PyPI - json-repair - Versions diffs - 0.47.4__tar.gz → 0.47.6__tar.gz - Mend

json-repair 0.47.4tar.gz → 0.47.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

{json_repair-0.47.4/src/json_repair.egg-info → json_repair-0.47.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: json_repair
-Version: 0.47.4
+Version: 0.47.6
 Summary: A package to repair broken json strings
 Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
 License: MIT License

{json_repair-0.47.4 → json_repair-0.47.6}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "json_repair"
-version = "0.47.4"
+version = "0.47.6"
 license = {file = "LICENSE"}
 authors = [
   { name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },

json_repair-0.47.6/src/json_repair/__init__.py ADDED Viewed

@@ -0,0 +1,4 @@
+from .constants import JSONReturnType
+from .json_repair import from_file, load, loads, repair_json
+__all__ = ["from_file", "load", "loads", "repair_json", "JSONReturnType"]

json_repair-0.47.6/src/json_repair/constants.py ADDED Viewed

@@ -0,0 +1,4 @@
+from typing import Any
+JSONReturnType = dict[str, Any] | list[Any] | str | float | int | bool | None
+STRING_DELIMITERS: list[str] = ['"', "'", "“", "”"]

json_repair-0.47.6/src/json_repair/json_parser.py ADDED Viewed

@@ -0,0 +1,173 @@
+from typing import Literal, TextIO
+from .constants import STRING_DELIMITERS, JSONReturnType
+from .json_context import JsonContext
+from .object_comparer import ObjectComparer
+from .parse_array import parse_array
+from .parse_boolean_or_null import parse_boolean_or_null
+from .parse_comment import parse_comment
+from .parse_number import parse_number
+from .parse_object import parse_object
+from .parse_string import parse_string
+from .string_file_wrapper import StringFileWrapper
+class JSONParser:
+    # Split the parse methods into separate files because this one was like 3000 lines
+    parse_array = parse_array
+    parse_boolean_or_null = parse_boolean_or_null
+    parse_comment = parse_comment
+    parse_number = parse_number
+    parse_object = parse_object
+    parse_string = parse_string
+    def __init__(
+        self,
+        json_str: str | StringFileWrapper,
+        json_fd: TextIO | None,
+        logging: bool | None,
+        json_fd_chunk_length: int = 0,
+        stream_stable: bool = False,
+    ) -> None:
+        # The string to parse
+        self.json_str: str | StringFileWrapper = json_str
+        # Alternatively, the file description with a json file in it
+        if json_fd:
+            # This is a trick we do to treat the file wrapper as an array
+            self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
+        # Index is our iterator that will keep track of which character we are looking at right now
+        self.index: int = 0
+        # This is used in the object member parsing to manage the special cases of missing quotes in key or value
+        self.context = JsonContext()
+        # Use this to log the activity, but only if logging is active
+        # This is a trick but a beautiful one. We call self.log in the code over and over even if it's not needed.
+        # We could add a guard in the code for each call but that would make this code unreadable, so here's this neat trick
+        # Replace self.log with a noop
+        self.logging = logging
+        if logging:
+            self.logger: list[dict[str, str]] = []
+            self.log = self._log
+        else:
+            # No-op
+            self.log = lambda *args, **kwargs: None  # noqa: ARG005
+        # When the json to be repaired is the accumulation of streaming json at a certain moment.
+        # e.g. json obtained from llm response.
+        # If this parameter to True will keep the repair results stable. For example:
+        #   case 1:  '{"key": "val\\' => '{"key": "val"}'
+        #   case 2:  '{"key": "val\\n' => '{"key": "val\\n"}'
+        #   case 3:  '{"key": "val\\n123,`key2:value2' => '{"key": "val\\n123,`key2:value2"}'
+        #   case 4:  '{"key": "val\\n123,`key2:value2`"}' => '{"key": "val\\n123,`key2:value2`"}'
+        self.stream_stable = stream_stable
+    def parse(
+        self,
+    ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
+        json = self.parse_json()
+        if self.index < len(self.json_str):
+            self.log(
+                "The parser returned early, checking if there's more json elements",
+            )
+            json = [json]
+            while self.index < len(self.json_str):
+                j = self.parse_json()
+                if j != "":
+                    if ObjectComparer.is_same_object(json[-1], j):
+                        # replace the last entry with the new one since the new one seems an update
+                        json.pop()
+                    json.append(j)
+                else:
+                    # this was a bust, move the index
+                    self.index += 1
+            # If nothing extra was found, don't return an array
+            if len(json) == 1:
+                self.log(
+                    "There were no more elements, returning the element without the array",
+                )
+                json = json[0]
+        if self.logging:
+            return json, self.logger
+        else:
+            return json
+    def parse_json(
+        self,
+    ) -> JSONReturnType:
+        while True:
+            char = self.get_char_at()
+            # False means that we are at the end of the string provided
+            if char is False:
+                return ""
+            # <object> starts with '{'
+            elif char == "{":
+                self.index += 1
+                return self.parse_object()
+            # <array> starts with '['
+            elif char == "[":
+                self.index += 1
+                return self.parse_array()
+            # <string> starts with a quote
+            elif not self.context.empty and (char in STRING_DELIMITERS or char.isalpha()):
+                return self.parse_string()
+            # <number> starts with [0-9] or minus
+            elif not self.context.empty and (char.isdigit() or char == "-" or char == "."):
+                return self.parse_number()
+            elif char in ["#", "/"]:
+                return self.parse_comment()
+            # If everything else fails, we just ignore and move on
+            else:
+                self.index += 1
+    def get_char_at(self, count: int = 0) -> str | Literal[False]:
+        # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
+        try:
+            return self.json_str[self.index + count]
+        except IndexError:
+            return False
+    def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
+        """
+        This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
+        """
+        try:
+            char = self.json_str[self.index + idx]
+        except IndexError:
+            return idx
+        while char.isspace():
+            if move_main_index:
+                self.index += 1
+            else:
+                idx += 1
+            try:
+                char = self.json_str[self.index + idx]
+            except IndexError:
+                return idx
+        return idx
+    def skip_to_character(self, character: str, idx: int = 0) -> int:
+        """
+        This function quickly iterates to find a character, syntactic sugar to make the code more concise
+        """
+        try:
+            char = self.json_str[self.index + idx]
+        except IndexError:
+            return idx
+        while char != character:
+            idx += 1
+            try:
+                char = self.json_str[self.index + idx]
+            except IndexError:
+                return idx
+        return idx
+    def _log(self, text: str) -> None:
+        window: int = 10
+        start: int = max(self.index - window, 0)
+        end: int = min(self.index + window, len(self.json_str))
+        context: str = self.json_str[start:end]
+        self.logger.append(
+            {
+                "text": text,
+                "context": context,
+            }
+        )

{json_repair-0.47.4 → json_repair-0.47.6}/src/json_repair/json_repair.py RENAMED Viewed

@@ -27,7 +27,8 @@ import json
 import sys
 from typing import Literal, TextIO, overload
-from .json_parser import JSONParser, JSONReturnType
+from .constants import JSONReturnType
+from .json_parser import JSONParser
 @overload

json_repair-0.47.6/src/json_repair/parse_array.py ADDED Viewed

@@ -0,0 +1,50 @@
+from .constants import STRING_DELIMITERS, JSONReturnType
+from .json_context import ContextValues
+def parse_array(self) -> list[JSONReturnType]:
+    # <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
+    arr = []
+    self.context.set(ContextValues.ARRAY)
+    # Stop when you either find the closing parentheses or you have iterated over the entire string
+    char = self.get_char_at()
+    while char and char not in ["]", "}"]:
+        self.skip_whitespaces_at()
+        value: JSONReturnType = ""
+        if char in STRING_DELIMITERS:
+            # Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
+            # So we are going to check if this string is followed by a : or not
+            # And either parse the string or parse the object
+            i = 1
+            i = self.skip_to_character(char, i)
+            i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
+            value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
+        else:
+            value = self.parse_json()
+        # It is possible that parse_json() returns nothing valid, so we increase by 1
+        if value == "":
+            self.index += 1
+        elif value == "..." and self.get_char_at(-1) == ".":
+            self.log(
+                "While parsing an array, found a stray '...'; ignoring it",
+            )
+        else:
+            arr.append(value)
+        # skip over whitespace after a value but before closing ]
+        char = self.get_char_at()
+        while char and char != "]" and (char.isspace() or char == ","):
+            self.index += 1
+            char = self.get_char_at()
+    # Especially at the end of an LLM generated json you might miss the last "]"
+    if char and char != "]":
+        self.log(
+            "While parsing an array we missed the closing ], ignoring it",
+        )
+    self.index += 1
+    self.context.reset()
+    return arr

json_repair-0.47.6/src/json_repair/parse_boolean_or_null.py ADDED Viewed

@@ -0,0 +1,24 @@
+def parse_boolean_or_null(self) -> bool | str | None:
+    # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
+    starting_index = self.index
+    char = (self.get_char_at() or "").lower()
+    value: tuple[str, bool | None] | None = None
+    if char == "t":
+        value = ("true", True)
+    elif char == "f":
+        value = ("false", False)
+    elif char == "n":
+        value = ("null", None)
+    if value:
+        i = 0
+        while char and i < len(value[0]) and char == value[0][i]:
+            i += 1
+            self.index += 1
+            char = (self.get_char_at() or "").lower()
+        if i == len(value[0]):
+            return value[1]
+    # If nothing works reset the index before returning
+    self.index = starting_index
+    return ""

json_repair-0.47.6/src/json_repair/parse_comment.py ADDED Viewed

@@ -0,0 +1,65 @@
+from .json_context import ContextValues
+def parse_comment(self) -> str:
+    """
+    Parse code-like comments:
+    - "# comment": A line comment that continues until a newline.
+    - "// comment": A line comment that continues until a newline.
+    - "/* comment */": A block comment that continues until the closing delimiter "*/".
+    The comment is skipped over and an empty string is returned so that comments do not interfere
+    with the actual JSON elements.
+    """
+    char = self.get_char_at()
+    termination_characters = ["\n", "\r"]
+    if ContextValues.ARRAY in self.context.context:
+        termination_characters.append("]")
+    if ContextValues.OBJECT_VALUE in self.context.context:
+        termination_characters.append("}")
+    if ContextValues.OBJECT_KEY in self.context.context:
+        termination_characters.append(":")
+    # Line comment starting with #
+    if char == "#":
+        comment = ""
+        while char and char not in termination_characters:
+            comment += char
+            self.index += 1
+            char = self.get_char_at()
+        self.log(f"Found line comment: {comment}, ignoring")
+    # Comments starting with '/'
+    elif char == "/":
+        next_char = self.get_char_at(1)
+        # Handle line comment starting with //
+        if next_char == "/":
+            comment = "//"
+            self.index += 2  # Skip both slashes.
+            char = self.get_char_at()
+            while char and char not in termination_characters:
+                comment += char
+                self.index += 1
+                char = self.get_char_at()
+            self.log(f"Found line comment: {comment}, ignoring")
+        # Handle block comment starting with /*
+        elif next_char == "*":
+            comment = "/*"
+            self.index += 2  # Skip '/*'
+            while True:
+                char = self.get_char_at()
+                if not char:
+                    self.log("Reached end-of-string while parsing block comment; unclosed block comment.")
+                    break
+                comment += char
+                self.index += 1
+                if comment.endswith("*/"):
+                    break
+            self.log(f"Found block comment: {comment}, ignoring")
+        else:
+            # Skip standalone '/' characters that are not part of a comment
+            # to avoid getting stuck in an infinite loop
+            self.index += 1
+    if self.context.empty:
+        return self.parse_json()
+    else:
+        return ""

json_repair-0.47.6/src/json_repair/parse_number.py ADDED Viewed

@@ -0,0 +1,32 @@
+from .constants import JSONReturnType
+from .json_context import ContextValues
+NUMBER_CHARS: set[str] = set("0123456789-.eE/,")
+def parse_number(self) -> float | int | str | JSONReturnType:
+    # <number> is a valid real number expressed in one of a number of given formats
+    number_str = ""
+    char = self.get_char_at()
+    is_array = self.context.current == ContextValues.ARRAY
+    while char and char in NUMBER_CHARS and (not is_array or char != ","):
+        number_str += char
+        self.index += 1
+        char = self.get_char_at()
+    if number_str and number_str[-1] in "-eE/,":
+        # The number ends with a non valid character for a number/currency, rolling back one
+        number_str = number_str[:-1]
+        self.index -= 1
+    elif (self.get_char_at() or "").isalpha():
+        # this was a string instead, sorry
+        self.index -= len(number_str)
+        return self.parse_string()
+    try:
+        if "," in number_str:
+            return str(number_str)
+        if "." in number_str or "e" in number_str or "E" in number_str:
+            return float(number_str)
+        else:
+            return int(number_str)
+    except ValueError:
+        return number_str

json_repair-0.47.6/src/json_repair/parse_object.py ADDED Viewed

@@ -0,0 +1,110 @@
+from .constants import JSONReturnType
+from .json_context import ContextValues
+def parse_object(self) -> dict[str, JSONReturnType]:
+    # <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
+    obj: dict[str, JSONReturnType] = {}
+    # Stop when you either find the closing parentheses or you have iterated over the entire string
+    while (self.get_char_at() or "}") != "}":
+        # This is what we expect to find:
+        # <member> ::= <string> ': ' <json>
+        # Skip filler whitespaces
+        self.skip_whitespaces_at()
+        # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
+        if (self.get_char_at() or "") == ":":
+            self.log(
+                "While parsing an object we found a : before a key, ignoring",
+            )
+            self.index += 1
+        # We are now searching for they string key
+        # Context is used in the string parser to manage the lack of quotes
+        self.context.set(ContextValues.OBJECT_KEY)
+        # Save this index in case we need find a duplicate key
+        rollback_index = self.index
+        # <member> starts with a <string>
+        key = ""
+        while self.get_char_at():
+            # The rollback index needs to be updated here in case the key is empty
+            rollback_index = self.index
+            if self.get_char_at() == "[" and key == "":
+                # Is this an array?
+                # Need to check if the previous parsed value contained in obj is an array and in that case parse and merge the two
+                prev_key = list(obj.keys())[-1] if obj else None
+                if prev_key and isinstance(obj[prev_key], list):
+                    # If the previous key's value is an array, parse the new array and merge
+                    self.index += 1
+                    new_array = self.parse_array()
+                    if isinstance(new_array, list):
+                        # Merge and flatten the arrays
+                        prev_value = obj[prev_key]
+                        if isinstance(prev_value, list):
+                            prev_value.extend(
+                                new_array[0] if len(new_array) == 1 and isinstance(new_array[0], list) else new_array
+                            )
+                        self.skip_whitespaces_at()
+                        if self.get_char_at() == ",":
+                            self.index += 1
+                        self.skip_whitespaces_at()
+                        continue
+            key = str(self.parse_string())
+            if key == "":
+                self.skip_whitespaces_at()
+            if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
+                # If the string is empty but there is a object divider, we are done here
+                break
+        if ContextValues.ARRAY in self.context.context and key in obj:
+            self.log(
+                "While parsing an object we found a duplicate key, closing the object here and rolling back the index",
+            )
+            self.index = rollback_index - 1
+            # add an opening curly brace to make this work
+            self.json_str = self.json_str[: self.index + 1] + "{" + self.json_str[self.index + 1 :]
+            break
+        # Skip filler whitespaces
+        self.skip_whitespaces_at()
+        # We reached the end here
+        if (self.get_char_at() or "}") == "}":
+            continue
+        self.skip_whitespaces_at()
+        # An extreme case of missing ":" after a key
+        if (self.get_char_at() or "") != ":":
+            self.log(
+                "While parsing an object we missed a : after a key",
+            )
+        self.index += 1
+        self.context.reset()
+        self.context.set(ContextValues.OBJECT_VALUE)
+        # The value can be any valid json
+        self.skip_whitespaces_at()
+        # Corner case, a lone comma
+        value: JSONReturnType = ""
+        if (self.get_char_at() or "") in [",", "}"]:
+            self.log(
+                "While parsing an object value we found a stray , ignoring it",
+            )
+        else:
+            value = self.parse_json()
+        # Reset context since our job is done
+        self.context.reset()
+        obj[key] = value
+        if (self.get_char_at() or "") in [",", "'", '"']:
+            self.index += 1
+        # Remove trailing spaces
+        self.skip_whitespaces_at()
+    self.index += 1
+    return obj

json-repair 0.47.4__tar.gz → 0.47.6__tar.gz

json-repair 0.47.4tar.gz → 0.47.6tar.gz