PyPI - json-repair - Versions diffs - 0.39.0__tar.gz → 0.40.0__tar.gz - Mend

json-repair 0.39.0tar.gz → 0.40.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

{json_repair-0.39.0/src/json_repair.egg-info → json_repair-0.40.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: json_repair
-Version: 0.39.0
+Version: 0.40.0
 Summary: A package to repair broken json strings
 Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
 License: MIT License
@@ -196,12 +196,12 @@ pipx install json-repair
 to know all options available:
 ```
 $ json_repair -h
-usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] filename
+usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] [filename]
 Repair and parse JSON files.
 positional arguments:
-  filename              The JSON file to repair
+  filename              The JSON file to repair (if omitted, reads from stdin)
 options:
   -h, --help            show this help message and exit
@@ -226,13 +226,13 @@ In this example, any version that starts with `0.` will be acceptable, allowing
 # How to cite
 If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
-    @software{Baccianella_JSON_Repair_-_2024,
-        author = {Baccianella, Stefano},
-        month = aug,
-        title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
-        url = {https://github.com/mangiucugna/json_repair},
-        version = {0.28.3},
-        year = {2024}
+    @software{Baccianella_JSON_Repair_-_2025,
+        author  = "Stefano {Baccianella}",
+        month   = "feb",
+        title   = "JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs",
+        url     = "https://github.com/mangiucugna/json_repair",
+        version = "0.39.1",
+        year    = 2025
     }
 Thank you for citing my work and please send me a link to the paper if you can!

{json_repair-0.39.0 → json_repair-0.40.0}/README.md RENAMED Viewed

@@ -158,12 +158,12 @@ pipx install json-repair
 to know all options available:
 ```
 $ json_repair -h
-usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] filename
+usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] [filename]
 Repair and parse JSON files.
 positional arguments:
-  filename              The JSON file to repair
+  filename              The JSON file to repair (if omitted, reads from stdin)
 options:
   -h, --help            show this help message and exit
@@ -188,13 +188,13 @@ In this example, any version that starts with `0.` will be acceptable, allowing
 # How to cite
 If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
-    @software{Baccianella_JSON_Repair_-_2024,
-        author = {Baccianella, Stefano},
-        month = aug,
-        title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
-        url = {https://github.com/mangiucugna/json_repair},
-        version = {0.28.3},
-        year = {2024}
+    @software{Baccianella_JSON_Repair_-_2025,
+        author  = "Stefano {Baccianella}",
+        month   = "feb",
+        title   = "JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs",
+        url     = "https://github.com/mangiucugna/json_repair",
+        version = "0.39.1",
+        year    = 2025
     }
 Thank you for citing my work and please send me a link to the paper if you can!

{json_repair-0.39.0 → json_repair-0.40.0}/pyproject.toml RENAMED Viewed

@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
 build-backend = "setuptools.build_meta"
 [project]
 name = "json_repair"
-version = "0.39.0"
+version = "0.40.0"
 license = {file = "LICENSE"}
 authors = [
   { name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },

{json_repair-0.39.0 → json_repair-0.40.0}/src/json_repair/json_parser.py RENAMED Viewed

@@ -9,6 +9,7 @@ JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
 class JSONParser:
     # Constants
     STRING_DELIMITERS = ['"', "'", "“", "”"]
+    NUMBER_CHARS = set("0123456789-.eE/,")
     def __init__(
         self,
@@ -129,8 +130,6 @@ class JSONParser:
             # Context is used in the string parser to manage the lack of quotes
             self.context.set(ContextValues.OBJECT_KEY)
-            self.skip_whitespaces_at()
             # Save this index in case we need find a duplicate key
             rollback_index = self.index
@@ -219,18 +218,13 @@ class JSONParser:
                 char = self.get_char_at()
         # Especially at the end of an LLM generated json you might miss the last "]"
-        char = self.get_char_at()
         if char and char != "]":
             self.log(
-                "While parsing an array we missed the closing ], adding it back",
-            )
-            self.index -= 1
-            # Add the missing closing bracket
-            self.json_str = (
-                self.json_str[: self.index + 1] + "]" + self.json_str[self.index + 1 :]
+                "While parsing an array we missed the closing ], ignoring it",
             )
         self.index += 1
         self.context.reset()
         return arr
@@ -275,15 +269,11 @@ class JSONParser:
             self.log(
                 "While parsing a string, we found a literal instead of a quote",
             )
-            self.log(
-                "While parsing a string, we found no starting quote. Will add the quote back",
-            )
             missing_quotes = True
         if not missing_quotes:
             self.index += 1
-        self.skip_whitespaces_at()
         # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
         if self.get_char_at() in self.STRING_DELIMITERS:
             # If the next character is the same type of quote, then we manage it as double quotes
@@ -583,6 +573,13 @@ class JSONParser:
                     elif (
                         next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\"
                     ):
+                        # Check if self.index:self.index+i is only whitespaces, break if that's the case
+                        if all(
+                            str(self.get_char_at(j)).isspace()
+                            for j in range(1, i)
+                            if self.get_char_at(j)
+                        ):
+                            break
                         if self.context.current == ContextValues.OBJECT_VALUE:
                             # But this might not be it! This could be just a missing comma
                             # We found a delimiter and we need to check if this is a key
@@ -610,27 +607,24 @@ class JSONParser:
                                 self.index += 1
                                 char = self.get_char_at()
                         elif self.context.current == ContextValues.ARRAY:
-                            # In array context this could be something like "lorem "ipsum" sic"
-                            # So let's check if we find a rstring_delimiter forward otherwise end early
-                            i = self.skip_to_character(rstring_delimiter, idx=i + 1)
-                            next_c = self.get_char_at(i)
-                            if next_c and next_c == rstring_delimiter:
-                                # Ok now if I find a comma or a closing ], that can be have also an optional rstring_delimiter before them
-                                # We can consider this a misplaced quote
-                                i += 1
-                                i = self.skip_whitespaces_at(
-                                    idx=i, move_main_index=False
-                                )
-                                next_c = self.get_char_at(i)
-                                if next_c and next_c in [",", "]"]:
-                                    self.log(
-                                        "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
-                                    )
-                                    unmatched_delimiter = not unmatched_delimiter
-                                    string_acc += str(char)
-                                    self.index += 1
-                                    char = self.get_char_at()
+                            # If we got up to here it means that this is a situation like this:
+                            # ["bla bla bla "puppy" bla bla bla "kitty" bla bla"]
+                            # So we need to ignore this quote
+                            self.log(
+                                "While parsing a string in Array context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
+                            )
+                            unmatched_delimiter = not unmatched_delimiter
+                            string_acc += str(char)
+                            self.index += 1
+                            char = self.get_char_at()
+                        elif self.context.current == ContextValues.OBJECT_KEY:
+                            # In this case we just ignore this and move on
+                            self.log(
+                                "While parsing a string in Object Key context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
+                            )
+                            string_acc += str(char)
+                            self.index += 1
+                            char = self.get_char_at()
         if (
             char
             and missing_quotes
@@ -663,10 +657,9 @@ class JSONParser:
     def parse_number(self) -> Union[float, int, str, JSONReturnType]:
         # <number> is a valid real number expressed in one of a number of given formats
         number_str = ""
-        number_chars = set("0123456789-.eE/,")
         char = self.get_char_at()
         is_array = self.context.current == ContextValues.ARRAY
-        while char and char in number_chars and (char != "," or not is_array):
+        while char and char in self.NUMBER_CHARS and (not is_array or char != ","):
             number_str += char
             self.index += 1
             char = self.get_char_at()
@@ -712,51 +705,6 @@ class JSONParser:
         self.index = starting_index
         return ""
-    def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
-        # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
-        try:
-            return self.json_str[self.index + count]
-        except IndexError:
-            return False
-    def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
-        """
-        This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
-        """
-        try:
-            char = self.json_str[self.index + idx]
-        except IndexError:
-            return idx
-        while char.isspace():
-            if move_main_index:
-                self.index += 1
-            else:
-                idx += 1
-            try:
-                char = self.json_str[self.index + idx]
-            except IndexError:
-                return idx
-        return idx
-    def skip_to_character(self, character: str, idx: int = 0) -> int:
-        """
-        This function quickly iterates to find a character, syntactic sugar to make the code more concise
-        """
-        try:
-            char = self.json_str[self.index + idx]
-        except IndexError:
-            return idx
-        while char != character:
-            idx += 1
-            try:
-                char = self.json_str[self.index + idx]
-            except IndexError:
-                return idx
-        if self.index + idx > 0 and self.json_str[self.index + idx - 1] == "\\":
-            # Ah this is an escaped character, try again
-            return self.skip_to_character(character=character, idx=idx + 1)
-        return idx
     def parse_comment(self) -> str:
         """
         Parse code-like comments:
@@ -827,6 +775,51 @@ class JSONParser:
             self.index += 1
             return ""
+    def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
+        # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
+        try:
+            return self.json_str[self.index + count]
+        except IndexError:
+            return False
+    def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
+        """
+        This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
+        """
+        try:
+            char = self.json_str[self.index + idx]
+        except IndexError:
+            return idx
+        while char.isspace():
+            if move_main_index:
+                self.index += 1
+            else:
+                idx += 1
+            try:
+                char = self.json_str[self.index + idx]
+            except IndexError:
+                return idx
+        return idx
+    def skip_to_character(self, character: str, idx: int = 0) -> int:
+        """
+        This function quickly iterates to find a character, syntactic sugar to make the code more concise
+        """
+        try:
+            char = self.json_str[self.index + idx]
+        except IndexError:
+            return idx
+        while char != character:
+            idx += 1
+            try:
+                char = self.json_str[self.index + idx]
+            except IndexError:
+                return idx
+        if self.index + idx > 0 and self.json_str[self.index + idx - 1] == "\\":
+            # Ah this is an escaped character, try again
+            return self.skip_to_character(character=character, idx=idx + 1)
+        return idx
     def _log(self, text: str) -> None:
         window: int = 10
         start: int = max(self.index - window, 0)

{json_repair-0.39.0 → json_repair-0.40.0/src/json_repair.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: json_repair
-Version: 0.39.0
+Version: 0.40.0
 Summary: A package to repair broken json strings
 Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
 License: MIT License
@@ -196,12 +196,12 @@ pipx install json-repair
 to know all options available:
 ```
 $ json_repair -h
-usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] filename
+usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] [filename]
 Repair and parse JSON files.
 positional arguments:
-  filename              The JSON file to repair
+  filename              The JSON file to repair (if omitted, reads from stdin)
 options:
   -h, --help            show this help message and exit
@@ -226,13 +226,13 @@ In this example, any version that starts with `0.` will be acceptable, allowing
 # How to cite
 If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
-    @software{Baccianella_JSON_Repair_-_2024,
-        author = {Baccianella, Stefano},
-        month = aug,
-        title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
-        url = {https://github.com/mangiucugna/json_repair},
-        version = {0.28.3},
-        year = {2024}
+    @software{Baccianella_JSON_Repair_-_2025,
+        author  = "Stefano {Baccianella}",
+        month   = "feb",
+        title   = "JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs",
+        url     = "https://github.com/mangiucugna/json_repair",
+        version = "0.39.1",
+        year    = 2025
     }
 Thank you for citing my work and please send me a link to the paper if you can!

{json_repair-0.39.0 → json_repair-0.40.0}/tests/test_json_repair.py RENAMED Viewed

@@ -126,6 +126,9 @@ def test_array_edge_cases():
     assert repair_json('["lorem "ipsum" sic"]') == '["lorem \\"ipsum\\" sic"]'
     assert repair_json('{"key1": ["value1", "value2"}, "key2": ["value3", "value4"]}') == '{"key1": ["value1", "value2"], "key2": ["value3", "value4"]}'
     assert repair_json('[ "value", /* comment */ "value2" ]') == '["value", "value2"]'
+    assert repair_json('{"key": ["value" "value1" "value2"]}') == '{"key": ["value", "value1", "value2"]}'
+    assert repair_json('{"key": ["lorem "ipsum" dolor "sit" amet, "consectetur" ", "lorem "ipsum" dolor", "lorem"]}') == '{"key": ["lorem \\"ipsum\\" dolor \\"sit\\" amet, \\"consectetur\\" ", "lorem \\"ipsum\\" dolor", "lorem"]}'
+    assert repair_json('{"k"e"y": "value"}') == '{"k\\"e\\"y": "value"}'
 def test_escaping():
     assert repair_json("'\"'") == '""'
@@ -150,7 +153,7 @@ def test_object_edge_cases():
     assert repair_json('{"lorem": ipsum, sic, datum.",}') == '{"lorem": "ipsum, sic, datum."}'
     assert repair_json('{"lorem": sic tamet. "ipsum": sic tamet, quick brown fox. "sic": ipsum}') == '{"lorem": "sic tamet.", "ipsum": "sic tamet", "sic": "ipsum"}'
     assert repair_json('{"lorem_ipsum": "sic tamet, quick brown fox. }') == '{"lorem_ipsum": "sic tamet, quick brown fox."}'
-    assert repair_json('{"key":value, " key2":"value2" }') == '{"key": "value", "key2": "value2"}'
+    assert repair_json('{"key":value, " key2":"value2" }') == '{"key": "value", " key2": "value2"}'
     assert repair_json('{"key":value "key2":"value2" }') == '{"key": "value", "key2": "value2"}'
     assert repair_json("{'text': 'words{words in brackets}more words'}") == '{"text": "words{words in brackets}more words"}'
     assert repair_json('{text:words{words in brackets}}') == '{"text": "words{words in brackets}"}'
@@ -265,8 +268,8 @@ def test_repair_json_from_file():
         # Write content to the temporary file
         with os.fdopen(temp_fd, 'w') as tmp:
             tmp.write("{key:value}")
-        assert from_file(filename=temp_path, logging=True) == ({'key': 'value'}, [{'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object key context, we found a :, stopping here',}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}, {'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn\'t determine that a right delimiter was present. Stopping here'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}])
-        assert from_file(filename=temp_path, logging=True, chunk_length=2) == ({'key': 'value'}, [{'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object key context, we found a :, stopping here',}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}, {'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn\'t determine that a right delimiter was present. Stopping here'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}])
+        assert from_file(filename=temp_path, logging=True) == ({'key': 'value'}, [{'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object key context, we found a :, stopping here',}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}, {'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn\'t determine that a right delimiter was present. Stopping here'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}])
+        assert from_file(filename=temp_path, logging=True, chunk_length=2) == ({'key': 'value'}, [{'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object key context, we found a :, stopping here',}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}, {'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn\'t determine that a right delimiter was present. Stopping here'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}])
     finally:
         # Clean up - delete the temporary file
         os.remove(temp_path)