json-repair 0.47.3__tar.gz → 0.47.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {json_repair-0.47.3/src/json_repair.egg-info → json_repair-0.47.5}/PKG-INFO +2 -1
  2. {json_repair-0.47.3 → json_repair-0.47.5}/README.md +1 -0
  3. {json_repair-0.47.3 → json_repair-0.47.5}/pyproject.toml +1 -1
  4. json_repair-0.47.5/src/json_repair/__init__.py +4 -0
  5. json_repair-0.47.5/src/json_repair/constants.py +4 -0
  6. json_repair-0.47.5/src/json_repair/json_parser.py +173 -0
  7. {json_repair-0.47.3 → json_repair-0.47.5}/src/json_repair/json_repair.py +2 -1
  8. json_repair-0.47.5/src/json_repair/parse_array.py +50 -0
  9. json_repair-0.47.5/src/json_repair/parse_boolean_or_null.py +24 -0
  10. json_repair-0.47.5/src/json_repair/parse_comment.py +65 -0
  11. json_repair-0.47.5/src/json_repair/parse_number.py +32 -0
  12. json_repair-0.47.5/src/json_repair/parse_object.py +110 -0
  13. json_repair-0.47.5/src/json_repair/parse_string.py +396 -0
  14. {json_repair-0.47.3 → json_repair-0.47.5/src/json_repair.egg-info}/PKG-INFO +2 -1
  15. {json_repair-0.47.3 → json_repair-0.47.5}/src/json_repair.egg-info/SOURCES.txt +8 -0
  16. {json_repair-0.47.3 → json_repair-0.47.5}/tests/test_json_repair.py +0 -14
  17. json_repair-0.47.5/tests/test_parse_comment.py +19 -0
  18. json_repair-0.47.3/src/json_repair/__init__.py +0 -3
  19. json_repair-0.47.3/src/json_repair/json_parser.py +0 -828
  20. {json_repair-0.47.3 → json_repair-0.47.5}/LICENSE +0 -0
  21. {json_repair-0.47.3 → json_repair-0.47.5}/setup.cfg +0 -0
  22. {json_repair-0.47.3 → json_repair-0.47.5}/src/json_repair/__main__.py +0 -0
  23. {json_repair-0.47.3 → json_repair-0.47.5}/src/json_repair/json_context.py +0 -0
  24. {json_repair-0.47.3 → json_repair-0.47.5}/src/json_repair/object_comparer.py +0 -0
  25. {json_repair-0.47.3 → json_repair-0.47.5}/src/json_repair/py.typed +0 -0
  26. {json_repair-0.47.3 → json_repair-0.47.5}/src/json_repair/string_file_wrapper.py +0 -0
  27. {json_repair-0.47.3 → json_repair-0.47.5}/src/json_repair.egg-info/dependency_links.txt +0 -0
  28. {json_repair-0.47.3 → json_repair-0.47.5}/src/json_repair.egg-info/entry_points.txt +0 -0
  29. {json_repair-0.47.3 → json_repair-0.47.5}/src/json_repair.egg-info/top_level.txt +0 -0
  30. {json_repair-0.47.3 → json_repair-0.47.5}/tests/test_performance.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: json_repair
3
- Version: 0.47.3
3
+ Version: 0.47.5
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -294,6 +294,7 @@ You will need owner access to this repository
294
294
  - Go: https://github.com/RealAlexandreAI/json-repair
295
295
  - Ruby: https://github.com/sashazykov/json-repair-rb
296
296
  - Rust: https://github.com/oramasearch/llm_json
297
+ - R: https://github.com/cgxjdzz/jsonRepair
297
298
  ---
298
299
  ## Star History
299
300
 
@@ -255,6 +255,7 @@ You will need owner access to this repository
255
255
  - Go: https://github.com/RealAlexandreAI/json-repair
256
256
  - Ruby: https://github.com/sashazykov/json-repair-rb
257
257
  - Rust: https://github.com/oramasearch/llm_json
258
+ - R: https://github.com/cgxjdzz/jsonRepair
258
259
  ---
259
260
  ## Star History
260
261
 
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
3
3
  build-backend = "setuptools.build_meta"
4
4
  [project]
5
5
  name = "json_repair"
6
- version = "0.47.3"
6
+ version = "0.47.5"
7
7
  license = {file = "LICENSE"}
8
8
  authors = [
9
9
  { name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },
@@ -0,0 +1,4 @@
1
+ from .constants import JSONReturnType
2
+ from .json_repair import from_file, load, loads, repair_json
3
+
4
+ __all__ = ["from_file", "load", "loads", "repair_json", "JSONReturnType"]
@@ -0,0 +1,4 @@
1
+ from typing import Any
2
+
3
+ JSONReturnType = dict[str, Any] | list[Any] | str | float | int | bool | None
4
+ STRING_DELIMITERS: list[str] = ['"', "'", "“", "”"]
@@ -0,0 +1,173 @@
1
+ from typing import Literal, TextIO
2
+
3
+ from .constants import STRING_DELIMITERS, JSONReturnType
4
+ from .json_context import JsonContext
5
+ from .object_comparer import ObjectComparer
6
+ from .parse_array import parse_array
7
+ from .parse_boolean_or_null import parse_boolean_or_null
8
+ from .parse_comment import parse_comment
9
+ from .parse_number import parse_number
10
+ from .parse_object import parse_object
11
+ from .parse_string import parse_string
12
+ from .string_file_wrapper import StringFileWrapper
13
+
14
+
15
+ class JSONParser:
16
+ # Split the parse methods into separate files because this one was like 3000 lines
17
+ parse_array = parse_array
18
+ parse_boolean_or_null = parse_boolean_or_null
19
+ parse_comment = parse_comment
20
+ parse_number = parse_number
21
+ parse_object = parse_object
22
+ parse_string = parse_string
23
+
24
+ def __init__(
25
+ self,
26
+ json_str: str | StringFileWrapper,
27
+ json_fd: TextIO | None,
28
+ logging: bool | None,
29
+ json_fd_chunk_length: int = 0,
30
+ stream_stable: bool = False,
31
+ ) -> None:
32
+ # The string to parse
33
+ self.json_str: str | StringFileWrapper = json_str
34
+ # Alternatively, the file description with a json file in it
35
+ if json_fd:
36
+ # This is a trick we do to treat the file wrapper as an array
37
+ self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
38
+ # Index is our iterator that will keep track of which character we are looking at right now
39
+ self.index: int = 0
40
+ # This is used in the object member parsing to manage the special cases of missing quotes in key or value
41
+ self.context = JsonContext()
42
+ # Use this to log the activity, but only if logging is active
43
+
44
+ # This is a trick but a beautiful one. We call self.log in the code over and over even if it's not needed.
45
+ # We could add a guard in the code for each call but that would make this code unreadable, so here's this neat trick
46
+ # Replace self.log with a noop
47
+ self.logging = logging
48
+ if logging:
49
+ self.logger: list[dict[str, str]] = []
50
+ self.log = self._log
51
+ else:
52
+ # No-op
53
+ self.log = lambda *args, **kwargs: None # noqa: ARG005
54
+ # When the json to be repaired is the accumulation of streaming json at a certain moment.
55
+ # e.g. json obtained from llm response.
56
+ # If this parameter to True will keep the repair results stable. For example:
57
+ # case 1: '{"key": "val\\' => '{"key": "val"}'
58
+ # case 2: '{"key": "val\\n' => '{"key": "val\\n"}'
59
+ # case 3: '{"key": "val\\n123,`key2:value2' => '{"key": "val\\n123,`key2:value2"}'
60
+ # case 4: '{"key": "val\\n123,`key2:value2`"}' => '{"key": "val\\n123,`key2:value2`"}'
61
+ self.stream_stable = stream_stable
62
+
63
+ def parse(
64
+ self,
65
+ ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
66
+ json = self.parse_json()
67
+ if self.index < len(self.json_str):
68
+ self.log(
69
+ "The parser returned early, checking if there's more json elements",
70
+ )
71
+ json = [json]
72
+ while self.index < len(self.json_str):
73
+ j = self.parse_json()
74
+ if j != "":
75
+ if ObjectComparer.is_same_object(json[-1], j):
76
+ # replace the last entry with the new one since the new one seems an update
77
+ json.pop()
78
+ json.append(j)
79
+ else:
80
+ # this was a bust, move the index
81
+ self.index += 1
82
+ # If nothing extra was found, don't return an array
83
+ if len(json) == 1:
84
+ self.log(
85
+ "There were no more elements, returning the element without the array",
86
+ )
87
+ json = json[0]
88
+ if self.logging:
89
+ return json, self.logger
90
+ else:
91
+ return json
92
+
93
+ def parse_json(
94
+ self,
95
+ ) -> JSONReturnType:
96
+ while True:
97
+ char = self.get_char_at()
98
+ # False means that we are at the end of the string provided
99
+ if char is False:
100
+ return ""
101
+ # <object> starts with '{'
102
+ elif char == "{":
103
+ self.index += 1
104
+ return self.parse_object()
105
+ # <array> starts with '['
106
+ elif char == "[":
107
+ self.index += 1
108
+ return self.parse_array()
109
+ # <string> starts with a quote
110
+ elif not self.context.empty and (char in STRING_DELIMITERS or char.isalpha()):
111
+ return self.parse_string()
112
+ # <number> starts with [0-9] or minus
113
+ elif not self.context.empty and (char.isdigit() or char == "-" or char == "."):
114
+ return self.parse_number()
115
+ elif char in ["#", "/"]:
116
+ return self.parse_comment()
117
+ # If everything else fails, we just ignore and move on
118
+ else:
119
+ self.index += 1
120
+
121
+ def get_char_at(self, count: int = 0) -> str | Literal[False]:
122
+ # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
123
+ try:
124
+ return self.json_str[self.index + count]
125
+ except IndexError:
126
+ return False
127
+
128
+ def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
129
+ """
130
+ This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
131
+ """
132
+ try:
133
+ char = self.json_str[self.index + idx]
134
+ except IndexError:
135
+ return idx
136
+ while char.isspace():
137
+ if move_main_index:
138
+ self.index += 1
139
+ else:
140
+ idx += 1
141
+ try:
142
+ char = self.json_str[self.index + idx]
143
+ except IndexError:
144
+ return idx
145
+ return idx
146
+
147
+ def skip_to_character(self, character: str, idx: int = 0) -> int:
148
+ """
149
+ This function quickly iterates to find a character, syntactic sugar to make the code more concise
150
+ """
151
+ try:
152
+ char = self.json_str[self.index + idx]
153
+ except IndexError:
154
+ return idx
155
+ while char != character:
156
+ idx += 1
157
+ try:
158
+ char = self.json_str[self.index + idx]
159
+ except IndexError:
160
+ return idx
161
+ return idx
162
+
163
+ def _log(self, text: str) -> None:
164
+ window: int = 10
165
+ start: int = max(self.index - window, 0)
166
+ end: int = min(self.index + window, len(self.json_str))
167
+ context: str = self.json_str[start:end]
168
+ self.logger.append(
169
+ {
170
+ "text": text,
171
+ "context": context,
172
+ }
173
+ )
@@ -27,7 +27,8 @@ import json
27
27
  import sys
28
28
  from typing import Literal, TextIO, overload
29
29
 
30
- from .json_parser import JSONParser, JSONReturnType
30
+ from .constants import JSONReturnType
31
+ from .json_parser import JSONParser
31
32
 
32
33
 
33
34
  @overload
@@ -0,0 +1,50 @@
1
+ from .constants import STRING_DELIMITERS, JSONReturnType
2
+ from .json_context import ContextValues
3
+
4
+
5
+ def parse_array(self) -> list[JSONReturnType]:
6
+ # <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
7
+ arr = []
8
+ self.context.set(ContextValues.ARRAY)
9
+ # Stop when you either find the closing parentheses or you have iterated over the entire string
10
+ char = self.get_char_at()
11
+ while char and char not in ["]", "}"]:
12
+ self.skip_whitespaces_at()
13
+ value: JSONReturnType = ""
14
+ if char in STRING_DELIMITERS:
15
+ # Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
16
+ # So we are going to check if this string is followed by a : or not
17
+ # And either parse the string or parse the object
18
+ i = 1
19
+ i = self.skip_to_character(char, i)
20
+ i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
21
+ value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
22
+ else:
23
+ value = self.parse_json()
24
+
25
+ # It is possible that parse_json() returns nothing valid, so we increase by 1
26
+ if value == "":
27
+ self.index += 1
28
+ elif value == "..." and self.get_char_at(-1) == ".":
29
+ self.log(
30
+ "While parsing an array, found a stray '...'; ignoring it",
31
+ )
32
+ else:
33
+ arr.append(value)
34
+
35
+ # skip over whitespace after a value but before closing ]
36
+ char = self.get_char_at()
37
+ while char and char != "]" and (char.isspace() or char == ","):
38
+ self.index += 1
39
+ char = self.get_char_at()
40
+
41
+ # Especially at the end of an LLM generated json you might miss the last "]"
42
+ if char and char != "]":
43
+ self.log(
44
+ "While parsing an array we missed the closing ], ignoring it",
45
+ )
46
+
47
+ self.index += 1
48
+
49
+ self.context.reset()
50
+ return arr
@@ -0,0 +1,24 @@
1
+ def parse_boolean_or_null(self) -> bool | str | None:
2
+ # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
3
+ starting_index = self.index
4
+ char = (self.get_char_at() or "").lower()
5
+ value: tuple[str, bool | None] | None = None
6
+ if char == "t":
7
+ value = ("true", True)
8
+ elif char == "f":
9
+ value = ("false", False)
10
+ elif char == "n":
11
+ value = ("null", None)
12
+
13
+ if value:
14
+ i = 0
15
+ while char and i < len(value[0]) and char == value[0][i]:
16
+ i += 1
17
+ self.index += 1
18
+ char = (self.get_char_at() or "").lower()
19
+ if i == len(value[0]):
20
+ return value[1]
21
+
22
+ # If nothing works reset the index before returning
23
+ self.index = starting_index
24
+ return ""
@@ -0,0 +1,65 @@
1
+ from .json_context import ContextValues
2
+
3
+
4
+ def parse_comment(self) -> str:
5
+ """
6
+ Parse code-like comments:
7
+
8
+ - "# comment": A line comment that continues until a newline.
9
+ - "// comment": A line comment that continues until a newline.
10
+ - "/* comment */": A block comment that continues until the closing delimiter "*/".
11
+
12
+ The comment is skipped over and an empty string is returned so that comments do not interfere
13
+ with the actual JSON elements.
14
+ """
15
+ char = self.get_char_at()
16
+ termination_characters = ["\n", "\r"]
17
+ if ContextValues.ARRAY in self.context.context:
18
+ termination_characters.append("]")
19
+ if ContextValues.OBJECT_VALUE in self.context.context:
20
+ termination_characters.append("}")
21
+ if ContextValues.OBJECT_KEY in self.context.context:
22
+ termination_characters.append(":")
23
+ # Line comment starting with #
24
+ if char == "#":
25
+ comment = ""
26
+ while char and char not in termination_characters:
27
+ comment += char
28
+ self.index += 1
29
+ char = self.get_char_at()
30
+ self.log(f"Found line comment: {comment}, ignoring")
31
+ # Comments starting with '/'
32
+ elif char == "/":
33
+ next_char = self.get_char_at(1)
34
+ # Handle line comment starting with //
35
+ if next_char == "/":
36
+ comment = "//"
37
+ self.index += 2 # Skip both slashes.
38
+ char = self.get_char_at()
39
+ while char and char not in termination_characters:
40
+ comment += char
41
+ self.index += 1
42
+ char = self.get_char_at()
43
+ self.log(f"Found line comment: {comment}, ignoring")
44
+ # Handle block comment starting with /*
45
+ elif next_char == "*":
46
+ comment = "/*"
47
+ self.index += 2 # Skip '/*'
48
+ while True:
49
+ char = self.get_char_at()
50
+ if not char:
51
+ self.log("Reached end-of-string while parsing block comment; unclosed block comment.")
52
+ break
53
+ comment += char
54
+ self.index += 1
55
+ if comment.endswith("*/"):
56
+ break
57
+ self.log(f"Found block comment: {comment}, ignoring")
58
+ else:
59
+ # Skip standalone '/' characters that are not part of a comment
60
+ # to avoid getting stuck in an infinite loop
61
+ self.index += 1
62
+ if self.context.empty:
63
+ return self.parse_json()
64
+ else:
65
+ return ""
@@ -0,0 +1,32 @@
1
+ from .constants import JSONReturnType
2
+ from .json_context import ContextValues
3
+
4
+ NUMBER_CHARS: set[str] = set("0123456789-.eE/,")
5
+
6
+
7
+ def parse_number(self) -> float | int | str | JSONReturnType:
8
+ # <number> is a valid real number expressed in one of a number of given formats
9
+ number_str = ""
10
+ char = self.get_char_at()
11
+ is_array = self.context.current == ContextValues.ARRAY
12
+ while char and char in NUMBER_CHARS and (not is_array or char != ","):
13
+ number_str += char
14
+ self.index += 1
15
+ char = self.get_char_at()
16
+ if number_str and number_str[-1] in "-eE/,":
17
+ # The number ends with a non valid character for a number/currency, rolling back one
18
+ number_str = number_str[:-1]
19
+ self.index -= 1
20
+ elif (self.get_char_at() or "").isalpha():
21
+ # this was a string instead, sorry
22
+ self.index -= len(number_str)
23
+ return self.parse_string()
24
+ try:
25
+ if "," in number_str:
26
+ return str(number_str)
27
+ if "." in number_str or "e" in number_str or "E" in number_str:
28
+ return float(number_str)
29
+ else:
30
+ return int(number_str)
31
+ except ValueError:
32
+ return number_str
@@ -0,0 +1,110 @@
1
+ from .constants import JSONReturnType
2
+ from .json_context import ContextValues
3
+
4
+
5
+ def parse_object(self) -> dict[str, JSONReturnType]:
6
+ # <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
7
+ obj: dict[str, JSONReturnType] = {}
8
+ # Stop when you either find the closing parentheses or you have iterated over the entire string
9
+ while (self.get_char_at() or "}") != "}":
10
+ # This is what we expect to find:
11
+ # <member> ::= <string> ': ' <json>
12
+
13
+ # Skip filler whitespaces
14
+ self.skip_whitespaces_at()
15
+
16
+ # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
17
+ if (self.get_char_at() or "") == ":":
18
+ self.log(
19
+ "While parsing an object we found a : before a key, ignoring",
20
+ )
21
+ self.index += 1
22
+
23
+ # We are now searching for they string key
24
+ # Context is used in the string parser to manage the lack of quotes
25
+ self.context.set(ContextValues.OBJECT_KEY)
26
+
27
+ # Save this index in case we need find a duplicate key
28
+ rollback_index = self.index
29
+
30
+ # <member> starts with a <string>
31
+ key = ""
32
+ while self.get_char_at():
33
+ # The rollback index needs to be updated here in case the key is empty
34
+ rollback_index = self.index
35
+ if self.get_char_at() == "[" and key == "":
36
+ # Is this an array?
37
+ # Need to check if the previous parsed value contained in obj is an array and in that case parse and merge the two
38
+ prev_key = list(obj.keys())[-1] if obj else None
39
+ if prev_key and isinstance(obj[prev_key], list):
40
+ # If the previous key's value is an array, parse the new array and merge
41
+ self.index += 1
42
+ new_array = self.parse_array()
43
+ if isinstance(new_array, list):
44
+ # Merge and flatten the arrays
45
+ prev_value = obj[prev_key]
46
+ if isinstance(prev_value, list):
47
+ prev_value.extend(
48
+ new_array[0] if len(new_array) == 1 and isinstance(new_array[0], list) else new_array
49
+ )
50
+ self.skip_whitespaces_at()
51
+ if self.get_char_at() == ",":
52
+ self.index += 1
53
+ self.skip_whitespaces_at()
54
+ continue
55
+ key = str(self.parse_string())
56
+ if key == "":
57
+ self.skip_whitespaces_at()
58
+ if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
59
+ # If the string is empty but there is a object divider, we are done here
60
+ break
61
+ if ContextValues.ARRAY in self.context.context and key in obj:
62
+ self.log(
63
+ "While parsing an object we found a duplicate key, closing the object here and rolling back the index",
64
+ )
65
+ self.index = rollback_index - 1
66
+ # add an opening curly brace to make this work
67
+ self.json_str = self.json_str[: self.index + 1] + "{" + self.json_str[self.index + 1 :]
68
+ break
69
+
70
+ # Skip filler whitespaces
71
+ self.skip_whitespaces_at()
72
+
73
+ # We reached the end here
74
+ if (self.get_char_at() or "}") == "}":
75
+ continue
76
+
77
+ self.skip_whitespaces_at()
78
+
79
+ # An extreme case of missing ":" after a key
80
+ if (self.get_char_at() or "") != ":":
81
+ self.log(
82
+ "While parsing an object we missed a : after a key",
83
+ )
84
+
85
+ self.index += 1
86
+ self.context.reset()
87
+ self.context.set(ContextValues.OBJECT_VALUE)
88
+ # The value can be any valid json
89
+ self.skip_whitespaces_at()
90
+ # Corner case, a lone comma
91
+ value: JSONReturnType = ""
92
+ if (self.get_char_at() or "") in [",", "}"]:
93
+ self.log(
94
+ "While parsing an object value we found a stray , ignoring it",
95
+ )
96
+ else:
97
+ value = self.parse_json()
98
+
99
+ # Reset context since our job is done
100
+ self.context.reset()
101
+ obj[key] = value
102
+
103
+ if (self.get_char_at() or "") in [",", "'", '"']:
104
+ self.index += 1
105
+
106
+ # Remove trailing spaces
107
+ self.skip_whitespaces_at()
108
+
109
+ self.index += 1
110
+ return obj