json-repair 0.47.4__py3-none-any.whl → 0.47.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/__init__.py +2 -1
- json_repair/constants.py +4 -0
- json_repair/json_parser.py +17 -674
- json_repair/json_repair.py +2 -1
- json_repair/parse_array.py +50 -0
- json_repair/parse_boolean_or_null.py +24 -0
- json_repair/parse_comment.py +65 -0
- json_repair/parse_number.py +32 -0
- json_repair/parse_object.py +110 -0
- json_repair/parse_string.py +413 -0
- {json_repair-0.47.4.dist-info → json_repair-0.47.6.dist-info}/METADATA +1 -1
- json_repair-0.47.6.dist-info/RECORD +21 -0
- json_repair-0.47.4.dist-info/RECORD +0 -14
- {json_repair-0.47.4.dist-info → json_repair-0.47.6.dist-info}/WHEEL +0 -0
- {json_repair-0.47.4.dist-info → json_repair-0.47.6.dist-info}/entry_points.txt +0 -0
- {json_repair-0.47.4.dist-info → json_repair-0.47.6.dist-info}/licenses/LICENSE +0 -0
- {json_repair-0.47.4.dist-info → json_repair-0.47.6.dist-info}/top_level.txt +0 -0
json_repair/json_repair.py
CHANGED
@@ -0,0 +1,50 @@
|
|
1
|
+
from .constants import STRING_DELIMITERS, JSONReturnType
|
2
|
+
from .json_context import ContextValues
|
3
|
+
|
4
|
+
|
5
|
+
def parse_array(self) -> list[JSONReturnType]:
|
6
|
+
# <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
|
7
|
+
arr = []
|
8
|
+
self.context.set(ContextValues.ARRAY)
|
9
|
+
# Stop when you either find the closing parentheses or you have iterated over the entire string
|
10
|
+
char = self.get_char_at()
|
11
|
+
while char and char not in ["]", "}"]:
|
12
|
+
self.skip_whitespaces_at()
|
13
|
+
value: JSONReturnType = ""
|
14
|
+
if char in STRING_DELIMITERS:
|
15
|
+
# Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
|
16
|
+
# So we are going to check if this string is followed by a : or not
|
17
|
+
# And either parse the string or parse the object
|
18
|
+
i = 1
|
19
|
+
i = self.skip_to_character(char, i)
|
20
|
+
i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
|
21
|
+
value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
|
22
|
+
else:
|
23
|
+
value = self.parse_json()
|
24
|
+
|
25
|
+
# It is possible that parse_json() returns nothing valid, so we increase by 1
|
26
|
+
if value == "":
|
27
|
+
self.index += 1
|
28
|
+
elif value == "..." and self.get_char_at(-1) == ".":
|
29
|
+
self.log(
|
30
|
+
"While parsing an array, found a stray '...'; ignoring it",
|
31
|
+
)
|
32
|
+
else:
|
33
|
+
arr.append(value)
|
34
|
+
|
35
|
+
# skip over whitespace after a value but before closing ]
|
36
|
+
char = self.get_char_at()
|
37
|
+
while char and char != "]" and (char.isspace() or char == ","):
|
38
|
+
self.index += 1
|
39
|
+
char = self.get_char_at()
|
40
|
+
|
41
|
+
# Especially at the end of an LLM generated json you might miss the last "]"
|
42
|
+
if char and char != "]":
|
43
|
+
self.log(
|
44
|
+
"While parsing an array we missed the closing ], ignoring it",
|
45
|
+
)
|
46
|
+
|
47
|
+
self.index += 1
|
48
|
+
|
49
|
+
self.context.reset()
|
50
|
+
return arr
|
@@ -0,0 +1,24 @@
|
|
1
|
+
def parse_boolean_or_null(self) -> bool | str | None:
|
2
|
+
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
|
3
|
+
starting_index = self.index
|
4
|
+
char = (self.get_char_at() or "").lower()
|
5
|
+
value: tuple[str, bool | None] | None = None
|
6
|
+
if char == "t":
|
7
|
+
value = ("true", True)
|
8
|
+
elif char == "f":
|
9
|
+
value = ("false", False)
|
10
|
+
elif char == "n":
|
11
|
+
value = ("null", None)
|
12
|
+
|
13
|
+
if value:
|
14
|
+
i = 0
|
15
|
+
while char and i < len(value[0]) and char == value[0][i]:
|
16
|
+
i += 1
|
17
|
+
self.index += 1
|
18
|
+
char = (self.get_char_at() or "").lower()
|
19
|
+
if i == len(value[0]):
|
20
|
+
return value[1]
|
21
|
+
|
22
|
+
# If nothing works reset the index before returning
|
23
|
+
self.index = starting_index
|
24
|
+
return ""
|
@@ -0,0 +1,65 @@
|
|
1
|
+
from .json_context import ContextValues
|
2
|
+
|
3
|
+
|
4
|
+
def parse_comment(self) -> str:
|
5
|
+
"""
|
6
|
+
Parse code-like comments:
|
7
|
+
|
8
|
+
- "# comment": A line comment that continues until a newline.
|
9
|
+
- "// comment": A line comment that continues until a newline.
|
10
|
+
- "/* comment */": A block comment that continues until the closing delimiter "*/".
|
11
|
+
|
12
|
+
The comment is skipped over and an empty string is returned so that comments do not interfere
|
13
|
+
with the actual JSON elements.
|
14
|
+
"""
|
15
|
+
char = self.get_char_at()
|
16
|
+
termination_characters = ["\n", "\r"]
|
17
|
+
if ContextValues.ARRAY in self.context.context:
|
18
|
+
termination_characters.append("]")
|
19
|
+
if ContextValues.OBJECT_VALUE in self.context.context:
|
20
|
+
termination_characters.append("}")
|
21
|
+
if ContextValues.OBJECT_KEY in self.context.context:
|
22
|
+
termination_characters.append(":")
|
23
|
+
# Line comment starting with #
|
24
|
+
if char == "#":
|
25
|
+
comment = ""
|
26
|
+
while char and char not in termination_characters:
|
27
|
+
comment += char
|
28
|
+
self.index += 1
|
29
|
+
char = self.get_char_at()
|
30
|
+
self.log(f"Found line comment: {comment}, ignoring")
|
31
|
+
# Comments starting with '/'
|
32
|
+
elif char == "/":
|
33
|
+
next_char = self.get_char_at(1)
|
34
|
+
# Handle line comment starting with //
|
35
|
+
if next_char == "/":
|
36
|
+
comment = "//"
|
37
|
+
self.index += 2 # Skip both slashes.
|
38
|
+
char = self.get_char_at()
|
39
|
+
while char and char not in termination_characters:
|
40
|
+
comment += char
|
41
|
+
self.index += 1
|
42
|
+
char = self.get_char_at()
|
43
|
+
self.log(f"Found line comment: {comment}, ignoring")
|
44
|
+
# Handle block comment starting with /*
|
45
|
+
elif next_char == "*":
|
46
|
+
comment = "/*"
|
47
|
+
self.index += 2 # Skip '/*'
|
48
|
+
while True:
|
49
|
+
char = self.get_char_at()
|
50
|
+
if not char:
|
51
|
+
self.log("Reached end-of-string while parsing block comment; unclosed block comment.")
|
52
|
+
break
|
53
|
+
comment += char
|
54
|
+
self.index += 1
|
55
|
+
if comment.endswith("*/"):
|
56
|
+
break
|
57
|
+
self.log(f"Found block comment: {comment}, ignoring")
|
58
|
+
else:
|
59
|
+
# Skip standalone '/' characters that are not part of a comment
|
60
|
+
# to avoid getting stuck in an infinite loop
|
61
|
+
self.index += 1
|
62
|
+
if self.context.empty:
|
63
|
+
return self.parse_json()
|
64
|
+
else:
|
65
|
+
return ""
|
@@ -0,0 +1,32 @@
|
|
1
|
+
from .constants import JSONReturnType
|
2
|
+
from .json_context import ContextValues
|
3
|
+
|
4
|
+
NUMBER_CHARS: set[str] = set("0123456789-.eE/,")
|
5
|
+
|
6
|
+
|
7
|
+
def parse_number(self) -> float | int | str | JSONReturnType:
|
8
|
+
# <number> is a valid real number expressed in one of a number of given formats
|
9
|
+
number_str = ""
|
10
|
+
char = self.get_char_at()
|
11
|
+
is_array = self.context.current == ContextValues.ARRAY
|
12
|
+
while char and char in NUMBER_CHARS and (not is_array or char != ","):
|
13
|
+
number_str += char
|
14
|
+
self.index += 1
|
15
|
+
char = self.get_char_at()
|
16
|
+
if number_str and number_str[-1] in "-eE/,":
|
17
|
+
# The number ends with a non valid character for a number/currency, rolling back one
|
18
|
+
number_str = number_str[:-1]
|
19
|
+
self.index -= 1
|
20
|
+
elif (self.get_char_at() or "").isalpha():
|
21
|
+
# this was a string instead, sorry
|
22
|
+
self.index -= len(number_str)
|
23
|
+
return self.parse_string()
|
24
|
+
try:
|
25
|
+
if "," in number_str:
|
26
|
+
return str(number_str)
|
27
|
+
if "." in number_str or "e" in number_str or "E" in number_str:
|
28
|
+
return float(number_str)
|
29
|
+
else:
|
30
|
+
return int(number_str)
|
31
|
+
except ValueError:
|
32
|
+
return number_str
|
@@ -0,0 +1,110 @@
|
|
1
|
+
from .constants import JSONReturnType
|
2
|
+
from .json_context import ContextValues
|
3
|
+
|
4
|
+
|
5
|
+
def parse_object(self) -> dict[str, JSONReturnType]:
|
6
|
+
# <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
|
7
|
+
obj: dict[str, JSONReturnType] = {}
|
8
|
+
# Stop when you either find the closing parentheses or you have iterated over the entire string
|
9
|
+
while (self.get_char_at() or "}") != "}":
|
10
|
+
# This is what we expect to find:
|
11
|
+
# <member> ::= <string> ': ' <json>
|
12
|
+
|
13
|
+
# Skip filler whitespaces
|
14
|
+
self.skip_whitespaces_at()
|
15
|
+
|
16
|
+
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
|
17
|
+
if (self.get_char_at() or "") == ":":
|
18
|
+
self.log(
|
19
|
+
"While parsing an object we found a : before a key, ignoring",
|
20
|
+
)
|
21
|
+
self.index += 1
|
22
|
+
|
23
|
+
# We are now searching for they string key
|
24
|
+
# Context is used in the string parser to manage the lack of quotes
|
25
|
+
self.context.set(ContextValues.OBJECT_KEY)
|
26
|
+
|
27
|
+
# Save this index in case we need find a duplicate key
|
28
|
+
rollback_index = self.index
|
29
|
+
|
30
|
+
# <member> starts with a <string>
|
31
|
+
key = ""
|
32
|
+
while self.get_char_at():
|
33
|
+
# The rollback index needs to be updated here in case the key is empty
|
34
|
+
rollback_index = self.index
|
35
|
+
if self.get_char_at() == "[" and key == "":
|
36
|
+
# Is this an array?
|
37
|
+
# Need to check if the previous parsed value contained in obj is an array and in that case parse and merge the two
|
38
|
+
prev_key = list(obj.keys())[-1] if obj else None
|
39
|
+
if prev_key and isinstance(obj[prev_key], list):
|
40
|
+
# If the previous key's value is an array, parse the new array and merge
|
41
|
+
self.index += 1
|
42
|
+
new_array = self.parse_array()
|
43
|
+
if isinstance(new_array, list):
|
44
|
+
# Merge and flatten the arrays
|
45
|
+
prev_value = obj[prev_key]
|
46
|
+
if isinstance(prev_value, list):
|
47
|
+
prev_value.extend(
|
48
|
+
new_array[0] if len(new_array) == 1 and isinstance(new_array[0], list) else new_array
|
49
|
+
)
|
50
|
+
self.skip_whitespaces_at()
|
51
|
+
if self.get_char_at() == ",":
|
52
|
+
self.index += 1
|
53
|
+
self.skip_whitespaces_at()
|
54
|
+
continue
|
55
|
+
key = str(self.parse_string())
|
56
|
+
if key == "":
|
57
|
+
self.skip_whitespaces_at()
|
58
|
+
if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
|
59
|
+
# If the string is empty but there is a object divider, we are done here
|
60
|
+
break
|
61
|
+
if ContextValues.ARRAY in self.context.context and key in obj:
|
62
|
+
self.log(
|
63
|
+
"While parsing an object we found a duplicate key, closing the object here and rolling back the index",
|
64
|
+
)
|
65
|
+
self.index = rollback_index - 1
|
66
|
+
# add an opening curly brace to make this work
|
67
|
+
self.json_str = self.json_str[: self.index + 1] + "{" + self.json_str[self.index + 1 :]
|
68
|
+
break
|
69
|
+
|
70
|
+
# Skip filler whitespaces
|
71
|
+
self.skip_whitespaces_at()
|
72
|
+
|
73
|
+
# We reached the end here
|
74
|
+
if (self.get_char_at() or "}") == "}":
|
75
|
+
continue
|
76
|
+
|
77
|
+
self.skip_whitespaces_at()
|
78
|
+
|
79
|
+
# An extreme case of missing ":" after a key
|
80
|
+
if (self.get_char_at() or "") != ":":
|
81
|
+
self.log(
|
82
|
+
"While parsing an object we missed a : after a key",
|
83
|
+
)
|
84
|
+
|
85
|
+
self.index += 1
|
86
|
+
self.context.reset()
|
87
|
+
self.context.set(ContextValues.OBJECT_VALUE)
|
88
|
+
# The value can be any valid json
|
89
|
+
self.skip_whitespaces_at()
|
90
|
+
# Corner case, a lone comma
|
91
|
+
value: JSONReturnType = ""
|
92
|
+
if (self.get_char_at() or "") in [",", "}"]:
|
93
|
+
self.log(
|
94
|
+
"While parsing an object value we found a stray , ignoring it",
|
95
|
+
)
|
96
|
+
else:
|
97
|
+
value = self.parse_json()
|
98
|
+
|
99
|
+
# Reset context since our job is done
|
100
|
+
self.context.reset()
|
101
|
+
obj[key] = value
|
102
|
+
|
103
|
+
if (self.get_char_at() or "") in [",", "'", '"']:
|
104
|
+
self.index += 1
|
105
|
+
|
106
|
+
# Remove trailing spaces
|
107
|
+
self.skip_whitespaces_at()
|
108
|
+
|
109
|
+
self.index += 1
|
110
|
+
return obj
|