json-repair 0.53.0__py3-none-any.whl → 0.53.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/__init__.py +1 -1
- json_repair/json_parser.py +36 -34
- json_repair/json_repair.py +6 -6
- json_repair/parse_array.py +8 -8
- json_repair/parse_comment.py +2 -2
- json_repair/parse_number.py +3 -2
- json_repair/parse_object.py +18 -18
- json_repair/parse_string.py +46 -54
- json_repair/parse_string_helpers/parse_boolean_or_null.py +28 -0
- json_repair/parse_string_helpers/parse_json_llm_block.py +7 -7
- {json_repair-0.53.0.dist-info → json_repair-0.53.1.dist-info}/METADATA +1 -1
- json_repair-0.53.1.dist-info/RECORD +22 -0
- json_repair/parse_boolean_or_null.py +0 -30
- json_repair-0.53.0.dist-info/RECORD +0 -22
- /json_repair/{constants.py → utils/constants.py} +0 -0
- /json_repair/{json_context.py → utils/json_context.py} +0 -0
- /json_repair/{object_comparer.py → utils/object_comparer.py} +0 -0
- /json_repair/{string_file_wrapper.py → utils/string_file_wrapper.py} +0 -0
- {json_repair-0.53.0.dist-info → json_repair-0.53.1.dist-info}/WHEEL +0 -0
- {json_repair-0.53.0.dist-info → json_repair-0.53.1.dist-info}/entry_points.txt +0 -0
- {json_repair-0.53.0.dist-info → json_repair-0.53.1.dist-info}/licenses/LICENSE +0 -0
- {json_repair-0.53.0.dist-info → json_repair-0.53.1.dist-info}/top_level.txt +0 -0
json_repair/__init__.py
CHANGED
json_repair/json_parser.py
CHANGED
|
@@ -1,36 +1,32 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import TextIO
|
|
2
2
|
|
|
3
|
-
from .constants import STRING_DELIMITERS, JSONReturnType
|
|
4
|
-
from .json_context import JsonContext
|
|
5
|
-
from .object_comparer import ObjectComparer
|
|
6
3
|
from .parse_array import parse_array as _parse_array
|
|
7
|
-
from .parse_boolean_or_null import parse_boolean_or_null as _parse_boolean_or_null
|
|
8
4
|
from .parse_comment import parse_comment as _parse_comment
|
|
9
5
|
from .parse_number import parse_number as _parse_number
|
|
10
6
|
from .parse_object import parse_object as _parse_object
|
|
11
7
|
from .parse_string import parse_string as _parse_string
|
|
12
|
-
from .
|
|
8
|
+
from .utils.constants import STRING_DELIMITERS, JSONReturnType
|
|
9
|
+
from .utils.json_context import JsonContext
|
|
10
|
+
from .utils.object_comparer import ObjectComparer
|
|
11
|
+
from .utils.string_file_wrapper import StringFileWrapper
|
|
13
12
|
|
|
14
13
|
|
|
15
14
|
class JSONParser:
|
|
16
15
|
# Split the parse methods into separate files because this one was like 3000 lines
|
|
17
|
-
def parse_array(self
|
|
18
|
-
return _parse_array(self
|
|
16
|
+
def parse_array(self) -> list[JSONReturnType]:
|
|
17
|
+
return _parse_array(self)
|
|
19
18
|
|
|
20
|
-
def
|
|
21
|
-
return
|
|
19
|
+
def parse_comment(self) -> JSONReturnType:
|
|
20
|
+
return _parse_comment(self)
|
|
22
21
|
|
|
23
|
-
def
|
|
24
|
-
return
|
|
22
|
+
def parse_number(self) -> JSONReturnType:
|
|
23
|
+
return _parse_number(self)
|
|
25
24
|
|
|
26
|
-
def
|
|
27
|
-
return
|
|
25
|
+
def parse_object(self) -> JSONReturnType:
|
|
26
|
+
return _parse_object(self)
|
|
28
27
|
|
|
29
|
-
def
|
|
30
|
-
return
|
|
31
|
-
|
|
32
|
-
def parse_string(self, *args, **kwargs):
|
|
33
|
-
return _parse_string(self, *args, **kwargs)
|
|
28
|
+
def parse_string(self) -> JSONReturnType:
|
|
29
|
+
return _parse_string(self)
|
|
34
30
|
|
|
35
31
|
def __init__(
|
|
36
32
|
self,
|
|
@@ -107,8 +103,8 @@ class JSONParser:
|
|
|
107
103
|
) -> JSONReturnType:
|
|
108
104
|
while True:
|
|
109
105
|
char = self.get_char_at()
|
|
110
|
-
#
|
|
111
|
-
if char is
|
|
106
|
+
# None means that we are at the end of the string provided
|
|
107
|
+
if char is None:
|
|
112
108
|
return ""
|
|
113
109
|
# <object> starts with '{'
|
|
114
110
|
elif char == "{":
|
|
@@ -130,30 +126,36 @@ class JSONParser:
|
|
|
130
126
|
else:
|
|
131
127
|
self.index += 1
|
|
132
128
|
|
|
133
|
-
def get_char_at(self, count: int = 0) -> str |
|
|
129
|
+
def get_char_at(self, count: int = 0) -> str | None:
|
|
134
130
|
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
|
|
135
131
|
try:
|
|
136
132
|
return self.json_str[self.index + count]
|
|
137
133
|
except IndexError:
|
|
138
|
-
return
|
|
134
|
+
return None
|
|
139
135
|
|
|
140
|
-
def
|
|
136
|
+
def skip_whitespaces(self) -> None:
|
|
141
137
|
"""
|
|
142
|
-
This function quickly iterates on whitespaces,
|
|
138
|
+
This function quickly iterates on whitespaces, moving the self.index forward
|
|
143
139
|
"""
|
|
144
140
|
try:
|
|
145
|
-
char = self.json_str[self.index
|
|
146
|
-
|
|
147
|
-
return idx
|
|
148
|
-
while char.isspace():
|
|
149
|
-
if move_main_index:
|
|
141
|
+
char = self.json_str[self.index]
|
|
142
|
+
while char.isspace():
|
|
150
143
|
self.index += 1
|
|
151
|
-
|
|
144
|
+
char = self.json_str[self.index]
|
|
145
|
+
except IndexError:
|
|
146
|
+
pass
|
|
147
|
+
|
|
148
|
+
def scroll_whitespaces(self, idx: int = 0) -> int:
|
|
149
|
+
"""
|
|
150
|
+
This function quickly iterates on whitespaces. Doesn't move the self.index and returns the offset from self.index
|
|
151
|
+
"""
|
|
152
|
+
try:
|
|
153
|
+
char = self.json_str[self.index + idx]
|
|
154
|
+
while char.isspace():
|
|
152
155
|
idx += 1
|
|
153
|
-
try:
|
|
154
156
|
char = self.json_str[self.index + idx]
|
|
155
|
-
|
|
156
|
-
|
|
157
|
+
except IndexError:
|
|
158
|
+
pass
|
|
157
159
|
return idx
|
|
158
160
|
|
|
159
161
|
def skip_to_character(self, character: str | list[str], idx: int = 0) -> int:
|
json_repair/json_repair.py
CHANGED
|
@@ -25,10 +25,10 @@ All supported use cases are in the unit tests
|
|
|
25
25
|
import argparse
|
|
26
26
|
import json
|
|
27
27
|
import sys
|
|
28
|
-
from typing import Literal, TextIO, overload
|
|
28
|
+
from typing import Any, Literal, TextIO, overload
|
|
29
29
|
|
|
30
|
-
from .constants import JSONReturnType
|
|
31
30
|
from .json_parser import JSONParser
|
|
31
|
+
from .utils.constants import JSONReturnType
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
@overload
|
|
@@ -40,7 +40,7 @@ def repair_json(
|
|
|
40
40
|
json_fd: TextIO | None = None,
|
|
41
41
|
chunk_length: int = 0,
|
|
42
42
|
stream_stable: bool = False,
|
|
43
|
-
**json_dumps_args,
|
|
43
|
+
**json_dumps_args: Any,
|
|
44
44
|
) -> str: ...
|
|
45
45
|
|
|
46
46
|
|
|
@@ -53,7 +53,7 @@ def repair_json(
|
|
|
53
53
|
json_fd: TextIO | None = None,
|
|
54
54
|
chunk_length: int = 0,
|
|
55
55
|
stream_stable: bool = False,
|
|
56
|
-
**json_dumps_args,
|
|
56
|
+
**json_dumps_args: Any,
|
|
57
57
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]: ...
|
|
58
58
|
|
|
59
59
|
|
|
@@ -65,8 +65,8 @@ def repair_json(
|
|
|
65
65
|
json_fd: TextIO | None = None,
|
|
66
66
|
chunk_length: int = 0,
|
|
67
67
|
stream_stable: bool = False,
|
|
68
|
-
**json_dumps_args,
|
|
69
|
-
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]
|
|
68
|
+
**json_dumps_args: Any,
|
|
69
|
+
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
|
|
70
70
|
"""
|
|
71
71
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
|
|
72
72
|
|
json_repair/parse_array.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
|
-
from .constants import STRING_DELIMITERS, JSONReturnType
|
|
4
|
-
from .json_context import ContextValues
|
|
5
|
-
from .object_comparer import ObjectComparer
|
|
3
|
+
from .utils.constants import STRING_DELIMITERS, JSONReturnType
|
|
4
|
+
from .utils.json_context import ContextValues
|
|
5
|
+
from .utils.object_comparer import ObjectComparer
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from .json_parser import JSONParser
|
|
@@ -15,7 +15,7 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
|
|
|
15
15
|
# Stop when you either find the closing parentheses or you have iterated over the entire string
|
|
16
16
|
char = self.get_char_at()
|
|
17
17
|
while char and char not in ["]", "}"]:
|
|
18
|
-
self.
|
|
18
|
+
self.skip_whitespaces()
|
|
19
19
|
value: JSONReturnType = ""
|
|
20
20
|
if char in STRING_DELIMITERS:
|
|
21
21
|
# Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
|
|
@@ -23,13 +23,13 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
|
|
|
23
23
|
# And either parse the string or parse the object
|
|
24
24
|
i = 1
|
|
25
25
|
i = self.skip_to_character(char, i)
|
|
26
|
-
i = self.
|
|
26
|
+
i = self.scroll_whitespaces(idx=i + 1)
|
|
27
27
|
value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
|
|
28
28
|
else:
|
|
29
29
|
value = self.parse_json()
|
|
30
30
|
|
|
31
|
-
# It is possible that parse_json() returns nothing valid, so we increase by 1
|
|
32
|
-
if ObjectComparer.is_strictly_empty(value):
|
|
31
|
+
# It is possible that parse_json() returns nothing valid, so we increase by 1, unless we find an array separator
|
|
32
|
+
if ObjectComparer.is_strictly_empty(value) and self.get_char_at() not in ["]", ","]:
|
|
33
33
|
self.index += 1
|
|
34
34
|
elif value == "..." and self.get_char_at(-1) == ".":
|
|
35
35
|
self.log(
|
|
@@ -45,7 +45,7 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
|
|
|
45
45
|
char = self.get_char_at()
|
|
46
46
|
|
|
47
47
|
# Especially at the end of an LLM generated json you might miss the last "]"
|
|
48
|
-
if char
|
|
48
|
+
if char != "]":
|
|
49
49
|
self.log(
|
|
50
50
|
"While parsing an array we missed the closing ], ignoring it",
|
|
51
51
|
)
|
json_repair/parse_comment.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
|
-
from .constants import JSONReturnType
|
|
4
|
-
from .json_context import ContextValues
|
|
3
|
+
from .utils.constants import JSONReturnType
|
|
4
|
+
from .utils.json_context import ContextValues
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
7
|
from .json_parser import JSONParser
|
json_repair/parse_number.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
|
-
from .
|
|
3
|
+
from .utils.constants import JSONReturnType
|
|
4
|
+
from .utils.json_context import ContextValues
|
|
4
5
|
|
|
5
6
|
NUMBER_CHARS: set[str] = set("0123456789-.eE/,")
|
|
6
7
|
|
|
@@ -9,7 +10,7 @@ if TYPE_CHECKING:
|
|
|
9
10
|
from .json_parser import JSONParser
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
def parse_number(self: "JSONParser") ->
|
|
13
|
+
def parse_number(self: "JSONParser") -> JSONReturnType:
|
|
13
14
|
# <number> is a valid real number expressed in one of a number of given formats
|
|
14
15
|
number_str = ""
|
|
15
16
|
char = self.get_char_at()
|
json_repair/parse_object.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
|
-
from .constants import STRING_DELIMITERS, JSONReturnType
|
|
4
|
-
from .json_context import ContextValues
|
|
3
|
+
from .utils.constants import STRING_DELIMITERS, JSONReturnType
|
|
4
|
+
from .utils.json_context import ContextValues
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
7
|
from .json_parser import JSONParser
|
|
@@ -17,10 +17,10 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
17
17
|
# <member> ::= <string> ': ' <json>
|
|
18
18
|
|
|
19
19
|
# Skip filler whitespaces
|
|
20
|
-
self.
|
|
20
|
+
self.skip_whitespaces()
|
|
21
21
|
|
|
22
22
|
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
|
|
23
|
-
if
|
|
23
|
+
if self.get_char_at() == ":":
|
|
24
24
|
self.log(
|
|
25
25
|
"While parsing an object we found a : before a key, ignoring",
|
|
26
26
|
)
|
|
@@ -53,14 +53,14 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
53
53
|
prev_value.extend(
|
|
54
54
|
new_array[0] if len(new_array) == 1 and isinstance(new_array[0], list) else new_array
|
|
55
55
|
)
|
|
56
|
-
self.
|
|
56
|
+
self.skip_whitespaces()
|
|
57
57
|
if self.get_char_at() == ",":
|
|
58
58
|
self.index += 1
|
|
59
|
-
self.
|
|
59
|
+
self.skip_whitespaces()
|
|
60
60
|
continue
|
|
61
61
|
key = str(self.parse_string())
|
|
62
62
|
if key == "":
|
|
63
|
-
self.
|
|
63
|
+
self.skip_whitespaces()
|
|
64
64
|
if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
|
|
65
65
|
# If the string is empty but there is a object divider, we are done here
|
|
66
66
|
break
|
|
@@ -74,16 +74,16 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
74
74
|
break
|
|
75
75
|
|
|
76
76
|
# Skip filler whitespaces
|
|
77
|
-
self.
|
|
77
|
+
self.skip_whitespaces()
|
|
78
78
|
|
|
79
79
|
# We reached the end here
|
|
80
80
|
if (self.get_char_at() or "}") == "}":
|
|
81
81
|
continue
|
|
82
82
|
|
|
83
|
-
self.
|
|
83
|
+
self.skip_whitespaces()
|
|
84
84
|
|
|
85
85
|
# An extreme case of missing ":" after a key
|
|
86
|
-
if
|
|
86
|
+
if self.get_char_at() != ":":
|
|
87
87
|
self.log(
|
|
88
88
|
"While parsing an object we missed a : after a key",
|
|
89
89
|
)
|
|
@@ -92,10 +92,10 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
92
92
|
self.context.reset()
|
|
93
93
|
self.context.set(ContextValues.OBJECT_VALUE)
|
|
94
94
|
# The value can be any valid json
|
|
95
|
-
self.
|
|
95
|
+
self.skip_whitespaces()
|
|
96
96
|
# Corner case, a lone comma
|
|
97
97
|
value: JSONReturnType = ""
|
|
98
|
-
if
|
|
98
|
+
if self.get_char_at() in [",", "}"]:
|
|
99
99
|
self.log(
|
|
100
100
|
"While parsing an object value we found a stray , ignoring it",
|
|
101
101
|
)
|
|
@@ -106,11 +106,11 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
106
106
|
self.context.reset()
|
|
107
107
|
obj[key] = value
|
|
108
108
|
|
|
109
|
-
if
|
|
109
|
+
if self.get_char_at() in [",", "'", '"']:
|
|
110
110
|
self.index += 1
|
|
111
111
|
|
|
112
112
|
# Remove trailing spaces
|
|
113
|
-
self.
|
|
113
|
+
self.skip_whitespaces()
|
|
114
114
|
|
|
115
115
|
self.index += 1
|
|
116
116
|
|
|
@@ -126,12 +126,12 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
126
126
|
if not self.context.empty:
|
|
127
127
|
return obj
|
|
128
128
|
|
|
129
|
-
self.
|
|
130
|
-
if
|
|
129
|
+
self.skip_whitespaces()
|
|
130
|
+
if self.get_char_at() != ",":
|
|
131
131
|
return obj
|
|
132
132
|
self.index += 1
|
|
133
|
-
self.
|
|
134
|
-
if
|
|
133
|
+
self.skip_whitespaces()
|
|
134
|
+
if self.get_char_at() not in STRING_DELIMITERS:
|
|
135
135
|
return obj
|
|
136
136
|
self.log(
|
|
137
137
|
"Found a comma and string delimiter after object closing brace, checking for additional key-value pairs",
|
json_repair/parse_string.py
CHANGED
|
@@ -1,14 +1,22 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
|
-
from .
|
|
4
|
-
from .json_context import ContextValues
|
|
3
|
+
from .parse_string_helpers.parse_boolean_or_null import parse_boolean_or_null
|
|
5
4
|
from .parse_string_helpers.parse_json_llm_block import parse_json_llm_block
|
|
5
|
+
from .utils.constants import STRING_DELIMITERS, JSONReturnType
|
|
6
|
+
from .utils.json_context import ContextValues
|
|
6
7
|
|
|
7
8
|
if TYPE_CHECKING:
|
|
8
9
|
from .json_parser import JSONParser
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
13
|
+
# Utility function to append a character to the accumulator and update the index
|
|
14
|
+
def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str | None]:
|
|
15
|
+
acc += str(current_char)
|
|
16
|
+
self.index += 1
|
|
17
|
+
char = self.get_char_at()
|
|
18
|
+
return acc, char
|
|
19
|
+
|
|
12
20
|
# <string> is a string of valid characters enclosed in quotes
|
|
13
21
|
# i.e. { name: "John" }
|
|
14
22
|
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
|
|
@@ -40,7 +48,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
40
48
|
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
|
|
41
49
|
# But remember, object keys are only of type string
|
|
42
50
|
if char.lower() in ["t", "f", "n"] and self.context.current != ContextValues.OBJECT_KEY:
|
|
43
|
-
value =
|
|
51
|
+
value = parse_boolean_or_null(self)
|
|
44
52
|
if value != "":
|
|
45
53
|
return value
|
|
46
54
|
self.log(
|
|
@@ -59,10 +67,12 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
59
67
|
"While parsing a string, we found code fences but they did not enclose valid JSON, continuing parsing the string",
|
|
60
68
|
)
|
|
61
69
|
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
|
|
62
|
-
if self.get_char_at()
|
|
70
|
+
if self.get_char_at() == lstring_delimiter:
|
|
63
71
|
# If it's an empty key, this was easy
|
|
64
|
-
if (
|
|
65
|
-
self.context.current == ContextValues.
|
|
72
|
+
if (
|
|
73
|
+
(self.context.current == ContextValues.OBJECT_KEY and self.get_char_at(1) == ":")
|
|
74
|
+
or (self.context.current == ContextValues.OBJECT_VALUE and self.get_char_at(1) in [",", "}"])
|
|
75
|
+
or (self.context.current == ContextValues.ARRAY and self.get_char_at(1) in [",", "]"])
|
|
66
76
|
):
|
|
67
77
|
self.index += 1
|
|
68
78
|
return ""
|
|
@@ -77,7 +87,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
77
87
|
next_c = self.get_char_at(i)
|
|
78
88
|
# Now check that the next character is also a delimiter to ensure that we have "".....""
|
|
79
89
|
# In that case we ignore this rstring delimiter
|
|
80
|
-
if
|
|
90
|
+
if self.get_char_at(i + 1) == rstring_delimiter:
|
|
81
91
|
self.log(
|
|
82
92
|
"While parsing a string, we found a valid starting doubled quote",
|
|
83
93
|
)
|
|
@@ -85,7 +95,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
85
95
|
self.index += 1
|
|
86
96
|
else:
|
|
87
97
|
# Ok this is not a doubled quote, check if this is an empty string or not
|
|
88
|
-
i = self.
|
|
98
|
+
i = self.scroll_whitespaces(idx=1)
|
|
89
99
|
next_c = self.get_char_at(i)
|
|
90
100
|
if next_c in STRING_DELIMITERS + ["{", "["]:
|
|
91
101
|
# something fishy is going on here
|
|
@@ -135,7 +145,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
135
145
|
):
|
|
136
146
|
rstring_delimiter_missing = True
|
|
137
147
|
# check if this is a case in which the closing comma is NOT missing instead
|
|
138
|
-
self.
|
|
148
|
+
self.skip_whitespaces()
|
|
139
149
|
if self.get_char_at(1) == "\\":
|
|
140
150
|
# Ok this is a quoted string, skip
|
|
141
151
|
rstring_delimiter_missing = False
|
|
@@ -145,7 +155,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
145
155
|
i += 1
|
|
146
156
|
# found a delimiter, now we need to check that is followed strictly by a comma or brace
|
|
147
157
|
# or the string ended
|
|
148
|
-
i = self.
|
|
158
|
+
i = self.scroll_whitespaces(idx=i)
|
|
149
159
|
next_c = self.get_char_at(i)
|
|
150
160
|
if not next_c or next_c in [",", "}"]:
|
|
151
161
|
rstring_delimiter_missing = False
|
|
@@ -160,7 +170,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
160
170
|
else:
|
|
161
171
|
# But again, this could just be something a bit stupid like "lorem, "ipsum" sic"
|
|
162
172
|
# Check if we find a : afterwards (skipping space)
|
|
163
|
-
i = self.
|
|
173
|
+
i = self.scroll_whitespaces(idx=i + 1)
|
|
164
174
|
next_c = self.get_char_at(i)
|
|
165
175
|
if next_c and next_c != ":":
|
|
166
176
|
rstring_delimiter_missing = False
|
|
@@ -175,7 +185,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
175
185
|
break
|
|
176
186
|
else:
|
|
177
187
|
# skip any whitespace first
|
|
178
|
-
i = self.
|
|
188
|
+
i = self.scroll_whitespaces(idx=1)
|
|
179
189
|
# We couldn't find any rstring_delimeter before the end of the string
|
|
180
190
|
# check if this is the last string of an object and therefore we can keep going
|
|
181
191
|
# make an exception if this is the last char before the closing brace
|
|
@@ -212,19 +222,15 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
212
222
|
if self.context.current == ContextValues.OBJECT_VALUE and char == "}":
|
|
213
223
|
# We found the end of an object while parsing a value
|
|
214
224
|
# Check if the object is really over, to avoid doubling the closing brace
|
|
215
|
-
i = self.
|
|
225
|
+
i = self.scroll_whitespaces(idx=1)
|
|
216
226
|
next_c = self.get_char_at(i)
|
|
217
|
-
if next_c and
|
|
227
|
+
if next_c == "`" and self.get_char_at(i + 1) == "`" and self.get_char_at(i + 2) == "`":
|
|
218
228
|
# This could be a special case in which the LLM added code fences after the object
|
|
219
229
|
# So we need to check if there are another two ` after this one`
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
self.log(
|
|
225
|
-
"While parsing a string in object value context, we found a } that closes the object before code fences, stopping here",
|
|
226
|
-
)
|
|
227
|
-
break
|
|
230
|
+
self.log(
|
|
231
|
+
"While parsing a string in object value context, we found a } that closes the object before code fences, stopping here",
|
|
232
|
+
)
|
|
233
|
+
break
|
|
228
234
|
if not next_c:
|
|
229
235
|
self.log(
|
|
230
236
|
"While parsing a string in object value context, we found a } that closes the object, stopping here",
|
|
@@ -282,12 +288,11 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
282
288
|
# found a second delimiter
|
|
283
289
|
i += 1
|
|
284
290
|
# Skip spaces
|
|
285
|
-
i = self.
|
|
286
|
-
|
|
287
|
-
if next_c and next_c in [",", "}"]:
|
|
291
|
+
i = self.scroll_whitespaces(idx=i)
|
|
292
|
+
if self.get_char_at(i) in [",", "}"]:
|
|
288
293
|
# Ok then this is a missing right quote
|
|
289
294
|
self.log(
|
|
290
|
-
"While parsing a string missing the right delimiter in object key context, we found a
|
|
295
|
+
"While parsing a string missing the right delimiter in object key context, we found a , or } stopping here",
|
|
291
296
|
)
|
|
292
297
|
break
|
|
293
298
|
else:
|
|
@@ -316,9 +321,8 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
316
321
|
# We found a quote, now let's make sure there's a ":" following
|
|
317
322
|
i += 1
|
|
318
323
|
# found a delimiter, now we need to check that is followed strictly by a comma or brace
|
|
319
|
-
i = self.
|
|
320
|
-
|
|
321
|
-
if next_c and next_c == ":":
|
|
324
|
+
i = self.scroll_whitespaces(idx=i)
|
|
325
|
+
if self.get_char_at(i) == ":":
|
|
322
326
|
# Reset the cursor
|
|
323
327
|
self.index -= 1
|
|
324
328
|
char = self.get_char_at()
|
|
@@ -328,9 +332,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
328
332
|
break
|
|
329
333
|
elif unmatched_delimiter:
|
|
330
334
|
unmatched_delimiter = False
|
|
331
|
-
string_acc
|
|
332
|
-
self.index += 1
|
|
333
|
-
char = self.get_char_at()
|
|
335
|
+
string_acc, char = _append_literal_char(string_acc, char)
|
|
334
336
|
else:
|
|
335
337
|
# Check if eventually there is a rstring delimiter, otherwise we bail
|
|
336
338
|
i = 1
|
|
@@ -365,22 +367,20 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
365
367
|
next_c = self.get_char_at(i)
|
|
366
368
|
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a } or a ,
|
|
367
369
|
i += 1
|
|
368
|
-
i = self.
|
|
370
|
+
i = self.scroll_whitespaces(idx=i)
|
|
369
371
|
next_c = self.get_char_at(i)
|
|
370
372
|
if next_c in ["}", ","]:
|
|
371
373
|
self.log(
|
|
372
|
-
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
|
374
|
+
"While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
|
373
375
|
)
|
|
374
|
-
string_acc
|
|
375
|
-
self.index += 1
|
|
376
|
-
char = self.get_char_at()
|
|
376
|
+
string_acc, char = _append_literal_char(string_acc, char)
|
|
377
377
|
continue
|
|
378
378
|
elif next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\":
|
|
379
379
|
# Check if self.index:self.index+i is only whitespaces, break if that's the case
|
|
380
380
|
if all(str(self.get_char_at(j)).isspace() for j in range(1, i) if self.get_char_at(j)):
|
|
381
381
|
break
|
|
382
382
|
if self.context.current == ContextValues.OBJECT_VALUE:
|
|
383
|
-
i = self.
|
|
383
|
+
i = self.scroll_whitespaces(idx=i + 1)
|
|
384
384
|
if self.get_char_at(i) == ",":
|
|
385
385
|
# So we found a comma, this could be a case of a single quote like "va"lue",
|
|
386
386
|
# Search if it's followed by another key, starting with the first delimeter
|
|
@@ -388,15 +388,13 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
388
388
|
i += 1
|
|
389
389
|
i = self.skip_to_character(character=rstring_delimiter, idx=i + 1)
|
|
390
390
|
i += 1
|
|
391
|
-
i = self.
|
|
391
|
+
i = self.scroll_whitespaces(idx=i)
|
|
392
392
|
next_c = self.get_char_at(i)
|
|
393
393
|
if next_c == ":":
|
|
394
394
|
self.log(
|
|
395
|
-
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
|
395
|
+
"While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
|
396
396
|
)
|
|
397
|
-
string_acc
|
|
398
|
-
self.index += 1
|
|
399
|
-
char = self.get_char_at()
|
|
397
|
+
string_acc, char = _append_literal_char(string_acc, char)
|
|
400
398
|
continue
|
|
401
399
|
# We found a delimiter and we need to check if this is a key
|
|
402
400
|
# so find a rstring_delimiter and a colon after
|
|
@@ -413,12 +411,10 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
413
411
|
# Only if we fail to find a ':' then we know this is misplaced quote
|
|
414
412
|
if next_c != ":":
|
|
415
413
|
self.log(
|
|
416
|
-
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
|
414
|
+
"While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
|
417
415
|
)
|
|
418
416
|
unmatched_delimiter = not unmatched_delimiter
|
|
419
|
-
string_acc
|
|
420
|
-
self.index += 1
|
|
421
|
-
char = self.get_char_at()
|
|
417
|
+
string_acc, char = _append_literal_char(string_acc, char)
|
|
422
418
|
elif self.context.current == ContextValues.ARRAY:
|
|
423
419
|
# So here we can have a few valid cases:
|
|
424
420
|
# ["bla bla bla "puppy" bla bla bla "kitty" bla bla"]
|
|
@@ -442,9 +438,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
442
438
|
"While parsing a string in Array context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
|
|
443
439
|
)
|
|
444
440
|
unmatched_delimiter = not unmatched_delimiter
|
|
445
|
-
string_acc
|
|
446
|
-
self.index += 1
|
|
447
|
-
char = self.get_char_at()
|
|
441
|
+
string_acc, char = _append_literal_char(string_acc, char)
|
|
448
442
|
else:
|
|
449
443
|
break
|
|
450
444
|
elif self.context.current == ContextValues.OBJECT_KEY:
|
|
@@ -452,14 +446,12 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
452
446
|
self.log(
|
|
453
447
|
"While parsing a string in Object Key context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
|
|
454
448
|
)
|
|
455
|
-
string_acc
|
|
456
|
-
self.index += 1
|
|
457
|
-
char = self.get_char_at()
|
|
449
|
+
string_acc, char = _append_literal_char(string_acc, char)
|
|
458
450
|
if char and missing_quotes and self.context.current == ContextValues.OBJECT_KEY and char.isspace():
|
|
459
451
|
self.log(
|
|
460
452
|
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
|
|
461
453
|
)
|
|
462
|
-
self.
|
|
454
|
+
self.skip_whitespaces()
|
|
463
455
|
if self.get_char_at() not in [":", ","]:
|
|
464
456
|
return ""
|
|
465
457
|
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from ..json_parser import JSONParser # noqa: TID252
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parse_boolean_or_null(parser: "JSONParser") -> bool | str | None:
|
|
8
|
+
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
|
|
9
|
+
char = (parser.get_char_at() or "").lower()
|
|
10
|
+
value_map: dict[str, tuple[str, bool | None]] = {
|
|
11
|
+
"t": ("true", True),
|
|
12
|
+
"f": ("false", False),
|
|
13
|
+
"n": ("null", None),
|
|
14
|
+
}
|
|
15
|
+
value: tuple[str, bool | None] = value_map[char]
|
|
16
|
+
|
|
17
|
+
i = 0
|
|
18
|
+
starting_index = parser.index
|
|
19
|
+
while char and i < len(value[0]) and char == value[0][i]:
|
|
20
|
+
i += 1
|
|
21
|
+
parser.index += 1
|
|
22
|
+
char = (parser.get_char_at() or "").lower()
|
|
23
|
+
if i == len(value[0]):
|
|
24
|
+
return value[1]
|
|
25
|
+
|
|
26
|
+
# If nothing works reset the index before returning
|
|
27
|
+
parser.index = starting_index
|
|
28
|
+
return ""
|
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
|
-
from ..constants import JSONReturnType # noqa: TID252
|
|
3
|
+
from ..utils.constants import JSONReturnType # noqa: TID252
|
|
4
4
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
6
|
from ..json_parser import JSONParser # noqa: TID252
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
def parse_json_llm_block(
|
|
9
|
+
def parse_json_llm_block(parser: "JSONParser") -> JSONReturnType:
|
|
10
10
|
"""
|
|
11
11
|
Extracts and normalizes JSON enclosed in ```json ... ``` blocks.
|
|
12
12
|
"""
|
|
13
13
|
# Try to find a ```json ... ``` block
|
|
14
|
-
if
|
|
15
|
-
i =
|
|
16
|
-
if
|
|
17
|
-
|
|
18
|
-
return
|
|
14
|
+
if parser.json_str[parser.index : parser.index + 7] == "```json":
|
|
15
|
+
i = parser.skip_to_character("`", idx=7)
|
|
16
|
+
if parser.json_str[parser.index + i : parser.index + i + 3] == "```":
|
|
17
|
+
parser.index += 7 # Move past ```json
|
|
18
|
+
return parser.parse_json()
|
|
19
19
|
return False
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
json_repair/__init__.py,sha256=JQ4Nm8YzR8Id2a527Ql0Az-rKapTp8DCMPKybLtQ620,180
|
|
2
|
+
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
|
3
|
+
json_repair/json_parser.py,sha256=tryp6hMKkjL8Rn1C1SgrmCDbK0ftvFMAQFg8vPzMHmw,7883
|
|
4
|
+
json_repair/json_repair.py,sha256=tf0FA4qorTFCae0-kz09AKHVvh2rOMpYcJYuA4I0WYc,11568
|
|
5
|
+
json_repair/parse_array.py,sha256=rZfnRiS86vBATOUHqSx2T5fE79Ndlk2NoTsg9Wek7l4,2239
|
|
6
|
+
json_repair/parse_comment.py,sha256=MUDxrx8BFfAaKvx6x4gWviJNvwRi2yv5qnrR6honmas,2660
|
|
7
|
+
json_repair/parse_number.py,sha256=Ddv3Dih1VYfdasUe5DxQWAqy7YAE3aZJ7iePCfdi1EQ,1292
|
|
8
|
+
json_repair/parse_object.py,sha256=ousZReaYw1PUZwF0muRvXVPtqL6O_X8uBTTeipnPD80,5505
|
|
9
|
+
json_repair/parse_string.py,sha256=vWpcjknLqdFB77m5erRlQAAK6YRWQ4CKoKra6cyGjVo,25740
|
|
10
|
+
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
json_repair/parse_string_helpers/parse_boolean_or_null.py,sha256=pGmH1QATBls70kTvUlJv4F8NiPaBWcyGhRL03sTOnto,871
|
|
12
|
+
json_repair/parse_string_helpers/parse_json_llm_block.py,sha256=wPSm-8RY30Ek8HxzjCkCRtdLq4-Cez-PJB3vOk_vP3w,670
|
|
13
|
+
json_repair/utils/constants.py,sha256=cv2gvyosuq0me0600WyTysM9avrtfXPuXYR26tawcuo,158
|
|
14
|
+
json_repair/utils/json_context.py,sha256=WsMOjqpGSr6aaDONcrk8UFtTurzWon2Qq9AoBBYseoI,934
|
|
15
|
+
json_repair/utils/object_comparer.py,sha256=XKV3MRab8H7_v4sm-wpEa5le0XX9OeycWo5S-MFm-GI,1716
|
|
16
|
+
json_repair/utils/string_file_wrapper.py,sha256=tGkWBEUPE-CZPf4uSM5NE9oSDTpskX0myJiXsl-gbds,4333
|
|
17
|
+
json_repair-0.53.1.dist-info/licenses/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
|
18
|
+
json_repair-0.53.1.dist-info/METADATA,sha256=IFe-MSHaytu_BrjLp4UK9YCriXiRjmCsQg3CTpp6UD4,11027
|
|
19
|
+
json_repair-0.53.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
20
|
+
json_repair-0.53.1.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
|
21
|
+
json_repair-0.53.1.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
|
22
|
+
json_repair-0.53.1.dist-info/RECORD,,
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING
|
|
2
|
-
|
|
3
|
-
if TYPE_CHECKING:
|
|
4
|
-
from .json_parser import JSONParser
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def parse_boolean_or_null(self: "JSONParser") -> bool | str | None:
|
|
8
|
-
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
|
|
9
|
-
starting_index = self.index
|
|
10
|
-
char = (self.get_char_at() or "").lower()
|
|
11
|
-
value: tuple[str, bool | None] | None = None
|
|
12
|
-
if char == "t":
|
|
13
|
-
value = ("true", True)
|
|
14
|
-
elif char == "f":
|
|
15
|
-
value = ("false", False)
|
|
16
|
-
elif char == "n":
|
|
17
|
-
value = ("null", None)
|
|
18
|
-
|
|
19
|
-
if value:
|
|
20
|
-
i = 0
|
|
21
|
-
while char and i < len(value[0]) and char == value[0][i]:
|
|
22
|
-
i += 1
|
|
23
|
-
self.index += 1
|
|
24
|
-
char = (self.get_char_at() or "").lower()
|
|
25
|
-
if i == len(value[0]):
|
|
26
|
-
return value[1]
|
|
27
|
-
|
|
28
|
-
# If nothing works reset the index before returning
|
|
29
|
-
self.index = starting_index
|
|
30
|
-
return ""
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
json_repair/__init__.py,sha256=JdJIZNCKV3MfIviryqK8NH8yGssCta2-192CekcwH-o,174
|
|
2
|
-
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
|
3
|
-
json_repair/constants.py,sha256=cv2gvyosuq0me0600WyTysM9avrtfXPuXYR26tawcuo,158
|
|
4
|
-
json_repair/json_context.py,sha256=WsMOjqpGSr6aaDONcrk8UFtTurzWon2Qq9AoBBYseoI,934
|
|
5
|
-
json_repair/json_parser.py,sha256=vy5Z8aiJUVhVmvYEgy0dkYy5WgUmyOeS6PEFiR3cW44,7948
|
|
6
|
-
json_repair/json_repair.py,sha256=sDhXzDZxu0QmaFzICPTtf_q7yOY1A1Lf_iQG6Potsco,11572
|
|
7
|
-
json_repair/object_comparer.py,sha256=XKV3MRab8H7_v4sm-wpEa5le0XX9OeycWo5S-MFm-GI,1716
|
|
8
|
-
json_repair/parse_array.py,sha256=-rh65JcfT-FtXiR6s8RYlMfI-6LzVr08ytlDh6Z2CFE,2181
|
|
9
|
-
json_repair/parse_boolean_or_null.py,sha256=WMSkvvxsp4wvauBcDqtt9WnLMD5SMoxeRfZFXp3FEBc,890
|
|
10
|
-
json_repair/parse_comment.py,sha256=JHtQ_QlxOvPNnMh7lhUaoTjFGelqjhTNq7qn9xUE7SU,2648
|
|
11
|
-
json_repair/parse_number.py,sha256=33zAtkbuVzi9Lqjxu7cXn9WlVzd3WjRx9Ln_LFzVL4o,1259
|
|
12
|
-
json_repair/parse_object.py,sha256=rnuH5Oxo98OrXhktF0wrOC1vRb5Th_m819Li1EFJzm4,5571
|
|
13
|
-
json_repair/parse_string.py,sha256=--coxoyH4nxl7osxgs1fIu31IEtB0HHwVbbOewypG4g,26146
|
|
14
|
-
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
json_repair/string_file_wrapper.py,sha256=tGkWBEUPE-CZPf4uSM5NE9oSDTpskX0myJiXsl-gbds,4333
|
|
16
|
-
json_repair/parse_string_helpers/parse_json_llm_block.py,sha256=taREF3pwb35kGBGJYbUHkTybATX3GI-SOwOz3yXaEQs,644
|
|
17
|
-
json_repair-0.53.0.dist-info/licenses/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
|
18
|
-
json_repair-0.53.0.dist-info/METADATA,sha256=JvMUVYGDDIzmym7MqbQ6k6PjbnuuskW_myvk0EWp7V8,11027
|
|
19
|
-
json_repair-0.53.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
20
|
-
json_repair-0.53.0.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
|
21
|
-
json_repair-0.53.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
|
22
|
-
json_repair-0.53.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|