json-repair 0.53.0__py3-none-any.whl → 0.54__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/__init__.py +1 -1
- json_repair/json_parser.py +46 -34
- json_repair/json_repair.py +27 -9
- json_repair/parse_array.py +8 -8
- json_repair/parse_comment.py +2 -2
- json_repair/parse_number.py +3 -2
- json_repair/parse_object.py +51 -28
- json_repair/parse_string.py +56 -55
- json_repair/parse_string_helpers/parse_boolean_or_null.py +28 -0
- json_repair/parse_string_helpers/parse_json_llm_block.py +7 -7
- json_repair/utils/string_file_wrapper.py +176 -0
- {json_repair-0.53.0.dist-info → json_repair-0.54.dist-info}/METADATA +19 -1
- json_repair-0.54.dist-info/RECORD +22 -0
- json_repair/parse_boolean_or_null.py +0 -30
- json_repair/string_file_wrapper.py +0 -108
- json_repair-0.53.0.dist-info/RECORD +0 -22
- /json_repair/{constants.py → utils/constants.py} +0 -0
- /json_repair/{json_context.py → utils/json_context.py} +0 -0
- /json_repair/{object_comparer.py → utils/object_comparer.py} +0 -0
- {json_repair-0.53.0.dist-info → json_repair-0.54.dist-info}/WHEEL +0 -0
- {json_repair-0.53.0.dist-info → json_repair-0.54.dist-info}/entry_points.txt +0 -0
- {json_repair-0.53.0.dist-info → json_repair-0.54.dist-info}/licenses/LICENSE +0 -0
- {json_repair-0.53.0.dist-info → json_repair-0.54.dist-info}/top_level.txt +0 -0
json_repair/__init__.py
CHANGED
json_repair/json_parser.py
CHANGED
|
@@ -1,36 +1,32 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import TextIO
|
|
2
2
|
|
|
3
|
-
from .constants import STRING_DELIMITERS, JSONReturnType
|
|
4
|
-
from .json_context import JsonContext
|
|
5
|
-
from .object_comparer import ObjectComparer
|
|
6
3
|
from .parse_array import parse_array as _parse_array
|
|
7
|
-
from .parse_boolean_or_null import parse_boolean_or_null as _parse_boolean_or_null
|
|
8
4
|
from .parse_comment import parse_comment as _parse_comment
|
|
9
5
|
from .parse_number import parse_number as _parse_number
|
|
10
6
|
from .parse_object import parse_object as _parse_object
|
|
11
7
|
from .parse_string import parse_string as _parse_string
|
|
12
|
-
from .
|
|
8
|
+
from .utils.constants import STRING_DELIMITERS, JSONReturnType
|
|
9
|
+
from .utils.json_context import JsonContext
|
|
10
|
+
from .utils.object_comparer import ObjectComparer
|
|
11
|
+
from .utils.string_file_wrapper import StringFileWrapper
|
|
13
12
|
|
|
14
13
|
|
|
15
14
|
class JSONParser:
|
|
16
15
|
# Split the parse methods into separate files because this one was like 3000 lines
|
|
17
|
-
def parse_array(self
|
|
18
|
-
return _parse_array(self
|
|
16
|
+
def parse_array(self) -> list[JSONReturnType]:
|
|
17
|
+
return _parse_array(self)
|
|
19
18
|
|
|
20
|
-
def
|
|
21
|
-
return
|
|
19
|
+
def parse_comment(self) -> JSONReturnType:
|
|
20
|
+
return _parse_comment(self)
|
|
22
21
|
|
|
23
|
-
def
|
|
24
|
-
return
|
|
22
|
+
def parse_number(self) -> JSONReturnType:
|
|
23
|
+
return _parse_number(self)
|
|
25
24
|
|
|
26
|
-
def
|
|
27
|
-
return
|
|
25
|
+
def parse_object(self) -> JSONReturnType:
|
|
26
|
+
return _parse_object(self)
|
|
28
27
|
|
|
29
|
-
def
|
|
30
|
-
return
|
|
31
|
-
|
|
32
|
-
def parse_string(self, *args, **kwargs):
|
|
33
|
-
return _parse_string(self, *args, **kwargs)
|
|
28
|
+
def parse_string(self) -> JSONReturnType:
|
|
29
|
+
return _parse_string(self)
|
|
34
30
|
|
|
35
31
|
def __init__(
|
|
36
32
|
self,
|
|
@@ -39,6 +35,7 @@ class JSONParser:
|
|
|
39
35
|
logging: bool | None,
|
|
40
36
|
json_fd_chunk_length: int = 0,
|
|
41
37
|
stream_stable: bool = False,
|
|
38
|
+
strict: bool = False,
|
|
42
39
|
) -> None:
|
|
43
40
|
# The string to parse
|
|
44
41
|
self.json_str: str | StringFileWrapper = json_str
|
|
@@ -70,6 +67,10 @@ class JSONParser:
|
|
|
70
67
|
# case 3: '{"key": "val\\n123,`key2:value2' => '{"key": "val\\n123,`key2:value2"}'
|
|
71
68
|
# case 4: '{"key": "val\\n123,`key2:value2`"}' => '{"key": "val\\n123,`key2:value2`"}'
|
|
72
69
|
self.stream_stable = stream_stable
|
|
70
|
+
# Over time the library got more and more complex heuristics to repair JSON. Some of these heuristics
|
|
71
|
+
# may not be desirable in some use cases and the user would prefer json_repair to return an exception.
|
|
72
|
+
# So strict mode was added to disable some of those heuristics.
|
|
73
|
+
self.strict = strict
|
|
73
74
|
|
|
74
75
|
def parse(
|
|
75
76
|
self,
|
|
@@ -97,6 +98,11 @@ class JSONParser:
|
|
|
97
98
|
"There were no more elements, returning the element without the array",
|
|
98
99
|
)
|
|
99
100
|
json = json[0]
|
|
101
|
+
elif self.strict:
|
|
102
|
+
self.log(
|
|
103
|
+
"Multiple top-level JSON elements found in strict mode, raising an error",
|
|
104
|
+
)
|
|
105
|
+
raise ValueError("Multiple top-level JSON elements found in strict mode.")
|
|
100
106
|
if self.logging:
|
|
101
107
|
return json, self.logger
|
|
102
108
|
else:
|
|
@@ -107,8 +113,8 @@ class JSONParser:
|
|
|
107
113
|
) -> JSONReturnType:
|
|
108
114
|
while True:
|
|
109
115
|
char = self.get_char_at()
|
|
110
|
-
#
|
|
111
|
-
if char is
|
|
116
|
+
# None means that we are at the end of the string provided
|
|
117
|
+
if char is None:
|
|
112
118
|
return ""
|
|
113
119
|
# <object> starts with '{'
|
|
114
120
|
elif char == "{":
|
|
@@ -130,30 +136,36 @@ class JSONParser:
|
|
|
130
136
|
else:
|
|
131
137
|
self.index += 1
|
|
132
138
|
|
|
133
|
-
def get_char_at(self, count: int = 0) -> str |
|
|
139
|
+
def get_char_at(self, count: int = 0) -> str | None:
|
|
134
140
|
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
|
|
135
141
|
try:
|
|
136
142
|
return self.json_str[self.index + count]
|
|
137
143
|
except IndexError:
|
|
138
|
-
return
|
|
144
|
+
return None
|
|
139
145
|
|
|
140
|
-
def
|
|
146
|
+
def skip_whitespaces(self) -> None:
|
|
141
147
|
"""
|
|
142
|
-
This function quickly iterates on whitespaces,
|
|
148
|
+
This function quickly iterates on whitespaces, moving the self.index forward
|
|
143
149
|
"""
|
|
144
150
|
try:
|
|
145
|
-
char = self.json_str[self.index
|
|
146
|
-
|
|
147
|
-
return idx
|
|
148
|
-
while char.isspace():
|
|
149
|
-
if move_main_index:
|
|
151
|
+
char = self.json_str[self.index]
|
|
152
|
+
while char.isspace():
|
|
150
153
|
self.index += 1
|
|
151
|
-
|
|
154
|
+
char = self.json_str[self.index]
|
|
155
|
+
except IndexError:
|
|
156
|
+
pass
|
|
157
|
+
|
|
158
|
+
def scroll_whitespaces(self, idx: int = 0) -> int:
|
|
159
|
+
"""
|
|
160
|
+
This function quickly iterates on whitespaces. Doesn't move the self.index and returns the offset from self.index
|
|
161
|
+
"""
|
|
162
|
+
try:
|
|
163
|
+
char = self.json_str[self.index + idx]
|
|
164
|
+
while char.isspace():
|
|
152
165
|
idx += 1
|
|
153
|
-
try:
|
|
154
166
|
char = self.json_str[self.index + idx]
|
|
155
|
-
|
|
156
|
-
|
|
167
|
+
except IndexError:
|
|
168
|
+
pass
|
|
157
169
|
return idx
|
|
158
170
|
|
|
159
171
|
def skip_to_character(self, character: str | list[str], idx: int = 0) -> int:
|
json_repair/json_repair.py
CHANGED
|
@@ -25,10 +25,10 @@ All supported use cases are in the unit tests
|
|
|
25
25
|
import argparse
|
|
26
26
|
import json
|
|
27
27
|
import sys
|
|
28
|
-
from typing import Literal, TextIO, overload
|
|
28
|
+
from typing import Any, Literal, TextIO, overload
|
|
29
29
|
|
|
30
|
-
from .constants import JSONReturnType
|
|
31
30
|
from .json_parser import JSONParser
|
|
31
|
+
from .utils.constants import JSONReturnType
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
@overload
|
|
@@ -40,7 +40,8 @@ def repair_json(
|
|
|
40
40
|
json_fd: TextIO | None = None,
|
|
41
41
|
chunk_length: int = 0,
|
|
42
42
|
stream_stable: bool = False,
|
|
43
|
-
|
|
43
|
+
strict: bool = False,
|
|
44
|
+
**json_dumps_args: Any,
|
|
44
45
|
) -> str: ...
|
|
45
46
|
|
|
46
47
|
|
|
@@ -53,7 +54,8 @@ def repair_json(
|
|
|
53
54
|
json_fd: TextIO | None = None,
|
|
54
55
|
chunk_length: int = 0,
|
|
55
56
|
stream_stable: bool = False,
|
|
56
|
-
|
|
57
|
+
strict: bool = False,
|
|
58
|
+
**json_dumps_args: Any,
|
|
57
59
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]: ...
|
|
58
60
|
|
|
59
61
|
|
|
@@ -65,8 +67,9 @@ def repair_json(
|
|
|
65
67
|
json_fd: TextIO | None = None,
|
|
66
68
|
chunk_length: int = 0,
|
|
67
69
|
stream_stable: bool = False,
|
|
68
|
-
|
|
69
|
-
|
|
70
|
+
strict: bool = False,
|
|
71
|
+
**json_dumps_args: Any,
|
|
72
|
+
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
|
|
70
73
|
"""
|
|
71
74
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
|
|
72
75
|
|
|
@@ -79,10 +82,11 @@ def repair_json(
|
|
|
79
82
|
ensure_ascii (bool, optional): Set to False to avoid converting non-latin characters to ascii (for example when using chinese characters). Defaults to True. Ignored if `skip_json_loads` is True.
|
|
80
83
|
chunk_length (int, optional): Size in bytes of the file chunks to read at once. Ignored if `json_fd` is None. Do not use! Use `from_file` or `load` instead. Defaults to 1MB.
|
|
81
84
|
stream_stable (bool, optional): When the json to be repaired is the accumulation of streaming json at a certain moment.If this parameter to True will keep the repair results stable.
|
|
85
|
+
strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
|
|
82
86
|
Returns:
|
|
83
87
|
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON or a tuple with the repaired JSON and repair log when logging is True.
|
|
84
88
|
"""
|
|
85
|
-
parser = JSONParser(json_str, json_fd, logging, chunk_length, stream_stable)
|
|
89
|
+
parser = JSONParser(json_str, json_fd, logging, chunk_length, stream_stable, strict)
|
|
86
90
|
if skip_json_loads:
|
|
87
91
|
parsed_json = parser.parse()
|
|
88
92
|
else:
|
|
@@ -109,6 +113,7 @@ def loads(
|
|
|
109
113
|
skip_json_loads: bool = False,
|
|
110
114
|
logging: bool = False,
|
|
111
115
|
stream_stable: bool = False,
|
|
116
|
+
strict: bool = False,
|
|
112
117
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | str:
|
|
113
118
|
"""
|
|
114
119
|
This function works like `json.loads()` except that it will fix your JSON in the process.
|
|
@@ -118,6 +123,7 @@ def loads(
|
|
|
118
123
|
json_str (str): The JSON string to load and repair.
|
|
119
124
|
skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
|
|
120
125
|
logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
|
|
126
|
+
strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
|
|
121
127
|
|
|
122
128
|
Returns:
|
|
123
129
|
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]], str]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
|
|
@@ -128,6 +134,7 @@ def loads(
|
|
|
128
134
|
skip_json_loads=skip_json_loads,
|
|
129
135
|
logging=logging,
|
|
130
136
|
stream_stable=stream_stable,
|
|
137
|
+
strict=strict,
|
|
131
138
|
)
|
|
132
139
|
|
|
133
140
|
|
|
@@ -136,6 +143,7 @@ def load(
|
|
|
136
143
|
skip_json_loads: bool = False,
|
|
137
144
|
logging: bool = False,
|
|
138
145
|
chunk_length: int = 0,
|
|
146
|
+
strict: bool = False,
|
|
139
147
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
|
|
140
148
|
"""
|
|
141
149
|
This function works like `json.load()` except that it will fix your JSON in the process.
|
|
@@ -146,6 +154,7 @@ def load(
|
|
|
146
154
|
skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
|
|
147
155
|
logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
|
|
148
156
|
chunk_length (int, optional): Size in bytes of the file chunks to read at once. Defaults to 1MB.
|
|
157
|
+
strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
|
|
149
158
|
|
|
150
159
|
Returns:
|
|
151
160
|
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
|
|
@@ -156,6 +165,7 @@ def load(
|
|
|
156
165
|
return_objects=True,
|
|
157
166
|
skip_json_loads=skip_json_loads,
|
|
158
167
|
logging=logging,
|
|
168
|
+
strict=strict,
|
|
159
169
|
)
|
|
160
170
|
|
|
161
171
|
|
|
@@ -164,6 +174,7 @@ def from_file(
|
|
|
164
174
|
skip_json_loads: bool = False,
|
|
165
175
|
logging: bool = False,
|
|
166
176
|
chunk_length: int = 0,
|
|
177
|
+
strict: bool = False,
|
|
167
178
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
|
|
168
179
|
"""
|
|
169
180
|
This function is a wrapper around `load()` so you can pass the filename as string
|
|
@@ -173,6 +184,7 @@ def from_file(
|
|
|
173
184
|
skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
|
|
174
185
|
logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
|
|
175
186
|
chunk_length (int, optional): Size in bytes of the file chunks to read at once. Defaults to 1MB.
|
|
187
|
+
strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
|
|
176
188
|
|
|
177
189
|
Returns:
|
|
178
190
|
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
|
|
@@ -183,6 +195,7 @@ def from_file(
|
|
|
183
195
|
skip_json_loads=skip_json_loads,
|
|
184
196
|
logging=logging,
|
|
185
197
|
chunk_length=chunk_length,
|
|
198
|
+
strict=strict,
|
|
186
199
|
)
|
|
187
200
|
|
|
188
201
|
return jsonobj
|
|
@@ -240,6 +253,11 @@ def cli(inline_args: list[str] | None = None) -> int:
|
|
|
240
253
|
default=2,
|
|
241
254
|
help="Number of spaces for indentation (Default 2)",
|
|
242
255
|
)
|
|
256
|
+
parser.add_argument(
|
|
257
|
+
"--strict",
|
|
258
|
+
action="store_true",
|
|
259
|
+
help="Raise on duplicate keys, missing separators, empty keys/values, and other unrecoverable structures instead of repairing them",
|
|
260
|
+
)
|
|
243
261
|
|
|
244
262
|
args = parser.parse_args() if inline_args is None else parser.parse_args(inline_args)
|
|
245
263
|
|
|
@@ -259,10 +277,10 @@ def cli(inline_args: list[str] | None = None) -> int:
|
|
|
259
277
|
try:
|
|
260
278
|
# Use from_file if a filename is provided; otherwise read from stdin.
|
|
261
279
|
if args.filename:
|
|
262
|
-
result = from_file(args.filename)
|
|
280
|
+
result = from_file(args.filename, strict=args.strict)
|
|
263
281
|
else:
|
|
264
282
|
data = sys.stdin.read()
|
|
265
|
-
result = loads(data)
|
|
283
|
+
result = loads(data, strict=args.strict)
|
|
266
284
|
if args.inline or args.output:
|
|
267
285
|
with open(args.output or args.filename, mode="w") as fd:
|
|
268
286
|
json.dump(result, fd, indent=args.indent, ensure_ascii=ensure_ascii)
|
json_repair/parse_array.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
|
-
from .constants import STRING_DELIMITERS, JSONReturnType
|
|
4
|
-
from .json_context import ContextValues
|
|
5
|
-
from .object_comparer import ObjectComparer
|
|
3
|
+
from .utils.constants import STRING_DELIMITERS, JSONReturnType
|
|
4
|
+
from .utils.json_context import ContextValues
|
|
5
|
+
from .utils.object_comparer import ObjectComparer
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from .json_parser import JSONParser
|
|
@@ -15,7 +15,7 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
|
|
|
15
15
|
# Stop when you either find the closing parentheses or you have iterated over the entire string
|
|
16
16
|
char = self.get_char_at()
|
|
17
17
|
while char and char not in ["]", "}"]:
|
|
18
|
-
self.
|
|
18
|
+
self.skip_whitespaces()
|
|
19
19
|
value: JSONReturnType = ""
|
|
20
20
|
if char in STRING_DELIMITERS:
|
|
21
21
|
# Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
|
|
@@ -23,13 +23,13 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
|
|
|
23
23
|
# And either parse the string or parse the object
|
|
24
24
|
i = 1
|
|
25
25
|
i = self.skip_to_character(char, i)
|
|
26
|
-
i = self.
|
|
26
|
+
i = self.scroll_whitespaces(idx=i + 1)
|
|
27
27
|
value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
|
|
28
28
|
else:
|
|
29
29
|
value = self.parse_json()
|
|
30
30
|
|
|
31
|
-
# It is possible that parse_json() returns nothing valid, so we increase by 1
|
|
32
|
-
if ObjectComparer.is_strictly_empty(value):
|
|
31
|
+
# It is possible that parse_json() returns nothing valid, so we increase by 1, unless we find an array separator
|
|
32
|
+
if ObjectComparer.is_strictly_empty(value) and self.get_char_at() not in ["]", ","]:
|
|
33
33
|
self.index += 1
|
|
34
34
|
elif value == "..." and self.get_char_at(-1) == ".":
|
|
35
35
|
self.log(
|
|
@@ -45,7 +45,7 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
|
|
|
45
45
|
char = self.get_char_at()
|
|
46
46
|
|
|
47
47
|
# Especially at the end of an LLM generated json you might miss the last "]"
|
|
48
|
-
if char
|
|
48
|
+
if char != "]":
|
|
49
49
|
self.log(
|
|
50
50
|
"While parsing an array we missed the closing ], ignoring it",
|
|
51
51
|
)
|
json_repair/parse_comment.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
|
-
from .constants import JSONReturnType
|
|
4
|
-
from .json_context import ContextValues
|
|
3
|
+
from .utils.constants import JSONReturnType
|
|
4
|
+
from .utils.json_context import ContextValues
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
7
|
from .json_parser import JSONParser
|
json_repair/parse_number.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
|
-
from .
|
|
3
|
+
from .utils.constants import JSONReturnType
|
|
4
|
+
from .utils.json_context import ContextValues
|
|
4
5
|
|
|
5
6
|
NUMBER_CHARS: set[str] = set("0123456789-.eE/,")
|
|
6
7
|
|
|
@@ -9,7 +10,7 @@ if TYPE_CHECKING:
|
|
|
9
10
|
from .json_parser import JSONParser
|
|
10
11
|
|
|
11
12
|
|
|
12
|
-
def parse_number(self: "JSONParser") ->
|
|
13
|
+
def parse_number(self: "JSONParser") -> JSONReturnType:
|
|
13
14
|
# <number> is a valid real number expressed in one of a number of given formats
|
|
14
15
|
number_str = ""
|
|
15
16
|
char = self.get_char_at()
|
json_repair/parse_object.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
|
-
from .constants import STRING_DELIMITERS, JSONReturnType
|
|
4
|
-
from .json_context import ContextValues
|
|
3
|
+
from .utils.constants import STRING_DELIMITERS, JSONReturnType
|
|
4
|
+
from .utils.json_context import ContextValues
|
|
5
5
|
|
|
6
6
|
if TYPE_CHECKING:
|
|
7
7
|
from .json_parser import JSONParser
|
|
@@ -17,10 +17,10 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
17
17
|
# <member> ::= <string> ': ' <json>
|
|
18
18
|
|
|
19
19
|
# Skip filler whitespaces
|
|
20
|
-
self.
|
|
20
|
+
self.skip_whitespaces()
|
|
21
21
|
|
|
22
22
|
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
|
|
23
|
-
if
|
|
23
|
+
if self.get_char_at() == ":":
|
|
24
24
|
self.log(
|
|
25
25
|
"While parsing an object we found a : before a key, ignoring",
|
|
26
26
|
)
|
|
@@ -53,18 +53,26 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
53
53
|
prev_value.extend(
|
|
54
54
|
new_array[0] if len(new_array) == 1 and isinstance(new_array[0], list) else new_array
|
|
55
55
|
)
|
|
56
|
-
self.
|
|
56
|
+
self.skip_whitespaces()
|
|
57
57
|
if self.get_char_at() == ",":
|
|
58
58
|
self.index += 1
|
|
59
|
-
self.
|
|
59
|
+
self.skip_whitespaces()
|
|
60
60
|
continue
|
|
61
61
|
key = str(self.parse_string())
|
|
62
62
|
if key == "":
|
|
63
|
-
self.
|
|
63
|
+
self.skip_whitespaces()
|
|
64
64
|
if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
|
|
65
|
-
#
|
|
65
|
+
# Empty keys now trigger in strict mode, otherwise we keep repairing as before
|
|
66
|
+
if key == "" and self.strict:
|
|
67
|
+
self.log(
|
|
68
|
+
"Empty key found in strict mode while parsing object, raising an error",
|
|
69
|
+
)
|
|
70
|
+
raise ValueError("Empty key found in strict mode while parsing object.")
|
|
66
71
|
break
|
|
67
72
|
if ContextValues.ARRAY in self.context.context and key in obj:
|
|
73
|
+
if self.strict:
|
|
74
|
+
self.log("Duplicate key found in strict mode while parsing object, raising an error")
|
|
75
|
+
raise ValueError("Duplicate key found in strict mode while parsing object.")
|
|
68
76
|
self.log(
|
|
69
77
|
"While parsing an object we found a duplicate key, closing the object here and rolling back the index",
|
|
70
78
|
)
|
|
@@ -74,16 +82,21 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
74
82
|
break
|
|
75
83
|
|
|
76
84
|
# Skip filler whitespaces
|
|
77
|
-
self.
|
|
85
|
+
self.skip_whitespaces()
|
|
78
86
|
|
|
79
87
|
# We reached the end here
|
|
80
88
|
if (self.get_char_at() or "}") == "}":
|
|
81
89
|
continue
|
|
82
90
|
|
|
83
|
-
self.
|
|
91
|
+
self.skip_whitespaces()
|
|
84
92
|
|
|
85
93
|
# An extreme case of missing ":" after a key
|
|
86
|
-
if
|
|
94
|
+
if self.get_char_at() != ":":
|
|
95
|
+
if self.strict:
|
|
96
|
+
self.log(
|
|
97
|
+
"Missing ':' after key in strict mode while parsing object, raising an error",
|
|
98
|
+
)
|
|
99
|
+
raise ValueError("Missing ':' after key in strict mode while parsing object.")
|
|
87
100
|
self.log(
|
|
88
101
|
"While parsing an object we missed a : after a key",
|
|
89
102
|
)
|
|
@@ -91,31 +104,40 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
91
104
|
self.index += 1
|
|
92
105
|
self.context.reset()
|
|
93
106
|
self.context.set(ContextValues.OBJECT_VALUE)
|
|
94
|
-
# The value can be any valid json
|
|
95
|
-
self.
|
|
107
|
+
# The value can be any valid json; strict mode will refuse repaired empties
|
|
108
|
+
self.skip_whitespaces()
|
|
96
109
|
# Corner case, a lone comma
|
|
97
110
|
value: JSONReturnType = ""
|
|
98
|
-
if
|
|
111
|
+
if self.get_char_at() in [",", "}"]:
|
|
99
112
|
self.log(
|
|
100
|
-
"While parsing an object value we found a stray , ignoring it",
|
|
113
|
+
"While parsing an object value we found a stray " + str(self.get_char_at()) + ", ignoring it",
|
|
101
114
|
)
|
|
102
115
|
else:
|
|
103
116
|
value = self.parse_json()
|
|
104
|
-
|
|
117
|
+
if value == "" and self.strict and self.get_char_at(-1) not in STRING_DELIMITERS:
|
|
118
|
+
self.log(
|
|
119
|
+
"Parsed value is empty in strict mode while parsing object, raising an error",
|
|
120
|
+
)
|
|
121
|
+
raise ValueError("Parsed value is empty in strict mode while parsing object.")
|
|
105
122
|
# Reset context since our job is done
|
|
106
123
|
self.context.reset()
|
|
107
124
|
obj[key] = value
|
|
108
125
|
|
|
109
|
-
if
|
|
126
|
+
if self.get_char_at() in [",", "'", '"']:
|
|
110
127
|
self.index += 1
|
|
111
128
|
|
|
112
129
|
# Remove trailing spaces
|
|
113
|
-
self.
|
|
130
|
+
self.skip_whitespaces()
|
|
114
131
|
|
|
115
132
|
self.index += 1
|
|
116
133
|
|
|
117
134
|
# If the object is empty but also isn't just {}
|
|
118
135
|
if not obj and self.index - start_index > 2:
|
|
136
|
+
if self.strict:
|
|
137
|
+
self.log(
|
|
138
|
+
"Parsed object is empty but contains extra characters in strict mode, raising an error",
|
|
139
|
+
)
|
|
140
|
+
raise ValueError("Parsed object is empty but contains extra characters in strict mode.")
|
|
119
141
|
self.log("Parsed object is empty, we will try to parse this as an array instead")
|
|
120
142
|
self.index = start_index
|
|
121
143
|
return self.parse_array()
|
|
@@ -126,18 +148,19 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
126
148
|
if not self.context.empty:
|
|
127
149
|
return obj
|
|
128
150
|
|
|
129
|
-
self.
|
|
130
|
-
if
|
|
151
|
+
self.skip_whitespaces()
|
|
152
|
+
if self.get_char_at() != ",":
|
|
131
153
|
return obj
|
|
132
154
|
self.index += 1
|
|
133
|
-
self.
|
|
134
|
-
if
|
|
155
|
+
self.skip_whitespaces()
|
|
156
|
+
if self.get_char_at() not in STRING_DELIMITERS:
|
|
135
157
|
return obj
|
|
136
|
-
self.
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
158
|
+
if not self.strict:
|
|
159
|
+
self.log(
|
|
160
|
+
"Found a comma and string delimiter after object closing brace, checking for additional key-value pairs",
|
|
161
|
+
)
|
|
162
|
+
additional_obj = self.parse_object()
|
|
163
|
+
if isinstance(additional_obj, dict):
|
|
164
|
+
obj.update(additional_obj)
|
|
142
165
|
|
|
143
166
|
return obj
|
json_repair/parse_string.py
CHANGED
|
@@ -1,14 +1,22 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
|
-
from .
|
|
4
|
-
from .json_context import ContextValues
|
|
3
|
+
from .parse_string_helpers.parse_boolean_or_null import parse_boolean_or_null
|
|
5
4
|
from .parse_string_helpers.parse_json_llm_block import parse_json_llm_block
|
|
5
|
+
from .utils.constants import STRING_DELIMITERS, JSONReturnType
|
|
6
|
+
from .utils.json_context import ContextValues
|
|
6
7
|
|
|
7
8
|
if TYPE_CHECKING:
|
|
8
9
|
from .json_parser import JSONParser
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
13
|
+
# Utility function to append a character to the accumulator and update the index
|
|
14
|
+
def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str | None]:
|
|
15
|
+
acc += str(current_char)
|
|
16
|
+
self.index += 1
|
|
17
|
+
char = self.get_char_at()
|
|
18
|
+
return acc, char
|
|
19
|
+
|
|
12
20
|
# <string> is a string of valid characters enclosed in quotes
|
|
13
21
|
# i.e. { name: "John" }
|
|
14
22
|
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
|
|
@@ -40,7 +48,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
40
48
|
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
|
|
41
49
|
# But remember, object keys are only of type string
|
|
42
50
|
if char.lower() in ["t", "f", "n"] and self.context.current != ContextValues.OBJECT_KEY:
|
|
43
|
-
value =
|
|
51
|
+
value = parse_boolean_or_null(self)
|
|
44
52
|
if value != "":
|
|
45
53
|
return value
|
|
46
54
|
self.log(
|
|
@@ -59,10 +67,12 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
59
67
|
"While parsing a string, we found code fences but they did not enclose valid JSON, continuing parsing the string",
|
|
60
68
|
)
|
|
61
69
|
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
|
|
62
|
-
if self.get_char_at()
|
|
70
|
+
if self.get_char_at() == lstring_delimiter:
|
|
63
71
|
# If it's an empty key, this was easy
|
|
64
|
-
if (
|
|
65
|
-
self.context.current == ContextValues.
|
|
72
|
+
if (
|
|
73
|
+
(self.context.current == ContextValues.OBJECT_KEY and self.get_char_at(1) == ":")
|
|
74
|
+
or (self.context.current == ContextValues.OBJECT_VALUE and self.get_char_at(1) in [",", "}"])
|
|
75
|
+
or (self.context.current == ContextValues.ARRAY and self.get_char_at(1) in [",", "]"])
|
|
66
76
|
):
|
|
67
77
|
self.index += 1
|
|
68
78
|
return ""
|
|
@@ -71,13 +81,16 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
71
81
|
self.log(
|
|
72
82
|
"While parsing a string, we found a doubled quote and then a quote again, ignoring it",
|
|
73
83
|
)
|
|
74
|
-
|
|
84
|
+
if self.strict:
|
|
85
|
+
raise ValueError("Found doubled quotes followed by another quote.")
|
|
86
|
+
else:
|
|
87
|
+
return ""
|
|
75
88
|
# Find the next delimiter
|
|
76
89
|
i = self.skip_to_character(character=rstring_delimiter, idx=1)
|
|
77
90
|
next_c = self.get_char_at(i)
|
|
78
91
|
# Now check that the next character is also a delimiter to ensure that we have "".....""
|
|
79
92
|
# In that case we ignore this rstring delimiter
|
|
80
|
-
if
|
|
93
|
+
if self.get_char_at(i + 1) == rstring_delimiter:
|
|
81
94
|
self.log(
|
|
82
95
|
"While parsing a string, we found a valid starting doubled quote",
|
|
83
96
|
)
|
|
@@ -85,13 +98,17 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
85
98
|
self.index += 1
|
|
86
99
|
else:
|
|
87
100
|
# Ok this is not a doubled quote, check if this is an empty string or not
|
|
88
|
-
i = self.
|
|
101
|
+
i = self.scroll_whitespaces(idx=1)
|
|
89
102
|
next_c = self.get_char_at(i)
|
|
90
103
|
if next_c in STRING_DELIMITERS + ["{", "["]:
|
|
91
104
|
# something fishy is going on here
|
|
92
105
|
self.log(
|
|
93
106
|
"While parsing a string, we found a doubled quote but also another quote afterwards, ignoring it",
|
|
94
107
|
)
|
|
108
|
+
if self.strict:
|
|
109
|
+
raise ValueError(
|
|
110
|
+
"Found doubled quotes followed by another quote while parsing a string.",
|
|
111
|
+
)
|
|
95
112
|
self.index += 1
|
|
96
113
|
return ""
|
|
97
114
|
elif next_c not in [",", "]", "}"]:
|
|
@@ -135,7 +152,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
135
152
|
):
|
|
136
153
|
rstring_delimiter_missing = True
|
|
137
154
|
# check if this is a case in which the closing comma is NOT missing instead
|
|
138
|
-
self.
|
|
155
|
+
self.skip_whitespaces()
|
|
139
156
|
if self.get_char_at(1) == "\\":
|
|
140
157
|
# Ok this is a quoted string, skip
|
|
141
158
|
rstring_delimiter_missing = False
|
|
@@ -145,7 +162,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
145
162
|
i += 1
|
|
146
163
|
# found a delimiter, now we need to check that is followed strictly by a comma or brace
|
|
147
164
|
# or the string ended
|
|
148
|
-
i = self.
|
|
165
|
+
i = self.scroll_whitespaces(idx=i)
|
|
149
166
|
next_c = self.get_char_at(i)
|
|
150
167
|
if not next_c or next_c in [",", "}"]:
|
|
151
168
|
rstring_delimiter_missing = False
|
|
@@ -160,7 +177,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
160
177
|
else:
|
|
161
178
|
# But again, this could just be something a bit stupid like "lorem, "ipsum" sic"
|
|
162
179
|
# Check if we find a : afterwards (skipping space)
|
|
163
|
-
i = self.
|
|
180
|
+
i = self.scroll_whitespaces(idx=i + 1)
|
|
164
181
|
next_c = self.get_char_at(i)
|
|
165
182
|
if next_c and next_c != ":":
|
|
166
183
|
rstring_delimiter_missing = False
|
|
@@ -175,7 +192,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
175
192
|
break
|
|
176
193
|
else:
|
|
177
194
|
# skip any whitespace first
|
|
178
|
-
i = self.
|
|
195
|
+
i = self.scroll_whitespaces(idx=1)
|
|
179
196
|
# We couldn't find any rstring_delimeter before the end of the string
|
|
180
197
|
# check if this is the last string of an object and therefore we can keep going
|
|
181
198
|
# make an exception if this is the last char before the closing brace
|
|
@@ -212,19 +229,15 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
212
229
|
if self.context.current == ContextValues.OBJECT_VALUE and char == "}":
|
|
213
230
|
# We found the end of an object while parsing a value
|
|
214
231
|
# Check if the object is really over, to avoid doubling the closing brace
|
|
215
|
-
i = self.
|
|
232
|
+
i = self.scroll_whitespaces(idx=1)
|
|
216
233
|
next_c = self.get_char_at(i)
|
|
217
|
-
if next_c and
|
|
234
|
+
if next_c == "`" and self.get_char_at(i + 1) == "`" and self.get_char_at(i + 2) == "`":
|
|
218
235
|
# This could be a special case in which the LLM added code fences after the object
|
|
219
236
|
# So we need to check if there are another two ` after this one`
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
self.log(
|
|
225
|
-
"While parsing a string in object value context, we found a } that closes the object before code fences, stopping here",
|
|
226
|
-
)
|
|
227
|
-
break
|
|
237
|
+
self.log(
|
|
238
|
+
"While parsing a string in object value context, we found a } that closes the object before code fences, stopping here",
|
|
239
|
+
)
|
|
240
|
+
break
|
|
228
241
|
if not next_c:
|
|
229
242
|
self.log(
|
|
230
243
|
"While parsing a string in object value context, we found a } that closes the object, stopping here",
|
|
@@ -282,12 +295,13 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
282
295
|
# found a second delimiter
|
|
283
296
|
i += 1
|
|
284
297
|
# Skip spaces
|
|
285
|
-
i = self.
|
|
286
|
-
|
|
287
|
-
if next_c and next_c in [",", "}"]:
|
|
298
|
+
i = self.scroll_whitespaces(idx=i)
|
|
299
|
+
if self.get_char_at(i) in [",", "}"]:
|
|
288
300
|
# Ok then this is a missing right quote
|
|
289
301
|
self.log(
|
|
290
|
-
"While parsing a string missing the right delimiter in object key context, we found a
|
|
302
|
+
"While parsing a string missing the right delimiter in object key context, we found a "
|
|
303
|
+
+ str(self.get_char_at(i))
|
|
304
|
+
+ " stopping here",
|
|
291
305
|
)
|
|
292
306
|
break
|
|
293
307
|
else:
|
|
@@ -316,9 +330,8 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
316
330
|
# We found a quote, now let's make sure there's a ":" following
|
|
317
331
|
i += 1
|
|
318
332
|
# found a delimiter, now we need to check that is followed strictly by a comma or brace
|
|
319
|
-
i = self.
|
|
320
|
-
|
|
321
|
-
if next_c and next_c == ":":
|
|
333
|
+
i = self.scroll_whitespaces(idx=i)
|
|
334
|
+
if self.get_char_at(i) == ":":
|
|
322
335
|
# Reset the cursor
|
|
323
336
|
self.index -= 1
|
|
324
337
|
char = self.get_char_at()
|
|
@@ -328,9 +341,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
328
341
|
break
|
|
329
342
|
elif unmatched_delimiter:
|
|
330
343
|
unmatched_delimiter = False
|
|
331
|
-
string_acc
|
|
332
|
-
self.index += 1
|
|
333
|
-
char = self.get_char_at()
|
|
344
|
+
string_acc, char = _append_literal_char(string_acc, char)
|
|
334
345
|
else:
|
|
335
346
|
# Check if eventually there is a rstring delimiter, otherwise we bail
|
|
336
347
|
i = 1
|
|
@@ -365,22 +376,20 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
365
376
|
next_c = self.get_char_at(i)
|
|
366
377
|
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a } or a ,
|
|
367
378
|
i += 1
|
|
368
|
-
i = self.
|
|
379
|
+
i = self.scroll_whitespaces(idx=i)
|
|
369
380
|
next_c = self.get_char_at(i)
|
|
370
381
|
if next_c in ["}", ","]:
|
|
371
382
|
self.log(
|
|
372
|
-
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
|
383
|
+
"While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
|
373
384
|
)
|
|
374
|
-
string_acc
|
|
375
|
-
self.index += 1
|
|
376
|
-
char = self.get_char_at()
|
|
385
|
+
string_acc, char = _append_literal_char(string_acc, char)
|
|
377
386
|
continue
|
|
378
387
|
elif next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\":
|
|
379
388
|
# Check if self.index:self.index+i is only whitespaces, break if that's the case
|
|
380
389
|
if all(str(self.get_char_at(j)).isspace() for j in range(1, i) if self.get_char_at(j)):
|
|
381
390
|
break
|
|
382
391
|
if self.context.current == ContextValues.OBJECT_VALUE:
|
|
383
|
-
i = self.
|
|
392
|
+
i = self.scroll_whitespaces(idx=i + 1)
|
|
384
393
|
if self.get_char_at(i) == ",":
|
|
385
394
|
# So we found a comma, this could be a case of a single quote like "va"lue",
|
|
386
395
|
# Search if it's followed by another key, starting with the first delimeter
|
|
@@ -388,15 +397,13 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
388
397
|
i += 1
|
|
389
398
|
i = self.skip_to_character(character=rstring_delimiter, idx=i + 1)
|
|
390
399
|
i += 1
|
|
391
|
-
i = self.
|
|
400
|
+
i = self.scroll_whitespaces(idx=i)
|
|
392
401
|
next_c = self.get_char_at(i)
|
|
393
402
|
if next_c == ":":
|
|
394
403
|
self.log(
|
|
395
|
-
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
|
404
|
+
"While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
|
396
405
|
)
|
|
397
|
-
string_acc
|
|
398
|
-
self.index += 1
|
|
399
|
-
char = self.get_char_at()
|
|
406
|
+
string_acc, char = _append_literal_char(string_acc, char)
|
|
400
407
|
continue
|
|
401
408
|
# We found a delimiter and we need to check if this is a key
|
|
402
409
|
# so find a rstring_delimiter and a colon after
|
|
@@ -413,12 +420,10 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
413
420
|
# Only if we fail to find a ':' then we know this is misplaced quote
|
|
414
421
|
if next_c != ":":
|
|
415
422
|
self.log(
|
|
416
|
-
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
|
423
|
+
"While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
|
417
424
|
)
|
|
418
425
|
unmatched_delimiter = not unmatched_delimiter
|
|
419
|
-
string_acc
|
|
420
|
-
self.index += 1
|
|
421
|
-
char = self.get_char_at()
|
|
426
|
+
string_acc, char = _append_literal_char(string_acc, char)
|
|
422
427
|
elif self.context.current == ContextValues.ARRAY:
|
|
423
428
|
# So here we can have a few valid cases:
|
|
424
429
|
# ["bla bla bla "puppy" bla bla bla "kitty" bla bla"]
|
|
@@ -442,9 +447,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
442
447
|
"While parsing a string in Array context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
|
|
443
448
|
)
|
|
444
449
|
unmatched_delimiter = not unmatched_delimiter
|
|
445
|
-
string_acc
|
|
446
|
-
self.index += 1
|
|
447
|
-
char = self.get_char_at()
|
|
450
|
+
string_acc, char = _append_literal_char(string_acc, char)
|
|
448
451
|
else:
|
|
449
452
|
break
|
|
450
453
|
elif self.context.current == ContextValues.OBJECT_KEY:
|
|
@@ -452,14 +455,12 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
452
455
|
self.log(
|
|
453
456
|
"While parsing a string in Object Key context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
|
|
454
457
|
)
|
|
455
|
-
string_acc
|
|
456
|
-
self.index += 1
|
|
457
|
-
char = self.get_char_at()
|
|
458
|
+
string_acc, char = _append_literal_char(string_acc, char)
|
|
458
459
|
if char and missing_quotes and self.context.current == ContextValues.OBJECT_KEY and char.isspace():
|
|
459
460
|
self.log(
|
|
460
461
|
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
|
|
461
462
|
)
|
|
462
|
-
self.
|
|
463
|
+
self.skip_whitespaces()
|
|
463
464
|
if self.get_char_at() not in [":", ","]:
|
|
464
465
|
return ""
|
|
465
466
|
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from ..json_parser import JSONParser # noqa: TID252
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def parse_boolean_or_null(parser: "JSONParser") -> bool | str | None:
|
|
8
|
+
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
|
|
9
|
+
char = (parser.get_char_at() or "").lower()
|
|
10
|
+
value_map: dict[str, tuple[str, bool | None]] = {
|
|
11
|
+
"t": ("true", True),
|
|
12
|
+
"f": ("false", False),
|
|
13
|
+
"n": ("null", None),
|
|
14
|
+
}
|
|
15
|
+
value: tuple[str, bool | None] = value_map[char]
|
|
16
|
+
|
|
17
|
+
i = 0
|
|
18
|
+
starting_index = parser.index
|
|
19
|
+
while char and i < len(value[0]) and char == value[0][i]:
|
|
20
|
+
i += 1
|
|
21
|
+
parser.index += 1
|
|
22
|
+
char = (parser.get_char_at() or "").lower()
|
|
23
|
+
if i == len(value[0]):
|
|
24
|
+
return value[1]
|
|
25
|
+
|
|
26
|
+
# If nothing works reset the index before returning
|
|
27
|
+
parser.index = starting_index
|
|
28
|
+
return ""
|
|
@@ -1,19 +1,19 @@
|
|
|
1
1
|
from typing import TYPE_CHECKING
|
|
2
2
|
|
|
3
|
-
from ..constants import JSONReturnType # noqa: TID252
|
|
3
|
+
from ..utils.constants import JSONReturnType # noqa: TID252
|
|
4
4
|
|
|
5
5
|
if TYPE_CHECKING:
|
|
6
6
|
from ..json_parser import JSONParser # noqa: TID252
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
def parse_json_llm_block(
|
|
9
|
+
def parse_json_llm_block(parser: "JSONParser") -> JSONReturnType:
|
|
10
10
|
"""
|
|
11
11
|
Extracts and normalizes JSON enclosed in ```json ... ``` blocks.
|
|
12
12
|
"""
|
|
13
13
|
# Try to find a ```json ... ``` block
|
|
14
|
-
if
|
|
15
|
-
i =
|
|
16
|
-
if
|
|
17
|
-
|
|
18
|
-
return
|
|
14
|
+
if parser.json_str[parser.index : parser.index + 7] == "```json":
|
|
15
|
+
i = parser.skip_to_character("`", idx=7)
|
|
16
|
+
if parser.json_str[parser.index + i : parser.index + i + 3] == "```":
|
|
17
|
+
parser.index += 7 # Move past ```json
|
|
18
|
+
return parser.parse_json()
|
|
19
19
|
return False
|
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from typing import TextIO
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class StringFileWrapper:
|
|
6
|
+
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
|
|
7
|
+
def __init__(self, fd: TextIO, chunk_length: int) -> None:
|
|
8
|
+
"""
|
|
9
|
+
Initialize the StringFileWrapper with a file descriptor and chunk length.
|
|
10
|
+
|
|
11
|
+
Args:
|
|
12
|
+
fd (TextIO): The file descriptor to wrap.
|
|
13
|
+
CHUNK_LENGTH (int): The length of each chunk to read from the file.
|
|
14
|
+
|
|
15
|
+
Attributes:
|
|
16
|
+
fd (TextIO): The wrapped file descriptor.
|
|
17
|
+
length (int): The total length of the file content.
|
|
18
|
+
buffers (dict[int, str]): Dictionary to store chunks of file content.
|
|
19
|
+
buffer_length (int): The length of each buffer chunk.
|
|
20
|
+
"""
|
|
21
|
+
self.fd = fd
|
|
22
|
+
# Buffers are chunks of text read from the file and cached to reduce disk access.
|
|
23
|
+
self.buffers: dict[int, str] = {}
|
|
24
|
+
if not chunk_length or chunk_length < 2:
|
|
25
|
+
chunk_length = 1_000_000
|
|
26
|
+
# chunk_length now refers to the number of characters per chunk.
|
|
27
|
+
self.buffer_length = chunk_length
|
|
28
|
+
# Keep track of the starting file position ("cookie") for each chunk so we can
|
|
29
|
+
# seek safely without landing in the middle of a multibyte code point.
|
|
30
|
+
self._chunk_positions: list[int] = [0]
|
|
31
|
+
self.length: int | None = None
|
|
32
|
+
|
|
33
|
+
def get_buffer(self, index: int) -> str:
|
|
34
|
+
"""
|
|
35
|
+
Retrieve or load a buffer chunk from the file.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
index (int): The index of the buffer chunk to retrieve.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
str: The buffer chunk at the specified index.
|
|
42
|
+
"""
|
|
43
|
+
if index < 0:
|
|
44
|
+
raise IndexError("Negative indexing is not supported")
|
|
45
|
+
|
|
46
|
+
cached = self.buffers.get(index)
|
|
47
|
+
if cached is not None:
|
|
48
|
+
return cached
|
|
49
|
+
|
|
50
|
+
self._ensure_chunk_position(index)
|
|
51
|
+
start_pos = self._chunk_positions[index]
|
|
52
|
+
self.fd.seek(start_pos)
|
|
53
|
+
chunk = self.fd.read(self.buffer_length)
|
|
54
|
+
if not chunk:
|
|
55
|
+
raise IndexError("Chunk index out of range")
|
|
56
|
+
end_pos = self.fd.tell()
|
|
57
|
+
if len(self._chunk_positions) <= index + 1:
|
|
58
|
+
self._chunk_positions.append(end_pos)
|
|
59
|
+
if len(chunk) < self.buffer_length:
|
|
60
|
+
self.length = index * self.buffer_length + len(chunk)
|
|
61
|
+
|
|
62
|
+
self.buffers[index] = chunk
|
|
63
|
+
# Save memory by keeping max 2MB buffer chunks and min 2 chunks
|
|
64
|
+
max_buffers = max(2, int(2_000_000 / self.buffer_length))
|
|
65
|
+
if len(self.buffers) > max_buffers:
|
|
66
|
+
oldest_key = next(iter(self.buffers))
|
|
67
|
+
if oldest_key != index:
|
|
68
|
+
self.buffers.pop(oldest_key)
|
|
69
|
+
return chunk
|
|
70
|
+
|
|
71
|
+
def __getitem__(self, index: int | slice) -> str:
|
|
72
|
+
"""
|
|
73
|
+
Retrieve a character or a slice of characters from the file.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
index (Union[int, slice]): The index or slice of characters to retrieve.
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
str: The character(s) at the specified index or slice.
|
|
80
|
+
"""
|
|
81
|
+
# The buffer is an array that is seek like a RAM:
|
|
82
|
+
# self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
|
|
83
|
+
# self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
|
|
84
|
+
if isinstance(index, slice):
|
|
85
|
+
total_len = len(self)
|
|
86
|
+
start = 0 if index.start is None else index.start
|
|
87
|
+
stop = total_len if index.stop is None else index.stop
|
|
88
|
+
step = 1 if index.step is None else index.step
|
|
89
|
+
|
|
90
|
+
if start < 0:
|
|
91
|
+
start += total_len
|
|
92
|
+
if stop < 0:
|
|
93
|
+
stop += total_len
|
|
94
|
+
|
|
95
|
+
start = max(start, 0)
|
|
96
|
+
stop = min(stop, total_len)
|
|
97
|
+
|
|
98
|
+
if step == 0:
|
|
99
|
+
raise ValueError("slice step cannot be zero")
|
|
100
|
+
if step != 1:
|
|
101
|
+
return "".join(self[i] for i in range(start, stop, step))
|
|
102
|
+
|
|
103
|
+
if start >= stop:
|
|
104
|
+
return ""
|
|
105
|
+
|
|
106
|
+
buffer_index = start // self.buffer_length
|
|
107
|
+
buffer_end = (stop - 1) // self.buffer_length
|
|
108
|
+
start_mod = start % self.buffer_length
|
|
109
|
+
stop_mod = stop % self.buffer_length
|
|
110
|
+
if stop_mod == 0 and stop > start:
|
|
111
|
+
stop_mod = self.buffer_length
|
|
112
|
+
if buffer_index == buffer_end:
|
|
113
|
+
buffer = self.get_buffer(buffer_index)
|
|
114
|
+
return buffer[start_mod:stop_mod]
|
|
115
|
+
|
|
116
|
+
start_slice = self.get_buffer(buffer_index)[start_mod:]
|
|
117
|
+
end_slice = self.get_buffer(buffer_end)[:stop_mod]
|
|
118
|
+
middle_slices = [self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)]
|
|
119
|
+
return start_slice + "".join(middle_slices) + end_slice
|
|
120
|
+
else:
|
|
121
|
+
if index < 0:
|
|
122
|
+
index += len(self)
|
|
123
|
+
if index < 0:
|
|
124
|
+
raise IndexError("string index out of range")
|
|
125
|
+
buffer_index = index // self.buffer_length
|
|
126
|
+
buffer = self.get_buffer(buffer_index)
|
|
127
|
+
return buffer[index % self.buffer_length]
|
|
128
|
+
|
|
129
|
+
def __len__(self) -> int:
|
|
130
|
+
"""
|
|
131
|
+
Get the total length of the file.
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
int: The total number of characters in the file.
|
|
135
|
+
"""
|
|
136
|
+
if self.length is None:
|
|
137
|
+
while self.length is None:
|
|
138
|
+
chunk_index = len(self._chunk_positions)
|
|
139
|
+
self._ensure_chunk_position(chunk_index)
|
|
140
|
+
return self.length
|
|
141
|
+
|
|
142
|
+
def __setitem__(self, index: int | slice, value: str) -> None: # pragma: no cover
|
|
143
|
+
"""
|
|
144
|
+
Set a character or a slice of characters in the file.
|
|
145
|
+
|
|
146
|
+
Args:
|
|
147
|
+
index (slice): The slice of characters to set.
|
|
148
|
+
value (str): The value to set at the specified index or slice.
|
|
149
|
+
"""
|
|
150
|
+
start = index.start or 0 if isinstance(index, slice) else index or 0
|
|
151
|
+
|
|
152
|
+
if start < 0:
|
|
153
|
+
start += len(self)
|
|
154
|
+
|
|
155
|
+
current_position = self.fd.tell()
|
|
156
|
+
self.fd.seek(start)
|
|
157
|
+
self.fd.write(value)
|
|
158
|
+
self.fd.seek(current_position)
|
|
159
|
+
|
|
160
|
+
def _ensure_chunk_position(self, chunk_index: int) -> None:
|
|
161
|
+
"""
|
|
162
|
+
Ensure that we know the starting file position for the given chunk index.
|
|
163
|
+
"""
|
|
164
|
+
while len(self._chunk_positions) <= chunk_index:
|
|
165
|
+
prev_index = len(self._chunk_positions) - 1
|
|
166
|
+
start_pos = self._chunk_positions[-1]
|
|
167
|
+
self.fd.seek(start_pos, os.SEEK_SET)
|
|
168
|
+
chunk = self.fd.read(self.buffer_length)
|
|
169
|
+
end_pos = self.fd.tell()
|
|
170
|
+
if len(chunk) < self.buffer_length:
|
|
171
|
+
self.length = prev_index * self.buffer_length + len(chunk)
|
|
172
|
+
self._chunk_positions.append(end_pos)
|
|
173
|
+
if not chunk:
|
|
174
|
+
break
|
|
175
|
+
if len(self._chunk_positions) <= chunk_index:
|
|
176
|
+
raise IndexError("Chunk index out of range")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: json_repair
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.54
|
|
4
4
|
Summary: A package to repair broken json strings
|
|
5
5
|
Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -167,6 +167,23 @@ Some rules of thumb to use:
|
|
|
167
167
|
- `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
|
|
168
168
|
- If you are having issues with escaping pass the string as **raw** string like: `r"string with escaping\""`
|
|
169
169
|
|
|
170
|
+
### Strict mode
|
|
171
|
+
|
|
172
|
+
By default `json_repair` does its best to “fix” input, even when the JSON is far from valid.
|
|
173
|
+
In some scenarios you want the opposite behavior and need the parser to error out instead of repairing; pass `strict=True` to `repair_json`, `loads`, `load`, or `from_file` to enable that mode:
|
|
174
|
+
|
|
175
|
+
```
|
|
176
|
+
from json_repair import repair_json
|
|
177
|
+
|
|
178
|
+
repair_json(bad_json_string, strict=True)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
The CLI exposes the same behavior with `json_repair --strict input.json` (or piping data via stdin).
|
|
182
|
+
|
|
183
|
+
In strict mode the parser raises `ValueError` as soon as it encounters structural issues such as duplicate keys, missing `:` separators, empty keys/values introduced by stray commas, multiple top-level elements, or other ambiguous constructs. This is useful when you just need validation with friendlier error messages while still benefiting from json_repair’s resilience elsewhere in your stack.
|
|
184
|
+
|
|
185
|
+
Strict mode still honors `skip_json_loads=True`; combining them lets you skip the initial `json.loads` check but still enforce strict parsing rules.
|
|
186
|
+
|
|
170
187
|
### Use json_repair with streaming
|
|
171
188
|
|
|
172
189
|
Sometimes you are streaming some data and want to repair the JSON coming from it. Normally this won't work but you can pass `stream_stable` to `repair_json()` or `loads()` to make it work:
|
|
@@ -198,6 +215,7 @@ options:
|
|
|
198
215
|
If specified, the output will be written to TARGET filename instead of stdout
|
|
199
216
|
--ensure_ascii Pass ensure_ascii=True to json.dumps()
|
|
200
217
|
--indent INDENT Number of spaces for indentation (Default 2)
|
|
218
|
+
--strict Raise on duplicate keys, missing separators, empty keys/values, and similar structural issues instead of repairing them
|
|
201
219
|
```
|
|
202
220
|
|
|
203
221
|
## Adding to requirements
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
json_repair/__init__.py,sha256=JQ4Nm8YzR8Id2a527Ql0Az-rKapTp8DCMPKybLtQ620,180
|
|
2
|
+
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
|
3
|
+
json_repair/json_parser.py,sha256=nATFDlcEnPD8G2NDSKj2nme_v1la_cCcFZrdQvEjTZs,8495
|
|
4
|
+
json_repair/json_repair.py,sha256=iT-OJgpBnKUJVIV4IUlXmMUkOyW6bNnKCZLB7Fys8hk,12758
|
|
5
|
+
json_repair/parse_array.py,sha256=rZfnRiS86vBATOUHqSx2T5fE79Ndlk2NoTsg9Wek7l4,2239
|
|
6
|
+
json_repair/parse_comment.py,sha256=MUDxrx8BFfAaKvx6x4gWviJNvwRi2yv5qnrR6honmas,2660
|
|
7
|
+
json_repair/parse_number.py,sha256=Ddv3Dih1VYfdasUe5DxQWAqy7YAE3aZJ7iePCfdi1EQ,1292
|
|
8
|
+
json_repair/parse_object.py,sha256=noaiP10kzl-jA-1jc6tMmtFoJMIputpB3zFxcAuYQvY,6986
|
|
9
|
+
json_repair/parse_string.py,sha256=L4McLWzRkbW_7Xx_hSGOmfpoPMwbYTGEKBAjqwanLEs,26146
|
|
10
|
+
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
json_repair/parse_string_helpers/parse_boolean_or_null.py,sha256=pGmH1QATBls70kTvUlJv4F8NiPaBWcyGhRL03sTOnto,871
|
|
12
|
+
json_repair/parse_string_helpers/parse_json_llm_block.py,sha256=wPSm-8RY30Ek8HxzjCkCRtdLq4-Cez-PJB3vOk_vP3w,670
|
|
13
|
+
json_repair/utils/constants.py,sha256=cv2gvyosuq0me0600WyTysM9avrtfXPuXYR26tawcuo,158
|
|
14
|
+
json_repair/utils/json_context.py,sha256=WsMOjqpGSr6aaDONcrk8UFtTurzWon2Qq9AoBBYseoI,934
|
|
15
|
+
json_repair/utils/object_comparer.py,sha256=XKV3MRab8H7_v4sm-wpEa5le0XX9OeycWo5S-MFm-GI,1716
|
|
16
|
+
json_repair/utils/string_file_wrapper.py,sha256=Zlm0ZfJAw_VPlIy-QldL_OKYrPk3TYGq1JVAFPv7SnQ,6862
|
|
17
|
+
json_repair-0.54.dist-info/licenses/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
|
18
|
+
json_repair-0.54.dist-info/METADATA,sha256=xoD5G1EZ7muIRVbzdjsgD10OQbxS-K06sNGqlNDvvdQ,12220
|
|
19
|
+
json_repair-0.54.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
20
|
+
json_repair-0.54.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
|
21
|
+
json_repair-0.54.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
|
22
|
+
json_repair-0.54.dist-info/RECORD,,
|
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING
|
|
2
|
-
|
|
3
|
-
if TYPE_CHECKING:
|
|
4
|
-
from .json_parser import JSONParser
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
def parse_boolean_or_null(self: "JSONParser") -> bool | str | None:
|
|
8
|
-
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
|
|
9
|
-
starting_index = self.index
|
|
10
|
-
char = (self.get_char_at() or "").lower()
|
|
11
|
-
value: tuple[str, bool | None] | None = None
|
|
12
|
-
if char == "t":
|
|
13
|
-
value = ("true", True)
|
|
14
|
-
elif char == "f":
|
|
15
|
-
value = ("false", False)
|
|
16
|
-
elif char == "n":
|
|
17
|
-
value = ("null", None)
|
|
18
|
-
|
|
19
|
-
if value:
|
|
20
|
-
i = 0
|
|
21
|
-
while char and i < len(value[0]) and char == value[0][i]:
|
|
22
|
-
i += 1
|
|
23
|
-
self.index += 1
|
|
24
|
-
char = (self.get_char_at() or "").lower()
|
|
25
|
-
if i == len(value[0]):
|
|
26
|
-
return value[1]
|
|
27
|
-
|
|
28
|
-
# If nothing works reset the index before returning
|
|
29
|
-
self.index = starting_index
|
|
30
|
-
return ""
|
|
@@ -1,108 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
from typing import TextIO
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
class StringFileWrapper:
|
|
6
|
-
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
|
|
7
|
-
def __init__(self, fd: TextIO, chunk_length: int) -> None:
|
|
8
|
-
"""
|
|
9
|
-
Initialize the StringFileWrapper with a file descriptor and chunk length.
|
|
10
|
-
|
|
11
|
-
Args:
|
|
12
|
-
fd (TextIO): The file descriptor to wrap.
|
|
13
|
-
CHUNK_LENGTH (int): The length of each chunk to read from the file.
|
|
14
|
-
|
|
15
|
-
Attributes:
|
|
16
|
-
fd (TextIO): The wrapped file descriptor.
|
|
17
|
-
length (int): The total length of the file content.
|
|
18
|
-
buffers (dict[int, str]): Dictionary to store chunks of file content.
|
|
19
|
-
buffer_length (int): The length of each buffer chunk.
|
|
20
|
-
"""
|
|
21
|
-
self.fd = fd
|
|
22
|
-
self.length: int = 0
|
|
23
|
-
# Buffers are 1MB strings that are read from the file
|
|
24
|
-
# and kept in memory to keep reads low
|
|
25
|
-
self.buffers: dict[int, str] = {}
|
|
26
|
-
# chunk_length is in bytes
|
|
27
|
-
if not chunk_length or chunk_length < 2:
|
|
28
|
-
chunk_length = 1_000_000
|
|
29
|
-
self.buffer_length = chunk_length
|
|
30
|
-
|
|
31
|
-
def get_buffer(self, index: int) -> str:
|
|
32
|
-
"""
|
|
33
|
-
Retrieve or load a buffer chunk from the file.
|
|
34
|
-
|
|
35
|
-
Args:
|
|
36
|
-
index (int): The index of the buffer chunk to retrieve.
|
|
37
|
-
|
|
38
|
-
Returns:
|
|
39
|
-
str: The buffer chunk at the specified index.
|
|
40
|
-
"""
|
|
41
|
-
if self.buffers.get(index) is None:
|
|
42
|
-
self.fd.seek(index * self.buffer_length)
|
|
43
|
-
self.buffers[index] = self.fd.read(self.buffer_length)
|
|
44
|
-
# Save memory by keeping max 2MB buffer chunks and min 2 chunks
|
|
45
|
-
if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
|
|
46
|
-
oldest_key = next(iter(self.buffers))
|
|
47
|
-
if oldest_key != index:
|
|
48
|
-
self.buffers.pop(oldest_key)
|
|
49
|
-
return self.buffers[index]
|
|
50
|
-
|
|
51
|
-
def __getitem__(self, index: int | slice) -> str:
|
|
52
|
-
"""
|
|
53
|
-
Retrieve a character or a slice of characters from the file.
|
|
54
|
-
|
|
55
|
-
Args:
|
|
56
|
-
index (Union[int, slice]): The index or slice of characters to retrieve.
|
|
57
|
-
|
|
58
|
-
Returns:
|
|
59
|
-
str: The character(s) at the specified index or slice.
|
|
60
|
-
"""
|
|
61
|
-
# The buffer is an array that is seek like a RAM:
|
|
62
|
-
# self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
|
|
63
|
-
# self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
|
|
64
|
-
if isinstance(index, slice):
|
|
65
|
-
buffer_index = index.start // self.buffer_length
|
|
66
|
-
buffer_end = index.stop // self.buffer_length
|
|
67
|
-
if buffer_index == buffer_end:
|
|
68
|
-
return self.get_buffer(buffer_index)[index.start % self.buffer_length : index.stop % self.buffer_length]
|
|
69
|
-
else:
|
|
70
|
-
start_slice = self.get_buffer(buffer_index)[index.start % self.buffer_length :]
|
|
71
|
-
end_slice = self.get_buffer(buffer_end)[: index.stop % self.buffer_length]
|
|
72
|
-
middle_slices = [self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)]
|
|
73
|
-
return start_slice + "".join(middle_slices) + end_slice
|
|
74
|
-
else:
|
|
75
|
-
buffer_index = index // self.buffer_length
|
|
76
|
-
return self.get_buffer(buffer_index)[index % self.buffer_length]
|
|
77
|
-
|
|
78
|
-
def __len__(self) -> int:
|
|
79
|
-
"""
|
|
80
|
-
Get the total length of the file.
|
|
81
|
-
|
|
82
|
-
Returns:
|
|
83
|
-
int: The total number of characters in the file.
|
|
84
|
-
"""
|
|
85
|
-
if self.length < 1:
|
|
86
|
-
current_position = self.fd.tell()
|
|
87
|
-
self.fd.seek(0, os.SEEK_END)
|
|
88
|
-
self.length = self.fd.tell()
|
|
89
|
-
self.fd.seek(current_position)
|
|
90
|
-
return self.length
|
|
91
|
-
|
|
92
|
-
def __setitem__(self, index: int | slice, value: str) -> None: # pragma: no cover
|
|
93
|
-
"""
|
|
94
|
-
Set a character or a slice of characters in the file.
|
|
95
|
-
|
|
96
|
-
Args:
|
|
97
|
-
index (slice): The slice of characters to set.
|
|
98
|
-
value (str): The value to set at the specified index or slice.
|
|
99
|
-
"""
|
|
100
|
-
start = index.start or 0 if isinstance(index, slice) else index or 0
|
|
101
|
-
|
|
102
|
-
if start < 0:
|
|
103
|
-
start += len(self)
|
|
104
|
-
|
|
105
|
-
current_position = self.fd.tell()
|
|
106
|
-
self.fd.seek(start)
|
|
107
|
-
self.fd.write(value)
|
|
108
|
-
self.fd.seek(current_position)
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
json_repair/__init__.py,sha256=JdJIZNCKV3MfIviryqK8NH8yGssCta2-192CekcwH-o,174
|
|
2
|
-
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
|
3
|
-
json_repair/constants.py,sha256=cv2gvyosuq0me0600WyTysM9avrtfXPuXYR26tawcuo,158
|
|
4
|
-
json_repair/json_context.py,sha256=WsMOjqpGSr6aaDONcrk8UFtTurzWon2Qq9AoBBYseoI,934
|
|
5
|
-
json_repair/json_parser.py,sha256=vy5Z8aiJUVhVmvYEgy0dkYy5WgUmyOeS6PEFiR3cW44,7948
|
|
6
|
-
json_repair/json_repair.py,sha256=sDhXzDZxu0QmaFzICPTtf_q7yOY1A1Lf_iQG6Potsco,11572
|
|
7
|
-
json_repair/object_comparer.py,sha256=XKV3MRab8H7_v4sm-wpEa5le0XX9OeycWo5S-MFm-GI,1716
|
|
8
|
-
json_repair/parse_array.py,sha256=-rh65JcfT-FtXiR6s8RYlMfI-6LzVr08ytlDh6Z2CFE,2181
|
|
9
|
-
json_repair/parse_boolean_or_null.py,sha256=WMSkvvxsp4wvauBcDqtt9WnLMD5SMoxeRfZFXp3FEBc,890
|
|
10
|
-
json_repair/parse_comment.py,sha256=JHtQ_QlxOvPNnMh7lhUaoTjFGelqjhTNq7qn9xUE7SU,2648
|
|
11
|
-
json_repair/parse_number.py,sha256=33zAtkbuVzi9Lqjxu7cXn9WlVzd3WjRx9Ln_LFzVL4o,1259
|
|
12
|
-
json_repair/parse_object.py,sha256=rnuH5Oxo98OrXhktF0wrOC1vRb5Th_m819Li1EFJzm4,5571
|
|
13
|
-
json_repair/parse_string.py,sha256=--coxoyH4nxl7osxgs1fIu31IEtB0HHwVbbOewypG4g,26146
|
|
14
|
-
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
json_repair/string_file_wrapper.py,sha256=tGkWBEUPE-CZPf4uSM5NE9oSDTpskX0myJiXsl-gbds,4333
|
|
16
|
-
json_repair/parse_string_helpers/parse_json_llm_block.py,sha256=taREF3pwb35kGBGJYbUHkTybATX3GI-SOwOz3yXaEQs,644
|
|
17
|
-
json_repair-0.53.0.dist-info/licenses/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
|
18
|
-
json_repair-0.53.0.dist-info/METADATA,sha256=JvMUVYGDDIzmym7MqbQ6k6PjbnuuskW_myvk0EWp7V8,11027
|
|
19
|
-
json_repair-0.53.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
20
|
-
json_repair-0.53.0.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
|
21
|
-
json_repair-0.53.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
|
22
|
-
json_repair-0.53.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|