json-repair 0.55.1__py3-none-any.whl → 0.56.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/json_parser.py +66 -28
- json_repair/json_repair.py +101 -25
- json_repair/parse_array.py +76 -15
- json_repair/parse_comment.py +1 -2
- json_repair/parse_number.py +1 -2
- json_repair/parse_object.py +150 -18
- json_repair/parse_string.py +23 -25
- json_repair/schema_repair.py +508 -0
- json_repair/utils/constants.py +11 -0
- json_repair/utils/object_comparer.py +1 -1
- json_repair/utils/string_file_wrapper.py +40 -35
- {json_repair-0.55.1.dist-info → json_repair-0.56.0.dist-info}/METADATA +70 -3
- json_repair-0.56.0.dist-info/RECORD +23 -0
- {json_repair-0.55.1.dist-info → json_repair-0.56.0.dist-info}/WHEEL +1 -1
- json_repair-0.55.1.dist-info/RECORD +0 -22
- {json_repair-0.55.1.dist-info → json_repair-0.56.0.dist-info}/entry_points.txt +0 -0
- {json_repair-0.55.1.dist-info → json_repair-0.56.0.dist-info}/licenses/LICENSE +0 -0
- {json_repair-0.55.1.dist-info → json_repair-0.56.0.dist-info}/top_level.txt +0 -0
json_repair/json_parser.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import TYPE_CHECKING, Any, TextIO
|
|
2
3
|
|
|
3
4
|
from .parse_array import parse_array as _parse_array
|
|
4
5
|
from .parse_comment import parse_comment as _parse_comment
|
|
@@ -10,11 +11,18 @@ from .utils.json_context import JsonContext
|
|
|
10
11
|
from .utils.object_comparer import ObjectComparer
|
|
11
12
|
from .utils.string_file_wrapper import StringFileWrapper
|
|
12
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from .schema_repair import SchemaRepairer
|
|
16
|
+
|
|
13
17
|
|
|
14
18
|
class JSONParser:
|
|
15
19
|
# Split the parse methods into separate files because this one was like 3000 lines
|
|
16
|
-
def parse_array(
|
|
17
|
-
|
|
20
|
+
def parse_array(
|
|
21
|
+
self,
|
|
22
|
+
schema: dict[str, Any] | bool | None = None,
|
|
23
|
+
path: str = "$",
|
|
24
|
+
) -> list[JSONReturnType]:
|
|
25
|
+
return _parse_array(self, schema, path)
|
|
18
26
|
|
|
19
27
|
def parse_comment(self) -> JSONReturnType:
|
|
20
28
|
return _parse_comment(self)
|
|
@@ -22,8 +30,12 @@ class JSONParser:
|
|
|
22
30
|
def parse_number(self) -> JSONReturnType:
|
|
23
31
|
return _parse_number(self)
|
|
24
32
|
|
|
25
|
-
def parse_object(
|
|
26
|
-
|
|
33
|
+
def parse_object(
|
|
34
|
+
self,
|
|
35
|
+
schema: dict[str, Any] | bool | None = None,
|
|
36
|
+
path: str = "$",
|
|
37
|
+
) -> JSONReturnType:
|
|
38
|
+
return _parse_object(self, schema, path)
|
|
27
39
|
|
|
28
40
|
def parse_string(self) -> JSONReturnType:
|
|
29
41
|
return _parse_string(self)
|
|
@@ -53,8 +65,8 @@ class JSONParser:
|
|
|
53
65
|
# We could add a guard in the code for each call but that would make this code unreadable, so here's this neat trick
|
|
54
66
|
# Replace self.log with a noop
|
|
55
67
|
self.logging = logging
|
|
68
|
+
self.logger: list[dict[str, str]] = []
|
|
56
69
|
if logging:
|
|
57
|
-
self.logger: list[dict[str, str]] = []
|
|
58
70
|
self.log = self._log
|
|
59
71
|
else:
|
|
60
72
|
# No-op
|
|
@@ -71,11 +83,26 @@ class JSONParser:
|
|
|
71
83
|
# may not be desirable in some use cases and the user would prefer json_repair to return an exception.
|
|
72
84
|
# So strict mode was added to disable some of those heuristics.
|
|
73
85
|
self.strict = strict
|
|
86
|
+
self.schema_repairer: SchemaRepairer | None = None
|
|
74
87
|
|
|
75
88
|
def parse(
|
|
76
89
|
self,
|
|
77
|
-
) -> JSONReturnType
|
|
78
|
-
|
|
90
|
+
) -> JSONReturnType:
|
|
91
|
+
return self._parse_top_level(self.parse_json)
|
|
92
|
+
|
|
93
|
+
def parse_with_schema(
|
|
94
|
+
self,
|
|
95
|
+
repairer: "SchemaRepairer",
|
|
96
|
+
schema: dict[str, Any] | bool,
|
|
97
|
+
) -> JSONReturnType:
|
|
98
|
+
"""Parse with schema guidance enabled for all nested values."""
|
|
99
|
+
self.schema_repairer = repairer
|
|
100
|
+
return self._parse_top_level(lambda: self.parse_json(schema, "$"))
|
|
101
|
+
|
|
102
|
+
# Consolidate top-level parsing so we handle multiple sequential JSON values consistently
|
|
103
|
+
# (including update semantics and strict-mode validation).
|
|
104
|
+
def _parse_top_level(self, parse_element: Callable[[], JSONReturnType]) -> JSONReturnType:
|
|
105
|
+
json = parse_element()
|
|
79
106
|
if self.index < len(self.json_str):
|
|
80
107
|
self.log(
|
|
81
108
|
"The parser returned early, checking if there's more json elements",
|
|
@@ -83,19 +110,17 @@ class JSONParser:
|
|
|
83
110
|
json = [json]
|
|
84
111
|
while self.index < len(self.json_str):
|
|
85
112
|
self.context.reset()
|
|
86
|
-
j =
|
|
113
|
+
j = parse_element()
|
|
87
114
|
if j:
|
|
88
115
|
if ObjectComparer.is_same_object(json[-1], j):
|
|
89
|
-
#
|
|
116
|
+
# Treat repeated objects as updates: keep the newest value.
|
|
90
117
|
json.pop()
|
|
91
118
|
else:
|
|
92
119
|
if not json[-1]:
|
|
93
120
|
json.pop()
|
|
94
121
|
json.append(j)
|
|
95
122
|
else:
|
|
96
|
-
# this was a bust, move the index
|
|
97
123
|
self.index += 1
|
|
98
|
-
# If nothing extra was found, don't return an array
|
|
99
124
|
if len(json) == 1:
|
|
100
125
|
self.log(
|
|
101
126
|
"There were no more elements, returning the element without the array",
|
|
@@ -106,38 +131,51 @@ class JSONParser:
|
|
|
106
131
|
"Multiple top-level JSON elements found in strict mode, raising an error",
|
|
107
132
|
)
|
|
108
133
|
raise ValueError("Multiple top-level JSON elements found in strict mode.")
|
|
109
|
-
|
|
110
|
-
return json, self.logger
|
|
111
|
-
else:
|
|
112
|
-
return json
|
|
134
|
+
return json
|
|
113
135
|
|
|
114
136
|
def parse_json(
|
|
115
137
|
self,
|
|
138
|
+
schema: dict[str, Any] | bool | None = None,
|
|
139
|
+
path: str = "$",
|
|
116
140
|
) -> JSONReturnType:
|
|
141
|
+
"""Parse the next JSON value and, when configured, enforce schema constraints."""
|
|
142
|
+
repairer = self.schema_repairer if self.schema_repairer is not None and schema not in (None, True) else None
|
|
143
|
+
if repairer is not None:
|
|
144
|
+
# Resolve references once and decide whether schema-guided repairs are needed.
|
|
145
|
+
schema = repairer.resolve_schema(schema)
|
|
146
|
+
if schema is True:
|
|
147
|
+
repairer = None
|
|
148
|
+
elif schema is False:
|
|
149
|
+
raise ValueError("Schema does not allow any values.")
|
|
150
|
+
|
|
117
151
|
while True:
|
|
118
152
|
char = self.get_char_at()
|
|
119
153
|
# None means that we are at the end of the string provided
|
|
120
154
|
if char is None:
|
|
121
155
|
return ""
|
|
122
156
|
# <object> starts with '{'
|
|
123
|
-
|
|
157
|
+
if char == "{":
|
|
124
158
|
self.index += 1
|
|
125
|
-
|
|
159
|
+
value = self.parse_object(schema, path) if repairer else self.parse_object()
|
|
160
|
+
return repairer.repair_value(value, schema, path) if repairer else value
|
|
126
161
|
# <array> starts with '['
|
|
127
|
-
|
|
162
|
+
if char == "[":
|
|
128
163
|
self.index += 1
|
|
129
|
-
|
|
164
|
+
value = self.parse_array(schema, path) if repairer else self.parse_array()
|
|
165
|
+
return repairer.repair_value(value, schema, path) if repairer else value
|
|
130
166
|
# <string> starts with a quote
|
|
131
|
-
|
|
132
|
-
|
|
167
|
+
if not self.context.empty and (char in STRING_DELIMITERS or char.isalpha()):
|
|
168
|
+
value = self.parse_string()
|
|
169
|
+
return repairer.repair_value(value, schema, path) if repairer else value
|
|
133
170
|
# <number> starts with [0-9] or minus
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
171
|
+
if not self.context.empty and (char.isdigit() or char == "-" or char == "."):
|
|
172
|
+
value = self.parse_number()
|
|
173
|
+
return repairer.repair_value(value, schema, path) if repairer else value
|
|
174
|
+
if char in ["#", "/"]:
|
|
175
|
+
value = self.parse_comment()
|
|
176
|
+
return repairer.repair_value(value, schema, path) if repairer else value
|
|
138
177
|
# If everything else fails, we just ignore and move on
|
|
139
|
-
|
|
140
|
-
self.index += 1
|
|
178
|
+
self.index += 1
|
|
141
179
|
|
|
142
180
|
def get_char_at(self, count: int = 0) -> str | None:
|
|
143
181
|
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
|
json_repair/json_repair.py
CHANGED
|
@@ -25,9 +25,11 @@ All supported use cases are in the unit tests
|
|
|
25
25
|
import argparse
|
|
26
26
|
import json
|
|
27
27
|
import sys
|
|
28
|
+
from pathlib import Path
|
|
28
29
|
from typing import Any, Literal, TextIO, overload
|
|
29
30
|
|
|
30
31
|
from .json_parser import JSONParser
|
|
32
|
+
from .schema_repair import SchemaRepairer, load_schema_model, schema_from_input
|
|
31
33
|
from .utils.constants import JSONReturnType
|
|
32
34
|
|
|
33
35
|
|
|
@@ -41,6 +43,7 @@ def repair_json(
|
|
|
41
43
|
chunk_length: int = 0,
|
|
42
44
|
stream_stable: bool = False,
|
|
43
45
|
strict: bool = False,
|
|
46
|
+
schema: Any | None = None,
|
|
44
47
|
**json_dumps_args: Any,
|
|
45
48
|
) -> str: ...
|
|
46
49
|
|
|
@@ -55,6 +58,7 @@ def repair_json(
|
|
|
55
58
|
chunk_length: int = 0,
|
|
56
59
|
stream_stable: bool = False,
|
|
57
60
|
strict: bool = False,
|
|
61
|
+
schema: Any | None = None,
|
|
58
62
|
**json_dumps_args: Any,
|
|
59
63
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]: ...
|
|
60
64
|
|
|
@@ -68,6 +72,7 @@ def repair_json(
|
|
|
68
72
|
chunk_length: int = 0,
|
|
69
73
|
stream_stable: bool = False,
|
|
70
74
|
strict: bool = False,
|
|
75
|
+
schema: Any | None = None,
|
|
71
76
|
**json_dumps_args: Any,
|
|
72
77
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
|
|
73
78
|
"""
|
|
@@ -83,27 +88,49 @@ def repair_json(
|
|
|
83
88
|
chunk_length (int, optional): Size in bytes of the file chunks to read at once. Ignored if `json_fd` is None. Do not use! Use `from_file` or `load` instead. Defaults to 1MB.
|
|
84
89
|
stream_stable (bool, optional): When the json to be repaired is the accumulation of streaming json at a certain moment.If this parameter to True will keep the repair results stable.
|
|
85
90
|
strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
|
|
91
|
+
schema (Any, optional): JSON Schema dict, boolean schema, or pydantic v2 model used to guide repairs. Schema guidance is skipped for already-valid JSON unless `skip_json_loads=True`.
|
|
86
92
|
Returns:
|
|
87
93
|
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON or a tuple with the repaired JSON and repair log when logging is True.
|
|
88
94
|
"""
|
|
95
|
+
# Schema-guided repairs and strict mode are mutually exclusive to avoid conflicting behavior.
|
|
96
|
+
if schema is not None and strict:
|
|
97
|
+
raise ValueError("schema and strict cannot be used together.")
|
|
98
|
+
|
|
89
99
|
parser = JSONParser(json_str, json_fd, logging, chunk_length, stream_stable, strict)
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
100
|
+
# When JSON is already valid, skip schema guidance unless the caller explicitly disables json.loads.
|
|
101
|
+
if not skip_json_loads:
|
|
102
|
+
loaded_json: JSONReturnType | None
|
|
93
103
|
try:
|
|
94
|
-
|
|
104
|
+
loaded_json = json.load(json_fd) if json_fd else json.loads(json_str)
|
|
95
105
|
except json.JSONDecodeError:
|
|
96
|
-
|
|
106
|
+
loaded_json = None
|
|
107
|
+
else:
|
|
108
|
+
if logging:
|
|
109
|
+
return loaded_json, []
|
|
110
|
+
if return_objects:
|
|
111
|
+
return loaded_json
|
|
112
|
+
if loaded_json == "":
|
|
113
|
+
return ""
|
|
114
|
+
return json.dumps(loaded_json, **json_dumps_args)
|
|
115
|
+
|
|
116
|
+
# Schema guidance only happens in parser mode.
|
|
117
|
+
schema_obj = schema_from_input(schema) if schema is not None else None
|
|
118
|
+
parsed_json: JSONReturnType
|
|
119
|
+
if schema_obj is None:
|
|
120
|
+
parsed_json = parser.parse()
|
|
121
|
+
else:
|
|
122
|
+
repairer = SchemaRepairer(schema_obj, parser.logger if logging else None)
|
|
123
|
+
parsed_json = parser.parse_with_schema(repairer, schema_obj)
|
|
124
|
+
# Post-parse validation ensures we reject values that cannot satisfy the schema.
|
|
125
|
+
repairer.validate(parsed_json, schema_obj)
|
|
97
126
|
# It's useful to return the actual object instead of the json string,
|
|
98
127
|
# it allows this lib to be a replacement of the json library
|
|
99
|
-
if
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
if logging and not isinstance(parsed_json, tuple):
|
|
103
|
-
return parsed_json, []
|
|
128
|
+
if logging:
|
|
129
|
+
return parsed_json, parser.logger
|
|
130
|
+
if return_objects:
|
|
104
131
|
return parsed_json
|
|
105
132
|
# Avoid returning only a pair of quotes if it's an empty string
|
|
106
|
-
|
|
133
|
+
if parsed_json == "":
|
|
107
134
|
return ""
|
|
108
135
|
return json.dumps(parsed_json, **json_dumps_args)
|
|
109
136
|
|
|
@@ -114,6 +141,7 @@ def loads(
|
|
|
114
141
|
logging: bool = False,
|
|
115
142
|
stream_stable: bool = False,
|
|
116
143
|
strict: bool = False,
|
|
144
|
+
schema: Any | None = None,
|
|
117
145
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | str:
|
|
118
146
|
"""
|
|
119
147
|
This function works like `json.loads()` except that it will fix your JSON in the process.
|
|
@@ -124,6 +152,7 @@ def loads(
|
|
|
124
152
|
skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
|
|
125
153
|
logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
|
|
126
154
|
strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
|
|
155
|
+
schema (Any, optional): JSON Schema dict, boolean schema, or pydantic v2 model used to guide repairs. Schema guidance is skipped for already-valid JSON unless `skip_json_loads=True`.
|
|
127
156
|
|
|
128
157
|
Returns:
|
|
129
158
|
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]], str]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
|
|
@@ -135,6 +164,7 @@ def loads(
|
|
|
135
164
|
logging=logging,
|
|
136
165
|
stream_stable=stream_stable,
|
|
137
166
|
strict=strict,
|
|
167
|
+
schema=schema,
|
|
138
168
|
)
|
|
139
169
|
|
|
140
170
|
|
|
@@ -144,6 +174,7 @@ def load(
|
|
|
144
174
|
logging: bool = False,
|
|
145
175
|
chunk_length: int = 0,
|
|
146
176
|
strict: bool = False,
|
|
177
|
+
schema: Any | None = None,
|
|
147
178
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
|
|
148
179
|
"""
|
|
149
180
|
This function works like `json.load()` except that it will fix your JSON in the process.
|
|
@@ -155,6 +186,7 @@ def load(
|
|
|
155
186
|
logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
|
|
156
187
|
chunk_length (int, optional): Size in bytes of the file chunks to read at once. Defaults to 1MB.
|
|
157
188
|
strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
|
|
189
|
+
schema (Any, optional): JSON Schema dict, boolean schema, or pydantic v2 model used to guide repairs. Schema guidance is skipped for already-valid JSON unless `skip_json_loads=True`.
|
|
158
190
|
|
|
159
191
|
Returns:
|
|
160
192
|
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
|
|
@@ -166,40 +198,42 @@ def load(
|
|
|
166
198
|
skip_json_loads=skip_json_loads,
|
|
167
199
|
logging=logging,
|
|
168
200
|
strict=strict,
|
|
201
|
+
schema=schema,
|
|
169
202
|
)
|
|
170
203
|
|
|
171
204
|
|
|
172
205
|
def from_file(
|
|
173
|
-
filename: str,
|
|
206
|
+
filename: str | Path,
|
|
174
207
|
skip_json_loads: bool = False,
|
|
175
208
|
logging: bool = False,
|
|
176
209
|
chunk_length: int = 0,
|
|
177
210
|
strict: bool = False,
|
|
211
|
+
schema: Any | None = None,
|
|
178
212
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
|
|
179
213
|
"""
|
|
180
214
|
This function is a wrapper around `load()` so you can pass the filename as string
|
|
181
215
|
|
|
182
216
|
Args:
|
|
183
|
-
filename (str): The name of the file containing JSON data to load and repair.
|
|
217
|
+
filename (str | Path): The name of the file containing JSON data to load and repair.
|
|
184
218
|
skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
|
|
185
219
|
logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
|
|
186
220
|
chunk_length (int, optional): Size in bytes of the file chunks to read at once. Defaults to 1MB.
|
|
187
221
|
strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
|
|
222
|
+
schema (Any, optional): JSON Schema dict, boolean schema, or pydantic v2 model used to guide repairs. Schema guidance is skipped for already-valid JSON unless `skip_json_loads=True`.
|
|
188
223
|
|
|
189
224
|
Returns:
|
|
190
225
|
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
|
|
191
226
|
"""
|
|
192
|
-
with
|
|
193
|
-
|
|
227
|
+
with Path(filename).open() as fd:
|
|
228
|
+
return load(
|
|
194
229
|
fd=fd,
|
|
195
230
|
skip_json_loads=skip_json_loads,
|
|
196
231
|
logging=logging,
|
|
197
232
|
chunk_length=chunk_length,
|
|
198
233
|
strict=strict,
|
|
234
|
+
schema=schema,
|
|
199
235
|
)
|
|
200
236
|
|
|
201
|
-
return jsonobj
|
|
202
|
-
|
|
203
237
|
|
|
204
238
|
def cli(inline_args: list[str] | None = None) -> int:
|
|
205
239
|
"""
|
|
@@ -212,6 +246,10 @@ def cli(inline_args: list[str] | None = None) -> int:
|
|
|
212
246
|
- -o, --output TARGET (str): If specified, the output will be written to TARGET filename instead of stdout.
|
|
213
247
|
- --ensure_ascii (bool): Pass ensure_ascii=True to json.dumps(). Will pass False otherwise.
|
|
214
248
|
- --indent INDENT (int): Number of spaces for indentation (Default 2).
|
|
249
|
+
- --skip-json-loads (bool): Skip initial json.loads validation (needed to force schema on valid JSON).
|
|
250
|
+
- --schema SCHEMA (str): Path to a JSON Schema file that guides repairs.
|
|
251
|
+
- --schema-model MODEL (str): Pydantic v2 model in 'module:ClassName' form that guides repairs.
|
|
252
|
+
- --strict (bool): Raise on duplicate keys, missing separators, empty keys/values, and other unrecoverable structures instead of repairing them.
|
|
215
253
|
|
|
216
254
|
Returns:
|
|
217
255
|
int: Exit code of the CLI operation.
|
|
@@ -253,13 +291,28 @@ def cli(inline_args: list[str] | None = None) -> int:
|
|
|
253
291
|
default=2,
|
|
254
292
|
help="Number of spaces for indentation (Default 2)",
|
|
255
293
|
)
|
|
294
|
+
parser.add_argument(
|
|
295
|
+
"--skip-json-loads",
|
|
296
|
+
action="store_true",
|
|
297
|
+
help="Skip initial json.loads validation (needed to force schema on valid JSON)",
|
|
298
|
+
)
|
|
299
|
+
parser.add_argument(
|
|
300
|
+
"--schema",
|
|
301
|
+
metavar="SCHEMA",
|
|
302
|
+
help="Path to a JSON Schema file that guides repairs",
|
|
303
|
+
)
|
|
304
|
+
parser.add_argument(
|
|
305
|
+
"--schema-model",
|
|
306
|
+
metavar="MODEL",
|
|
307
|
+
help="Pydantic v2 model in 'module:ClassName' form that guides repairs",
|
|
308
|
+
)
|
|
256
309
|
parser.add_argument(
|
|
257
310
|
"--strict",
|
|
258
311
|
action="store_true",
|
|
259
312
|
help="Raise on duplicate keys, missing separators, empty keys/values, and other unrecoverable structures instead of repairing them",
|
|
260
313
|
)
|
|
261
314
|
|
|
262
|
-
args = parser.parse_args(
|
|
315
|
+
args = parser.parse_args(inline_args)
|
|
263
316
|
|
|
264
317
|
# Inline mode requires a filename, so error out if none was provided.
|
|
265
318
|
if args.inline and not args.filename: # pragma: no cover
|
|
@@ -270,23 +323,46 @@ def cli(inline_args: list[str] | None = None) -> int:
|
|
|
270
323
|
print("Error: You cannot pass both --inline and --output", file=sys.stderr)
|
|
271
324
|
sys.exit(1)
|
|
272
325
|
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
326
|
+
if args.schema and args.schema_model:
|
|
327
|
+
print("Error: You cannot pass both --schema and --schema-model", file=sys.stderr)
|
|
328
|
+
sys.exit(1)
|
|
329
|
+
|
|
330
|
+
if args.strict and (args.schema or args.schema_model):
|
|
331
|
+
print("Error: --strict cannot be used with --schema or --schema-model", file=sys.stderr)
|
|
332
|
+
sys.exit(1)
|
|
333
|
+
|
|
334
|
+
ensure_ascii = args.ensure_ascii
|
|
276
335
|
|
|
277
336
|
try:
|
|
337
|
+
schema = None
|
|
338
|
+
if args.schema:
|
|
339
|
+
with Path(args.schema).open() as fd:
|
|
340
|
+
schema = json.load(fd)
|
|
341
|
+
elif args.schema_model:
|
|
342
|
+
schema = load_schema_model(args.schema_model)
|
|
343
|
+
|
|
278
344
|
# Use from_file if a filename is provided; otherwise read from stdin.
|
|
279
345
|
if args.filename:
|
|
280
|
-
result = from_file(
|
|
346
|
+
result = from_file(
|
|
347
|
+
args.filename,
|
|
348
|
+
skip_json_loads=args.skip_json_loads,
|
|
349
|
+
strict=args.strict,
|
|
350
|
+
schema=schema,
|
|
351
|
+
)
|
|
281
352
|
else:
|
|
282
353
|
data = sys.stdin.read()
|
|
283
|
-
result = loads(
|
|
354
|
+
result = loads(
|
|
355
|
+
data,
|
|
356
|
+
skip_json_loads=args.skip_json_loads,
|
|
357
|
+
strict=args.strict,
|
|
358
|
+
schema=schema,
|
|
359
|
+
)
|
|
284
360
|
if args.inline or args.output:
|
|
285
|
-
with
|
|
361
|
+
with Path(args.output or args.filename).open(mode="w") as fd:
|
|
286
362
|
json.dump(result, fd, indent=args.indent, ensure_ascii=ensure_ascii)
|
|
287
363
|
else:
|
|
288
364
|
print(json.dumps(result, indent=args.indent, ensure_ascii=ensure_ascii))
|
|
289
|
-
except
|
|
365
|
+
except (OSError, TypeError, ValueError) as e: # pragma: no cover
|
|
290
366
|
print(f"Error: {str(e)}", file=sys.stderr)
|
|
291
367
|
return 1
|
|
292
368
|
|
json_repair/parse_array.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import TYPE_CHECKING
|
|
1
|
+
from typing import TYPE_CHECKING, Any
|
|
2
2
|
|
|
3
3
|
from .utils.constants import STRING_DELIMITERS, JSONReturnType
|
|
4
4
|
from .utils.json_context import ContextValues
|
|
@@ -6,51 +6,112 @@ from .utils.object_comparer import ObjectComparer
|
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
8
8
|
from .json_parser import JSONParser
|
|
9
|
+
from .schema_repair import SchemaRepairer
|
|
9
10
|
|
|
10
11
|
|
|
11
|
-
def parse_array(
|
|
12
|
+
def parse_array(
|
|
13
|
+
self: "JSONParser",
|
|
14
|
+
schema: dict[str, Any] | bool | None = None,
|
|
15
|
+
path: str = "$",
|
|
16
|
+
) -> list[JSONReturnType]:
|
|
12
17
|
# <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
|
|
13
|
-
|
|
18
|
+
# Only activate schema-guided parsing if a repairer is available and schema looks array-like.
|
|
19
|
+
schema_repairer: SchemaRepairer | None = None
|
|
20
|
+
items_schema: object | None = None
|
|
21
|
+
additional_items: object | None = None
|
|
22
|
+
if schema is not None and schema is not True:
|
|
23
|
+
repairer = self.schema_repairer
|
|
24
|
+
if repairer is not None:
|
|
25
|
+
schema = repairer.resolve_schema(schema)
|
|
26
|
+
if schema is False:
|
|
27
|
+
raise ValueError("Schema does not allow any values.")
|
|
28
|
+
if schema is not True and repairer.is_array_schema(schema):
|
|
29
|
+
schema_repairer = repairer
|
|
30
|
+
items_schema = schema.get("items")
|
|
31
|
+
additional_items = schema.get("additionalItems", None)
|
|
32
|
+
|
|
33
|
+
arr: list[JSONReturnType] = []
|
|
14
34
|
self.context.set(ContextValues.ARRAY)
|
|
15
|
-
# Stop when you either find the closing parentheses or you have iterated over the entire string
|
|
16
35
|
char = self.get_char_at()
|
|
36
|
+
idx = 0
|
|
37
|
+
|
|
17
38
|
while char and char not in ["]", "}"]:
|
|
18
39
|
self.skip_whitespaces()
|
|
19
|
-
|
|
40
|
+
|
|
41
|
+
# Resolve per-item schema (tuple schemas + additionalItems) when schema guidance is active.
|
|
42
|
+
item_schema: dict[str, Any] | bool | None = None
|
|
43
|
+
drop_item = False
|
|
44
|
+
if schema_repairer is not None:
|
|
45
|
+
if isinstance(items_schema, list):
|
|
46
|
+
if idx < len(items_schema):
|
|
47
|
+
raw_schema = items_schema[idx]
|
|
48
|
+
# Tuple schemas must contain dict/bool entries only.
|
|
49
|
+
if raw_schema is not None and not isinstance(raw_schema, (dict, bool)):
|
|
50
|
+
raise ValueError("Schema must be an object.")
|
|
51
|
+
item_schema = raw_schema
|
|
52
|
+
else:
|
|
53
|
+
if additional_items is False:
|
|
54
|
+
drop_item = True
|
|
55
|
+
elif isinstance(additional_items, dict):
|
|
56
|
+
item_schema = additional_items
|
|
57
|
+
else:
|
|
58
|
+
item_schema = True
|
|
59
|
+
elif isinstance(items_schema, dict):
|
|
60
|
+
item_schema = items_schema
|
|
61
|
+
else:
|
|
62
|
+
item_schema = True
|
|
63
|
+
|
|
64
|
+
item_path = f"{path}[{idx}]"
|
|
65
|
+
|
|
20
66
|
if char in STRING_DELIMITERS:
|
|
21
|
-
#
|
|
22
|
-
# So we are going to check if this string is followed by a : or not
|
|
23
|
-
# And either parse the string or parse the object
|
|
67
|
+
# A string followed by ':' is often a missing object start; treat it as an object.
|
|
24
68
|
i = 1
|
|
25
69
|
i = self.skip_to_character(char, i)
|
|
26
70
|
i = self.scroll_whitespaces(idx=i + 1)
|
|
27
|
-
|
|
71
|
+
if self.get_char_at(i) == ":":
|
|
72
|
+
if schema_repairer is not None and not drop_item:
|
|
73
|
+
# Schema-guided object parsing, then enforce schema on the parsed object.
|
|
74
|
+
value = self.parse_object(item_schema, item_path)
|
|
75
|
+
value = schema_repairer.repair_value(value, item_schema, item_path)
|
|
76
|
+
else:
|
|
77
|
+
# No schema (or dropping): still parse to keep the cursor in sync.
|
|
78
|
+
value = self.parse_object()
|
|
79
|
+
else:
|
|
80
|
+
value = self.parse_string()
|
|
81
|
+
if schema_repairer is not None and not drop_item:
|
|
82
|
+
# Apply schema constraints/coercions to scalar values when configured.
|
|
83
|
+
value = schema_repairer.repair_value(value, item_schema, item_path)
|
|
28
84
|
else:
|
|
29
|
-
|
|
85
|
+
if schema_repairer is not None and not drop_item:
|
|
86
|
+
# Use schema-aware parsing to guide nested repairs.
|
|
87
|
+
value = self.parse_json(item_schema, item_path)
|
|
88
|
+
else:
|
|
89
|
+
# Parse normally (or discard) to keep the index aligned.
|
|
90
|
+
value = self.parse_json()
|
|
30
91
|
|
|
31
|
-
# It is possible that parse_json() returns nothing valid, so we increase by 1, unless we find an array separator
|
|
32
92
|
if ObjectComparer.is_strictly_empty(value) and self.get_char_at() not in ["]", ","]:
|
|
33
93
|
self.index += 1
|
|
34
94
|
elif value == "..." and self.get_char_at(-1) == ".":
|
|
35
95
|
self.log(
|
|
36
96
|
"While parsing an array, found a stray '...'; ignoring it",
|
|
37
97
|
)
|
|
38
|
-
|
|
98
|
+
elif not drop_item:
|
|
39
99
|
arr.append(value)
|
|
100
|
+
elif schema_repairer is not None:
|
|
101
|
+
# Record drops for visibility when schema forbids extra tuple items.
|
|
102
|
+
schema_repairer._log("Dropped extra array item not covered by schema", item_path)
|
|
40
103
|
|
|
41
|
-
|
|
104
|
+
idx += 1
|
|
42
105
|
char = self.get_char_at()
|
|
43
106
|
while char and char != "]" and (char.isspace() or char == ","):
|
|
44
107
|
self.index += 1
|
|
45
108
|
char = self.get_char_at()
|
|
46
109
|
|
|
47
|
-
# Especially at the end of an LLM generated json you might miss the last "]"
|
|
48
110
|
if char != "]":
|
|
49
111
|
self.log(
|
|
50
112
|
"While parsing an array we missed the closing ], ignoring it",
|
|
51
113
|
)
|
|
52
114
|
|
|
53
115
|
self.index += 1
|
|
54
|
-
|
|
55
116
|
self.context.reset()
|
|
56
117
|
return arr
|
json_repair/parse_comment.py
CHANGED
json_repair/parse_number.py
CHANGED
|
@@ -33,7 +33,6 @@ def parse_number(self: "JSONParser") -> JSONReturnType:
|
|
|
33
33
|
return number_str
|
|
34
34
|
if "." in number_str or "e" in number_str or "E" in number_str:
|
|
35
35
|
return float(number_str)
|
|
36
|
-
|
|
37
|
-
return int(number_str)
|
|
36
|
+
return int(number_str)
|
|
38
37
|
except ValueError:
|
|
39
38
|
return number_str
|