json-repair 0.55.1__py3-none-any.whl → 0.56.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,5 @@
1
- from typing import TextIO
1
+ from collections.abc import Callable
2
+ from typing import TYPE_CHECKING, Any, TextIO
2
3
 
3
4
  from .parse_array import parse_array as _parse_array
4
5
  from .parse_comment import parse_comment as _parse_comment
@@ -10,11 +11,18 @@ from .utils.json_context import JsonContext
10
11
  from .utils.object_comparer import ObjectComparer
11
12
  from .utils.string_file_wrapper import StringFileWrapper
12
13
 
14
+ if TYPE_CHECKING:
15
+ from .schema_repair import SchemaRepairer
16
+
13
17
 
14
18
  class JSONParser:
15
19
  # Split the parse methods into separate files because this one was like 3000 lines
16
- def parse_array(self) -> list[JSONReturnType]:
17
- return _parse_array(self)
20
+ def parse_array(
21
+ self,
22
+ schema: dict[str, Any] | bool | None = None,
23
+ path: str = "$",
24
+ ) -> list[JSONReturnType]:
25
+ return _parse_array(self, schema, path)
18
26
 
19
27
  def parse_comment(self) -> JSONReturnType:
20
28
  return _parse_comment(self)
@@ -22,8 +30,12 @@ class JSONParser:
22
30
  def parse_number(self) -> JSONReturnType:
23
31
  return _parse_number(self)
24
32
 
25
- def parse_object(self) -> JSONReturnType:
26
- return _parse_object(self)
33
+ def parse_object(
34
+ self,
35
+ schema: dict[str, Any] | bool | None = None,
36
+ path: str = "$",
37
+ ) -> JSONReturnType:
38
+ return _parse_object(self, schema, path)
27
39
 
28
40
  def parse_string(self) -> JSONReturnType:
29
41
  return _parse_string(self)
@@ -53,8 +65,8 @@ class JSONParser:
53
65
  # We could add a guard in the code for each call but that would make this code unreadable, so here's this neat trick
54
66
  # Replace self.log with a noop
55
67
  self.logging = logging
68
+ self.logger: list[dict[str, str]] = []
56
69
  if logging:
57
- self.logger: list[dict[str, str]] = []
58
70
  self.log = self._log
59
71
  else:
60
72
  # No-op
@@ -71,11 +83,26 @@ class JSONParser:
71
83
  # may not be desirable in some use cases and the user would prefer json_repair to return an exception.
72
84
  # So strict mode was added to disable some of those heuristics.
73
85
  self.strict = strict
86
+ self.schema_repairer: SchemaRepairer | None = None
74
87
 
75
88
  def parse(
76
89
  self,
77
- ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
78
- json = self.parse_json()
90
+ ) -> JSONReturnType:
91
+ return self._parse_top_level(self.parse_json)
92
+
93
+ def parse_with_schema(
94
+ self,
95
+ repairer: "SchemaRepairer",
96
+ schema: dict[str, Any] | bool,
97
+ ) -> JSONReturnType:
98
+ """Parse with schema guidance enabled for all nested values."""
99
+ self.schema_repairer = repairer
100
+ return self._parse_top_level(lambda: self.parse_json(schema, "$"))
101
+
102
+ # Consolidate top-level parsing so we handle multiple sequential JSON values consistently
103
+ # (including update semantics and strict-mode validation).
104
+ def _parse_top_level(self, parse_element: Callable[[], JSONReturnType]) -> JSONReturnType:
105
+ json = parse_element()
79
106
  if self.index < len(self.json_str):
80
107
  self.log(
81
108
  "The parser returned early, checking if there's more json elements",
@@ -83,19 +110,17 @@ class JSONParser:
83
110
  json = [json]
84
111
  while self.index < len(self.json_str):
85
112
  self.context.reset()
86
- j = self.parse_json()
113
+ j = parse_element()
87
114
  if j:
88
115
  if ObjectComparer.is_same_object(json[-1], j):
89
- # replace the last entry with the new one since the new one seems an update
116
+ # Treat repeated objects as updates: keep the newest value.
90
117
  json.pop()
91
118
  else:
92
119
  if not json[-1]:
93
120
  json.pop()
94
121
  json.append(j)
95
122
  else:
96
- # this was a bust, move the index
97
123
  self.index += 1
98
- # If nothing extra was found, don't return an array
99
124
  if len(json) == 1:
100
125
  self.log(
101
126
  "There were no more elements, returning the element without the array",
@@ -106,38 +131,51 @@ class JSONParser:
106
131
  "Multiple top-level JSON elements found in strict mode, raising an error",
107
132
  )
108
133
  raise ValueError("Multiple top-level JSON elements found in strict mode.")
109
- if self.logging:
110
- return json, self.logger
111
- else:
112
- return json
134
+ return json
113
135
 
114
136
  def parse_json(
115
137
  self,
138
+ schema: dict[str, Any] | bool | None = None,
139
+ path: str = "$",
116
140
  ) -> JSONReturnType:
141
+ """Parse the next JSON value and, when configured, enforce schema constraints."""
142
+ repairer = self.schema_repairer if self.schema_repairer is not None and schema not in (None, True) else None
143
+ if repairer is not None:
144
+ # Resolve references once and decide whether schema-guided repairs are needed.
145
+ schema = repairer.resolve_schema(schema)
146
+ if schema is True:
147
+ repairer = None
148
+ elif schema is False:
149
+ raise ValueError("Schema does not allow any values.")
150
+
117
151
  while True:
118
152
  char = self.get_char_at()
119
153
  # None means that we are at the end of the string provided
120
154
  if char is None:
121
155
  return ""
122
156
  # <object> starts with '{'
123
- elif char == "{":
157
+ if char == "{":
124
158
  self.index += 1
125
- return self.parse_object()
159
+ value = self.parse_object(schema, path) if repairer else self.parse_object()
160
+ return repairer.repair_value(value, schema, path) if repairer else value
126
161
  # <array> starts with '['
127
- elif char == "[":
162
+ if char == "[":
128
163
  self.index += 1
129
- return self.parse_array()
164
+ value = self.parse_array(schema, path) if repairer else self.parse_array()
165
+ return repairer.repair_value(value, schema, path) if repairer else value
130
166
  # <string> starts with a quote
131
- elif not self.context.empty and (char in STRING_DELIMITERS or char.isalpha()):
132
- return self.parse_string()
167
+ if not self.context.empty and (char in STRING_DELIMITERS or char.isalpha()):
168
+ value = self.parse_string()
169
+ return repairer.repair_value(value, schema, path) if repairer else value
133
170
  # <number> starts with [0-9] or minus
134
- elif not self.context.empty and (char.isdigit() or char == "-" or char == "."):
135
- return self.parse_number()
136
- elif char in ["#", "/"]:
137
- return self.parse_comment()
171
+ if not self.context.empty and (char.isdigit() or char == "-" or char == "."):
172
+ value = self.parse_number()
173
+ return repairer.repair_value(value, schema, path) if repairer else value
174
+ if char in ["#", "/"]:
175
+ value = self.parse_comment()
176
+ return repairer.repair_value(value, schema, path) if repairer else value
138
177
  # If everything else fails, we just ignore and move on
139
- else:
140
- self.index += 1
178
+ self.index += 1
141
179
 
142
180
  def get_char_at(self, count: int = 0) -> str | None:
143
181
  # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
@@ -25,9 +25,11 @@ All supported use cases are in the unit tests
25
25
  import argparse
26
26
  import json
27
27
  import sys
28
+ from pathlib import Path
28
29
  from typing import Any, Literal, TextIO, overload
29
30
 
30
31
  from .json_parser import JSONParser
32
+ from .schema_repair import SchemaRepairer, load_schema_model, schema_from_input
31
33
  from .utils.constants import JSONReturnType
32
34
 
33
35
 
@@ -41,6 +43,7 @@ def repair_json(
41
43
  chunk_length: int = 0,
42
44
  stream_stable: bool = False,
43
45
  strict: bool = False,
46
+ schema: Any | None = None,
44
47
  **json_dumps_args: Any,
45
48
  ) -> str: ...
46
49
 
@@ -55,6 +58,7 @@ def repair_json(
55
58
  chunk_length: int = 0,
56
59
  stream_stable: bool = False,
57
60
  strict: bool = False,
61
+ schema: Any | None = None,
58
62
  **json_dumps_args: Any,
59
63
  ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]: ...
60
64
 
@@ -68,6 +72,7 @@ def repair_json(
68
72
  chunk_length: int = 0,
69
73
  stream_stable: bool = False,
70
74
  strict: bool = False,
75
+ schema: Any | None = None,
71
76
  **json_dumps_args: Any,
72
77
  ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
73
78
  """
@@ -83,27 +88,49 @@ def repair_json(
83
88
  chunk_length (int, optional): Size in bytes of the file chunks to read at once. Ignored if `json_fd` is None. Do not use! Use `from_file` or `load` instead. Defaults to 1MB.
84
89
  stream_stable (bool, optional): When the json to be repaired is the accumulation of streaming json at a certain moment.If this parameter to True will keep the repair results stable.
85
90
  strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
91
+ schema (Any, optional): JSON Schema dict, boolean schema, or pydantic v2 model used to guide repairs. Schema guidance is skipped for already-valid JSON unless `skip_json_loads=True`.
86
92
  Returns:
87
93
  Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON or a tuple with the repaired JSON and repair log when logging is True.
88
94
  """
95
+ # Schema-guided repairs and strict mode are mutually exclusive to avoid conflicting behavior.
96
+ if schema is not None and strict:
97
+ raise ValueError("schema and strict cannot be used together.")
98
+
89
99
  parser = JSONParser(json_str, json_fd, logging, chunk_length, stream_stable, strict)
90
- if skip_json_loads:
91
- parsed_json = parser.parse()
92
- else:
100
+ # When JSON is already valid, skip schema guidance unless the caller explicitly disables json.loads.
101
+ if not skip_json_loads:
102
+ loaded_json: JSONReturnType | None
93
103
  try:
94
- parsed_json = json.load(json_fd) if json_fd else json.loads(json_str)
104
+ loaded_json = json.load(json_fd) if json_fd else json.loads(json_str)
95
105
  except json.JSONDecodeError:
96
- parsed_json = parser.parse()
106
+ loaded_json = None
107
+ else:
108
+ if logging:
109
+ return loaded_json, []
110
+ if return_objects:
111
+ return loaded_json
112
+ if loaded_json == "":
113
+ return ""
114
+ return json.dumps(loaded_json, **json_dumps_args)
115
+
116
+ # Schema guidance only happens in parser mode.
117
+ schema_obj = schema_from_input(schema) if schema is not None else None
118
+ parsed_json: JSONReturnType
119
+ if schema_obj is None:
120
+ parsed_json = parser.parse()
121
+ else:
122
+ repairer = SchemaRepairer(schema_obj, parser.logger if logging else None)
123
+ parsed_json = parser.parse_with_schema(repairer, schema_obj)
124
+ # Post-parse validation ensures we reject values that cannot satisfy the schema.
125
+ repairer.validate(parsed_json, schema_obj)
97
126
  # It's useful to return the actual object instead of the json string,
98
127
  # it allows this lib to be a replacement of the json library
99
- if return_objects or logging:
100
- # If logging is True, the user should expect a tuple.
101
- # If json.load(s) worked, the repair log list is empty
102
- if logging and not isinstance(parsed_json, tuple):
103
- return parsed_json, []
128
+ if logging:
129
+ return parsed_json, parser.logger
130
+ if return_objects:
104
131
  return parsed_json
105
132
  # Avoid returning only a pair of quotes if it's an empty string
106
- elif parsed_json == "":
133
+ if parsed_json == "":
107
134
  return ""
108
135
  return json.dumps(parsed_json, **json_dumps_args)
109
136
 
@@ -114,6 +141,7 @@ def loads(
114
141
  logging: bool = False,
115
142
  stream_stable: bool = False,
116
143
  strict: bool = False,
144
+ schema: Any | None = None,
117
145
  ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | str:
118
146
  """
119
147
  This function works like `json.loads()` except that it will fix your JSON in the process.
@@ -124,6 +152,7 @@ def loads(
124
152
  skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
125
153
  logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
126
154
  strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
155
+ schema (Any, optional): JSON Schema dict, boolean schema, or pydantic v2 model used to guide repairs. Schema guidance is skipped for already-valid JSON unless `skip_json_loads=True`.
127
156
 
128
157
  Returns:
129
158
  Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]], str]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
@@ -135,6 +164,7 @@ def loads(
135
164
  logging=logging,
136
165
  stream_stable=stream_stable,
137
166
  strict=strict,
167
+ schema=schema,
138
168
  )
139
169
 
140
170
 
@@ -144,6 +174,7 @@ def load(
144
174
  logging: bool = False,
145
175
  chunk_length: int = 0,
146
176
  strict: bool = False,
177
+ schema: Any | None = None,
147
178
  ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
148
179
  """
149
180
  This function works like `json.load()` except that it will fix your JSON in the process.
@@ -155,6 +186,7 @@ def load(
155
186
  logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
156
187
  chunk_length (int, optional): Size in bytes of the file chunks to read at once. Defaults to 1MB.
157
188
  strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
189
+ schema (Any, optional): JSON Schema dict, boolean schema, or pydantic v2 model used to guide repairs. Schema guidance is skipped for already-valid JSON unless `skip_json_loads=True`.
158
190
 
159
191
  Returns:
160
192
  Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
@@ -166,40 +198,42 @@ def load(
166
198
  skip_json_loads=skip_json_loads,
167
199
  logging=logging,
168
200
  strict=strict,
201
+ schema=schema,
169
202
  )
170
203
 
171
204
 
172
205
  def from_file(
173
- filename: str,
206
+ filename: str | Path,
174
207
  skip_json_loads: bool = False,
175
208
  logging: bool = False,
176
209
  chunk_length: int = 0,
177
210
  strict: bool = False,
211
+ schema: Any | None = None,
178
212
  ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
179
213
  """
180
214
  This function is a wrapper around `load()` so you can pass the filename as string
181
215
 
182
216
  Args:
183
- filename (str): The name of the file containing JSON data to load and repair.
217
+ filename (str | Path): The name of the file containing JSON data to load and repair.
184
218
  skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
185
219
  logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
186
220
  chunk_length (int, optional): Size in bytes of the file chunks to read at once. Defaults to 1MB.
187
221
  strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
222
+ schema (Any, optional): JSON Schema dict, boolean schema, or pydantic v2 model used to guide repairs. Schema guidance is skipped for already-valid JSON unless `skip_json_loads=True`.
188
223
 
189
224
  Returns:
190
225
  Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
191
226
  """
192
- with open(filename) as fd:
193
- jsonobj = load(
227
+ with Path(filename).open() as fd:
228
+ return load(
194
229
  fd=fd,
195
230
  skip_json_loads=skip_json_loads,
196
231
  logging=logging,
197
232
  chunk_length=chunk_length,
198
233
  strict=strict,
234
+ schema=schema,
199
235
  )
200
236
 
201
- return jsonobj
202
-
203
237
 
204
238
  def cli(inline_args: list[str] | None = None) -> int:
205
239
  """
@@ -212,6 +246,10 @@ def cli(inline_args: list[str] | None = None) -> int:
212
246
  - -o, --output TARGET (str): If specified, the output will be written to TARGET filename instead of stdout.
213
247
  - --ensure_ascii (bool): Pass ensure_ascii=True to json.dumps(). Will pass False otherwise.
214
248
  - --indent INDENT (int): Number of spaces for indentation (Default 2).
249
+ - --skip-json-loads (bool): Skip initial json.loads validation (needed to force schema on valid JSON).
250
+ - --schema SCHEMA (str): Path to a JSON Schema file that guides repairs.
251
+ - --schema-model MODEL (str): Pydantic v2 model in 'module:ClassName' form that guides repairs.
252
+ - --strict (bool): Raise on duplicate keys, missing separators, empty keys/values, and other unrecoverable structures instead of repairing them.
215
253
 
216
254
  Returns:
217
255
  int: Exit code of the CLI operation.
@@ -253,13 +291,28 @@ def cli(inline_args: list[str] | None = None) -> int:
253
291
  default=2,
254
292
  help="Number of spaces for indentation (Default 2)",
255
293
  )
294
+ parser.add_argument(
295
+ "--skip-json-loads",
296
+ action="store_true",
297
+ help="Skip initial json.loads validation (needed to force schema on valid JSON)",
298
+ )
299
+ parser.add_argument(
300
+ "--schema",
301
+ metavar="SCHEMA",
302
+ help="Path to a JSON Schema file that guides repairs",
303
+ )
304
+ parser.add_argument(
305
+ "--schema-model",
306
+ metavar="MODEL",
307
+ help="Pydantic v2 model in 'module:ClassName' form that guides repairs",
308
+ )
256
309
  parser.add_argument(
257
310
  "--strict",
258
311
  action="store_true",
259
312
  help="Raise on duplicate keys, missing separators, empty keys/values, and other unrecoverable structures instead of repairing them",
260
313
  )
261
314
 
262
- args = parser.parse_args() if inline_args is None else parser.parse_args(inline_args)
315
+ args = parser.parse_args(inline_args)
263
316
 
264
317
  # Inline mode requires a filename, so error out if none was provided.
265
318
  if args.inline and not args.filename: # pragma: no cover
@@ -270,23 +323,46 @@ def cli(inline_args: list[str] | None = None) -> int:
270
323
  print("Error: You cannot pass both --inline and --output", file=sys.stderr)
271
324
  sys.exit(1)
272
325
 
273
- ensure_ascii = False
274
- if args.ensure_ascii:
275
- ensure_ascii = True
326
+ if args.schema and args.schema_model:
327
+ print("Error: You cannot pass both --schema and --schema-model", file=sys.stderr)
328
+ sys.exit(1)
329
+
330
+ if args.strict and (args.schema or args.schema_model):
331
+ print("Error: --strict cannot be used with --schema or --schema-model", file=sys.stderr)
332
+ sys.exit(1)
333
+
334
+ ensure_ascii = args.ensure_ascii
276
335
 
277
336
  try:
337
+ schema = None
338
+ if args.schema:
339
+ with Path(args.schema).open() as fd:
340
+ schema = json.load(fd)
341
+ elif args.schema_model:
342
+ schema = load_schema_model(args.schema_model)
343
+
278
344
  # Use from_file if a filename is provided; otherwise read from stdin.
279
345
  if args.filename:
280
- result = from_file(args.filename, strict=args.strict)
346
+ result = from_file(
347
+ args.filename,
348
+ skip_json_loads=args.skip_json_loads,
349
+ strict=args.strict,
350
+ schema=schema,
351
+ )
281
352
  else:
282
353
  data = sys.stdin.read()
283
- result = loads(data, strict=args.strict)
354
+ result = loads(
355
+ data,
356
+ skip_json_loads=args.skip_json_loads,
357
+ strict=args.strict,
358
+ schema=schema,
359
+ )
284
360
  if args.inline or args.output:
285
- with open(args.output or args.filename, mode="w") as fd:
361
+ with Path(args.output or args.filename).open(mode="w") as fd:
286
362
  json.dump(result, fd, indent=args.indent, ensure_ascii=ensure_ascii)
287
363
  else:
288
364
  print(json.dumps(result, indent=args.indent, ensure_ascii=ensure_ascii))
289
- except Exception as e: # pragma: no cover
365
+ except (OSError, TypeError, ValueError) as e: # pragma: no cover
290
366
  print(f"Error: {str(e)}", file=sys.stderr)
291
367
  return 1
292
368
 
@@ -1,4 +1,4 @@
1
- from typing import TYPE_CHECKING
1
+ from typing import TYPE_CHECKING, Any
2
2
 
3
3
  from .utils.constants import STRING_DELIMITERS, JSONReturnType
4
4
  from .utils.json_context import ContextValues
@@ -6,51 +6,112 @@ from .utils.object_comparer import ObjectComparer
6
6
 
7
7
  if TYPE_CHECKING:
8
8
  from .json_parser import JSONParser
9
+ from .schema_repair import SchemaRepairer
9
10
 
10
11
 
11
- def parse_array(self: "JSONParser") -> list[JSONReturnType]:
12
+ def parse_array(
13
+ self: "JSONParser",
14
+ schema: dict[str, Any] | bool | None = None,
15
+ path: str = "$",
16
+ ) -> list[JSONReturnType]:
12
17
  # <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
13
- arr = []
18
+ # Only activate schema-guided parsing if a repairer is available and schema looks array-like.
19
+ schema_repairer: SchemaRepairer | None = None
20
+ items_schema: object | None = None
21
+ additional_items: object | None = None
22
+ if schema is not None and schema is not True:
23
+ repairer = self.schema_repairer
24
+ if repairer is not None:
25
+ schema = repairer.resolve_schema(schema)
26
+ if schema is False:
27
+ raise ValueError("Schema does not allow any values.")
28
+ if schema is not True and repairer.is_array_schema(schema):
29
+ schema_repairer = repairer
30
+ items_schema = schema.get("items")
31
+ additional_items = schema.get("additionalItems", None)
32
+
33
+ arr: list[JSONReturnType] = []
14
34
  self.context.set(ContextValues.ARRAY)
15
- # Stop when you either find the closing parentheses or you have iterated over the entire string
16
35
  char = self.get_char_at()
36
+ idx = 0
37
+
17
38
  while char and char not in ["]", "}"]:
18
39
  self.skip_whitespaces()
19
- value: JSONReturnType = ""
40
+
41
+ # Resolve per-item schema (tuple schemas + additionalItems) when schema guidance is active.
42
+ item_schema: dict[str, Any] | bool | None = None
43
+ drop_item = False
44
+ if schema_repairer is not None:
45
+ if isinstance(items_schema, list):
46
+ if idx < len(items_schema):
47
+ raw_schema = items_schema[idx]
48
+ # Tuple schemas must contain dict/bool entries only.
49
+ if raw_schema is not None and not isinstance(raw_schema, (dict, bool)):
50
+ raise ValueError("Schema must be an object.")
51
+ item_schema = raw_schema
52
+ else:
53
+ if additional_items is False:
54
+ drop_item = True
55
+ elif isinstance(additional_items, dict):
56
+ item_schema = additional_items
57
+ else:
58
+ item_schema = True
59
+ elif isinstance(items_schema, dict):
60
+ item_schema = items_schema
61
+ else:
62
+ item_schema = True
63
+
64
+ item_path = f"{path}[{idx}]"
65
+
20
66
  if char in STRING_DELIMITERS:
21
- # Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
22
- # So we are going to check if this string is followed by a : or not
23
- # And either parse the string or parse the object
67
+ # A string followed by ':' is often a missing object start; treat it as an object.
24
68
  i = 1
25
69
  i = self.skip_to_character(char, i)
26
70
  i = self.scroll_whitespaces(idx=i + 1)
27
- value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
71
+ if self.get_char_at(i) == ":":
72
+ if schema_repairer is not None and not drop_item:
73
+ # Schema-guided object parsing, then enforce schema on the parsed object.
74
+ value = self.parse_object(item_schema, item_path)
75
+ value = schema_repairer.repair_value(value, item_schema, item_path)
76
+ else:
77
+ # No schema (or dropping): still parse to keep the cursor in sync.
78
+ value = self.parse_object()
79
+ else:
80
+ value = self.parse_string()
81
+ if schema_repairer is not None and not drop_item:
82
+ # Apply schema constraints/coercions to scalar values when configured.
83
+ value = schema_repairer.repair_value(value, item_schema, item_path)
28
84
  else:
29
- value = self.parse_json()
85
+ if schema_repairer is not None and not drop_item:
86
+ # Use schema-aware parsing to guide nested repairs.
87
+ value = self.parse_json(item_schema, item_path)
88
+ else:
89
+ # Parse normally (or discard) to keep the index aligned.
90
+ value = self.parse_json()
30
91
 
31
- # It is possible that parse_json() returns nothing valid, so we increase by 1, unless we find an array separator
32
92
  if ObjectComparer.is_strictly_empty(value) and self.get_char_at() not in ["]", ","]:
33
93
  self.index += 1
34
94
  elif value == "..." and self.get_char_at(-1) == ".":
35
95
  self.log(
36
96
  "While parsing an array, found a stray '...'; ignoring it",
37
97
  )
38
- else:
98
+ elif not drop_item:
39
99
  arr.append(value)
100
+ elif schema_repairer is not None:
101
+ # Record drops for visibility when schema forbids extra tuple items.
102
+ schema_repairer._log("Dropped extra array item not covered by schema", item_path)
40
103
 
41
- # skip over whitespace after a value but before closing ]
104
+ idx += 1
42
105
  char = self.get_char_at()
43
106
  while char and char != "]" and (char.isspace() or char == ","):
44
107
  self.index += 1
45
108
  char = self.get_char_at()
46
109
 
47
- # Especially at the end of an LLM generated json you might miss the last "]"
48
110
  if char != "]":
49
111
  self.log(
50
112
  "While parsing an array we missed the closing ], ignoring it",
51
113
  )
52
114
 
53
115
  self.index += 1
54
-
55
116
  self.context.reset()
56
117
  return arr
@@ -67,5 +67,4 @@ def parse_comment(self: "JSONParser") -> JSONReturnType:
67
67
  self.index += 1
68
68
  if self.context.empty:
69
69
  return self.parse_json()
70
- else:
71
- return ""
70
+ return ""
@@ -33,7 +33,6 @@ def parse_number(self: "JSONParser") -> JSONReturnType:
33
33
  return number_str
34
34
  if "." in number_str or "e" in number_str or "E" in number_str:
35
35
  return float(number_str)
36
- else:
37
- return int(number_str)
36
+ return int(number_str)
38
37
  except ValueError:
39
38
  return number_str