json-repair 0.53.0__tar.gz → 0.54__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {json_repair-0.53.0/src/json_repair.egg-info → json_repair-0.54}/PKG-INFO +19 -1
  2. {json_repair-0.53.0 → json_repair-0.54}/README.md +18 -0
  3. {json_repair-0.53.0 → json_repair-0.54}/pyproject.toml +3 -1
  4. {json_repair-0.53.0 → json_repair-0.54}/src/json_repair/__init__.py +1 -1
  5. {json_repair-0.53.0 → json_repair-0.54}/src/json_repair/json_parser.py +46 -34
  6. {json_repair-0.53.0 → json_repair-0.54}/src/json_repair/json_repair.py +27 -9
  7. {json_repair-0.53.0 → json_repair-0.54}/src/json_repair/parse_array.py +8 -8
  8. {json_repair-0.53.0 → json_repair-0.54}/src/json_repair/parse_comment.py +2 -2
  9. {json_repair-0.53.0 → json_repair-0.54}/src/json_repair/parse_number.py +3 -2
  10. {json_repair-0.53.0 → json_repair-0.54}/src/json_repair/parse_object.py +51 -28
  11. {json_repair-0.53.0 → json_repair-0.54}/src/json_repair/parse_string.py +56 -55
  12. json_repair-0.54/src/json_repair/parse_string_helpers/parse_boolean_or_null.py +28 -0
  13. json_repair-0.54/src/json_repair/parse_string_helpers/parse_json_llm_block.py +19 -0
  14. json_repair-0.54/src/json_repair/utils/string_file_wrapper.py +176 -0
  15. {json_repair-0.53.0 → json_repair-0.54/src/json_repair.egg-info}/PKG-INFO +19 -1
  16. {json_repair-0.53.0 → json_repair-0.54}/src/json_repair.egg-info/SOURCES.txt +7 -7
  17. {json_repair-0.53.0 → json_repair-0.54}/tests/test_parse_string.py +11 -0
  18. {json_repair-0.53.0 → json_repair-0.54}/tests/test_repair_json_cli.py +5 -5
  19. json_repair-0.54/tests/test_strict_mode.py +49 -0
  20. json_repair-0.53.0/src/json_repair/parse_boolean_or_null.py +0 -30
  21. json_repair-0.53.0/src/json_repair/parse_string_helpers/parse_json_llm_block.py +0 -19
  22. json_repair-0.53.0/src/json_repair/string_file_wrapper.py +0 -108
  23. json_repair-0.53.0/tests/test_parse_boolean_or_null.py +0 -12
  24. {json_repair-0.53.0 → json_repair-0.54}/LICENSE +0 -0
  25. {json_repair-0.53.0 → json_repair-0.54}/setup.cfg +0 -0
  26. {json_repair-0.53.0 → json_repair-0.54}/src/json_repair/__main__.py +0 -0
  27. {json_repair-0.53.0 → json_repair-0.54}/src/json_repair/py.typed +0 -0
  28. {json_repair-0.53.0/src/json_repair → json_repair-0.54/src/json_repair/utils}/constants.py +0 -0
  29. {json_repair-0.53.0/src/json_repair → json_repair-0.54/src/json_repair/utils}/json_context.py +0 -0
  30. {json_repair-0.53.0/src/json_repair → json_repair-0.54/src/json_repair/utils}/object_comparer.py +0 -0
  31. {json_repair-0.53.0 → json_repair-0.54}/src/json_repair.egg-info/dependency_links.txt +0 -0
  32. {json_repair-0.53.0 → json_repair-0.54}/src/json_repair.egg-info/entry_points.txt +0 -0
  33. {json_repair-0.53.0 → json_repair-0.54}/src/json_repair.egg-info/top_level.txt +0 -0
  34. {json_repair-0.53.0 → json_repair-0.54}/tests/test_json_repair.py +0 -0
  35. {json_repair-0.53.0 → json_repair-0.54}/tests/test_parse_array.py +0 -0
  36. {json_repair-0.53.0 → json_repair-0.54}/tests/test_parse_comment.py +0 -0
  37. {json_repair-0.53.0 → json_repair-0.54}/tests/test_parse_number.py +0 -0
  38. {json_repair-0.53.0 → json_repair-0.54}/tests/test_parse_object.py +0 -0
  39. {json_repair-0.53.0 → json_repair-0.54}/tests/test_performance.py +0 -0
  40. {json_repair-0.53.0 → json_repair-0.54}/tests/test_repair_json_from_file.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: json_repair
3
- Version: 0.53.0
3
+ Version: 0.54
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License-Expression: MIT
@@ -167,6 +167,23 @@ Some rules of thumb to use:
167
167
  - `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
168
168
  - If you are having issues with escaping pass the string as **raw** string like: `r"string with escaping\""`
169
169
 
170
+ ### Strict mode
171
+
172
+ By default `json_repair` does its best to “fix” input, even when the JSON is far from valid.
173
+ In some scenarios you want the opposite behavior and need the parser to error out instead of repairing; pass `strict=True` to `repair_json`, `loads`, `load`, or `from_file` to enable that mode:
174
+
175
+ ```
176
+ from json_repair import repair_json
177
+
178
+ repair_json(bad_json_string, strict=True)
179
+ ```
180
+
181
+ The CLI exposes the same behavior with `json_repair --strict input.json` (or piping data via stdin).
182
+
183
+ In strict mode the parser raises `ValueError` as soon as it encounters structural issues such as duplicate keys, missing `:` separators, empty keys/values introduced by stray commas, multiple top-level elements, or other ambiguous constructs. This is useful when you just need validation with friendlier error messages while still benefiting from json_repair’s resilience elsewhere in your stack.
184
+
185
+ Strict mode still honors `skip_json_loads=True`; combining them lets you skip the initial `json.loads` check but still enforce strict parsing rules.
186
+
170
187
  ### Use json_repair with streaming
171
188
 
172
189
  Sometimes you are streaming some data and want to repair the JSON coming from it. Normally this won't work but you can pass `stream_stable` to `repair_json()` or `loads()` to make it work:
@@ -198,6 +215,7 @@ options:
198
215
  If specified, the output will be written to TARGET filename instead of stdout
199
216
  --ensure_ascii Pass ensure_ascii=True to json.dumps()
200
217
  --indent INDENT Number of spaces for indentation (Default 2)
218
+ --strict Raise on duplicate keys, missing separators, empty keys/values, and similar structural issues instead of repairing them
201
219
  ```
202
220
 
203
221
  ## Adding to requirements
@@ -150,6 +150,23 @@ Some rules of thumb to use:
150
150
  - `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
151
151
  - If you are having issues with escaping pass the string as **raw** string like: `r"string with escaping\""`
152
152
 
153
+ ### Strict mode
154
+
155
+ By default `json_repair` does its best to “fix” input, even when the JSON is far from valid.
156
+ In some scenarios you want the opposite behavior and need the parser to error out instead of repairing; pass `strict=True` to `repair_json`, `loads`, `load`, or `from_file` to enable that mode:
157
+
158
+ ```
159
+ from json_repair import repair_json
160
+
161
+ repair_json(bad_json_string, strict=True)
162
+ ```
163
+
164
+ The CLI exposes the same behavior with `json_repair --strict input.json` (or piping data via stdin).
165
+
166
+ In strict mode the parser raises `ValueError` as soon as it encounters structural issues such as duplicate keys, missing `:` separators, empty keys/values introduced by stray commas, multiple top-level elements, or other ambiguous constructs. This is useful when you just need validation with friendlier error messages while still benefiting from json_repair’s resilience elsewhere in your stack.
167
+
168
+ Strict mode still honors `skip_json_loads=True`; combining them lets you skip the initial `json.loads` check but still enforce strict parsing rules.
169
+
153
170
  ### Use json_repair with streaming
154
171
 
155
172
  Sometimes you are streaming some data and want to repair the JSON coming from it. Normally this won't work but you can pass `stream_stable` to `repair_json()` or `loads()` to make it work:
@@ -181,6 +198,7 @@ options:
181
198
  If specified, the output will be written to TARGET filename instead of stdout
182
199
  --ensure_ascii Pass ensure_ascii=True to json.dumps()
183
200
  --indent INDENT Number of spaces for indentation (Default 2)
201
+ --strict Raise on duplicate keys, missing separators, empty keys/values, and similar structural issues instead of repairing them
184
202
  ```
185
203
 
186
204
  ## Adding to requirements
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
3
3
  build-backend = "setuptools.build_meta"
4
4
  [project]
5
5
  name = "json_repair"
6
- version = "0.53.0"
6
+ version = "0.54"
7
7
  license = "MIT"
8
8
  license-files = ["LICENSE"]
9
9
  authors = [
@@ -117,3 +117,5 @@ line-ending = "auto"
117
117
  [tool.ruff.lint.per-file-ignores]
118
118
  # Explicit re-exports is fine in __init__.py, still a code smell elsewhere.
119
119
  "__init__.py" = ["PLC0414"]
120
+ [tool.mypy]
121
+ strict = true
@@ -1,4 +1,4 @@
1
- from .constants import JSONReturnType
2
1
  from .json_repair import from_file, load, loads, repair_json
2
+ from .utils.constants import JSONReturnType
3
3
 
4
4
  __all__ = ["from_file", "load", "loads", "repair_json", "JSONReturnType"]
@@ -1,36 +1,32 @@
1
- from typing import Literal, TextIO
1
+ from typing import TextIO
2
2
 
3
- from .constants import STRING_DELIMITERS, JSONReturnType
4
- from .json_context import JsonContext
5
- from .object_comparer import ObjectComparer
6
3
  from .parse_array import parse_array as _parse_array
7
- from .parse_boolean_or_null import parse_boolean_or_null as _parse_boolean_or_null
8
4
  from .parse_comment import parse_comment as _parse_comment
9
5
  from .parse_number import parse_number as _parse_number
10
6
  from .parse_object import parse_object as _parse_object
11
7
  from .parse_string import parse_string as _parse_string
12
- from .string_file_wrapper import StringFileWrapper
8
+ from .utils.constants import STRING_DELIMITERS, JSONReturnType
9
+ from .utils.json_context import JsonContext
10
+ from .utils.object_comparer import ObjectComparer
11
+ from .utils.string_file_wrapper import StringFileWrapper
13
12
 
14
13
 
15
14
  class JSONParser:
16
15
  # Split the parse methods into separate files because this one was like 3000 lines
17
- def parse_array(self, *args, **kwargs):
18
- return _parse_array(self, *args, **kwargs)
16
+ def parse_array(self) -> list[JSONReturnType]:
17
+ return _parse_array(self)
19
18
 
20
- def parse_boolean_or_null(self, *args, **kwargs):
21
- return _parse_boolean_or_null(self, *args, **kwargs)
19
+ def parse_comment(self) -> JSONReturnType:
20
+ return _parse_comment(self)
22
21
 
23
- def parse_comment(self, *args, **kwargs):
24
- return _parse_comment(self, *args, **kwargs)
22
+ def parse_number(self) -> JSONReturnType:
23
+ return _parse_number(self)
25
24
 
26
- def parse_number(self, *args, **kwargs):
27
- return _parse_number(self, *args, **kwargs)
25
+ def parse_object(self) -> JSONReturnType:
26
+ return _parse_object(self)
28
27
 
29
- def parse_object(self, *args, **kwargs):
30
- return _parse_object(self, *args, **kwargs)
31
-
32
- def parse_string(self, *args, **kwargs):
33
- return _parse_string(self, *args, **kwargs)
28
+ def parse_string(self) -> JSONReturnType:
29
+ return _parse_string(self)
34
30
 
35
31
  def __init__(
36
32
  self,
@@ -39,6 +35,7 @@ class JSONParser:
39
35
  logging: bool | None,
40
36
  json_fd_chunk_length: int = 0,
41
37
  stream_stable: bool = False,
38
+ strict: bool = False,
42
39
  ) -> None:
43
40
  # The string to parse
44
41
  self.json_str: str | StringFileWrapper = json_str
@@ -70,6 +67,10 @@ class JSONParser:
70
67
  # case 3: '{"key": "val\\n123,`key2:value2' => '{"key": "val\\n123,`key2:value2"}'
71
68
  # case 4: '{"key": "val\\n123,`key2:value2`"}' => '{"key": "val\\n123,`key2:value2`"}'
72
69
  self.stream_stable = stream_stable
70
+ # Over time the library got more and more complex heuristics to repair JSON. Some of these heuristics
71
+ # may not be desirable in some use cases and the user would prefer json_repair to return an exception.
72
+ # So strict mode was added to disable some of those heuristics.
73
+ self.strict = strict
73
74
 
74
75
  def parse(
75
76
  self,
@@ -97,6 +98,11 @@ class JSONParser:
97
98
  "There were no more elements, returning the element without the array",
98
99
  )
99
100
  json = json[0]
101
+ elif self.strict:
102
+ self.log(
103
+ "Multiple top-level JSON elements found in strict mode, raising an error",
104
+ )
105
+ raise ValueError("Multiple top-level JSON elements found in strict mode.")
100
106
  if self.logging:
101
107
  return json, self.logger
102
108
  else:
@@ -107,8 +113,8 @@ class JSONParser:
107
113
  ) -> JSONReturnType:
108
114
  while True:
109
115
  char = self.get_char_at()
110
- # False means that we are at the end of the string provided
111
- if char is False:
116
+ # None means that we are at the end of the string provided
117
+ if char is None:
112
118
  return ""
113
119
  # <object> starts with '{'
114
120
  elif char == "{":
@@ -130,30 +136,36 @@ class JSONParser:
130
136
  else:
131
137
  self.index += 1
132
138
 
133
- def get_char_at(self, count: int = 0) -> str | Literal[False]:
139
+ def get_char_at(self, count: int = 0) -> str | None:
134
140
  # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
135
141
  try:
136
142
  return self.json_str[self.index + count]
137
143
  except IndexError:
138
- return False
144
+ return None
139
145
 
140
- def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
146
+ def skip_whitespaces(self) -> None:
141
147
  """
142
- This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
148
+ This function quickly iterates on whitespaces, moving the self.index forward
143
149
  """
144
150
  try:
145
- char = self.json_str[self.index + idx]
146
- except IndexError:
147
- return idx
148
- while char.isspace():
149
- if move_main_index:
151
+ char = self.json_str[self.index]
152
+ while char.isspace():
150
153
  self.index += 1
151
- else:
154
+ char = self.json_str[self.index]
155
+ except IndexError:
156
+ pass
157
+
158
+ def scroll_whitespaces(self, idx: int = 0) -> int:
159
+ """
160
+ This function quickly iterates on whitespaces. Doesn't move the self.index and returns the offset from self.index
161
+ """
162
+ try:
163
+ char = self.json_str[self.index + idx]
164
+ while char.isspace():
152
165
  idx += 1
153
- try:
154
166
  char = self.json_str[self.index + idx]
155
- except IndexError:
156
- return idx
167
+ except IndexError:
168
+ pass
157
169
  return idx
158
170
 
159
171
  def skip_to_character(self, character: str | list[str], idx: int = 0) -> int:
@@ -25,10 +25,10 @@ All supported use cases are in the unit tests
25
25
  import argparse
26
26
  import json
27
27
  import sys
28
- from typing import Literal, TextIO, overload
28
+ from typing import Any, Literal, TextIO, overload
29
29
 
30
- from .constants import JSONReturnType
31
30
  from .json_parser import JSONParser
31
+ from .utils.constants import JSONReturnType
32
32
 
33
33
 
34
34
  @overload
@@ -40,7 +40,8 @@ def repair_json(
40
40
  json_fd: TextIO | None = None,
41
41
  chunk_length: int = 0,
42
42
  stream_stable: bool = False,
43
- **json_dumps_args,
43
+ strict: bool = False,
44
+ **json_dumps_args: Any,
44
45
  ) -> str: ...
45
46
 
46
47
 
@@ -53,7 +54,8 @@ def repair_json(
53
54
  json_fd: TextIO | None = None,
54
55
  chunk_length: int = 0,
55
56
  stream_stable: bool = False,
56
- **json_dumps_args,
57
+ strict: bool = False,
58
+ **json_dumps_args: Any,
57
59
  ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]: ...
58
60
 
59
61
 
@@ -65,8 +67,9 @@ def repair_json(
65
67
  json_fd: TextIO | None = None,
66
68
  chunk_length: int = 0,
67
69
  stream_stable: bool = False,
68
- **json_dumps_args,
69
- ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | tuple[JSONReturnType, list]:
70
+ strict: bool = False,
71
+ **json_dumps_args: Any,
72
+ ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
70
73
  """
71
74
  Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
72
75
 
@@ -79,10 +82,11 @@ def repair_json(
79
82
  ensure_ascii (bool, optional): Set to False to avoid converting non-latin characters to ascii (for example when using chinese characters). Defaults to True. Ignored if `skip_json_loads` is True.
80
83
  chunk_length (int, optional): Size in bytes of the file chunks to read at once. Ignored if `json_fd` is None. Do not use! Use `from_file` or `load` instead. Defaults to 1MB.
81
84
  stream_stable (bool, optional): When the json to be repaired is the accumulation of streaming json at a certain moment.If this parameter to True will keep the repair results stable.
85
+ strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
82
86
  Returns:
83
87
  Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON or a tuple with the repaired JSON and repair log when logging is True.
84
88
  """
85
- parser = JSONParser(json_str, json_fd, logging, chunk_length, stream_stable)
89
+ parser = JSONParser(json_str, json_fd, logging, chunk_length, stream_stable, strict)
86
90
  if skip_json_loads:
87
91
  parsed_json = parser.parse()
88
92
  else:
@@ -109,6 +113,7 @@ def loads(
109
113
  skip_json_loads: bool = False,
110
114
  logging: bool = False,
111
115
  stream_stable: bool = False,
116
+ strict: bool = False,
112
117
  ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | str:
113
118
  """
114
119
  This function works like `json.loads()` except that it will fix your JSON in the process.
@@ -118,6 +123,7 @@ def loads(
118
123
  json_str (str): The JSON string to load and repair.
119
124
  skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
120
125
  logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
126
+ strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
121
127
 
122
128
  Returns:
123
129
  Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]], str]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
@@ -128,6 +134,7 @@ def loads(
128
134
  skip_json_loads=skip_json_loads,
129
135
  logging=logging,
130
136
  stream_stable=stream_stable,
137
+ strict=strict,
131
138
  )
132
139
 
133
140
 
@@ -136,6 +143,7 @@ def load(
136
143
  skip_json_loads: bool = False,
137
144
  logging: bool = False,
138
145
  chunk_length: int = 0,
146
+ strict: bool = False,
139
147
  ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
140
148
  """
141
149
  This function works like `json.load()` except that it will fix your JSON in the process.
@@ -146,6 +154,7 @@ def load(
146
154
  skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
147
155
  logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
148
156
  chunk_length (int, optional): Size in bytes of the file chunks to read at once. Defaults to 1MB.
157
+ strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
149
158
 
150
159
  Returns:
151
160
  Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
@@ -156,6 +165,7 @@ def load(
156
165
  return_objects=True,
157
166
  skip_json_loads=skip_json_loads,
158
167
  logging=logging,
168
+ strict=strict,
159
169
  )
160
170
 
161
171
 
@@ -164,6 +174,7 @@ def from_file(
164
174
  skip_json_loads: bool = False,
165
175
  logging: bool = False,
166
176
  chunk_length: int = 0,
177
+ strict: bool = False,
167
178
  ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
168
179
  """
169
180
  This function is a wrapper around `load()` so you can pass the filename as string
@@ -173,6 +184,7 @@ def from_file(
173
184
  skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
174
185
  logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
175
186
  chunk_length (int, optional): Size in bytes of the file chunks to read at once. Defaults to 1MB.
187
+ strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
176
188
 
177
189
  Returns:
178
190
  Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
@@ -183,6 +195,7 @@ def from_file(
183
195
  skip_json_loads=skip_json_loads,
184
196
  logging=logging,
185
197
  chunk_length=chunk_length,
198
+ strict=strict,
186
199
  )
187
200
 
188
201
  return jsonobj
@@ -240,6 +253,11 @@ def cli(inline_args: list[str] | None = None) -> int:
240
253
  default=2,
241
254
  help="Number of spaces for indentation (Default 2)",
242
255
  )
256
+ parser.add_argument(
257
+ "--strict",
258
+ action="store_true",
259
+ help="Raise on duplicate keys, missing separators, empty keys/values, and other unrecoverable structures instead of repairing them",
260
+ )
243
261
 
244
262
  args = parser.parse_args() if inline_args is None else parser.parse_args(inline_args)
245
263
 
@@ -259,10 +277,10 @@ def cli(inline_args: list[str] | None = None) -> int:
259
277
  try:
260
278
  # Use from_file if a filename is provided; otherwise read from stdin.
261
279
  if args.filename:
262
- result = from_file(args.filename)
280
+ result = from_file(args.filename, strict=args.strict)
263
281
  else:
264
282
  data = sys.stdin.read()
265
- result = loads(data)
283
+ result = loads(data, strict=args.strict)
266
284
  if args.inline or args.output:
267
285
  with open(args.output or args.filename, mode="w") as fd:
268
286
  json.dump(result, fd, indent=args.indent, ensure_ascii=ensure_ascii)
@@ -1,8 +1,8 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from .constants import STRING_DELIMITERS, JSONReturnType
4
- from .json_context import ContextValues
5
- from .object_comparer import ObjectComparer
3
+ from .utils.constants import STRING_DELIMITERS, JSONReturnType
4
+ from .utils.json_context import ContextValues
5
+ from .utils.object_comparer import ObjectComparer
6
6
 
7
7
  if TYPE_CHECKING:
8
8
  from .json_parser import JSONParser
@@ -15,7 +15,7 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
15
15
  # Stop when you either find the closing parentheses or you have iterated over the entire string
16
16
  char = self.get_char_at()
17
17
  while char and char not in ["]", "}"]:
18
- self.skip_whitespaces_at()
18
+ self.skip_whitespaces()
19
19
  value: JSONReturnType = ""
20
20
  if char in STRING_DELIMITERS:
21
21
  # Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
@@ -23,13 +23,13 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
23
23
  # And either parse the string or parse the object
24
24
  i = 1
25
25
  i = self.skip_to_character(char, i)
26
- i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
26
+ i = self.scroll_whitespaces(idx=i + 1)
27
27
  value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
28
28
  else:
29
29
  value = self.parse_json()
30
30
 
31
- # It is possible that parse_json() returns nothing valid, so we increase by 1
32
- if ObjectComparer.is_strictly_empty(value):
31
+ # It is possible that parse_json() returns nothing valid, so we increase by 1, unless we find an array separator
32
+ if ObjectComparer.is_strictly_empty(value) and self.get_char_at() not in ["]", ","]:
33
33
  self.index += 1
34
34
  elif value == "..." and self.get_char_at(-1) == ".":
35
35
  self.log(
@@ -45,7 +45,7 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
45
45
  char = self.get_char_at()
46
46
 
47
47
  # Especially at the end of an LLM generated json you might miss the last "]"
48
- if char and char != "]":
48
+ if char != "]":
49
49
  self.log(
50
50
  "While parsing an array we missed the closing ], ignoring it",
51
51
  )
@@ -1,7 +1,7 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from .constants import JSONReturnType
4
- from .json_context import ContextValues
3
+ from .utils.constants import JSONReturnType
4
+ from .utils.json_context import ContextValues
5
5
 
6
6
  if TYPE_CHECKING:
7
7
  from .json_parser import JSONParser
@@ -1,6 +1,7 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from .json_context import ContextValues
3
+ from .utils.constants import JSONReturnType
4
+ from .utils.json_context import ContextValues
4
5
 
5
6
  NUMBER_CHARS: set[str] = set("0123456789-.eE/,")
6
7
 
@@ -9,7 +10,7 @@ if TYPE_CHECKING:
9
10
  from .json_parser import JSONParser
10
11
 
11
12
 
12
- def parse_number(self: "JSONParser") -> float | int | str | bool | None:
13
+ def parse_number(self: "JSONParser") -> JSONReturnType:
13
14
  # <number> is a valid real number expressed in one of a number of given formats
14
15
  number_str = ""
15
16
  char = self.get_char_at()
@@ -1,7 +1,7 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from .constants import STRING_DELIMITERS, JSONReturnType
4
- from .json_context import ContextValues
3
+ from .utils.constants import STRING_DELIMITERS, JSONReturnType
4
+ from .utils.json_context import ContextValues
5
5
 
6
6
  if TYPE_CHECKING:
7
7
  from .json_parser import JSONParser
@@ -17,10 +17,10 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
17
17
  # <member> ::= <string> ': ' <json>
18
18
 
19
19
  # Skip filler whitespaces
20
- self.skip_whitespaces_at()
20
+ self.skip_whitespaces()
21
21
 
22
22
  # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
23
- if (self.get_char_at() or "") == ":":
23
+ if self.get_char_at() == ":":
24
24
  self.log(
25
25
  "While parsing an object we found a : before a key, ignoring",
26
26
  )
@@ -53,18 +53,26 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
53
53
  prev_value.extend(
54
54
  new_array[0] if len(new_array) == 1 and isinstance(new_array[0], list) else new_array
55
55
  )
56
- self.skip_whitespaces_at()
56
+ self.skip_whitespaces()
57
57
  if self.get_char_at() == ",":
58
58
  self.index += 1
59
- self.skip_whitespaces_at()
59
+ self.skip_whitespaces()
60
60
  continue
61
61
  key = str(self.parse_string())
62
62
  if key == "":
63
- self.skip_whitespaces_at()
63
+ self.skip_whitespaces()
64
64
  if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
65
- # If the string is empty but there is a object divider, we are done here
65
+ # Empty keys now trigger in strict mode, otherwise we keep repairing as before
66
+ if key == "" and self.strict:
67
+ self.log(
68
+ "Empty key found in strict mode while parsing object, raising an error",
69
+ )
70
+ raise ValueError("Empty key found in strict mode while parsing object.")
66
71
  break
67
72
  if ContextValues.ARRAY in self.context.context and key in obj:
73
+ if self.strict:
74
+ self.log("Duplicate key found in strict mode while parsing object, raising an error")
75
+ raise ValueError("Duplicate key found in strict mode while parsing object.")
68
76
  self.log(
69
77
  "While parsing an object we found a duplicate key, closing the object here and rolling back the index",
70
78
  )
@@ -74,16 +82,21 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
74
82
  break
75
83
 
76
84
  # Skip filler whitespaces
77
- self.skip_whitespaces_at()
85
+ self.skip_whitespaces()
78
86
 
79
87
  # We reached the end here
80
88
  if (self.get_char_at() or "}") == "}":
81
89
  continue
82
90
 
83
- self.skip_whitespaces_at()
91
+ self.skip_whitespaces()
84
92
 
85
93
  # An extreme case of missing ":" after a key
86
- if (self.get_char_at() or "") != ":":
94
+ if self.get_char_at() != ":":
95
+ if self.strict:
96
+ self.log(
97
+ "Missing ':' after key in strict mode while parsing object, raising an error",
98
+ )
99
+ raise ValueError("Missing ':' after key in strict mode while parsing object.")
87
100
  self.log(
88
101
  "While parsing an object we missed a : after a key",
89
102
  )
@@ -91,31 +104,40 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
91
104
  self.index += 1
92
105
  self.context.reset()
93
106
  self.context.set(ContextValues.OBJECT_VALUE)
94
- # The value can be any valid json
95
- self.skip_whitespaces_at()
107
+ # The value can be any valid json; strict mode will refuse repaired empties
108
+ self.skip_whitespaces()
96
109
  # Corner case, a lone comma
97
110
  value: JSONReturnType = ""
98
- if (self.get_char_at() or "") in [",", "}"]:
111
+ if self.get_char_at() in [",", "}"]:
99
112
  self.log(
100
- "While parsing an object value we found a stray , ignoring it",
113
+ "While parsing an object value we found a stray " + str(self.get_char_at()) + ", ignoring it",
101
114
  )
102
115
  else:
103
116
  value = self.parse_json()
104
-
117
+ if value == "" and self.strict and self.get_char_at(-1) not in STRING_DELIMITERS:
118
+ self.log(
119
+ "Parsed value is empty in strict mode while parsing object, raising an error",
120
+ )
121
+ raise ValueError("Parsed value is empty in strict mode while parsing object.")
105
122
  # Reset context since our job is done
106
123
  self.context.reset()
107
124
  obj[key] = value
108
125
 
109
- if (self.get_char_at() or "") in [",", "'", '"']:
126
+ if self.get_char_at() in [",", "'", '"']:
110
127
  self.index += 1
111
128
 
112
129
  # Remove trailing spaces
113
- self.skip_whitespaces_at()
130
+ self.skip_whitespaces()
114
131
 
115
132
  self.index += 1
116
133
 
117
134
  # If the object is empty but also isn't just {}
118
135
  if not obj and self.index - start_index > 2:
136
+ if self.strict:
137
+ self.log(
138
+ "Parsed object is empty but contains extra characters in strict mode, raising an error",
139
+ )
140
+ raise ValueError("Parsed object is empty but contains extra characters in strict mode.")
119
141
  self.log("Parsed object is empty, we will try to parse this as an array instead")
120
142
  self.index = start_index
121
143
  return self.parse_array()
@@ -126,18 +148,19 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
126
148
  if not self.context.empty:
127
149
  return obj
128
150
 
129
- self.skip_whitespaces_at()
130
- if (self.get_char_at() or "") != ",":
151
+ self.skip_whitespaces()
152
+ if self.get_char_at() != ",":
131
153
  return obj
132
154
  self.index += 1
133
- self.skip_whitespaces_at()
134
- if (self.get_char_at() or "") not in STRING_DELIMITERS:
155
+ self.skip_whitespaces()
156
+ if self.get_char_at() not in STRING_DELIMITERS:
135
157
  return obj
136
- self.log(
137
- "Found a comma and string delimiter after object closing brace, checking for additional key-value pairs",
138
- )
139
- additional_obj = self.parse_object()
140
- if isinstance(additional_obj, dict):
141
- obj.update(additional_obj)
158
+ if not self.strict:
159
+ self.log(
160
+ "Found a comma and string delimiter after object closing brace, checking for additional key-value pairs",
161
+ )
162
+ additional_obj = self.parse_object()
163
+ if isinstance(additional_obj, dict):
164
+ obj.update(additional_obj)
142
165
 
143
166
  return obj