json-repair 0.53.0__py3-none-any.whl → 0.54__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
json_repair/__init__.py CHANGED
@@ -1,4 +1,4 @@
1
- from .constants import JSONReturnType
2
1
  from .json_repair import from_file, load, loads, repair_json
2
+ from .utils.constants import JSONReturnType
3
3
 
4
4
  __all__ = ["from_file", "load", "loads", "repair_json", "JSONReturnType"]
@@ -1,36 +1,32 @@
1
- from typing import Literal, TextIO
1
+ from typing import TextIO
2
2
 
3
- from .constants import STRING_DELIMITERS, JSONReturnType
4
- from .json_context import JsonContext
5
- from .object_comparer import ObjectComparer
6
3
  from .parse_array import parse_array as _parse_array
7
- from .parse_boolean_or_null import parse_boolean_or_null as _parse_boolean_or_null
8
4
  from .parse_comment import parse_comment as _parse_comment
9
5
  from .parse_number import parse_number as _parse_number
10
6
  from .parse_object import parse_object as _parse_object
11
7
  from .parse_string import parse_string as _parse_string
12
- from .string_file_wrapper import StringFileWrapper
8
+ from .utils.constants import STRING_DELIMITERS, JSONReturnType
9
+ from .utils.json_context import JsonContext
10
+ from .utils.object_comparer import ObjectComparer
11
+ from .utils.string_file_wrapper import StringFileWrapper
13
12
 
14
13
 
15
14
  class JSONParser:
16
15
  # Split the parse methods into separate files because this one was like 3000 lines
17
- def parse_array(self, *args, **kwargs):
18
- return _parse_array(self, *args, **kwargs)
16
+ def parse_array(self) -> list[JSONReturnType]:
17
+ return _parse_array(self)
19
18
 
20
- def parse_boolean_or_null(self, *args, **kwargs):
21
- return _parse_boolean_or_null(self, *args, **kwargs)
19
+ def parse_comment(self) -> JSONReturnType:
20
+ return _parse_comment(self)
22
21
 
23
- def parse_comment(self, *args, **kwargs):
24
- return _parse_comment(self, *args, **kwargs)
22
+ def parse_number(self) -> JSONReturnType:
23
+ return _parse_number(self)
25
24
 
26
- def parse_number(self, *args, **kwargs):
27
- return _parse_number(self, *args, **kwargs)
25
+ def parse_object(self) -> JSONReturnType:
26
+ return _parse_object(self)
28
27
 
29
- def parse_object(self, *args, **kwargs):
30
- return _parse_object(self, *args, **kwargs)
31
-
32
- def parse_string(self, *args, **kwargs):
33
- return _parse_string(self, *args, **kwargs)
28
+ def parse_string(self) -> JSONReturnType:
29
+ return _parse_string(self)
34
30
 
35
31
  def __init__(
36
32
  self,
@@ -39,6 +35,7 @@ class JSONParser:
39
35
  logging: bool | None,
40
36
  json_fd_chunk_length: int = 0,
41
37
  stream_stable: bool = False,
38
+ strict: bool = False,
42
39
  ) -> None:
43
40
  # The string to parse
44
41
  self.json_str: str | StringFileWrapper = json_str
@@ -70,6 +67,10 @@ class JSONParser:
70
67
  # case 3: '{"key": "val\\n123,`key2:value2' => '{"key": "val\\n123,`key2:value2"}'
71
68
  # case 4: '{"key": "val\\n123,`key2:value2`"}' => '{"key": "val\\n123,`key2:value2`"}'
72
69
  self.stream_stable = stream_stable
70
+ # Over time the library got more and more complex heuristics to repair JSON. Some of these heuristics
71
+ # may not be desirable in some use cases and the user would prefer json_repair to return an exception.
72
+ # So strict mode was added to disable some of those heuristics.
73
+ self.strict = strict
73
74
 
74
75
  def parse(
75
76
  self,
@@ -97,6 +98,11 @@ class JSONParser:
97
98
  "There were no more elements, returning the element without the array",
98
99
  )
99
100
  json = json[0]
101
+ elif self.strict:
102
+ self.log(
103
+ "Multiple top-level JSON elements found in strict mode, raising an error",
104
+ )
105
+ raise ValueError("Multiple top-level JSON elements found in strict mode.")
100
106
  if self.logging:
101
107
  return json, self.logger
102
108
  else:
@@ -107,8 +113,8 @@ class JSONParser:
107
113
  ) -> JSONReturnType:
108
114
  while True:
109
115
  char = self.get_char_at()
110
- # False means that we are at the end of the string provided
111
- if char is False:
116
+ # None means that we are at the end of the string provided
117
+ if char is None:
112
118
  return ""
113
119
  # <object> starts with '{'
114
120
  elif char == "{":
@@ -130,30 +136,36 @@ class JSONParser:
130
136
  else:
131
137
  self.index += 1
132
138
 
133
- def get_char_at(self, count: int = 0) -> str | Literal[False]:
139
+ def get_char_at(self, count: int = 0) -> str | None:
134
140
  # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
135
141
  try:
136
142
  return self.json_str[self.index + count]
137
143
  except IndexError:
138
- return False
144
+ return None
139
145
 
140
- def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
146
+ def skip_whitespaces(self) -> None:
141
147
  """
142
- This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
148
+ This function quickly iterates on whitespaces, moving the self.index forward
143
149
  """
144
150
  try:
145
- char = self.json_str[self.index + idx]
146
- except IndexError:
147
- return idx
148
- while char.isspace():
149
- if move_main_index:
151
+ char = self.json_str[self.index]
152
+ while char.isspace():
150
153
  self.index += 1
151
- else:
154
+ char = self.json_str[self.index]
155
+ except IndexError:
156
+ pass
157
+
158
+ def scroll_whitespaces(self, idx: int = 0) -> int:
159
+ """
160
+ This function quickly iterates on whitespaces. Doesn't move the self.index and returns the offset from self.index
161
+ """
162
+ try:
163
+ char = self.json_str[self.index + idx]
164
+ while char.isspace():
152
165
  idx += 1
153
- try:
154
166
  char = self.json_str[self.index + idx]
155
- except IndexError:
156
- return idx
167
+ except IndexError:
168
+ pass
157
169
  return idx
158
170
 
159
171
  def skip_to_character(self, character: str | list[str], idx: int = 0) -> int:
@@ -25,10 +25,10 @@ All supported use cases are in the unit tests
25
25
  import argparse
26
26
  import json
27
27
  import sys
28
- from typing import Literal, TextIO, overload
28
+ from typing import Any, Literal, TextIO, overload
29
29
 
30
- from .constants import JSONReturnType
31
30
  from .json_parser import JSONParser
31
+ from .utils.constants import JSONReturnType
32
32
 
33
33
 
34
34
  @overload
@@ -40,7 +40,8 @@ def repair_json(
40
40
  json_fd: TextIO | None = None,
41
41
  chunk_length: int = 0,
42
42
  stream_stable: bool = False,
43
- **json_dumps_args,
43
+ strict: bool = False,
44
+ **json_dumps_args: Any,
44
45
  ) -> str: ...
45
46
 
46
47
 
@@ -53,7 +54,8 @@ def repair_json(
53
54
  json_fd: TextIO | None = None,
54
55
  chunk_length: int = 0,
55
56
  stream_stable: bool = False,
56
- **json_dumps_args,
57
+ strict: bool = False,
58
+ **json_dumps_args: Any,
57
59
  ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]: ...
58
60
 
59
61
 
@@ -65,8 +67,9 @@ def repair_json(
65
67
  json_fd: TextIO | None = None,
66
68
  chunk_length: int = 0,
67
69
  stream_stable: bool = False,
68
- **json_dumps_args,
69
- ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | tuple[JSONReturnType, list]:
70
+ strict: bool = False,
71
+ **json_dumps_args: Any,
72
+ ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
70
73
  """
71
74
  Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
72
75
 
@@ -79,10 +82,11 @@ def repair_json(
79
82
  ensure_ascii (bool, optional): Set to False to avoid converting non-latin characters to ascii (for example when using chinese characters). Defaults to True. Ignored if `skip_json_loads` is True.
80
83
  chunk_length (int, optional): Size in bytes of the file chunks to read at once. Ignored if `json_fd` is None. Do not use! Use `from_file` or `load` instead. Defaults to 1MB.
81
84
  stream_stable (bool, optional): When the json to be repaired is the accumulation of streaming json at a certain moment.If this parameter to True will keep the repair results stable.
85
+ strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
82
86
  Returns:
83
87
  Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON or a tuple with the repaired JSON and repair log when logging is True.
84
88
  """
85
- parser = JSONParser(json_str, json_fd, logging, chunk_length, stream_stable)
89
+ parser = JSONParser(json_str, json_fd, logging, chunk_length, stream_stable, strict)
86
90
  if skip_json_loads:
87
91
  parsed_json = parser.parse()
88
92
  else:
@@ -109,6 +113,7 @@ def loads(
109
113
  skip_json_loads: bool = False,
110
114
  logging: bool = False,
111
115
  stream_stable: bool = False,
116
+ strict: bool = False,
112
117
  ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | str:
113
118
  """
114
119
  This function works like `json.loads()` except that it will fix your JSON in the process.
@@ -118,6 +123,7 @@ def loads(
118
123
  json_str (str): The JSON string to load and repair.
119
124
  skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
120
125
  logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
126
+ strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
121
127
 
122
128
  Returns:
123
129
  Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]], str]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
@@ -128,6 +134,7 @@ def loads(
128
134
  skip_json_loads=skip_json_loads,
129
135
  logging=logging,
130
136
  stream_stable=stream_stable,
137
+ strict=strict,
131
138
  )
132
139
 
133
140
 
@@ -136,6 +143,7 @@ def load(
136
143
  skip_json_loads: bool = False,
137
144
  logging: bool = False,
138
145
  chunk_length: int = 0,
146
+ strict: bool = False,
139
147
  ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
140
148
  """
141
149
  This function works like `json.load()` except that it will fix your JSON in the process.
@@ -146,6 +154,7 @@ def load(
146
154
  skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
147
155
  logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
148
156
  chunk_length (int, optional): Size in bytes of the file chunks to read at once. Defaults to 1MB.
157
+ strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
149
158
 
150
159
  Returns:
151
160
  Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
@@ -156,6 +165,7 @@ def load(
156
165
  return_objects=True,
157
166
  skip_json_loads=skip_json_loads,
158
167
  logging=logging,
168
+ strict=strict,
159
169
  )
160
170
 
161
171
 
@@ -164,6 +174,7 @@ def from_file(
164
174
  skip_json_loads: bool = False,
165
175
  logging: bool = False,
166
176
  chunk_length: int = 0,
177
+ strict: bool = False,
167
178
  ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
168
179
  """
169
180
  This function is a wrapper around `load()` so you can pass the filename as string
@@ -173,6 +184,7 @@ def from_file(
173
184
  skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
174
185
  logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
175
186
  chunk_length (int, optional): Size in bytes of the file chunks to read at once. Defaults to 1MB.
187
+ strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
176
188
 
177
189
  Returns:
178
190
  Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
@@ -183,6 +195,7 @@ def from_file(
183
195
  skip_json_loads=skip_json_loads,
184
196
  logging=logging,
185
197
  chunk_length=chunk_length,
198
+ strict=strict,
186
199
  )
187
200
 
188
201
  return jsonobj
@@ -240,6 +253,11 @@ def cli(inline_args: list[str] | None = None) -> int:
240
253
  default=2,
241
254
  help="Number of spaces for indentation (Default 2)",
242
255
  )
256
+ parser.add_argument(
257
+ "--strict",
258
+ action="store_true",
259
+ help="Raise on duplicate keys, missing separators, empty keys/values, and other unrecoverable structures instead of repairing them",
260
+ )
243
261
 
244
262
  args = parser.parse_args() if inline_args is None else parser.parse_args(inline_args)
245
263
 
@@ -259,10 +277,10 @@ def cli(inline_args: list[str] | None = None) -> int:
259
277
  try:
260
278
  # Use from_file if a filename is provided; otherwise read from stdin.
261
279
  if args.filename:
262
- result = from_file(args.filename)
280
+ result = from_file(args.filename, strict=args.strict)
263
281
  else:
264
282
  data = sys.stdin.read()
265
- result = loads(data)
283
+ result = loads(data, strict=args.strict)
266
284
  if args.inline or args.output:
267
285
  with open(args.output or args.filename, mode="w") as fd:
268
286
  json.dump(result, fd, indent=args.indent, ensure_ascii=ensure_ascii)
@@ -1,8 +1,8 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from .constants import STRING_DELIMITERS, JSONReturnType
4
- from .json_context import ContextValues
5
- from .object_comparer import ObjectComparer
3
+ from .utils.constants import STRING_DELIMITERS, JSONReturnType
4
+ from .utils.json_context import ContextValues
5
+ from .utils.object_comparer import ObjectComparer
6
6
 
7
7
  if TYPE_CHECKING:
8
8
  from .json_parser import JSONParser
@@ -15,7 +15,7 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
15
15
  # Stop when you either find the closing parentheses or you have iterated over the entire string
16
16
  char = self.get_char_at()
17
17
  while char and char not in ["]", "}"]:
18
- self.skip_whitespaces_at()
18
+ self.skip_whitespaces()
19
19
  value: JSONReturnType = ""
20
20
  if char in STRING_DELIMITERS:
21
21
  # Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
@@ -23,13 +23,13 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
23
23
  # And either parse the string or parse the object
24
24
  i = 1
25
25
  i = self.skip_to_character(char, i)
26
- i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
26
+ i = self.scroll_whitespaces(idx=i + 1)
27
27
  value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
28
28
  else:
29
29
  value = self.parse_json()
30
30
 
31
- # It is possible that parse_json() returns nothing valid, so we increase by 1
32
- if ObjectComparer.is_strictly_empty(value):
31
+ # It is possible that parse_json() returns nothing valid, so we increase by 1, unless we find an array separator
32
+ if ObjectComparer.is_strictly_empty(value) and self.get_char_at() not in ["]", ","]:
33
33
  self.index += 1
34
34
  elif value == "..." and self.get_char_at(-1) == ".":
35
35
  self.log(
@@ -45,7 +45,7 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
45
45
  char = self.get_char_at()
46
46
 
47
47
  # Especially at the end of an LLM generated json you might miss the last "]"
48
- if char and char != "]":
48
+ if char != "]":
49
49
  self.log(
50
50
  "While parsing an array we missed the closing ], ignoring it",
51
51
  )
@@ -1,7 +1,7 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from .constants import JSONReturnType
4
- from .json_context import ContextValues
3
+ from .utils.constants import JSONReturnType
4
+ from .utils.json_context import ContextValues
5
5
 
6
6
  if TYPE_CHECKING:
7
7
  from .json_parser import JSONParser
@@ -1,6 +1,7 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from .json_context import ContextValues
3
+ from .utils.constants import JSONReturnType
4
+ from .utils.json_context import ContextValues
4
5
 
5
6
  NUMBER_CHARS: set[str] = set("0123456789-.eE/,")
6
7
 
@@ -9,7 +10,7 @@ if TYPE_CHECKING:
9
10
  from .json_parser import JSONParser
10
11
 
11
12
 
12
- def parse_number(self: "JSONParser") -> float | int | str | bool | None:
13
+ def parse_number(self: "JSONParser") -> JSONReturnType:
13
14
  # <number> is a valid real number expressed in one of a number of given formats
14
15
  number_str = ""
15
16
  char = self.get_char_at()
@@ -1,7 +1,7 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from .constants import STRING_DELIMITERS, JSONReturnType
4
- from .json_context import ContextValues
3
+ from .utils.constants import STRING_DELIMITERS, JSONReturnType
4
+ from .utils.json_context import ContextValues
5
5
 
6
6
  if TYPE_CHECKING:
7
7
  from .json_parser import JSONParser
@@ -17,10 +17,10 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
17
17
  # <member> ::= <string> ': ' <json>
18
18
 
19
19
  # Skip filler whitespaces
20
- self.skip_whitespaces_at()
20
+ self.skip_whitespaces()
21
21
 
22
22
  # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
23
- if (self.get_char_at() or "") == ":":
23
+ if self.get_char_at() == ":":
24
24
  self.log(
25
25
  "While parsing an object we found a : before a key, ignoring",
26
26
  )
@@ -53,18 +53,26 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
53
53
  prev_value.extend(
54
54
  new_array[0] if len(new_array) == 1 and isinstance(new_array[0], list) else new_array
55
55
  )
56
- self.skip_whitespaces_at()
56
+ self.skip_whitespaces()
57
57
  if self.get_char_at() == ",":
58
58
  self.index += 1
59
- self.skip_whitespaces_at()
59
+ self.skip_whitespaces()
60
60
  continue
61
61
  key = str(self.parse_string())
62
62
  if key == "":
63
- self.skip_whitespaces_at()
63
+ self.skip_whitespaces()
64
64
  if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
65
- # If the string is empty but there is a object divider, we are done here
65
+ # Empty keys now trigger in strict mode, otherwise we keep repairing as before
66
+ if key == "" and self.strict:
67
+ self.log(
68
+ "Empty key found in strict mode while parsing object, raising an error",
69
+ )
70
+ raise ValueError("Empty key found in strict mode while parsing object.")
66
71
  break
67
72
  if ContextValues.ARRAY in self.context.context and key in obj:
73
+ if self.strict:
74
+ self.log("Duplicate key found in strict mode while parsing object, raising an error")
75
+ raise ValueError("Duplicate key found in strict mode while parsing object.")
68
76
  self.log(
69
77
  "While parsing an object we found a duplicate key, closing the object here and rolling back the index",
70
78
  )
@@ -74,16 +82,21 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
74
82
  break
75
83
 
76
84
  # Skip filler whitespaces
77
- self.skip_whitespaces_at()
85
+ self.skip_whitespaces()
78
86
 
79
87
  # We reached the end here
80
88
  if (self.get_char_at() or "}") == "}":
81
89
  continue
82
90
 
83
- self.skip_whitespaces_at()
91
+ self.skip_whitespaces()
84
92
 
85
93
  # An extreme case of missing ":" after a key
86
- if (self.get_char_at() or "") != ":":
94
+ if self.get_char_at() != ":":
95
+ if self.strict:
96
+ self.log(
97
+ "Missing ':' after key in strict mode while parsing object, raising an error",
98
+ )
99
+ raise ValueError("Missing ':' after key in strict mode while parsing object.")
87
100
  self.log(
88
101
  "While parsing an object we missed a : after a key",
89
102
  )
@@ -91,31 +104,40 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
91
104
  self.index += 1
92
105
  self.context.reset()
93
106
  self.context.set(ContextValues.OBJECT_VALUE)
94
- # The value can be any valid json
95
- self.skip_whitespaces_at()
107
+ # The value can be any valid json; strict mode will refuse repaired empties
108
+ self.skip_whitespaces()
96
109
  # Corner case, a lone comma
97
110
  value: JSONReturnType = ""
98
- if (self.get_char_at() or "") in [",", "}"]:
111
+ if self.get_char_at() in [",", "}"]:
99
112
  self.log(
100
- "While parsing an object value we found a stray , ignoring it",
113
+ "While parsing an object value we found a stray " + str(self.get_char_at()) + ", ignoring it",
101
114
  )
102
115
  else:
103
116
  value = self.parse_json()
104
-
117
+ if value == "" and self.strict and self.get_char_at(-1) not in STRING_DELIMITERS:
118
+ self.log(
119
+ "Parsed value is empty in strict mode while parsing object, raising an error",
120
+ )
121
+ raise ValueError("Parsed value is empty in strict mode while parsing object.")
105
122
  # Reset context since our job is done
106
123
  self.context.reset()
107
124
  obj[key] = value
108
125
 
109
- if (self.get_char_at() or "") in [",", "'", '"']:
126
+ if self.get_char_at() in [",", "'", '"']:
110
127
  self.index += 1
111
128
 
112
129
  # Remove trailing spaces
113
- self.skip_whitespaces_at()
130
+ self.skip_whitespaces()
114
131
 
115
132
  self.index += 1
116
133
 
117
134
  # If the object is empty but also isn't just {}
118
135
  if not obj and self.index - start_index > 2:
136
+ if self.strict:
137
+ self.log(
138
+ "Parsed object is empty but contains extra characters in strict mode, raising an error",
139
+ )
140
+ raise ValueError("Parsed object is empty but contains extra characters in strict mode.")
119
141
  self.log("Parsed object is empty, we will try to parse this as an array instead")
120
142
  self.index = start_index
121
143
  return self.parse_array()
@@ -126,18 +148,19 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
126
148
  if not self.context.empty:
127
149
  return obj
128
150
 
129
- self.skip_whitespaces_at()
130
- if (self.get_char_at() or "") != ",":
151
+ self.skip_whitespaces()
152
+ if self.get_char_at() != ",":
131
153
  return obj
132
154
  self.index += 1
133
- self.skip_whitespaces_at()
134
- if (self.get_char_at() or "") not in STRING_DELIMITERS:
155
+ self.skip_whitespaces()
156
+ if self.get_char_at() not in STRING_DELIMITERS:
135
157
  return obj
136
- self.log(
137
- "Found a comma and string delimiter after object closing brace, checking for additional key-value pairs",
138
- )
139
- additional_obj = self.parse_object()
140
- if isinstance(additional_obj, dict):
141
- obj.update(additional_obj)
158
+ if not self.strict:
159
+ self.log(
160
+ "Found a comma and string delimiter after object closing brace, checking for additional key-value pairs",
161
+ )
162
+ additional_obj = self.parse_object()
163
+ if isinstance(additional_obj, dict):
164
+ obj.update(additional_obj)
142
165
 
143
166
  return obj
@@ -1,14 +1,22 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from .constants import STRING_DELIMITERS, JSONReturnType
4
- from .json_context import ContextValues
3
+ from .parse_string_helpers.parse_boolean_or_null import parse_boolean_or_null
5
4
  from .parse_string_helpers.parse_json_llm_block import parse_json_llm_block
5
+ from .utils.constants import STRING_DELIMITERS, JSONReturnType
6
+ from .utils.json_context import ContextValues
6
7
 
7
8
  if TYPE_CHECKING:
8
9
  from .json_parser import JSONParser
9
10
 
10
11
 
11
12
  def parse_string(self: "JSONParser") -> JSONReturnType:
13
+ # Utility function to append a character to the accumulator and update the index
14
+ def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str | None]:
15
+ acc += str(current_char)
16
+ self.index += 1
17
+ char = self.get_char_at()
18
+ return acc, char
19
+
12
20
  # <string> is a string of valid characters enclosed in quotes
13
21
  # i.e. { name: "John" }
14
22
  # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
@@ -40,7 +48,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
40
48
  # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
41
49
  # But remember, object keys are only of type string
42
50
  if char.lower() in ["t", "f", "n"] and self.context.current != ContextValues.OBJECT_KEY:
43
- value = self.parse_boolean_or_null()
51
+ value = parse_boolean_or_null(self)
44
52
  if value != "":
45
53
  return value
46
54
  self.log(
@@ -59,10 +67,12 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
59
67
  "While parsing a string, we found code fences but they did not enclose valid JSON, continuing parsing the string",
60
68
  )
61
69
  # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
62
- if self.get_char_at() in STRING_DELIMITERS and self.get_char_at() == lstring_delimiter:
70
+ if self.get_char_at() == lstring_delimiter:
63
71
  # If it's an empty key, this was easy
64
- if (self.context.current == ContextValues.OBJECT_KEY and self.get_char_at(1) == ":") or (
65
- self.context.current == ContextValues.OBJECT_VALUE and self.get_char_at(1) in [",", "}"]
72
+ if (
73
+ (self.context.current == ContextValues.OBJECT_KEY and self.get_char_at(1) == ":")
74
+ or (self.context.current == ContextValues.OBJECT_VALUE and self.get_char_at(1) in [",", "}"])
75
+ or (self.context.current == ContextValues.ARRAY and self.get_char_at(1) in [",", "]"])
66
76
  ):
67
77
  self.index += 1
68
78
  return ""
@@ -71,13 +81,16 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
71
81
  self.log(
72
82
  "While parsing a string, we found a doubled quote and then a quote again, ignoring it",
73
83
  )
74
- return ""
84
+ if self.strict:
85
+ raise ValueError("Found doubled quotes followed by another quote.")
86
+ else:
87
+ return ""
75
88
  # Find the next delimiter
76
89
  i = self.skip_to_character(character=rstring_delimiter, idx=1)
77
90
  next_c = self.get_char_at(i)
78
91
  # Now check that the next character is also a delimiter to ensure that we have "".....""
79
92
  # In that case we ignore this rstring delimiter
80
- if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
93
+ if self.get_char_at(i + 1) == rstring_delimiter:
81
94
  self.log(
82
95
  "While parsing a string, we found a valid starting doubled quote",
83
96
  )
@@ -85,13 +98,17 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
85
98
  self.index += 1
86
99
  else:
87
100
  # Ok this is not a doubled quote, check if this is an empty string or not
88
- i = self.skip_whitespaces_at(idx=1, move_main_index=False)
101
+ i = self.scroll_whitespaces(idx=1)
89
102
  next_c = self.get_char_at(i)
90
103
  if next_c in STRING_DELIMITERS + ["{", "["]:
91
104
  # something fishy is going on here
92
105
  self.log(
93
106
  "While parsing a string, we found a doubled quote but also another quote afterwards, ignoring it",
94
107
  )
108
+ if self.strict:
109
+ raise ValueError(
110
+ "Found doubled quotes followed by another quote while parsing a string.",
111
+ )
95
112
  self.index += 1
96
113
  return ""
97
114
  elif next_c not in [",", "]", "}"]:
@@ -135,7 +152,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
135
152
  ):
136
153
  rstring_delimiter_missing = True
137
154
  # check if this is a case in which the closing comma is NOT missing instead
138
- self.skip_whitespaces_at()
155
+ self.skip_whitespaces()
139
156
  if self.get_char_at(1) == "\\":
140
157
  # Ok this is a quoted string, skip
141
158
  rstring_delimiter_missing = False
@@ -145,7 +162,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
145
162
  i += 1
146
163
  # found a delimiter, now we need to check that is followed strictly by a comma or brace
147
164
  # or the string ended
148
- i = self.skip_whitespaces_at(idx=i, move_main_index=False)
165
+ i = self.scroll_whitespaces(idx=i)
149
166
  next_c = self.get_char_at(i)
150
167
  if not next_c or next_c in [",", "}"]:
151
168
  rstring_delimiter_missing = False
@@ -160,7 +177,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
160
177
  else:
161
178
  # But again, this could just be something a bit stupid like "lorem, "ipsum" sic"
162
179
  # Check if we find a : afterwards (skipping space)
163
- i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
180
+ i = self.scroll_whitespaces(idx=i + 1)
164
181
  next_c = self.get_char_at(i)
165
182
  if next_c and next_c != ":":
166
183
  rstring_delimiter_missing = False
@@ -175,7 +192,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
175
192
  break
176
193
  else:
177
194
  # skip any whitespace first
178
- i = self.skip_whitespaces_at(idx=1, move_main_index=False)
195
+ i = self.scroll_whitespaces(idx=1)
179
196
  # We couldn't find any rstring_delimeter before the end of the string
180
197
  # check if this is the last string of an object and therefore we can keep going
181
198
  # make an exception if this is the last char before the closing brace
@@ -212,19 +229,15 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
212
229
  if self.context.current == ContextValues.OBJECT_VALUE and char == "}":
213
230
  # We found the end of an object while parsing a value
214
231
  # Check if the object is really over, to avoid doubling the closing brace
215
- i = self.skip_whitespaces_at(idx=1, move_main_index=False)
232
+ i = self.scroll_whitespaces(idx=1)
216
233
  next_c = self.get_char_at(i)
217
- if next_c and next_c == "`":
234
+ if next_c == "`" and self.get_char_at(i + 1) == "`" and self.get_char_at(i + 2) == "`":
218
235
  # This could be a special case in which the LLM added code fences after the object
219
236
  # So we need to check if there are another two ` after this one`
220
- next_c = self.get_char_at(i + 1)
221
- if next_c and next_c == "`":
222
- next_c = self.get_char_at(i + 2)
223
- if next_c and next_c == "`":
224
- self.log(
225
- "While parsing a string in object value context, we found a } that closes the object before code fences, stopping here",
226
- )
227
- break
237
+ self.log(
238
+ "While parsing a string in object value context, we found a } that closes the object before code fences, stopping here",
239
+ )
240
+ break
228
241
  if not next_c:
229
242
  self.log(
230
243
  "While parsing a string in object value context, we found a } that closes the object, stopping here",
@@ -282,12 +295,13 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
282
295
  # found a second delimiter
283
296
  i += 1
284
297
  # Skip spaces
285
- i = self.skip_whitespaces_at(idx=i, move_main_index=False)
286
- next_c = self.get_char_at(i)
287
- if next_c and next_c in [",", "}"]:
298
+ i = self.scroll_whitespaces(idx=i)
299
+ if self.get_char_at(i) in [",", "}"]:
288
300
  # Ok then this is a missing right quote
289
301
  self.log(
290
- "While parsing a string missing the right delimiter in object key context, we found a :, stopping here",
302
+ "While parsing a string missing the right delimiter in object key context, we found a "
303
+ + str(self.get_char_at(i))
304
+ + " stopping here",
291
305
  )
292
306
  break
293
307
  else:
@@ -316,9 +330,8 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
316
330
  # We found a quote, now let's make sure there's a ":" following
317
331
  i += 1
318
332
  # found a delimiter, now we need to check that is followed strictly by a comma or brace
319
- i = self.skip_whitespaces_at(idx=i, move_main_index=False)
320
- next_c = self.get_char_at(i)
321
- if next_c and next_c == ":":
333
+ i = self.scroll_whitespaces(idx=i)
334
+ if self.get_char_at(i) == ":":
322
335
  # Reset the cursor
323
336
  self.index -= 1
324
337
  char = self.get_char_at()
@@ -328,9 +341,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
328
341
  break
329
342
  elif unmatched_delimiter:
330
343
  unmatched_delimiter = False
331
- string_acc += str(char)
332
- self.index += 1
333
- char = self.get_char_at()
344
+ string_acc, char = _append_literal_char(string_acc, char)
334
345
  else:
335
346
  # Check if eventually there is a rstring delimiter, otherwise we bail
336
347
  i = 1
@@ -365,22 +376,20 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
365
376
  next_c = self.get_char_at(i)
366
377
  # Ok now I found a delimiter, let's skip whitespaces and see if next we find a } or a ,
367
378
  i += 1
368
- i = self.skip_whitespaces_at(idx=i, move_main_index=False)
379
+ i = self.scroll_whitespaces(idx=i)
369
380
  next_c = self.get_char_at(i)
370
381
  if next_c in ["}", ","]:
371
382
  self.log(
372
- "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
383
+ "While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
373
384
  )
374
- string_acc += str(char)
375
- self.index += 1
376
- char = self.get_char_at()
385
+ string_acc, char = _append_literal_char(string_acc, char)
377
386
  continue
378
387
  elif next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\":
379
388
  # Check if self.index:self.index+i is only whitespaces, break if that's the case
380
389
  if all(str(self.get_char_at(j)).isspace() for j in range(1, i) if self.get_char_at(j)):
381
390
  break
382
391
  if self.context.current == ContextValues.OBJECT_VALUE:
383
- i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
392
+ i = self.scroll_whitespaces(idx=i + 1)
384
393
  if self.get_char_at(i) == ",":
385
394
  # So we found a comma, this could be a case of a single quote like "va"lue",
386
395
  # Search if it's followed by another key, starting with the first delimeter
@@ -388,15 +397,13 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
388
397
  i += 1
389
398
  i = self.skip_to_character(character=rstring_delimiter, idx=i + 1)
390
399
  i += 1
391
- i = self.skip_whitespaces_at(idx=i, move_main_index=False)
400
+ i = self.scroll_whitespaces(idx=i)
392
401
  next_c = self.get_char_at(i)
393
402
  if next_c == ":":
394
403
  self.log(
395
- "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
404
+ "While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
396
405
  )
397
- string_acc += str(char)
398
- self.index += 1
399
- char = self.get_char_at()
406
+ string_acc, char = _append_literal_char(string_acc, char)
400
407
  continue
401
408
  # We found a delimiter and we need to check if this is a key
402
409
  # so find a rstring_delimiter and a colon after
@@ -413,12 +420,10 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
413
420
  # Only if we fail to find a ':' then we know this is misplaced quote
414
421
  if next_c != ":":
415
422
  self.log(
416
- "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
423
+ "While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
417
424
  )
418
425
  unmatched_delimiter = not unmatched_delimiter
419
- string_acc += str(char)
420
- self.index += 1
421
- char = self.get_char_at()
426
+ string_acc, char = _append_literal_char(string_acc, char)
422
427
  elif self.context.current == ContextValues.ARRAY:
423
428
  # So here we can have a few valid cases:
424
429
  # ["bla bla bla "puppy" bla bla bla "kitty" bla bla"]
@@ -442,9 +447,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
442
447
  "While parsing a string in Array context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
443
448
  )
444
449
  unmatched_delimiter = not unmatched_delimiter
445
- string_acc += str(char)
446
- self.index += 1
447
- char = self.get_char_at()
450
+ string_acc, char = _append_literal_char(string_acc, char)
448
451
  else:
449
452
  break
450
453
  elif self.context.current == ContextValues.OBJECT_KEY:
@@ -452,14 +455,12 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
452
455
  self.log(
453
456
  "While parsing a string in Object Key context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
454
457
  )
455
- string_acc += str(char)
456
- self.index += 1
457
- char = self.get_char_at()
458
+ string_acc, char = _append_literal_char(string_acc, char)
458
459
  if char and missing_quotes and self.context.current == ContextValues.OBJECT_KEY and char.isspace():
459
460
  self.log(
460
461
  "While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
461
462
  )
462
- self.skip_whitespaces_at()
463
+ self.skip_whitespaces()
463
464
  if self.get_char_at() not in [":", ","]:
464
465
  return ""
465
466
 
@@ -0,0 +1,28 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from ..json_parser import JSONParser # noqa: TID252
5
+
6
+
7
+ def parse_boolean_or_null(parser: "JSONParser") -> bool | str | None:
8
+ # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
9
+ char = (parser.get_char_at() or "").lower()
10
+ value_map: dict[str, tuple[str, bool | None]] = {
11
+ "t": ("true", True),
12
+ "f": ("false", False),
13
+ "n": ("null", None),
14
+ }
15
+ value: tuple[str, bool | None] = value_map[char]
16
+
17
+ i = 0
18
+ starting_index = parser.index
19
+ while char and i < len(value[0]) and char == value[0][i]:
20
+ i += 1
21
+ parser.index += 1
22
+ char = (parser.get_char_at() or "").lower()
23
+ if i == len(value[0]):
24
+ return value[1]
25
+
26
+ # If nothing works reset the index before returning
27
+ parser.index = starting_index
28
+ return ""
@@ -1,19 +1,19 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from ..constants import JSONReturnType # noqa: TID252
3
+ from ..utils.constants import JSONReturnType # noqa: TID252
4
4
 
5
5
  if TYPE_CHECKING:
6
6
  from ..json_parser import JSONParser # noqa: TID252
7
7
 
8
8
 
9
- def parse_json_llm_block(self: "JSONParser") -> JSONReturnType:
9
+ def parse_json_llm_block(parser: "JSONParser") -> JSONReturnType:
10
10
  """
11
11
  Extracts and normalizes JSON enclosed in ```json ... ``` blocks.
12
12
  """
13
13
  # Try to find a ```json ... ``` block
14
- if self.json_str[self.index : self.index + 7] == "```json":
15
- i = self.skip_to_character("`", idx=7)
16
- if self.json_str[self.index + i : self.index + i + 3] == "```":
17
- self.index += 7 # Move past ```json
18
- return self.parse_json()
14
+ if parser.json_str[parser.index : parser.index + 7] == "```json":
15
+ i = parser.skip_to_character("`", idx=7)
16
+ if parser.json_str[parser.index + i : parser.index + i + 3] == "```":
17
+ parser.index += 7 # Move past ```json
18
+ return parser.parse_json()
19
19
  return False
@@ -0,0 +1,176 @@
1
+ import os
2
+ from typing import TextIO
3
+
4
+
5
+ class StringFileWrapper:
6
+ # This is a trick to simplify the code, transform the filedescriptor handling into a string handling
7
+ def __init__(self, fd: TextIO, chunk_length: int) -> None:
8
+ """
9
+ Initialize the StringFileWrapper with a file descriptor and chunk length.
10
+
11
+ Args:
12
+ fd (TextIO): The file descriptor to wrap.
13
+ CHUNK_LENGTH (int): The length of each chunk to read from the file.
14
+
15
+ Attributes:
16
+ fd (TextIO): The wrapped file descriptor.
17
+ length (int): The total length of the file content.
18
+ buffers (dict[int, str]): Dictionary to store chunks of file content.
19
+ buffer_length (int): The length of each buffer chunk.
20
+ """
21
+ self.fd = fd
22
+ # Buffers are chunks of text read from the file and cached to reduce disk access.
23
+ self.buffers: dict[int, str] = {}
24
+ if not chunk_length or chunk_length < 2:
25
+ chunk_length = 1_000_000
26
+ # chunk_length now refers to the number of characters per chunk.
27
+ self.buffer_length = chunk_length
28
+ # Keep track of the starting file position ("cookie") for each chunk so we can
29
+ # seek safely without landing in the middle of a multibyte code point.
30
+ self._chunk_positions: list[int] = [0]
31
+ self.length: int | None = None
32
+
33
+ def get_buffer(self, index: int) -> str:
34
+ """
35
+ Retrieve or load a buffer chunk from the file.
36
+
37
+ Args:
38
+ index (int): The index of the buffer chunk to retrieve.
39
+
40
+ Returns:
41
+ str: The buffer chunk at the specified index.
42
+ """
43
+ if index < 0:
44
+ raise IndexError("Negative indexing is not supported")
45
+
46
+ cached = self.buffers.get(index)
47
+ if cached is not None:
48
+ return cached
49
+
50
+ self._ensure_chunk_position(index)
51
+ start_pos = self._chunk_positions[index]
52
+ self.fd.seek(start_pos)
53
+ chunk = self.fd.read(self.buffer_length)
54
+ if not chunk:
55
+ raise IndexError("Chunk index out of range")
56
+ end_pos = self.fd.tell()
57
+ if len(self._chunk_positions) <= index + 1:
58
+ self._chunk_positions.append(end_pos)
59
+ if len(chunk) < self.buffer_length:
60
+ self.length = index * self.buffer_length + len(chunk)
61
+
62
+ self.buffers[index] = chunk
63
+ # Save memory by keeping max 2MB buffer chunks and min 2 chunks
64
+ max_buffers = max(2, int(2_000_000 / self.buffer_length))
65
+ if len(self.buffers) > max_buffers:
66
+ oldest_key = next(iter(self.buffers))
67
+ if oldest_key != index:
68
+ self.buffers.pop(oldest_key)
69
+ return chunk
70
+
71
+ def __getitem__(self, index: int | slice) -> str:
72
+ """
73
+ Retrieve a character or a slice of characters from the file.
74
+
75
+ Args:
76
+ index (Union[int, slice]): The index or slice of characters to retrieve.
77
+
78
+ Returns:
79
+ str: The character(s) at the specified index or slice.
80
+ """
81
+ # The buffer is an array that is seek like a RAM:
82
+ # self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
83
+ # self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
84
+ if isinstance(index, slice):
85
+ total_len = len(self)
86
+ start = 0 if index.start is None else index.start
87
+ stop = total_len if index.stop is None else index.stop
88
+ step = 1 if index.step is None else index.step
89
+
90
+ if start < 0:
91
+ start += total_len
92
+ if stop < 0:
93
+ stop += total_len
94
+
95
+ start = max(start, 0)
96
+ stop = min(stop, total_len)
97
+
98
+ if step == 0:
99
+ raise ValueError("slice step cannot be zero")
100
+ if step != 1:
101
+ return "".join(self[i] for i in range(start, stop, step))
102
+
103
+ if start >= stop:
104
+ return ""
105
+
106
+ buffer_index = start // self.buffer_length
107
+ buffer_end = (stop - 1) // self.buffer_length
108
+ start_mod = start % self.buffer_length
109
+ stop_mod = stop % self.buffer_length
110
+ if stop_mod == 0 and stop > start:
111
+ stop_mod = self.buffer_length
112
+ if buffer_index == buffer_end:
113
+ buffer = self.get_buffer(buffer_index)
114
+ return buffer[start_mod:stop_mod]
115
+
116
+ start_slice = self.get_buffer(buffer_index)[start_mod:]
117
+ end_slice = self.get_buffer(buffer_end)[:stop_mod]
118
+ middle_slices = [self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)]
119
+ return start_slice + "".join(middle_slices) + end_slice
120
+ else:
121
+ if index < 0:
122
+ index += len(self)
123
+ if index < 0:
124
+ raise IndexError("string index out of range")
125
+ buffer_index = index // self.buffer_length
126
+ buffer = self.get_buffer(buffer_index)
127
+ return buffer[index % self.buffer_length]
128
+
129
+ def __len__(self) -> int:
130
+ """
131
+ Get the total length of the file.
132
+
133
+ Returns:
134
+ int: The total number of characters in the file.
135
+ """
136
+ if self.length is None:
137
+ while self.length is None:
138
+ chunk_index = len(self._chunk_positions)
139
+ self._ensure_chunk_position(chunk_index)
140
+ return self.length
141
+
142
+ def __setitem__(self, index: int | slice, value: str) -> None: # pragma: no cover
143
+ """
144
+ Set a character or a slice of characters in the file.
145
+
146
+ Args:
147
+ index (slice): The slice of characters to set.
148
+ value (str): The value to set at the specified index or slice.
149
+ """
150
+ start = index.start or 0 if isinstance(index, slice) else index or 0
151
+
152
+ if start < 0:
153
+ start += len(self)
154
+
155
+ current_position = self.fd.tell()
156
+ self.fd.seek(start)
157
+ self.fd.write(value)
158
+ self.fd.seek(current_position)
159
+
160
+ def _ensure_chunk_position(self, chunk_index: int) -> None:
161
+ """
162
+ Ensure that we know the starting file position for the given chunk index.
163
+ """
164
+ while len(self._chunk_positions) <= chunk_index:
165
+ prev_index = len(self._chunk_positions) - 1
166
+ start_pos = self._chunk_positions[-1]
167
+ self.fd.seek(start_pos, os.SEEK_SET)
168
+ chunk = self.fd.read(self.buffer_length)
169
+ end_pos = self.fd.tell()
170
+ if len(chunk) < self.buffer_length:
171
+ self.length = prev_index * self.buffer_length + len(chunk)
172
+ self._chunk_positions.append(end_pos)
173
+ if not chunk:
174
+ break
175
+ if len(self._chunk_positions) <= chunk_index:
176
+ raise IndexError("Chunk index out of range")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: json_repair
3
- Version: 0.53.0
3
+ Version: 0.54
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License-Expression: MIT
@@ -167,6 +167,23 @@ Some rules of thumb to use:
167
167
  - `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
168
168
  - If you are having issues with escaping pass the string as **raw** string like: `r"string with escaping\""`
169
169
 
170
+ ### Strict mode
171
+
172
+ By default `json_repair` does its best to “fix” input, even when the JSON is far from valid.
173
+ In some scenarios you want the opposite behavior and need the parser to error out instead of repairing; pass `strict=True` to `repair_json`, `loads`, `load`, or `from_file` to enable that mode:
174
+
175
+ ```
176
+ from json_repair import repair_json
177
+
178
+ repair_json(bad_json_string, strict=True)
179
+ ```
180
+
181
+ The CLI exposes the same behavior with `json_repair --strict input.json` (or piping data via stdin).
182
+
183
+ In strict mode the parser raises `ValueError` as soon as it encounters structural issues such as duplicate keys, missing `:` separators, empty keys/values introduced by stray commas, multiple top-level elements, or other ambiguous constructs. This is useful when you just need validation with friendlier error messages while still benefiting from json_repair’s resilience elsewhere in your stack.
184
+
185
+ Strict mode still honors `skip_json_loads=True`; combining them lets you skip the initial `json.loads` check but still enforce strict parsing rules.
186
+
170
187
  ### Use json_repair with streaming
171
188
 
172
189
  Sometimes you are streaming some data and want to repair the JSON coming from it. Normally this won't work but you can pass `stream_stable` to `repair_json()` or `loads()` to make it work:
@@ -198,6 +215,7 @@ options:
198
215
  If specified, the output will be written to TARGET filename instead of stdout
199
216
  --ensure_ascii Pass ensure_ascii=True to json.dumps()
200
217
  --indent INDENT Number of spaces for indentation (Default 2)
218
+ --strict Raise on duplicate keys, missing separators, empty keys/values, and similar structural issues instead of repairing them
201
219
  ```
202
220
 
203
221
  ## Adding to requirements
@@ -0,0 +1,22 @@
1
+ json_repair/__init__.py,sha256=JQ4Nm8YzR8Id2a527Ql0Az-rKapTp8DCMPKybLtQ620,180
2
+ json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
3
+ json_repair/json_parser.py,sha256=nATFDlcEnPD8G2NDSKj2nme_v1la_cCcFZrdQvEjTZs,8495
4
+ json_repair/json_repair.py,sha256=iT-OJgpBnKUJVIV4IUlXmMUkOyW6bNnKCZLB7Fys8hk,12758
5
+ json_repair/parse_array.py,sha256=rZfnRiS86vBATOUHqSx2T5fE79Ndlk2NoTsg9Wek7l4,2239
6
+ json_repair/parse_comment.py,sha256=MUDxrx8BFfAaKvx6x4gWviJNvwRi2yv5qnrR6honmas,2660
7
+ json_repair/parse_number.py,sha256=Ddv3Dih1VYfdasUe5DxQWAqy7YAE3aZJ7iePCfdi1EQ,1292
8
+ json_repair/parse_object.py,sha256=noaiP10kzl-jA-1jc6tMmtFoJMIputpB3zFxcAuYQvY,6986
9
+ json_repair/parse_string.py,sha256=L4McLWzRkbW_7Xx_hSGOmfpoPMwbYTGEKBAjqwanLEs,26146
10
+ json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ json_repair/parse_string_helpers/parse_boolean_or_null.py,sha256=pGmH1QATBls70kTvUlJv4F8NiPaBWcyGhRL03sTOnto,871
12
+ json_repair/parse_string_helpers/parse_json_llm_block.py,sha256=wPSm-8RY30Ek8HxzjCkCRtdLq4-Cez-PJB3vOk_vP3w,670
13
+ json_repair/utils/constants.py,sha256=cv2gvyosuq0me0600WyTysM9avrtfXPuXYR26tawcuo,158
14
+ json_repair/utils/json_context.py,sha256=WsMOjqpGSr6aaDONcrk8UFtTurzWon2Qq9AoBBYseoI,934
15
+ json_repair/utils/object_comparer.py,sha256=XKV3MRab8H7_v4sm-wpEa5le0XX9OeycWo5S-MFm-GI,1716
16
+ json_repair/utils/string_file_wrapper.py,sha256=Zlm0ZfJAw_VPlIy-QldL_OKYrPk3TYGq1JVAFPv7SnQ,6862
17
+ json_repair-0.54.dist-info/licenses/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
18
+ json_repair-0.54.dist-info/METADATA,sha256=xoD5G1EZ7muIRVbzdjsgD10OQbxS-K06sNGqlNDvvdQ,12220
19
+ json_repair-0.54.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
+ json_repair-0.54.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
21
+ json_repair-0.54.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
22
+ json_repair-0.54.dist-info/RECORD,,
@@ -1,30 +0,0 @@
1
- from typing import TYPE_CHECKING
2
-
3
- if TYPE_CHECKING:
4
- from .json_parser import JSONParser
5
-
6
-
7
- def parse_boolean_or_null(self: "JSONParser") -> bool | str | None:
8
- # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
9
- starting_index = self.index
10
- char = (self.get_char_at() or "").lower()
11
- value: tuple[str, bool | None] | None = None
12
- if char == "t":
13
- value = ("true", True)
14
- elif char == "f":
15
- value = ("false", False)
16
- elif char == "n":
17
- value = ("null", None)
18
-
19
- if value:
20
- i = 0
21
- while char and i < len(value[0]) and char == value[0][i]:
22
- i += 1
23
- self.index += 1
24
- char = (self.get_char_at() or "").lower()
25
- if i == len(value[0]):
26
- return value[1]
27
-
28
- # If nothing works reset the index before returning
29
- self.index = starting_index
30
- return ""
@@ -1,108 +0,0 @@
1
- import os
2
- from typing import TextIO
3
-
4
-
5
- class StringFileWrapper:
6
- # This is a trick to simplify the code, transform the filedescriptor handling into a string handling
7
- def __init__(self, fd: TextIO, chunk_length: int) -> None:
8
- """
9
- Initialize the StringFileWrapper with a file descriptor and chunk length.
10
-
11
- Args:
12
- fd (TextIO): The file descriptor to wrap.
13
- CHUNK_LENGTH (int): The length of each chunk to read from the file.
14
-
15
- Attributes:
16
- fd (TextIO): The wrapped file descriptor.
17
- length (int): The total length of the file content.
18
- buffers (dict[int, str]): Dictionary to store chunks of file content.
19
- buffer_length (int): The length of each buffer chunk.
20
- """
21
- self.fd = fd
22
- self.length: int = 0
23
- # Buffers are 1MB strings that are read from the file
24
- # and kept in memory to keep reads low
25
- self.buffers: dict[int, str] = {}
26
- # chunk_length is in bytes
27
- if not chunk_length or chunk_length < 2:
28
- chunk_length = 1_000_000
29
- self.buffer_length = chunk_length
30
-
31
- def get_buffer(self, index: int) -> str:
32
- """
33
- Retrieve or load a buffer chunk from the file.
34
-
35
- Args:
36
- index (int): The index of the buffer chunk to retrieve.
37
-
38
- Returns:
39
- str: The buffer chunk at the specified index.
40
- """
41
- if self.buffers.get(index) is None:
42
- self.fd.seek(index * self.buffer_length)
43
- self.buffers[index] = self.fd.read(self.buffer_length)
44
- # Save memory by keeping max 2MB buffer chunks and min 2 chunks
45
- if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
46
- oldest_key = next(iter(self.buffers))
47
- if oldest_key != index:
48
- self.buffers.pop(oldest_key)
49
- return self.buffers[index]
50
-
51
- def __getitem__(self, index: int | slice) -> str:
52
- """
53
- Retrieve a character or a slice of characters from the file.
54
-
55
- Args:
56
- index (Union[int, slice]): The index or slice of characters to retrieve.
57
-
58
- Returns:
59
- str: The character(s) at the specified index or slice.
60
- """
61
- # The buffer is an array that is seek like a RAM:
62
- # self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
63
- # self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
64
- if isinstance(index, slice):
65
- buffer_index = index.start // self.buffer_length
66
- buffer_end = index.stop // self.buffer_length
67
- if buffer_index == buffer_end:
68
- return self.get_buffer(buffer_index)[index.start % self.buffer_length : index.stop % self.buffer_length]
69
- else:
70
- start_slice = self.get_buffer(buffer_index)[index.start % self.buffer_length :]
71
- end_slice = self.get_buffer(buffer_end)[: index.stop % self.buffer_length]
72
- middle_slices = [self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)]
73
- return start_slice + "".join(middle_slices) + end_slice
74
- else:
75
- buffer_index = index // self.buffer_length
76
- return self.get_buffer(buffer_index)[index % self.buffer_length]
77
-
78
- def __len__(self) -> int:
79
- """
80
- Get the total length of the file.
81
-
82
- Returns:
83
- int: The total number of characters in the file.
84
- """
85
- if self.length < 1:
86
- current_position = self.fd.tell()
87
- self.fd.seek(0, os.SEEK_END)
88
- self.length = self.fd.tell()
89
- self.fd.seek(current_position)
90
- return self.length
91
-
92
- def __setitem__(self, index: int | slice, value: str) -> None: # pragma: no cover
93
- """
94
- Set a character or a slice of characters in the file.
95
-
96
- Args:
97
- index (slice): The slice of characters to set.
98
- value (str): The value to set at the specified index or slice.
99
- """
100
- start = index.start or 0 if isinstance(index, slice) else index or 0
101
-
102
- if start < 0:
103
- start += len(self)
104
-
105
- current_position = self.fd.tell()
106
- self.fd.seek(start)
107
- self.fd.write(value)
108
- self.fd.seek(current_position)
@@ -1,22 +0,0 @@
1
- json_repair/__init__.py,sha256=JdJIZNCKV3MfIviryqK8NH8yGssCta2-192CekcwH-o,174
2
- json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
3
- json_repair/constants.py,sha256=cv2gvyosuq0me0600WyTysM9avrtfXPuXYR26tawcuo,158
4
- json_repair/json_context.py,sha256=WsMOjqpGSr6aaDONcrk8UFtTurzWon2Qq9AoBBYseoI,934
5
- json_repair/json_parser.py,sha256=vy5Z8aiJUVhVmvYEgy0dkYy5WgUmyOeS6PEFiR3cW44,7948
6
- json_repair/json_repair.py,sha256=sDhXzDZxu0QmaFzICPTtf_q7yOY1A1Lf_iQG6Potsco,11572
7
- json_repair/object_comparer.py,sha256=XKV3MRab8H7_v4sm-wpEa5le0XX9OeycWo5S-MFm-GI,1716
8
- json_repair/parse_array.py,sha256=-rh65JcfT-FtXiR6s8RYlMfI-6LzVr08ytlDh6Z2CFE,2181
9
- json_repair/parse_boolean_or_null.py,sha256=WMSkvvxsp4wvauBcDqtt9WnLMD5SMoxeRfZFXp3FEBc,890
10
- json_repair/parse_comment.py,sha256=JHtQ_QlxOvPNnMh7lhUaoTjFGelqjhTNq7qn9xUE7SU,2648
11
- json_repair/parse_number.py,sha256=33zAtkbuVzi9Lqjxu7cXn9WlVzd3WjRx9Ln_LFzVL4o,1259
12
- json_repair/parse_object.py,sha256=rnuH5Oxo98OrXhktF0wrOC1vRb5Th_m819Li1EFJzm4,5571
13
- json_repair/parse_string.py,sha256=--coxoyH4nxl7osxgs1fIu31IEtB0HHwVbbOewypG4g,26146
14
- json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- json_repair/string_file_wrapper.py,sha256=tGkWBEUPE-CZPf4uSM5NE9oSDTpskX0myJiXsl-gbds,4333
16
- json_repair/parse_string_helpers/parse_json_llm_block.py,sha256=taREF3pwb35kGBGJYbUHkTybATX3GI-SOwOz3yXaEQs,644
17
- json_repair-0.53.0.dist-info/licenses/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
18
- json_repair-0.53.0.dist-info/METADATA,sha256=JvMUVYGDDIzmym7MqbQ6k6PjbnuuskW_myvk0EWp7V8,11027
19
- json_repair-0.53.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
20
- json_repair-0.53.0.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
21
- json_repair-0.53.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
22
- json_repair-0.53.0.dist-info/RECORD,,
File without changes
File without changes