json-repair 0.29.2__py3-none-any.whl → 0.29.3__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- json_repair/json_context.py +69 -0
- json_repair/json_parser.py +598 -0
- json_repair/json_repair.py +2 -643
- json_repair/string_file_wrapper.py +98 -0
- {json_repair-0.29.2.dist-info → json_repair-0.29.3.dist-info}/METADATA +41 -16
- json_repair-0.29.3.dist-info/RECORD +13 -0
- {json_repair-0.29.2.dist-info → json_repair-0.29.3.dist-info}/WHEEL +1 -1
- json_repair-0.29.2.dist-info/RECORD +0 -10
- {json_repair-0.29.2.dist-info → json_repair-0.29.3.dist-info}/LICENSE +0 -0
- {json_repair-0.29.2.dist-info → json_repair-0.29.3.dist-info}/entry_points.txt +0 -0
- {json_repair-0.29.2.dist-info → json_repair-0.29.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,69 @@
|
|
1
|
+
from enum import Enum, auto
|
2
|
+
from typing import List
|
3
|
+
|
4
|
+
|
5
|
+
class ContextValues(Enum):
|
6
|
+
OBJECT_KEY = auto()
|
7
|
+
OBJECT_VALUE = auto()
|
8
|
+
ARRAY = auto()
|
9
|
+
|
10
|
+
|
11
|
+
class JsonContext:
|
12
|
+
def __init__(self) -> None:
|
13
|
+
self.context: List[ContextValues] = []
|
14
|
+
|
15
|
+
def set(self, value: ContextValues) -> None:
|
16
|
+
"""
|
17
|
+
Set a new context value.
|
18
|
+
|
19
|
+
Args:
|
20
|
+
value (ContextValues): The context value to be added.
|
21
|
+
|
22
|
+
Returns:
|
23
|
+
None
|
24
|
+
"""
|
25
|
+
# If a value is provided update the context variable and save in stack
|
26
|
+
if value:
|
27
|
+
self.context.append(value)
|
28
|
+
|
29
|
+
def reset(self) -> None:
|
30
|
+
"""
|
31
|
+
Remove the most recent context value.
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
None
|
35
|
+
"""
|
36
|
+
self.context.pop()
|
37
|
+
|
38
|
+
def is_current(self, context: ContextValues) -> bool:
|
39
|
+
"""
|
40
|
+
Check if the given context is the current (most recent) context.
|
41
|
+
|
42
|
+
Args:
|
43
|
+
context (ContextValues): The context value to check.
|
44
|
+
|
45
|
+
Returns:
|
46
|
+
bool: True if the given context is the same as the most recent context in the stack, False otherwise.
|
47
|
+
"""
|
48
|
+
return self.context[-1] == context
|
49
|
+
|
50
|
+
def is_any(self, context: ContextValues) -> bool:
|
51
|
+
"""
|
52
|
+
Check if the given context exists anywhere in the context stack.
|
53
|
+
|
54
|
+
Args:
|
55
|
+
context (ContextValues): The context value to check.
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
bool: True if the given context exists in the stack, False otherwise.
|
59
|
+
"""
|
60
|
+
return context in self.context
|
61
|
+
|
62
|
+
def is_empty(self) -> bool:
|
63
|
+
"""
|
64
|
+
Check if the context stack is empty.
|
65
|
+
|
66
|
+
Returns:
|
67
|
+
bool: True if the context stack is empty, False otherwise.
|
68
|
+
"""
|
69
|
+
return len(self.context) == 0
|
@@ -0,0 +1,598 @@
|
|
1
|
+
from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
|
2
|
+
|
3
|
+
from .string_file_wrapper import StringFileWrapper
|
4
|
+
from .json_context import JsonContext, ContextValues
|
5
|
+
|
6
|
+
JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
|
7
|
+
|
8
|
+
|
9
|
+
class JSONParser:
|
10
|
+
def __init__(
|
11
|
+
self,
|
12
|
+
json_str: Union[str, StringFileWrapper],
|
13
|
+
json_fd: Optional[TextIO],
|
14
|
+
logging: Optional[bool],
|
15
|
+
json_fd_chunk_length: int = 0,
|
16
|
+
) -> None:
|
17
|
+
# The string to parse
|
18
|
+
self.json_str: Union[str, StringFileWrapper] = json_str
|
19
|
+
# Alternatively, the file description with a json file in it
|
20
|
+
if json_fd:
|
21
|
+
# This is a trick we do to treat the file wrapper as an array
|
22
|
+
self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
|
23
|
+
# Index is our iterator that will keep track of which character we are looking at right now
|
24
|
+
self.index: int = 0
|
25
|
+
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
|
26
|
+
self.context = JsonContext()
|
27
|
+
# Use this to log the activity, but only if logging is active
|
28
|
+
|
29
|
+
# This is a trick but a beatiful one. We call self.log in the code over and over even if it's not needed.
|
30
|
+
# We could add a guard in the code for each call but that would make this code unreadable, so here's this neat trick
|
31
|
+
# Replace self.log with a noop
|
32
|
+
self.logging = logging
|
33
|
+
if logging:
|
34
|
+
self.logger: List[Dict[str, str]] = []
|
35
|
+
self.log = self._log
|
36
|
+
else:
|
37
|
+
self.log = self.noop
|
38
|
+
|
39
|
+
def parse(
|
40
|
+
self,
|
41
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
42
|
+
json = self.parse_json()
|
43
|
+
if self.index < len(self.json_str):
|
44
|
+
self.log(
|
45
|
+
"The parser returned early, checking if there's more json elements",
|
46
|
+
)
|
47
|
+
json = [json]
|
48
|
+
last_index = self.index
|
49
|
+
while self.index < len(self.json_str):
|
50
|
+
j = self.parse_json()
|
51
|
+
if j != "":
|
52
|
+
json.append(j)
|
53
|
+
if self.index == last_index:
|
54
|
+
self.index += 1
|
55
|
+
last_index = self.index
|
56
|
+
# If nothing extra was found, don't return an array
|
57
|
+
if len(json) == 1:
|
58
|
+
self.log(
|
59
|
+
"There were no more elements, returning the element without the array",
|
60
|
+
)
|
61
|
+
json = json[0]
|
62
|
+
if self.logging:
|
63
|
+
return json, self.logger
|
64
|
+
else:
|
65
|
+
return json
|
66
|
+
|
67
|
+
def parse_json(
|
68
|
+
self,
|
69
|
+
) -> JSONReturnType:
|
70
|
+
while True:
|
71
|
+
char = self.get_char_at()
|
72
|
+
# False means that we are at the end of the string provided
|
73
|
+
if char is False:
|
74
|
+
return ""
|
75
|
+
# <object> starts with '{'
|
76
|
+
elif char == "{":
|
77
|
+
self.index += 1
|
78
|
+
return self.parse_object()
|
79
|
+
# <array> starts with '['
|
80
|
+
elif char == "[":
|
81
|
+
self.index += 1
|
82
|
+
return self.parse_array()
|
83
|
+
# there can be an edge case in which a key is empty and at the end of an object
|
84
|
+
# like "key": }. We return an empty string here to close the object properly
|
85
|
+
elif char == "}":
|
86
|
+
self.log(
|
87
|
+
"At the end of an object we found a key with missing value, skipping",
|
88
|
+
)
|
89
|
+
return ""
|
90
|
+
# <string> starts with a quote
|
91
|
+
elif not self.context.is_empty() and (
|
92
|
+
char in ['"', "'", "“"] or char.isalpha()
|
93
|
+
):
|
94
|
+
return self.parse_string()
|
95
|
+
# <number> starts with [0-9] or minus
|
96
|
+
elif not self.context.is_empty() and (
|
97
|
+
char.isdigit() or char == "-" or char == "."
|
98
|
+
):
|
99
|
+
return self.parse_number()
|
100
|
+
# If everything else fails, we just ignore and move on
|
101
|
+
else:
|
102
|
+
self.index += 1
|
103
|
+
|
104
|
+
def parse_object(self) -> Dict[str, JSONReturnType]:
|
105
|
+
# <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
|
106
|
+
obj = {}
|
107
|
+
# Stop when you either find the closing parentheses or you have iterated over the entire string
|
108
|
+
while (self.get_char_at() or "}") != "}":
|
109
|
+
# This is what we expect to find:
|
110
|
+
# <member> ::= <string> ': ' <json>
|
111
|
+
|
112
|
+
# Skip filler whitespaces
|
113
|
+
self.skip_whitespaces_at()
|
114
|
+
|
115
|
+
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
|
116
|
+
if (self.get_char_at() or "") == ":":
|
117
|
+
self.log(
|
118
|
+
"While parsing an object we found a : before a key, ignoring",
|
119
|
+
)
|
120
|
+
self.index += 1
|
121
|
+
|
122
|
+
# We are now searching for they string key
|
123
|
+
# Context is used in the string parser to manage the lack of quotes
|
124
|
+
self.context.set(ContextValues.OBJECT_KEY)
|
125
|
+
|
126
|
+
self.skip_whitespaces_at()
|
127
|
+
|
128
|
+
# <member> starts with a <string>
|
129
|
+
key = ""
|
130
|
+
while self.get_char_at():
|
131
|
+
key = str(self.parse_string())
|
132
|
+
|
133
|
+
if key != "" or (key == "" and self.get_char_at() == ":"):
|
134
|
+
# If the string is empty but there is a object divider, we are done here
|
135
|
+
break
|
136
|
+
|
137
|
+
self.skip_whitespaces_at()
|
138
|
+
|
139
|
+
# We reached the end here
|
140
|
+
if (self.get_char_at() or "}") == "}":
|
141
|
+
continue
|
142
|
+
|
143
|
+
self.skip_whitespaces_at()
|
144
|
+
|
145
|
+
# An extreme case of missing ":" after a key
|
146
|
+
if (self.get_char_at() or "") != ":":
|
147
|
+
self.log(
|
148
|
+
"While parsing an object we missed a : after a key",
|
149
|
+
)
|
150
|
+
|
151
|
+
self.index += 1
|
152
|
+
self.context.reset()
|
153
|
+
self.context.set(ContextValues.OBJECT_VALUE)
|
154
|
+
# The value can be any valid json
|
155
|
+
value = self.parse_json()
|
156
|
+
|
157
|
+
# Reset context since our job is done
|
158
|
+
self.context.reset()
|
159
|
+
obj[key] = value
|
160
|
+
|
161
|
+
if (self.get_char_at() or "") in [",", "'", '"']:
|
162
|
+
self.index += 1
|
163
|
+
|
164
|
+
# Remove trailing spaces
|
165
|
+
self.skip_whitespaces_at()
|
166
|
+
|
167
|
+
self.index += 1
|
168
|
+
return obj
|
169
|
+
|
170
|
+
def parse_array(self) -> List[JSONReturnType]:
|
171
|
+
# <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
|
172
|
+
arr = []
|
173
|
+
self.context.set(ContextValues.ARRAY)
|
174
|
+
# Stop when you either find the closing parentheses or you have iterated over the entire string
|
175
|
+
while (self.get_char_at() or "]") != "]":
|
176
|
+
self.skip_whitespaces_at()
|
177
|
+
value = self.parse_json()
|
178
|
+
|
179
|
+
# It is possible that parse_json() returns nothing valid, so we stop
|
180
|
+
if value == "":
|
181
|
+
break
|
182
|
+
|
183
|
+
if value == "..." and self.get_char_at(-1) == ".":
|
184
|
+
self.log(
|
185
|
+
"While parsing an array, found a stray '...'; ignoring it",
|
186
|
+
)
|
187
|
+
else:
|
188
|
+
arr.append(value)
|
189
|
+
|
190
|
+
# skip over whitespace after a value but before closing ]
|
191
|
+
char = self.get_char_at()
|
192
|
+
while char and (char.isspace() or char == ","):
|
193
|
+
self.index += 1
|
194
|
+
char = self.get_char_at()
|
195
|
+
|
196
|
+
# Especially at the end of an LLM generated json you might miss the last "]"
|
197
|
+
char = self.get_char_at()
|
198
|
+
if char and char != "]":
|
199
|
+
self.log(
|
200
|
+
"While parsing an array we missed the closing ], adding it back",
|
201
|
+
)
|
202
|
+
self.index -= 1
|
203
|
+
|
204
|
+
self.index += 1
|
205
|
+
self.context.reset()
|
206
|
+
return arr
|
207
|
+
|
208
|
+
def parse_string(self) -> Union[str, bool, None]:
|
209
|
+
# <string> is a string of valid characters enclosed in quotes
|
210
|
+
# i.e. { name: "John" }
|
211
|
+
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
|
212
|
+
|
213
|
+
# Flag to manage corner cases related to missing starting quote
|
214
|
+
missing_quotes = False
|
215
|
+
doubled_quotes = False
|
216
|
+
lstring_delimiter = rstring_delimiter = '"'
|
217
|
+
|
218
|
+
char = self.get_char_at()
|
219
|
+
# A valid string can only start with a valid quote or, in our case, with a literal
|
220
|
+
while char and char not in ['"', "'", "“"] and not char.isalnum():
|
221
|
+
self.index += 1
|
222
|
+
char = self.get_char_at()
|
223
|
+
|
224
|
+
if not char:
|
225
|
+
# This is an empty string
|
226
|
+
return ""
|
227
|
+
|
228
|
+
# Ensuring we use the right delimiter
|
229
|
+
if char == "'":
|
230
|
+
lstring_delimiter = rstring_delimiter = "'"
|
231
|
+
elif char == "“":
|
232
|
+
lstring_delimiter = "“"
|
233
|
+
rstring_delimiter = "”"
|
234
|
+
elif char.isalnum():
|
235
|
+
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
|
236
|
+
# But remember, object keys are only of type string
|
237
|
+
if char.lower() in ["t", "f", "n"] and not self.context.is_current(
|
238
|
+
ContextValues.OBJECT_KEY
|
239
|
+
):
|
240
|
+
value = self.parse_boolean_or_null()
|
241
|
+
if value != "":
|
242
|
+
return value
|
243
|
+
self.log(
|
244
|
+
"While parsing a string, we found a literal instead of a quote",
|
245
|
+
)
|
246
|
+
self.log(
|
247
|
+
"While parsing a string, we found no starting quote. Will add the quote back",
|
248
|
+
)
|
249
|
+
missing_quotes = True
|
250
|
+
|
251
|
+
if not missing_quotes:
|
252
|
+
self.index += 1
|
253
|
+
|
254
|
+
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
|
255
|
+
if self.get_char_at() == lstring_delimiter:
|
256
|
+
# If it's an empty key, this was easy
|
257
|
+
if (
|
258
|
+
self.context.is_current(ContextValues.OBJECT_KEY)
|
259
|
+
and self.get_char_at(1) == ":"
|
260
|
+
):
|
261
|
+
self.index += 1
|
262
|
+
return ""
|
263
|
+
# Find the next delimiter
|
264
|
+
i = self.skip_to_character(
|
265
|
+
character=rstring_delimiter, idx=1, move_main_index=False
|
266
|
+
)
|
267
|
+
next_c = self.get_char_at(i)
|
268
|
+
# Now check that the next character is also a delimiter to ensure that we have "".....""
|
269
|
+
# In that case we ignore this rstring delimiter
|
270
|
+
if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
|
271
|
+
self.log(
|
272
|
+
"While parsing a string, we found a valid starting doubled quote, ignoring it",
|
273
|
+
)
|
274
|
+
doubled_quotes = True
|
275
|
+
self.index += 1
|
276
|
+
else:
|
277
|
+
# Ok this is not a doubled quote, check if this is an empty string or not
|
278
|
+
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
|
279
|
+
next_c = self.get_char_at(i)
|
280
|
+
if next_c not in [",", "]", "}"]:
|
281
|
+
self.log(
|
282
|
+
"While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
|
283
|
+
)
|
284
|
+
self.index += 1
|
285
|
+
|
286
|
+
# Initialize our return value
|
287
|
+
string_acc = ""
|
288
|
+
|
289
|
+
# Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
|
290
|
+
# In that case we need to use the ":|,|}" characters as terminators of the string
|
291
|
+
# So this will stop if:
|
292
|
+
# * It finds a closing quote
|
293
|
+
# * It iterated over the entire sequence
|
294
|
+
# * If we are fixing missing quotes in an object, when it finds the special terminators
|
295
|
+
char = self.get_char_at()
|
296
|
+
while char and char != rstring_delimiter:
|
297
|
+
if (
|
298
|
+
missing_quotes
|
299
|
+
and self.context.is_current(ContextValues.OBJECT_KEY)
|
300
|
+
and (char == ":" or char.isspace())
|
301
|
+
):
|
302
|
+
self.log(
|
303
|
+
"While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
|
304
|
+
)
|
305
|
+
break
|
306
|
+
if self.context.is_current(ContextValues.OBJECT_VALUE) and char in [
|
307
|
+
",",
|
308
|
+
"}",
|
309
|
+
]:
|
310
|
+
rstring_delimiter_missing = True
|
311
|
+
# check if this is a case in which the closing comma is NOT missing instead
|
312
|
+
i = self.skip_to_character(
|
313
|
+
character=rstring_delimiter, idx=1, move_main_index=False
|
314
|
+
)
|
315
|
+
next_c = self.get_char_at(i)
|
316
|
+
if next_c:
|
317
|
+
i += 1
|
318
|
+
# found a delimiter, now we need to check that is followed strictly by a comma or brace
|
319
|
+
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
|
320
|
+
next_c = self.get_char_at(i)
|
321
|
+
if next_c and next_c in [",", "}"]:
|
322
|
+
rstring_delimiter_missing = False
|
323
|
+
if rstring_delimiter_missing:
|
324
|
+
self.log(
|
325
|
+
"While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
|
326
|
+
)
|
327
|
+
break
|
328
|
+
string_acc += char
|
329
|
+
self.index += 1
|
330
|
+
char = self.get_char_at()
|
331
|
+
if char and len(string_acc) > 0 and string_acc[-1] == "\\":
|
332
|
+
# This is a special case, if people use real strings this might happen
|
333
|
+
self.log("Found a stray escape sequence, normalizing it")
|
334
|
+
string_acc = string_acc[:-1]
|
335
|
+
if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
|
336
|
+
escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
|
337
|
+
string_acc += escape_seqs.get(char, char) or char
|
338
|
+
self.index += 1
|
339
|
+
char = self.get_char_at()
|
340
|
+
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
|
341
|
+
if char == rstring_delimiter:
|
342
|
+
# Special case here, in case of double quotes one after another
|
343
|
+
if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
|
344
|
+
self.log(
|
345
|
+
"While parsing a string, we found a doubled quote, ignoring it"
|
346
|
+
)
|
347
|
+
self.index += 1
|
348
|
+
elif missing_quotes and self.context.is_current(
|
349
|
+
ContextValues.OBJECT_VALUE
|
350
|
+
):
|
351
|
+
# In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
|
352
|
+
i = 1
|
353
|
+
next_c = self.get_char_at(i)
|
354
|
+
while next_c and next_c not in [
|
355
|
+
rstring_delimiter,
|
356
|
+
lstring_delimiter,
|
357
|
+
]:
|
358
|
+
i += 1
|
359
|
+
next_c = self.get_char_at(i)
|
360
|
+
if next_c:
|
361
|
+
# We found a quote, now let's make sure there's a ":" following
|
362
|
+
i += 1
|
363
|
+
# found a delimiter, now we need to check that is followed strictly by a comma or brace
|
364
|
+
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
|
365
|
+
next_c = self.get_char_at(i)
|
366
|
+
if next_c and next_c == ":":
|
367
|
+
# Reset the cursor
|
368
|
+
self.index -= 1
|
369
|
+
char = self.get_char_at()
|
370
|
+
self.log(
|
371
|
+
"In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.",
|
372
|
+
)
|
373
|
+
break
|
374
|
+
else:
|
375
|
+
# Check if eventually there is a rstring delimiter, otherwise we bail
|
376
|
+
i = 1
|
377
|
+
next_c = self.get_char_at(i)
|
378
|
+
check_comma_in_object_value = True
|
379
|
+
while next_c and next_c not in [
|
380
|
+
rstring_delimiter,
|
381
|
+
lstring_delimiter,
|
382
|
+
]:
|
383
|
+
# This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
|
384
|
+
# This is because the routine after will make sure to correct any bad guess and this solves a corner case
|
385
|
+
if check_comma_in_object_value and next_c.isalpha():
|
386
|
+
check_comma_in_object_value = False
|
387
|
+
# If we are in an object context, let's check for the right delimiters
|
388
|
+
if (
|
389
|
+
(
|
390
|
+
self.context.is_any(ContextValues.OBJECT_KEY)
|
391
|
+
and next_c in [":", "}"]
|
392
|
+
)
|
393
|
+
or (
|
394
|
+
self.context.is_any(ContextValues.OBJECT_VALUE)
|
395
|
+
and next_c == "}"
|
396
|
+
)
|
397
|
+
or (
|
398
|
+
self.context.is_any(ContextValues.ARRAY)
|
399
|
+
and next_c in ["]", ","]
|
400
|
+
)
|
401
|
+
or (
|
402
|
+
check_comma_in_object_value
|
403
|
+
and self.context.is_current(ContextValues.OBJECT_VALUE)
|
404
|
+
and next_c == ","
|
405
|
+
)
|
406
|
+
):
|
407
|
+
break
|
408
|
+
i += 1
|
409
|
+
next_c = self.get_char_at(i)
|
410
|
+
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
|
411
|
+
if next_c == "," and self.context.is_current(
|
412
|
+
ContextValues.OBJECT_VALUE
|
413
|
+
):
|
414
|
+
i += 1
|
415
|
+
i = self.skip_to_character(
|
416
|
+
character=rstring_delimiter, idx=i, move_main_index=False
|
417
|
+
)
|
418
|
+
next_c = self.get_char_at(i)
|
419
|
+
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
|
420
|
+
i += 1
|
421
|
+
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
|
422
|
+
next_c = self.get_char_at(i)
|
423
|
+
if next_c == "}":
|
424
|
+
# OK this is valid then
|
425
|
+
self.log(
|
426
|
+
"While parsing a string, we misplaced a quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
|
427
|
+
)
|
428
|
+
string_acc += str(char)
|
429
|
+
self.index += 1
|
430
|
+
char = self.get_char_at()
|
431
|
+
elif next_c == rstring_delimiter:
|
432
|
+
if self.context.is_current(ContextValues.OBJECT_VALUE):
|
433
|
+
# But this might not be it! This could be just a missing comma
|
434
|
+
# We found a delimiter and we need to check if this is a key
|
435
|
+
# so find a rstring_delimiter and a colon after
|
436
|
+
i += 1
|
437
|
+
i = self.skip_to_character(
|
438
|
+
character=rstring_delimiter,
|
439
|
+
idx=i,
|
440
|
+
move_main_index=False,
|
441
|
+
)
|
442
|
+
i += 1
|
443
|
+
next_c = self.get_char_at(i)
|
444
|
+
while next_c and next_c != ":":
|
445
|
+
if next_c in [
|
446
|
+
lstring_delimiter,
|
447
|
+
rstring_delimiter,
|
448
|
+
",",
|
449
|
+
]:
|
450
|
+
break
|
451
|
+
i += 1
|
452
|
+
next_c = self.get_char_at(i)
|
453
|
+
# Only if we fail to find a ':' then we know this is misplaced quote
|
454
|
+
if next_c != ":":
|
455
|
+
self.log(
|
456
|
+
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
457
|
+
)
|
458
|
+
string_acc += str(char)
|
459
|
+
self.index += 1
|
460
|
+
char = self.get_char_at()
|
461
|
+
|
462
|
+
if (
|
463
|
+
char
|
464
|
+
and missing_quotes
|
465
|
+
and self.context.is_current(ContextValues.OBJECT_KEY)
|
466
|
+
and char.isspace()
|
467
|
+
):
|
468
|
+
self.log(
|
469
|
+
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
|
470
|
+
)
|
471
|
+
self.skip_whitespaces_at()
|
472
|
+
if self.get_char_at() not in [":", ","]:
|
473
|
+
return ""
|
474
|
+
|
475
|
+
# A fallout of the previous special case in the while loop,
|
476
|
+
# we need to update the index only if we had a closing quote
|
477
|
+
if char != rstring_delimiter:
|
478
|
+
self.log(
|
479
|
+
"While parsing a string, we missed the closing quote, ignoring",
|
480
|
+
)
|
481
|
+
else:
|
482
|
+
self.index += 1
|
483
|
+
|
484
|
+
return string_acc.rstrip()
|
485
|
+
|
486
|
+
def parse_number(self) -> Union[float, int, str, JSONReturnType]:
|
487
|
+
# <number> is a valid real number expressed in one of a number of given formats
|
488
|
+
number_str = ""
|
489
|
+
number_chars = set("0123456789-.eE/,")
|
490
|
+
char = self.get_char_at()
|
491
|
+
is_array = self.context.is_current(ContextValues.ARRAY)
|
492
|
+
while char and char in number_chars and (char != "," or not is_array):
|
493
|
+
number_str += char
|
494
|
+
self.index += 1
|
495
|
+
char = self.get_char_at()
|
496
|
+
if len(number_str) > 1 and number_str[-1] in "-eE/,":
|
497
|
+
# The number ends with a non valid character for a number/currency, rolling back one
|
498
|
+
number_str = number_str[:-1]
|
499
|
+
self.index -= 1
|
500
|
+
try:
|
501
|
+
if "," in number_str:
|
502
|
+
return str(number_str)
|
503
|
+
if "." in number_str or "e" in number_str or "E" in number_str:
|
504
|
+
return float(number_str)
|
505
|
+
elif number_str == "-":
|
506
|
+
# If there is a stray "-" this will throw an exception, throw away this character
|
507
|
+
return self.parse_json()
|
508
|
+
else:
|
509
|
+
return int(number_str)
|
510
|
+
except ValueError:
|
511
|
+
return number_str
|
512
|
+
|
513
|
+
def parse_boolean_or_null(self) -> Union[bool, str, None]:
|
514
|
+
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
|
515
|
+
starting_index = self.index
|
516
|
+
char = (self.get_char_at() or "").lower()
|
517
|
+
value: Optional[Tuple[str, Optional[bool]]]
|
518
|
+
if char == "t":
|
519
|
+
value = ("true", True)
|
520
|
+
elif char == "f":
|
521
|
+
value = ("false", False)
|
522
|
+
elif char == "n":
|
523
|
+
value = ("null", None)
|
524
|
+
|
525
|
+
if value:
|
526
|
+
i = 0
|
527
|
+
while char and i < len(value[0]) and char == value[0][i]:
|
528
|
+
i += 1
|
529
|
+
self.index += 1
|
530
|
+
char = (self.get_char_at() or "").lower()
|
531
|
+
if i == len(value[0]):
|
532
|
+
return value[1]
|
533
|
+
|
534
|
+
# If nothing works reset the index before returning
|
535
|
+
self.index = starting_index
|
536
|
+
return ""
|
537
|
+
|
538
|
+
def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
|
539
|
+
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
|
540
|
+
try:
|
541
|
+
return self.json_str[self.index + count]
|
542
|
+
except IndexError:
|
543
|
+
return False
|
544
|
+
|
545
|
+
def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
|
546
|
+
"""
|
547
|
+
This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
|
548
|
+
"""
|
549
|
+
try:
|
550
|
+
char = self.json_str[self.index + idx]
|
551
|
+
except IndexError:
|
552
|
+
return idx
|
553
|
+
while char.isspace():
|
554
|
+
if move_main_index:
|
555
|
+
self.index += 1
|
556
|
+
else:
|
557
|
+
idx += 1
|
558
|
+
try:
|
559
|
+
char = self.json_str[self.index + idx]
|
560
|
+
except IndexError:
|
561
|
+
return idx
|
562
|
+
return idx
|
563
|
+
|
564
|
+
def skip_to_character(
|
565
|
+
self, character: str, idx: int = 0, move_main_index=True
|
566
|
+
) -> int:
|
567
|
+
"""
|
568
|
+
This function quickly iterates to find a character, syntactic sugar to make the code more concise
|
569
|
+
"""
|
570
|
+
try:
|
571
|
+
char = self.json_str[self.index + idx]
|
572
|
+
except IndexError:
|
573
|
+
return idx
|
574
|
+
while char != character:
|
575
|
+
if move_main_index: # pragma: no cover
|
576
|
+
self.index += 1
|
577
|
+
else:
|
578
|
+
idx += 1
|
579
|
+
try:
|
580
|
+
char = self.json_str[self.index + idx]
|
581
|
+
except IndexError:
|
582
|
+
return idx
|
583
|
+
return idx
|
584
|
+
|
585
|
+
def _log(self, text: str) -> None:
|
586
|
+
window: int = 10
|
587
|
+
start: int = max(self.index - window, 0)
|
588
|
+
end: int = min(self.index + window, len(self.json_str))
|
589
|
+
context: str = self.json_str[start:end]
|
590
|
+
self.logger.append(
|
591
|
+
{
|
592
|
+
"text": text,
|
593
|
+
"context": context,
|
594
|
+
}
|
595
|
+
)
|
596
|
+
|
597
|
+
def noop(*args: Any, **kwargs: Any) -> None:
|
598
|
+
pass
|