json-repair 0.47.4__py3-none-any.whl → 0.47.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/__init__.py +2 -1
- json_repair/constants.py +4 -0
- json_repair/json_parser.py +17 -674
- json_repair/json_repair.py +2 -1
- json_repair/parse_array.py +50 -0
- json_repair/parse_boolean_or_null.py +24 -0
- json_repair/parse_comment.py +65 -0
- json_repair/parse_number.py +32 -0
- json_repair/parse_object.py +110 -0
- json_repair/parse_string.py +413 -0
- {json_repair-0.47.4.dist-info → json_repair-0.47.6.dist-info}/METADATA +1 -1
- json_repair-0.47.6.dist-info/RECORD +21 -0
- json_repair-0.47.4.dist-info/RECORD +0 -14
- {json_repair-0.47.4.dist-info → json_repair-0.47.6.dist-info}/WHEEL +0 -0
- {json_repair-0.47.4.dist-info → json_repair-0.47.6.dist-info}/entry_points.txt +0 -0
- {json_repair-0.47.4.dist-info → json_repair-0.47.6.dist-info}/licenses/LICENSE +0 -0
- {json_repair-0.47.4.dist-info → json_repair-0.47.6.dist-info}/top_level.txt +0 -0
json_repair/json_parser.py
CHANGED
@@ -1,16 +1,25 @@
|
|
1
|
-
from typing import
|
1
|
+
from typing import Literal, TextIO
|
2
2
|
|
3
|
-
from .
|
3
|
+
from .constants import STRING_DELIMITERS, JSONReturnType
|
4
|
+
from .json_context import JsonContext
|
4
5
|
from .object_comparer import ObjectComparer
|
6
|
+
from .parse_array import parse_array
|
7
|
+
from .parse_boolean_or_null import parse_boolean_or_null
|
8
|
+
from .parse_comment import parse_comment
|
9
|
+
from .parse_number import parse_number
|
10
|
+
from .parse_object import parse_object
|
11
|
+
from .parse_string import parse_string
|
5
12
|
from .string_file_wrapper import StringFileWrapper
|
6
13
|
|
7
|
-
JSONReturnType = dict[str, Any] | list[Any] | str | float | int | bool | None
|
8
|
-
|
9
14
|
|
10
15
|
class JSONParser:
|
11
|
-
#
|
12
|
-
|
13
|
-
|
16
|
+
# Split the parse methods into separate files because this one was like 3000 lines
|
17
|
+
parse_array = parse_array
|
18
|
+
parse_boolean_or_null = parse_boolean_or_null
|
19
|
+
parse_comment = parse_comment
|
20
|
+
parse_number = parse_number
|
21
|
+
parse_object = parse_object
|
22
|
+
parse_string = parse_string
|
14
23
|
|
15
24
|
def __init__(
|
16
25
|
self,
|
@@ -98,7 +107,7 @@ class JSONParser:
|
|
98
107
|
self.index += 1
|
99
108
|
return self.parse_array()
|
100
109
|
# <string> starts with a quote
|
101
|
-
elif not self.context.empty and (char in
|
110
|
+
elif not self.context.empty and (char in STRING_DELIMITERS or char.isalpha()):
|
102
111
|
return self.parse_string()
|
103
112
|
# <number> starts with [0-9] or minus
|
104
113
|
elif not self.context.empty and (char.isdigit() or char == "-" or char == "."):
|
@@ -109,672 +118,6 @@ class JSONParser:
|
|
109
118
|
else:
|
110
119
|
self.index += 1
|
111
120
|
|
112
|
-
def parse_object(self) -> dict[str, JSONReturnType]:
|
113
|
-
# <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
|
114
|
-
obj: dict[str, JSONReturnType] = {}
|
115
|
-
# Stop when you either find the closing parentheses or you have iterated over the entire string
|
116
|
-
while (self.get_char_at() or "}") != "}":
|
117
|
-
# This is what we expect to find:
|
118
|
-
# <member> ::= <string> ': ' <json>
|
119
|
-
|
120
|
-
# Skip filler whitespaces
|
121
|
-
self.skip_whitespaces_at()
|
122
|
-
|
123
|
-
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
|
124
|
-
if (self.get_char_at() or "") == ":":
|
125
|
-
self.log(
|
126
|
-
"While parsing an object we found a : before a key, ignoring",
|
127
|
-
)
|
128
|
-
self.index += 1
|
129
|
-
|
130
|
-
# We are now searching for they string key
|
131
|
-
# Context is used in the string parser to manage the lack of quotes
|
132
|
-
self.context.set(ContextValues.OBJECT_KEY)
|
133
|
-
|
134
|
-
# Save this index in case we need find a duplicate key
|
135
|
-
rollback_index = self.index
|
136
|
-
|
137
|
-
# <member> starts with a <string>
|
138
|
-
key = ""
|
139
|
-
while self.get_char_at():
|
140
|
-
# The rollback index needs to be updated here in case the key is empty
|
141
|
-
rollback_index = self.index
|
142
|
-
if self.get_char_at() == "[" and key == "":
|
143
|
-
# Is this an array?
|
144
|
-
# Need to check if the previous parsed value contained in obj is an array and in that case parse and merge the two
|
145
|
-
prev_key = list(obj.keys())[-1] if obj else None
|
146
|
-
if prev_key and isinstance(obj[prev_key], list):
|
147
|
-
# If the previous key's value is an array, parse the new array and merge
|
148
|
-
self.index += 1
|
149
|
-
new_array = self.parse_array()
|
150
|
-
if isinstance(new_array, list):
|
151
|
-
# Merge and flatten the arrays
|
152
|
-
prev_value = obj[prev_key]
|
153
|
-
if isinstance(prev_value, list):
|
154
|
-
prev_value.extend(
|
155
|
-
new_array[0]
|
156
|
-
if len(new_array) == 1 and isinstance(new_array[0], list)
|
157
|
-
else new_array
|
158
|
-
)
|
159
|
-
self.skip_whitespaces_at()
|
160
|
-
if self.get_char_at() == ",":
|
161
|
-
self.index += 1
|
162
|
-
self.skip_whitespaces_at()
|
163
|
-
continue
|
164
|
-
key = str(self.parse_string())
|
165
|
-
if key == "":
|
166
|
-
self.skip_whitespaces_at()
|
167
|
-
if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
|
168
|
-
# If the string is empty but there is a object divider, we are done here
|
169
|
-
break
|
170
|
-
if ContextValues.ARRAY in self.context.context and key in obj:
|
171
|
-
self.log(
|
172
|
-
"While parsing an object we found a duplicate key, closing the object here and rolling back the index",
|
173
|
-
)
|
174
|
-
self.index = rollback_index - 1
|
175
|
-
# add an opening curly brace to make this work
|
176
|
-
self.json_str = self.json_str[: self.index + 1] + "{" + self.json_str[self.index + 1 :]
|
177
|
-
break
|
178
|
-
|
179
|
-
# Skip filler whitespaces
|
180
|
-
self.skip_whitespaces_at()
|
181
|
-
|
182
|
-
# We reached the end here
|
183
|
-
if (self.get_char_at() or "}") == "}":
|
184
|
-
continue
|
185
|
-
|
186
|
-
self.skip_whitespaces_at()
|
187
|
-
|
188
|
-
# An extreme case of missing ":" after a key
|
189
|
-
if (self.get_char_at() or "") != ":":
|
190
|
-
self.log(
|
191
|
-
"While parsing an object we missed a : after a key",
|
192
|
-
)
|
193
|
-
|
194
|
-
self.index += 1
|
195
|
-
self.context.reset()
|
196
|
-
self.context.set(ContextValues.OBJECT_VALUE)
|
197
|
-
# The value can be any valid json
|
198
|
-
self.skip_whitespaces_at()
|
199
|
-
# Corner case, a lone comma
|
200
|
-
value: JSONReturnType = ""
|
201
|
-
if (self.get_char_at() or "") in [",", "}"]:
|
202
|
-
self.log(
|
203
|
-
"While parsing an object value we found a stray , ignoring it",
|
204
|
-
)
|
205
|
-
else:
|
206
|
-
value = self.parse_json()
|
207
|
-
|
208
|
-
# Reset context since our job is done
|
209
|
-
self.context.reset()
|
210
|
-
obj[key] = value
|
211
|
-
|
212
|
-
if (self.get_char_at() or "") in [",", "'", '"']:
|
213
|
-
self.index += 1
|
214
|
-
|
215
|
-
# Remove trailing spaces
|
216
|
-
self.skip_whitespaces_at()
|
217
|
-
|
218
|
-
self.index += 1
|
219
|
-
return obj
|
220
|
-
|
221
|
-
def parse_array(self) -> list[JSONReturnType]:
|
222
|
-
# <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
|
223
|
-
arr = []
|
224
|
-
self.context.set(ContextValues.ARRAY)
|
225
|
-
# Stop when you either find the closing parentheses or you have iterated over the entire string
|
226
|
-
char = self.get_char_at()
|
227
|
-
while char and char not in ["]", "}"]:
|
228
|
-
self.skip_whitespaces_at()
|
229
|
-
value: JSONReturnType = ""
|
230
|
-
if char in self.STRING_DELIMITERS:
|
231
|
-
# Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
|
232
|
-
# So we are going to check if this string is followed by a : or not
|
233
|
-
# And either parse the string or parse the object
|
234
|
-
i = 1
|
235
|
-
i = self.skip_to_character(char, i)
|
236
|
-
i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
|
237
|
-
value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
|
238
|
-
else:
|
239
|
-
value = self.parse_json()
|
240
|
-
|
241
|
-
# It is possible that parse_json() returns nothing valid, so we increase by 1
|
242
|
-
if value == "":
|
243
|
-
self.index += 1
|
244
|
-
elif value == "..." and self.get_char_at(-1) == ".":
|
245
|
-
self.log(
|
246
|
-
"While parsing an array, found a stray '...'; ignoring it",
|
247
|
-
)
|
248
|
-
else:
|
249
|
-
arr.append(value)
|
250
|
-
|
251
|
-
# skip over whitespace after a value but before closing ]
|
252
|
-
char = self.get_char_at()
|
253
|
-
while char and char != "]" and (char.isspace() or char == ","):
|
254
|
-
self.index += 1
|
255
|
-
char = self.get_char_at()
|
256
|
-
|
257
|
-
# Especially at the end of an LLM generated json you might miss the last "]"
|
258
|
-
if char and char != "]":
|
259
|
-
self.log(
|
260
|
-
"While parsing an array we missed the closing ], ignoring it",
|
261
|
-
)
|
262
|
-
|
263
|
-
self.index += 1
|
264
|
-
|
265
|
-
self.context.reset()
|
266
|
-
return arr
|
267
|
-
|
268
|
-
def parse_string(self) -> str | bool | None:
|
269
|
-
# <string> is a string of valid characters enclosed in quotes
|
270
|
-
# i.e. { name: "John" }
|
271
|
-
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
|
272
|
-
|
273
|
-
# Flag to manage corner cases related to missing starting quote
|
274
|
-
missing_quotes = False
|
275
|
-
doubled_quotes = False
|
276
|
-
lstring_delimiter = rstring_delimiter = '"'
|
277
|
-
|
278
|
-
char = self.get_char_at()
|
279
|
-
if char in ["#", "/"]:
|
280
|
-
return self.parse_comment()
|
281
|
-
# A valid string can only start with a valid quote or, in our case, with a literal
|
282
|
-
while char and char not in self.STRING_DELIMITERS and not char.isalnum():
|
283
|
-
self.index += 1
|
284
|
-
char = self.get_char_at()
|
285
|
-
|
286
|
-
if not char:
|
287
|
-
# This is an empty string
|
288
|
-
return ""
|
289
|
-
|
290
|
-
# Ensuring we use the right delimiter
|
291
|
-
if char == "'":
|
292
|
-
lstring_delimiter = rstring_delimiter = "'"
|
293
|
-
elif char == "“":
|
294
|
-
lstring_delimiter = "“"
|
295
|
-
rstring_delimiter = "”"
|
296
|
-
elif char.isalnum():
|
297
|
-
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
|
298
|
-
# But remember, object keys are only of type string
|
299
|
-
if char.lower() in ["t", "f", "n"] and self.context.current != ContextValues.OBJECT_KEY:
|
300
|
-
value = self.parse_boolean_or_null()
|
301
|
-
if value != "":
|
302
|
-
return value
|
303
|
-
self.log(
|
304
|
-
"While parsing a string, we found a literal instead of a quote",
|
305
|
-
)
|
306
|
-
missing_quotes = True
|
307
|
-
|
308
|
-
if not missing_quotes:
|
309
|
-
self.index += 1
|
310
|
-
|
311
|
-
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
|
312
|
-
if self.get_char_at() in self.STRING_DELIMITERS and self.get_char_at() == lstring_delimiter:
|
313
|
-
# If it's an empty key, this was easy
|
314
|
-
if (self.context.current == ContextValues.OBJECT_KEY and self.get_char_at(1) == ":") or (
|
315
|
-
self.context.current == ContextValues.OBJECT_VALUE and self.get_char_at(1) in [",", "}"]
|
316
|
-
):
|
317
|
-
self.index += 1
|
318
|
-
return ""
|
319
|
-
elif self.get_char_at(1) == lstring_delimiter:
|
320
|
-
# There's something fishy about this, we found doubled quotes and then again quotes
|
321
|
-
self.log(
|
322
|
-
"While parsing a string, we found a doubled quote and then a quote again, ignoring it",
|
323
|
-
)
|
324
|
-
return ""
|
325
|
-
# Find the next delimiter
|
326
|
-
i = self.skip_to_character(character=rstring_delimiter, idx=1)
|
327
|
-
next_c = self.get_char_at(i)
|
328
|
-
# Now check that the next character is also a delimiter to ensure that we have "".....""
|
329
|
-
# In that case we ignore this rstring delimiter
|
330
|
-
if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
|
331
|
-
self.log(
|
332
|
-
"While parsing a string, we found a valid starting doubled quote",
|
333
|
-
)
|
334
|
-
doubled_quotes = True
|
335
|
-
self.index += 1
|
336
|
-
else:
|
337
|
-
# Ok this is not a doubled quote, check if this is an empty string or not
|
338
|
-
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
|
339
|
-
next_c = self.get_char_at(i)
|
340
|
-
if next_c in self.STRING_DELIMITERS + ["{", "["]:
|
341
|
-
# something fishy is going on here
|
342
|
-
self.log(
|
343
|
-
"While parsing a string, we found a doubled quote but also another quote afterwards, ignoring it",
|
344
|
-
)
|
345
|
-
self.index += 1
|
346
|
-
return ""
|
347
|
-
elif next_c not in [",", "]", "}"]:
|
348
|
-
self.log(
|
349
|
-
"While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
|
350
|
-
)
|
351
|
-
self.index += 1
|
352
|
-
|
353
|
-
# Initialize our return value
|
354
|
-
string_acc = ""
|
355
|
-
|
356
|
-
# Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
|
357
|
-
# In that case we need to use the ":|,|}" characters as terminators of the string
|
358
|
-
# So this will stop if:
|
359
|
-
# * It finds a closing quote
|
360
|
-
# * It iterated over the entire sequence
|
361
|
-
# * If we are fixing missing quotes in an object, when it finds the special terminators
|
362
|
-
char = self.get_char_at()
|
363
|
-
unmatched_delimiter = False
|
364
|
-
while char and char != rstring_delimiter:
|
365
|
-
if missing_quotes and self.context.current == ContextValues.OBJECT_KEY and (char == ":" or char.isspace()):
|
366
|
-
self.log(
|
367
|
-
"While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
|
368
|
-
)
|
369
|
-
break
|
370
|
-
if (
|
371
|
-
not self.stream_stable
|
372
|
-
and self.context.current == ContextValues.OBJECT_VALUE
|
373
|
-
and char
|
374
|
-
in [
|
375
|
-
",",
|
376
|
-
"}",
|
377
|
-
]
|
378
|
-
and (not string_acc or string_acc[-1] != rstring_delimiter)
|
379
|
-
):
|
380
|
-
rstring_delimiter_missing = True
|
381
|
-
# check if this is a case in which the closing comma is NOT missing instead
|
382
|
-
self.skip_whitespaces_at()
|
383
|
-
if self.get_char_at(1) == "\\":
|
384
|
-
# Ok this is a quoted string, skip
|
385
|
-
rstring_delimiter_missing = False
|
386
|
-
i = self.skip_to_character(character=rstring_delimiter, idx=1)
|
387
|
-
next_c = self.get_char_at(i)
|
388
|
-
if next_c:
|
389
|
-
i += 1
|
390
|
-
# found a delimiter, now we need to check that is followed strictly by a comma or brace
|
391
|
-
# or the string ended
|
392
|
-
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
|
393
|
-
next_c = self.get_char_at(i)
|
394
|
-
if not next_c or next_c in [",", "}"]:
|
395
|
-
rstring_delimiter_missing = False
|
396
|
-
else:
|
397
|
-
# OK but this could still be some garbage at the end of the string
|
398
|
-
# So we need to check if we find a new lstring_delimiter afterwards
|
399
|
-
# If we do, maybe this is a missing delimiter
|
400
|
-
i = self.skip_to_character(character=lstring_delimiter, idx=i)
|
401
|
-
next_c = self.get_char_at(i)
|
402
|
-
if not next_c:
|
403
|
-
rstring_delimiter_missing = False
|
404
|
-
else:
|
405
|
-
# But again, this could just be something a bit stupid like "lorem, "ipsum" sic"
|
406
|
-
# Check if we find a : afterwards (skipping space)
|
407
|
-
i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
|
408
|
-
next_c = self.get_char_at(i)
|
409
|
-
if next_c and next_c != ":":
|
410
|
-
rstring_delimiter_missing = False
|
411
|
-
else:
|
412
|
-
# There could be a case in which even the next key:value is missing delimeters
|
413
|
-
# because it might be a systemic issue with the output
|
414
|
-
# So let's check if we can find a : in the string instead
|
415
|
-
i = self.skip_to_character(character=":", idx=1)
|
416
|
-
next_c = self.get_char_at(i)
|
417
|
-
if next_c:
|
418
|
-
# OK then this is a systemic issue with the output
|
419
|
-
break
|
420
|
-
else:
|
421
|
-
# skip any whitespace first
|
422
|
-
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
|
423
|
-
# We couldn't find any rstring_delimeter before the end of the string
|
424
|
-
# check if this is the last string of an object and therefore we can keep going
|
425
|
-
# make an exception if this is the last char before the closing brace
|
426
|
-
j = self.skip_to_character(character="}", idx=i)
|
427
|
-
if j - i > 1:
|
428
|
-
# Ok it's not right after the comma
|
429
|
-
# Let's ignore
|
430
|
-
rstring_delimiter_missing = False
|
431
|
-
# Check that j was not out of bound
|
432
|
-
elif self.get_char_at(j):
|
433
|
-
# Check for an unmatched opening brace in string_acc
|
434
|
-
for c in reversed(string_acc):
|
435
|
-
if c == "{":
|
436
|
-
# Ok then this is part of the string
|
437
|
-
rstring_delimiter_missing = False
|
438
|
-
break
|
439
|
-
if rstring_delimiter_missing:
|
440
|
-
self.log(
|
441
|
-
"While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
|
442
|
-
)
|
443
|
-
break
|
444
|
-
if (
|
445
|
-
not self.stream_stable
|
446
|
-
and char == "]"
|
447
|
-
and ContextValues.ARRAY in self.context.context
|
448
|
-
and string_acc[-1] != rstring_delimiter
|
449
|
-
):
|
450
|
-
# We found the end of an array and we are in array context
|
451
|
-
# So let's check if we find a rstring_delimiter forward otherwise end early
|
452
|
-
i = self.skip_to_character(rstring_delimiter)
|
453
|
-
if not self.get_char_at(i):
|
454
|
-
# No delimiter found
|
455
|
-
break
|
456
|
-
string_acc += char
|
457
|
-
self.index += 1
|
458
|
-
char = self.get_char_at()
|
459
|
-
# Unclosed string ends with a \ character. This character is ignored if stream_stable = True.
|
460
|
-
if self.stream_stable and not char and string_acc[-1] == "\\":
|
461
|
-
string_acc = string_acc[:-1]
|
462
|
-
if char and string_acc[-1] == "\\":
|
463
|
-
# This is a special case, if people use real strings this might happen
|
464
|
-
self.log("Found a stray escape sequence, normalizing it")
|
465
|
-
if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
|
466
|
-
string_acc = string_acc[:-1]
|
467
|
-
escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
|
468
|
-
string_acc += escape_seqs.get(char, char)
|
469
|
-
self.index += 1
|
470
|
-
char = self.get_char_at()
|
471
|
-
while char and string_acc[-1] == "\\" and char in [rstring_delimiter, "\\"]:
|
472
|
-
# this is a bit of a special case, if I don't do this it will close the loop or create a train of \\
|
473
|
-
# I don't love it though
|
474
|
-
string_acc = string_acc[:-1]
|
475
|
-
string_acc += char
|
476
|
-
self.index += 1
|
477
|
-
char = self.get_char_at()
|
478
|
-
continue
|
479
|
-
elif char in ["u", "x"]:
|
480
|
-
# If we find a unicode escape sequence, normalize it
|
481
|
-
num_chars = 4 if char == "u" else 2
|
482
|
-
next_chars = self.json_str[self.index + 1 : self.index + 1 + num_chars]
|
483
|
-
if len(next_chars) == num_chars and all(c in "0123456789abcdefABCDEF" for c in next_chars):
|
484
|
-
self.log("Found a unicode escape sequence, normalizing it")
|
485
|
-
string_acc = string_acc[:-1]
|
486
|
-
string_acc += chr(int(next_chars, 16))
|
487
|
-
self.index += 1 + num_chars
|
488
|
-
char = self.get_char_at()
|
489
|
-
continue
|
490
|
-
# If we are in object key context and we find a colon, it could be a missing right quote
|
491
|
-
if char == ":" and not missing_quotes and self.context.current == ContextValues.OBJECT_KEY:
|
492
|
-
# Ok now we need to check if this is followed by a value like "..."
|
493
|
-
i = self.skip_to_character(character=lstring_delimiter, idx=1)
|
494
|
-
next_c = self.get_char_at(i)
|
495
|
-
if next_c:
|
496
|
-
i += 1
|
497
|
-
# found the first delimiter
|
498
|
-
i = self.skip_to_character(character=rstring_delimiter, idx=i)
|
499
|
-
next_c = self.get_char_at(i)
|
500
|
-
if next_c:
|
501
|
-
# found a second delimiter
|
502
|
-
i += 1
|
503
|
-
# Skip spaces
|
504
|
-
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
|
505
|
-
next_c = self.get_char_at(i)
|
506
|
-
if next_c and next_c in [",", "}"]:
|
507
|
-
# Ok then this is a missing right quote
|
508
|
-
self.log(
|
509
|
-
"While parsing a string missing the right delimiter in object key context, we found a :, stopping here",
|
510
|
-
)
|
511
|
-
break
|
512
|
-
else:
|
513
|
-
# The string ended without finding a lstring_delimiter, I will assume this is a missing right quote
|
514
|
-
self.log(
|
515
|
-
"While parsing a string missing the right delimiter in object key context, we found a :, stopping here",
|
516
|
-
)
|
517
|
-
break
|
518
|
-
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
|
519
|
-
if char == rstring_delimiter and string_acc[-1] != "\\":
|
520
|
-
# Special case here, in case of double quotes one after another
|
521
|
-
if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
|
522
|
-
self.log("While parsing a string, we found a doubled quote, ignoring it")
|
523
|
-
self.index += 1
|
524
|
-
elif missing_quotes and self.context.current == ContextValues.OBJECT_VALUE:
|
525
|
-
# In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
|
526
|
-
i = 1
|
527
|
-
next_c = self.get_char_at(i)
|
528
|
-
while next_c and next_c not in [
|
529
|
-
rstring_delimiter,
|
530
|
-
lstring_delimiter,
|
531
|
-
]:
|
532
|
-
i += 1
|
533
|
-
next_c = self.get_char_at(i)
|
534
|
-
if next_c:
|
535
|
-
# We found a quote, now let's make sure there's a ":" following
|
536
|
-
i += 1
|
537
|
-
# found a delimiter, now we need to check that is followed strictly by a comma or brace
|
538
|
-
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
|
539
|
-
next_c = self.get_char_at(i)
|
540
|
-
if next_c and next_c == ":":
|
541
|
-
# Reset the cursor
|
542
|
-
self.index -= 1
|
543
|
-
char = self.get_char_at()
|
544
|
-
self.log(
|
545
|
-
"In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.",
|
546
|
-
)
|
547
|
-
break
|
548
|
-
elif unmatched_delimiter:
|
549
|
-
unmatched_delimiter = False
|
550
|
-
string_acc += str(char)
|
551
|
-
self.index += 1
|
552
|
-
char = self.get_char_at()
|
553
|
-
else:
|
554
|
-
# Check if eventually there is a rstring delimiter, otherwise we bail
|
555
|
-
i = 1
|
556
|
-
next_c = self.get_char_at(i)
|
557
|
-
check_comma_in_object_value = True
|
558
|
-
while next_c and next_c not in [
|
559
|
-
rstring_delimiter,
|
560
|
-
lstring_delimiter,
|
561
|
-
]:
|
562
|
-
# This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
|
563
|
-
# This is because the routine after will make sure to correct any bad guess and this solves a corner case
|
564
|
-
if check_comma_in_object_value and next_c.isalpha():
|
565
|
-
check_comma_in_object_value = False
|
566
|
-
# If we are in an object context, let's check for the right delimiters
|
567
|
-
if (
|
568
|
-
(ContextValues.OBJECT_KEY in self.context.context and next_c in [":", "}"])
|
569
|
-
or (ContextValues.OBJECT_VALUE in self.context.context and next_c == "}")
|
570
|
-
or (ContextValues.ARRAY in self.context.context and next_c in ["]", ","])
|
571
|
-
or (
|
572
|
-
check_comma_in_object_value
|
573
|
-
and self.context.current == ContextValues.OBJECT_VALUE
|
574
|
-
and next_c == ","
|
575
|
-
)
|
576
|
-
):
|
577
|
-
break
|
578
|
-
i += 1
|
579
|
-
next_c = self.get_char_at(i)
|
580
|
-
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
|
581
|
-
if next_c == "," and self.context.current == ContextValues.OBJECT_VALUE:
|
582
|
-
i += 1
|
583
|
-
i = self.skip_to_character(character=rstring_delimiter, idx=i)
|
584
|
-
next_c = self.get_char_at(i)
|
585
|
-
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
|
586
|
-
i += 1
|
587
|
-
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
|
588
|
-
next_c = self.get_char_at(i)
|
589
|
-
elif next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\":
|
590
|
-
# Check if self.index:self.index+i is only whitespaces, break if that's the case
|
591
|
-
if all(str(self.get_char_at(j)).isspace() for j in range(1, i) if self.get_char_at(j)):
|
592
|
-
break
|
593
|
-
if self.context.current == ContextValues.OBJECT_VALUE:
|
594
|
-
# But this might not be it! This could be just a missing comma
|
595
|
-
# We found a delimiter and we need to check if this is a key
|
596
|
-
# so find a rstring_delimiter and a colon after
|
597
|
-
i = self.skip_to_character(character=rstring_delimiter, idx=i + 1)
|
598
|
-
i += 1
|
599
|
-
next_c = self.get_char_at(i)
|
600
|
-
while next_c and next_c != ":":
|
601
|
-
if next_c in [",", "]", "}"] or (
|
602
|
-
next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\"
|
603
|
-
):
|
604
|
-
break
|
605
|
-
i += 1
|
606
|
-
next_c = self.get_char_at(i)
|
607
|
-
# Only if we fail to find a ':' then we know this is misplaced quote
|
608
|
-
if next_c != ":":
|
609
|
-
self.log(
|
610
|
-
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
611
|
-
)
|
612
|
-
unmatched_delimiter = not unmatched_delimiter
|
613
|
-
string_acc += str(char)
|
614
|
-
self.index += 1
|
615
|
-
char = self.get_char_at()
|
616
|
-
elif self.context.current == ContextValues.ARRAY:
|
617
|
-
# If we got up to here it means that this is a situation like this:
|
618
|
-
# ["bla bla bla "puppy" bla bla bla "kitty" bla bla"]
|
619
|
-
# So we need to ignore this quote
|
620
|
-
self.log(
|
621
|
-
"While parsing a string in Array context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
|
622
|
-
)
|
623
|
-
unmatched_delimiter = not unmatched_delimiter
|
624
|
-
string_acc += str(char)
|
625
|
-
self.index += 1
|
626
|
-
char = self.get_char_at()
|
627
|
-
elif self.context.current == ContextValues.OBJECT_KEY:
|
628
|
-
# In this case we just ignore this and move on
|
629
|
-
self.log(
|
630
|
-
"While parsing a string in Object Key context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
|
631
|
-
)
|
632
|
-
string_acc += str(char)
|
633
|
-
self.index += 1
|
634
|
-
char = self.get_char_at()
|
635
|
-
if char and missing_quotes and self.context.current == ContextValues.OBJECT_KEY and char.isspace():
|
636
|
-
self.log(
|
637
|
-
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
|
638
|
-
)
|
639
|
-
self.skip_whitespaces_at()
|
640
|
-
if self.get_char_at() not in [":", ","]:
|
641
|
-
return ""
|
642
|
-
|
643
|
-
# A fallout of the previous special case in the while loop,
|
644
|
-
# we need to update the index only if we had a closing quote
|
645
|
-
if char != rstring_delimiter:
|
646
|
-
# if stream_stable = True, unclosed strings do not trim trailing whitespace characters
|
647
|
-
if not self.stream_stable:
|
648
|
-
self.log(
|
649
|
-
"While parsing a string, we missed the closing quote, ignoring",
|
650
|
-
)
|
651
|
-
string_acc = string_acc.rstrip()
|
652
|
-
else:
|
653
|
-
self.index += 1
|
654
|
-
|
655
|
-
if not self.stream_stable and (missing_quotes or (string_acc and string_acc[-1] == "\n")):
|
656
|
-
# Clean the whitespaces for some corner cases
|
657
|
-
string_acc = string_acc.rstrip()
|
658
|
-
|
659
|
-
return string_acc
|
660
|
-
|
661
|
-
def parse_number(self) -> float | int | str | JSONReturnType:
|
662
|
-
# <number> is a valid real number expressed in one of a number of given formats
|
663
|
-
number_str = ""
|
664
|
-
char = self.get_char_at()
|
665
|
-
is_array = self.context.current == ContextValues.ARRAY
|
666
|
-
while char and char in self.NUMBER_CHARS and (not is_array or char != ","):
|
667
|
-
number_str += char
|
668
|
-
self.index += 1
|
669
|
-
char = self.get_char_at()
|
670
|
-
if number_str and number_str[-1] in "-eE/,":
|
671
|
-
# The number ends with a non valid character for a number/currency, rolling back one
|
672
|
-
number_str = number_str[:-1]
|
673
|
-
self.index -= 1
|
674
|
-
elif (self.get_char_at() or "").isalpha():
|
675
|
-
# this was a string instead, sorry
|
676
|
-
self.index -= len(number_str)
|
677
|
-
return self.parse_string()
|
678
|
-
try:
|
679
|
-
if "," in number_str:
|
680
|
-
return str(number_str)
|
681
|
-
if "." in number_str or "e" in number_str or "E" in number_str:
|
682
|
-
return float(number_str)
|
683
|
-
else:
|
684
|
-
return int(number_str)
|
685
|
-
except ValueError:
|
686
|
-
return number_str
|
687
|
-
|
688
|
-
def parse_boolean_or_null(self) -> bool | str | None:
|
689
|
-
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
|
690
|
-
starting_index = self.index
|
691
|
-
char = (self.get_char_at() or "").lower()
|
692
|
-
value: tuple[str, bool | None] | None = None
|
693
|
-
if char == "t":
|
694
|
-
value = ("true", True)
|
695
|
-
elif char == "f":
|
696
|
-
value = ("false", False)
|
697
|
-
elif char == "n":
|
698
|
-
value = ("null", None)
|
699
|
-
|
700
|
-
if value:
|
701
|
-
i = 0
|
702
|
-
while char and i < len(value[0]) and char == value[0][i]:
|
703
|
-
i += 1
|
704
|
-
self.index += 1
|
705
|
-
char = (self.get_char_at() or "").lower()
|
706
|
-
if i == len(value[0]):
|
707
|
-
return value[1]
|
708
|
-
|
709
|
-
# If nothing works reset the index before returning
|
710
|
-
self.index = starting_index
|
711
|
-
return ""
|
712
|
-
|
713
|
-
def parse_comment(self) -> str:
|
714
|
-
"""
|
715
|
-
Parse code-like comments:
|
716
|
-
|
717
|
-
- "# comment": A line comment that continues until a newline.
|
718
|
-
- "// comment": A line comment that continues until a newline.
|
719
|
-
- "/* comment */": A block comment that continues until the closing delimiter "*/".
|
720
|
-
|
721
|
-
The comment is skipped over and an empty string is returned so that comments do not interfere
|
722
|
-
with the actual JSON elements.
|
723
|
-
"""
|
724
|
-
char = self.get_char_at()
|
725
|
-
termination_characters = ["\n", "\r"]
|
726
|
-
if ContextValues.ARRAY in self.context.context:
|
727
|
-
termination_characters.append("]")
|
728
|
-
if ContextValues.OBJECT_VALUE in self.context.context:
|
729
|
-
termination_characters.append("}")
|
730
|
-
if ContextValues.OBJECT_KEY in self.context.context:
|
731
|
-
termination_characters.append(":")
|
732
|
-
# Line comment starting with #
|
733
|
-
if char == "#":
|
734
|
-
comment = ""
|
735
|
-
while char and char not in termination_characters:
|
736
|
-
comment += char
|
737
|
-
self.index += 1
|
738
|
-
char = self.get_char_at()
|
739
|
-
self.log(f"Found line comment: {comment}")
|
740
|
-
return ""
|
741
|
-
|
742
|
-
# Comments starting with '/'
|
743
|
-
elif char == "/":
|
744
|
-
next_char = self.get_char_at(1)
|
745
|
-
# Handle line comment starting with //
|
746
|
-
if next_char == "/":
|
747
|
-
comment = "//"
|
748
|
-
self.index += 2 # Skip both slashes.
|
749
|
-
char = self.get_char_at()
|
750
|
-
while char and char not in termination_characters:
|
751
|
-
comment += char
|
752
|
-
self.index += 1
|
753
|
-
char = self.get_char_at()
|
754
|
-
self.log(f"Found line comment: {comment}")
|
755
|
-
return ""
|
756
|
-
# Handle block comment starting with /*
|
757
|
-
elif next_char == "*":
|
758
|
-
comment = "/*"
|
759
|
-
self.index += 2 # Skip '/*'
|
760
|
-
while True:
|
761
|
-
char = self.get_char_at()
|
762
|
-
if not char:
|
763
|
-
self.log("Reached end-of-string while parsing block comment; unclosed block comment.")
|
764
|
-
break
|
765
|
-
comment += char
|
766
|
-
self.index += 1
|
767
|
-
if comment.endswith("*/"):
|
768
|
-
break
|
769
|
-
self.log(f"Found block comment: {comment}")
|
770
|
-
return ""
|
771
|
-
else:
|
772
|
-
# Skip standalone '/' characters that are not part of a comment
|
773
|
-
# to avoid getting stuck in an infinite loop
|
774
|
-
self.index += 1
|
775
|
-
return ""
|
776
|
-
return "" # pragma: no cover
|
777
|
-
|
778
121
|
def get_char_at(self, count: int = 0) -> str | Literal[False]:
|
779
122
|
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
|
780
123
|
try:
|