json-repair 0.29.1__py3-none-any.whl → 0.29.3__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- json_repair/json_context.py +69 -0
- json_repair/json_parser.py +598 -0
- json_repair/json_repair.py +2 -642
- json_repair/string_file_wrapper.py +98 -0
- {json_repair-0.29.1.dist-info → json_repair-0.29.3.dist-info}/METADATA +41 -16
- json_repair-0.29.3.dist-info/RECORD +13 -0
- {json_repair-0.29.1.dist-info → json_repair-0.29.3.dist-info}/WHEEL +1 -1
- json_repair-0.29.1.dist-info/RECORD +0 -10
- {json_repair-0.29.1.dist-info → json_repair-0.29.3.dist-info}/LICENSE +0 -0
- {json_repair-0.29.1.dist-info → json_repair-0.29.3.dist-info}/entry_points.txt +0 -0
- {json_repair-0.29.1.dist-info → json_repair-0.29.3.dist-info}/top_level.txt +0 -0
json_repair/json_repair.py
CHANGED
@@ -23,650 +23,10 @@ All supported use cases are in the unit tests
|
|
23
23
|
"""
|
24
24
|
|
25
25
|
import argparse
|
26
|
-
import os
|
27
26
|
import sys
|
28
27
|
import json
|
29
|
-
from typing import
|
30
|
-
|
31
|
-
|
32
|
-
class StringFileWrapper:
|
33
|
-
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
|
34
|
-
def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
|
35
|
-
self.fd = fd
|
36
|
-
self.length: int = 0
|
37
|
-
# Buffers are 1MB strings that are read from the file
|
38
|
-
# and kept in memory to keep reads low
|
39
|
-
self.buffers: dict[int, str] = {}
|
40
|
-
# CHUNK_LENGTH is in bytes
|
41
|
-
if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
|
42
|
-
CHUNK_LENGTH = 1_000_000
|
43
|
-
self.buffer_length = CHUNK_LENGTH
|
44
|
-
|
45
|
-
def get_buffer(self, index: int) -> str:
|
46
|
-
if self.buffers.get(index) is None:
|
47
|
-
self.fd.seek(index * self.buffer_length)
|
48
|
-
self.buffers[index] = self.fd.read(self.buffer_length)
|
49
|
-
# Save memory by keeping max 2MB buffer chunks and min 2 chunks
|
50
|
-
if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
|
51
|
-
oldest_key = next(iter(self.buffers))
|
52
|
-
if oldest_key != index:
|
53
|
-
self.buffers.pop(oldest_key)
|
54
|
-
return self.buffers[index]
|
55
|
-
|
56
|
-
def __getitem__(self, index: Union[int, slice]) -> str:
|
57
|
-
# The buffer is an array that is seek like a RAM:
|
58
|
-
# self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
|
59
|
-
# self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
|
60
|
-
if isinstance(index, slice):
|
61
|
-
buffer_index = index.start // self.buffer_length
|
62
|
-
buffer_end = index.stop // self.buffer_length
|
63
|
-
if buffer_index == buffer_end:
|
64
|
-
return self.get_buffer(buffer_index)[
|
65
|
-
index.start % self.buffer_length : index.stop % self.buffer_length
|
66
|
-
]
|
67
|
-
else:
|
68
|
-
start_slice = self.get_buffer(buffer_index)[
|
69
|
-
index.start % self.buffer_length :
|
70
|
-
]
|
71
|
-
end_slice = self.get_buffer(buffer_end)[
|
72
|
-
: index.stop % self.buffer_length
|
73
|
-
]
|
74
|
-
middle_slices = [
|
75
|
-
self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)
|
76
|
-
]
|
77
|
-
return start_slice + "".join(middle_slices) + end_slice
|
78
|
-
else:
|
79
|
-
buffer_index = index // self.buffer_length
|
80
|
-
return self.get_buffer(buffer_index)[index % self.buffer_length]
|
81
|
-
|
82
|
-
def __len__(self) -> int:
|
83
|
-
if self.length < 1:
|
84
|
-
current_position = self.fd.tell()
|
85
|
-
self.fd.seek(0, os.SEEK_END)
|
86
|
-
self.length = self.fd.tell()
|
87
|
-
self.fd.seek(current_position)
|
88
|
-
return self.length
|
89
|
-
|
90
|
-
|
91
|
-
class LoggerConfig:
|
92
|
-
# This is a type class to simplify the declaration
|
93
|
-
def __init__(self, log_level: Optional[str]):
|
94
|
-
self.log: List[Dict[str, str]] = []
|
95
|
-
self.window: int = 10
|
96
|
-
self.log_level: str = log_level if log_level else "none"
|
97
|
-
|
98
|
-
|
99
|
-
JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
|
100
|
-
|
101
|
-
|
102
|
-
class JSONParser:
|
103
|
-
def __init__(
|
104
|
-
self,
|
105
|
-
json_str: Union[str, StringFileWrapper],
|
106
|
-
json_fd: Optional[TextIO],
|
107
|
-
logging: Optional[bool],
|
108
|
-
json_fd_chunk_length: int = 0,
|
109
|
-
) -> None:
|
110
|
-
# The string to parse
|
111
|
-
self.json_str = json_str
|
112
|
-
# Alternatively, the file description with a json file in it
|
113
|
-
if json_fd:
|
114
|
-
# This is a trick we do to treat the file wrapper as an array
|
115
|
-
self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
|
116
|
-
# Index is our iterator that will keep track of which character we are looking at right now
|
117
|
-
self.index: int = 0
|
118
|
-
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
|
119
|
-
self.context: list[str] = []
|
120
|
-
# Use this to log the activity, but only if logging is active
|
121
|
-
self.logger = LoggerConfig(log_level="info" if logging else None)
|
122
|
-
|
123
|
-
def parse(
|
124
|
-
self,
|
125
|
-
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
126
|
-
json = self.parse_json()
|
127
|
-
if self.index < len(self.json_str):
|
128
|
-
self.log(
|
129
|
-
"The parser returned early, checking if there's more json elements",
|
130
|
-
"info",
|
131
|
-
)
|
132
|
-
json = [json]
|
133
|
-
last_index = self.index
|
134
|
-
while self.index < len(self.json_str):
|
135
|
-
j = self.parse_json()
|
136
|
-
if j != "":
|
137
|
-
json.append(j)
|
138
|
-
if self.index == last_index:
|
139
|
-
self.index += 1
|
140
|
-
last_index = self.index
|
141
|
-
# If nothing extra was found, don't return an array
|
142
|
-
if len(json) == 1:
|
143
|
-
self.log(
|
144
|
-
"There were no more elements, returning the element without the array",
|
145
|
-
"info",
|
146
|
-
)
|
147
|
-
json = json[0]
|
148
|
-
if self.logger.log_level == "none":
|
149
|
-
return json
|
150
|
-
else:
|
151
|
-
return json, self.logger.log
|
152
|
-
|
153
|
-
def parse_json(
|
154
|
-
self,
|
155
|
-
) -> JSONReturnType:
|
156
|
-
while True:
|
157
|
-
char = self.get_char_at()
|
158
|
-
# This parser will ignore any basic element (string or number) that is not inside an array or object
|
159
|
-
is_in_context = len(self.context) > 0
|
160
|
-
# False means that we are at the end of the string provided
|
161
|
-
if char is False:
|
162
|
-
return ""
|
163
|
-
# <object> starts with '{'
|
164
|
-
elif char == "{":
|
165
|
-
self.index += 1
|
166
|
-
return self.parse_object()
|
167
|
-
# <array> starts with '['
|
168
|
-
elif char == "[":
|
169
|
-
self.index += 1
|
170
|
-
return self.parse_array()
|
171
|
-
# there can be an edge case in which a key is empty and at the end of an object
|
172
|
-
# like "key": }. We return an empty string here to close the object properly
|
173
|
-
elif char == "}":
|
174
|
-
self.log(
|
175
|
-
"At the end of an object we found a key with missing value, skipping",
|
176
|
-
"info",
|
177
|
-
)
|
178
|
-
return ""
|
179
|
-
# <string> starts with a quote
|
180
|
-
elif is_in_context and (char in ['"', "'", "“"] or char.isalpha()):
|
181
|
-
return self.parse_string()
|
182
|
-
# <number> starts with [0-9] or minus
|
183
|
-
elif is_in_context and (char.isdigit() or char == "-" or char == "."):
|
184
|
-
return self.parse_number()
|
185
|
-
# If everything else fails, we just ignore and move on
|
186
|
-
else:
|
187
|
-
self.index += 1
|
188
|
-
|
189
|
-
def parse_object(self) -> Dict[str, Any]:
|
190
|
-
# <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
|
191
|
-
obj = {}
|
192
|
-
# Stop when you either find the closing parentheses or you have iterated over the entire string
|
193
|
-
while (self.get_char_at() or "}") != "}":
|
194
|
-
# This is what we expect to find:
|
195
|
-
# <member> ::= <string> ': ' <json>
|
196
|
-
|
197
|
-
# Skip filler whitespaces
|
198
|
-
self.skip_whitespaces_at()
|
199
|
-
|
200
|
-
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
|
201
|
-
if (self.get_char_at() or "") == ":":
|
202
|
-
self.log(
|
203
|
-
"While parsing an object we found a : before a key, ignoring",
|
204
|
-
"info",
|
205
|
-
)
|
206
|
-
self.index += 1
|
207
|
-
|
208
|
-
# We are now searching for they string key
|
209
|
-
# Context is used in the string parser to manage the lack of quotes
|
210
|
-
self.set_context("object_key")
|
211
|
-
|
212
|
-
self.skip_whitespaces_at()
|
213
|
-
|
214
|
-
# <member> starts with a <string>
|
215
|
-
key = ""
|
216
|
-
while self.get_char_at():
|
217
|
-
key = str(self.parse_string())
|
218
|
-
|
219
|
-
if key != "" or (key == "" and self.get_char_at() == ":"):
|
220
|
-
# If the string is empty but there is a object divider, we are done here
|
221
|
-
break
|
222
|
-
|
223
|
-
self.skip_whitespaces_at()
|
224
|
-
|
225
|
-
# We reached the end here
|
226
|
-
if (self.get_char_at() or "}") == "}":
|
227
|
-
continue
|
228
|
-
|
229
|
-
self.skip_whitespaces_at()
|
230
|
-
|
231
|
-
# An extreme case of missing ":" after a key
|
232
|
-
if (self.get_char_at() or "") != ":":
|
233
|
-
self.log(
|
234
|
-
"While parsing an object we missed a : after a key",
|
235
|
-
"info",
|
236
|
-
)
|
237
|
-
|
238
|
-
self.index += 1
|
239
|
-
self.reset_context()
|
240
|
-
self.set_context("object_value")
|
241
|
-
# The value can be any valid json
|
242
|
-
value = self.parse_json()
|
243
|
-
|
244
|
-
# Reset context since our job is done
|
245
|
-
self.reset_context()
|
246
|
-
obj[key] = value
|
247
|
-
|
248
|
-
if (self.get_char_at() or "") in [",", "'", '"']:
|
249
|
-
self.index += 1
|
250
|
-
|
251
|
-
# Remove trailing spaces
|
252
|
-
self.skip_whitespaces_at()
|
253
|
-
|
254
|
-
self.index += 1
|
255
|
-
return obj
|
256
|
-
|
257
|
-
def parse_array(self) -> List[Any]:
|
258
|
-
# <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
|
259
|
-
arr = []
|
260
|
-
self.set_context("array")
|
261
|
-
# Stop when you either find the closing parentheses or you have iterated over the entire string
|
262
|
-
while (self.get_char_at() or "]") != "]":
|
263
|
-
self.skip_whitespaces_at()
|
264
|
-
value = self.parse_json()
|
265
|
-
|
266
|
-
# It is possible that parse_json() returns nothing valid, so we stop
|
267
|
-
if value == "":
|
268
|
-
break
|
269
|
-
|
270
|
-
if value == "..." and self.get_char_at(-1) == ".":
|
271
|
-
self.log(
|
272
|
-
"While parsing an array, found a stray '...'; ignoring it", "info"
|
273
|
-
)
|
274
|
-
else:
|
275
|
-
arr.append(value)
|
276
|
-
|
277
|
-
# skip over whitespace after a value but before closing ]
|
278
|
-
char = self.get_char_at()
|
279
|
-
while char and (char.isspace() or char == ","):
|
280
|
-
self.index += 1
|
281
|
-
char = self.get_char_at()
|
282
|
-
|
283
|
-
# Especially at the end of an LLM generated json you might miss the last "]"
|
284
|
-
char = self.get_char_at()
|
285
|
-
if char and char != "]":
|
286
|
-
self.log(
|
287
|
-
"While parsing an array we missed the closing ], adding it back", "info"
|
288
|
-
)
|
289
|
-
self.index -= 1
|
290
|
-
|
291
|
-
self.index += 1
|
292
|
-
self.reset_context()
|
293
|
-
return arr
|
294
|
-
|
295
|
-
def parse_string(self) -> Union[str, bool, None]:
|
296
|
-
# <string> is a string of valid characters enclosed in quotes
|
297
|
-
# i.e. { name: "John" }
|
298
|
-
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
|
299
|
-
|
300
|
-
# Flag to manage corner cases related to missing starting quote
|
301
|
-
missing_quotes = False
|
302
|
-
doubled_quotes = False
|
303
|
-
lstring_delimiter = rstring_delimiter = '"'
|
304
|
-
|
305
|
-
char = self.get_char_at()
|
306
|
-
# A valid string can only start with a valid quote or, in our case, with a literal
|
307
|
-
while char and char not in ['"', "'", "“"] and not char.isalnum():
|
308
|
-
self.index += 1
|
309
|
-
char = self.get_char_at()
|
310
|
-
|
311
|
-
if not char:
|
312
|
-
# This is an empty string
|
313
|
-
return ""
|
314
|
-
|
315
|
-
# Ensuring we use the right delimiter
|
316
|
-
if char == "'":
|
317
|
-
lstring_delimiter = rstring_delimiter = "'"
|
318
|
-
elif char == "“":
|
319
|
-
lstring_delimiter = "“"
|
320
|
-
rstring_delimiter = "”"
|
321
|
-
elif char.isalnum():
|
322
|
-
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
|
323
|
-
# But remember, object keys are only of type string
|
324
|
-
if char.lower() in ["t", "f", "n"] and self.get_context() != "object_key":
|
325
|
-
value = self.parse_boolean_or_null()
|
326
|
-
if value != "":
|
327
|
-
return value
|
328
|
-
self.log(
|
329
|
-
"While parsing a string, we found a literal instead of a quote",
|
330
|
-
"info",
|
331
|
-
)
|
332
|
-
self.log(
|
333
|
-
"While parsing a string, we found no starting quote. Will add the quote back",
|
334
|
-
"info",
|
335
|
-
)
|
336
|
-
missing_quotes = True
|
337
|
-
|
338
|
-
if not missing_quotes:
|
339
|
-
self.index += 1
|
340
|
-
|
341
|
-
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
|
342
|
-
if self.get_char_at() == lstring_delimiter:
|
343
|
-
# If it's an empty key, this was easy
|
344
|
-
if self.get_context() == "object_key" and self.get_char_at(1) == ":":
|
345
|
-
self.index += 1
|
346
|
-
return ""
|
347
|
-
# Find the next delimiter
|
348
|
-
i = 1
|
349
|
-
next_c = self.get_char_at(i)
|
350
|
-
while next_c and next_c != rstring_delimiter:
|
351
|
-
i += 1
|
352
|
-
next_c = self.get_char_at(i)
|
353
|
-
# Now check that the next character is also a delimiter to ensure that we have "".....""
|
354
|
-
# In that case we ignore this rstring delimiter
|
355
|
-
if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
|
356
|
-
self.log(
|
357
|
-
"While parsing a string, we found a valid starting doubled quote, ignoring it",
|
358
|
-
"info",
|
359
|
-
)
|
360
|
-
doubled_quotes = True
|
361
|
-
self.index += 1
|
362
|
-
else:
|
363
|
-
# Ok this is not a doubled quote, check if this is an empty string or not
|
364
|
-
i = 1
|
365
|
-
next_c = self.get_char_at(i)
|
366
|
-
while next_c and next_c.isspace():
|
367
|
-
i += 1
|
368
|
-
next_c = self.get_char_at(i)
|
369
|
-
if next_c not in [",", "]", "}"]:
|
370
|
-
self.log(
|
371
|
-
"While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
|
372
|
-
"info",
|
373
|
-
)
|
374
|
-
self.index += 1
|
375
|
-
|
376
|
-
# Initialize our return value
|
377
|
-
string_acc = ""
|
378
|
-
|
379
|
-
# Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
|
380
|
-
# In that case we need to use the ":|,|}" characters as terminators of the string
|
381
|
-
# So this will stop if:
|
382
|
-
# * It finds a closing quote
|
383
|
-
# * It iterated over the entire sequence
|
384
|
-
# * If we are fixing missing quotes in an object, when it finds the special terminators
|
385
|
-
char = self.get_char_at()
|
386
|
-
while char and char != rstring_delimiter:
|
387
|
-
if missing_quotes:
|
388
|
-
if self.get_context() == "object_key" and (
|
389
|
-
char == ":" or char.isspace()
|
390
|
-
):
|
391
|
-
self.log(
|
392
|
-
"While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
|
393
|
-
"info",
|
394
|
-
)
|
395
|
-
break
|
396
|
-
elif self.get_context() == "object_value" and char in [",", "}"]:
|
397
|
-
rstring_delimiter_missing = True
|
398
|
-
# check if this is a case in which the closing comma is NOT missing instead
|
399
|
-
i = 1
|
400
|
-
next_c = self.get_char_at(i)
|
401
|
-
while next_c and next_c != rstring_delimiter:
|
402
|
-
i += 1
|
403
|
-
next_c = self.get_char_at(i)
|
404
|
-
if next_c:
|
405
|
-
i += 1
|
406
|
-
next_c = self.get_char_at(i)
|
407
|
-
# found a delimiter, now we need to check that is followed strictly by a comma or brace
|
408
|
-
while next_c and next_c.isspace():
|
409
|
-
i += 1
|
410
|
-
next_c = self.get_char_at(i)
|
411
|
-
if next_c and next_c in [",", "}"]:
|
412
|
-
rstring_delimiter_missing = False
|
413
|
-
if rstring_delimiter_missing:
|
414
|
-
self.log(
|
415
|
-
"While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
|
416
|
-
"info",
|
417
|
-
)
|
418
|
-
break
|
419
|
-
string_acc += char
|
420
|
-
self.index += 1
|
421
|
-
char = self.get_char_at()
|
422
|
-
if char and len(string_acc) > 0 and string_acc[-1] == "\\":
|
423
|
-
# This is a special case, if people use real strings this might happen
|
424
|
-
self.log("Found a stray escape sequence, normalizing it", "info")
|
425
|
-
string_acc = string_acc[:-1]
|
426
|
-
if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
|
427
|
-
escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
|
428
|
-
string_acc += escape_seqs.get(char, char) or char
|
429
|
-
self.index += 1
|
430
|
-
char = self.get_char_at()
|
431
|
-
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
|
432
|
-
if char == rstring_delimiter:
|
433
|
-
# Special case here, in case of double quotes one after another
|
434
|
-
if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
|
435
|
-
self.log(
|
436
|
-
"While parsing a string, we found a doubled quote, ignoring it",
|
437
|
-
"info",
|
438
|
-
)
|
439
|
-
self.index += 1
|
440
|
-
elif missing_quotes and self.get_context() == "object_value":
|
441
|
-
# In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
|
442
|
-
i = 1
|
443
|
-
next_c = self.get_char_at(i)
|
444
|
-
while next_c and next_c not in [
|
445
|
-
rstring_delimiter,
|
446
|
-
lstring_delimiter,
|
447
|
-
]:
|
448
|
-
i += 1
|
449
|
-
next_c = self.get_char_at(i)
|
450
|
-
if next_c:
|
451
|
-
# We found a quote, now let's make sure there's a ":" following
|
452
|
-
i += 1
|
453
|
-
next_c = self.get_char_at(i)
|
454
|
-
# found a delimiter, now we need to check that is followed strictly by a comma or brace
|
455
|
-
while next_c and next_c.isspace():
|
456
|
-
i += 1
|
457
|
-
next_c = self.get_char_at(i)
|
458
|
-
if next_c and next_c == ":":
|
459
|
-
# Reset the cursor
|
460
|
-
self.index -= 1
|
461
|
-
char = self.get_char_at()
|
462
|
-
self.log(
|
463
|
-
"In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.",
|
464
|
-
"info",
|
465
|
-
)
|
466
|
-
break
|
467
|
-
else:
|
468
|
-
# Check if eventually there is a rstring delimiter, otherwise we bail
|
469
|
-
i = 1
|
470
|
-
next_c = self.get_char_at(i)
|
471
|
-
check_comma_in_object_value = True
|
472
|
-
while next_c and next_c not in [
|
473
|
-
rstring_delimiter,
|
474
|
-
lstring_delimiter,
|
475
|
-
]:
|
476
|
-
# This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
|
477
|
-
# This is because the routine after will make sure to correct any bad guess and this solves a corner case
|
478
|
-
if check_comma_in_object_value and next_c.isalpha():
|
479
|
-
check_comma_in_object_value = False
|
480
|
-
# If we are in an object context, let's check for the right delimiters
|
481
|
-
if (
|
482
|
-
("object_key" in self.context and next_c in [":", "}"])
|
483
|
-
or ("object_value" in self.context and next_c == "}")
|
484
|
-
or ("array" in self.context and next_c in ["]", ","])
|
485
|
-
or (
|
486
|
-
check_comma_in_object_value
|
487
|
-
and self.get_context() == "object_value"
|
488
|
-
and next_c == ","
|
489
|
-
)
|
490
|
-
):
|
491
|
-
break
|
492
|
-
i += 1
|
493
|
-
next_c = self.get_char_at(i)
|
494
|
-
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
|
495
|
-
if next_c == "," and self.get_context() == "object_value":
|
496
|
-
i += 1
|
497
|
-
next_c = self.get_char_at(i)
|
498
|
-
while next_c and next_c != rstring_delimiter:
|
499
|
-
i += 1
|
500
|
-
next_c = self.get_char_at(i)
|
501
|
-
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
|
502
|
-
i += 1
|
503
|
-
next_c = self.get_char_at(i)
|
504
|
-
while next_c and next_c.isspace():
|
505
|
-
i += 1
|
506
|
-
next_c = self.get_char_at(i)
|
507
|
-
if next_c == "}":
|
508
|
-
# OK this is valid then
|
509
|
-
self.log(
|
510
|
-
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
|
511
|
-
"info",
|
512
|
-
)
|
513
|
-
string_acc += str(char)
|
514
|
-
self.index += 1
|
515
|
-
char = self.get_char_at()
|
516
|
-
elif next_c == rstring_delimiter:
|
517
|
-
if self.get_context() == "object_value":
|
518
|
-
# But this might not be it! This could be just a missing comma
|
519
|
-
# We found a delimiter and we need to check if this is a key
|
520
|
-
# so find a rstring_delimiter and a colon after
|
521
|
-
i += 1
|
522
|
-
next_c = self.get_char_at(i)
|
523
|
-
while next_c and next_c != rstring_delimiter:
|
524
|
-
i += 1
|
525
|
-
next_c = self.get_char_at(i)
|
526
|
-
i += 1
|
527
|
-
next_c = self.get_char_at(i)
|
528
|
-
while next_c and next_c != ":":
|
529
|
-
if next_c in [
|
530
|
-
lstring_delimiter,
|
531
|
-
rstring_delimiter,
|
532
|
-
",",
|
533
|
-
]:
|
534
|
-
break
|
535
|
-
i += 1
|
536
|
-
next_c = self.get_char_at(i)
|
537
|
-
# Only if we fail to find a ':' then we know this is misplaced quote
|
538
|
-
if next_c != ":":
|
539
|
-
self.log(
|
540
|
-
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
541
|
-
"info",
|
542
|
-
)
|
543
|
-
string_acc += str(char)
|
544
|
-
self.index += 1
|
545
|
-
char = self.get_char_at()
|
546
|
-
|
547
|
-
if (
|
548
|
-
char
|
549
|
-
and missing_quotes
|
550
|
-
and self.get_context() == "object_key"
|
551
|
-
and char.isspace()
|
552
|
-
):
|
553
|
-
self.log(
|
554
|
-
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
|
555
|
-
"info",
|
556
|
-
)
|
557
|
-
self.skip_whitespaces_at()
|
558
|
-
if self.get_char_at() not in [":", ","]:
|
559
|
-
return ""
|
560
|
-
|
561
|
-
# A fallout of the previous special case in the while loop,
|
562
|
-
# we need to update the index only if we had a closing quote
|
563
|
-
if char != rstring_delimiter:
|
564
|
-
self.log(
|
565
|
-
"While parsing a string, we missed the closing quote, ignoring",
|
566
|
-
"info",
|
567
|
-
)
|
568
|
-
else:
|
569
|
-
self.index += 1
|
570
|
-
|
571
|
-
return string_acc.rstrip()
|
572
|
-
|
573
|
-
def parse_number(self) -> Union[float, int, str, JSONReturnType]:
|
574
|
-
# <number> is a valid real number expressed in one of a number of given formats
|
575
|
-
number_str = ""
|
576
|
-
number_chars = set("0123456789-.eE/,")
|
577
|
-
char = self.get_char_at()
|
578
|
-
is_array = self.get_context() == "array"
|
579
|
-
while char and char in number_chars and (char != "," or not is_array):
|
580
|
-
number_str += char
|
581
|
-
self.index += 1
|
582
|
-
char = self.get_char_at()
|
583
|
-
if len(number_str) > 1 and number_str[-1] in "-eE/,":
|
584
|
-
# The number ends with a non valid character for a number/currency, rolling back one
|
585
|
-
number_str = number_str[:-1]
|
586
|
-
self.index -= 1
|
587
|
-
try:
|
588
|
-
if "," in number_str:
|
589
|
-
return str(number_str)
|
590
|
-
if "." in number_str or "e" in number_str or "E" in number_str:
|
591
|
-
return float(number_str)
|
592
|
-
elif number_str == "-":
|
593
|
-
# If there is a stray "-" this will throw an exception, throw away this character
|
594
|
-
return self.parse_json()
|
595
|
-
else:
|
596
|
-
return int(number_str)
|
597
|
-
except ValueError:
|
598
|
-
return number_str
|
599
|
-
|
600
|
-
def parse_boolean_or_null(self) -> Union[bool, str, None]:
|
601
|
-
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
|
602
|
-
starting_index = self.index
|
603
|
-
char = (self.get_char_at() or "").lower()
|
604
|
-
value: Optional[Tuple[str, Optional[bool]]]
|
605
|
-
if char == "t":
|
606
|
-
value = ("true", True)
|
607
|
-
elif char == "f":
|
608
|
-
value = ("false", False)
|
609
|
-
elif char == "n":
|
610
|
-
value = ("null", None)
|
611
|
-
|
612
|
-
if value:
|
613
|
-
i = 0
|
614
|
-
while char and i < len(value[0]) and char == value[0][i]:
|
615
|
-
i += 1
|
616
|
-
self.index += 1
|
617
|
-
char = (self.get_char_at() or "").lower()
|
618
|
-
if i == len(value[0]):
|
619
|
-
return value[1]
|
620
|
-
|
621
|
-
# If nothing works reset the index before returning
|
622
|
-
self.index = starting_index
|
623
|
-
return ""
|
624
|
-
|
625
|
-
def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
|
626
|
-
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
|
627
|
-
try:
|
628
|
-
return self.json_str[self.index + count]
|
629
|
-
except IndexError:
|
630
|
-
return False
|
631
|
-
|
632
|
-
def skip_whitespaces_at(self) -> None:
|
633
|
-
"""
|
634
|
-
This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
|
635
|
-
"""
|
636
|
-
try:
|
637
|
-
char = self.json_str[self.index]
|
638
|
-
except IndexError:
|
639
|
-
return
|
640
|
-
while char.isspace():
|
641
|
-
self.index += 1
|
642
|
-
try:
|
643
|
-
char = self.json_str[self.index]
|
644
|
-
except IndexError:
|
645
|
-
return
|
646
|
-
|
647
|
-
def set_context(self, value: str) -> None:
|
648
|
-
# If a value is provided update the context variable and save in stack
|
649
|
-
if value:
|
650
|
-
self.context.append(value)
|
651
|
-
|
652
|
-
def reset_context(self) -> None:
|
653
|
-
self.context.pop()
|
654
|
-
|
655
|
-
def get_context(self) -> str:
|
656
|
-
return self.context[-1]
|
657
|
-
|
658
|
-
def log(self, text: str, level: str) -> None:
|
659
|
-
if level == self.logger.log_level:
|
660
|
-
context = ""
|
661
|
-
start = max(self.index - self.logger.window, 0)
|
662
|
-
end = min(self.index + self.logger.window, len(self.json_str))
|
663
|
-
context = self.json_str[start:end]
|
664
|
-
self.logger.log.append(
|
665
|
-
{
|
666
|
-
"text": text,
|
667
|
-
"context": context,
|
668
|
-
}
|
669
|
-
)
|
28
|
+
from typing import Dict, List, Optional, Union, TextIO, Tuple
|
29
|
+
from .json_parser import JSONParser, JSONReturnType
|
670
30
|
|
671
31
|
|
672
32
|
def repair_json(
|