json-repair 0.29.2__py3-none-any.whl → 0.29.4__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,45 @@
1
+ from enum import Enum, auto
2
+ from typing import List, Optional
3
+
4
+
5
+ class ContextValues(Enum):
6
+ OBJECT_KEY = auto()
7
+ OBJECT_VALUE = auto()
8
+ ARRAY = auto()
9
+
10
+
11
+ class JsonContext:
12
+ def __init__(self) -> None:
13
+ self.context: List[ContextValues] = []
14
+ self.current: Optional[ContextValues] = None
15
+ self.empty: bool = True
16
+
17
+ def set(self, value: ContextValues) -> None:
18
+ """
19
+ Set a new context value.
20
+
21
+ Args:
22
+ value (ContextValues): The context value to be added.
23
+
24
+ Returns:
25
+ None
26
+ """
27
+ # If a value is provided update the context variable and save in stack
28
+ if value:
29
+ self.context.append(value)
30
+ self.current = value
31
+ self.empty = False
32
+
33
+ def reset(self) -> None:
34
+ """
35
+ Remove the most recent context value.
36
+
37
+ Returns:
38
+ None
39
+ """
40
+ try:
41
+ self.context.pop()
42
+ self.current = self.context[-1]
43
+ except IndexError:
44
+ self.current = None
45
+ self.empty = True
@@ -0,0 +1,584 @@
1
+ from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
2
+
3
+ from .string_file_wrapper import StringFileWrapper
4
+ from .json_context import JsonContext, ContextValues
5
+
6
+ JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
7
+
8
+
9
+ class JSONParser:
10
+ def __init__(
11
+ self,
12
+ json_str: Union[str, StringFileWrapper],
13
+ json_fd: Optional[TextIO],
14
+ logging: Optional[bool],
15
+ json_fd_chunk_length: int = 0,
16
+ ) -> None:
17
+ # The string to parse
18
+ self.json_str: Union[str, StringFileWrapper] = json_str
19
+ # Alternatively, the file description with a json file in it
20
+ if json_fd:
21
+ # This is a trick we do to treat the file wrapper as an array
22
+ self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
23
+ # Index is our iterator that will keep track of which character we are looking at right now
24
+ self.index: int = 0
25
+ # This is used in the object member parsing to manage the special cases of missing quotes in key or value
26
+ self.context = JsonContext()
27
+ # Use this to log the activity, but only if logging is active
28
+
29
+ # This is a trick but a beatiful one. We call self.log in the code over and over even if it's not needed.
30
+ # We could add a guard in the code for each call but that would make this code unreadable, so here's this neat trick
31
+ # Replace self.log with a noop
32
+ self.logging = logging
33
+ if logging:
34
+ self.logger: List[Dict[str, str]] = []
35
+ self.log = self._log
36
+ else:
37
+ # No-op
38
+ self.log = lambda *args, **kwargs: None
39
+
40
+ def parse(
41
+ self,
42
+ ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
43
+ json = self.parse_json()
44
+ if self.index < len(self.json_str):
45
+ self.log(
46
+ "The parser returned early, checking if there's more json elements",
47
+ )
48
+ json = [json]
49
+ last_index = self.index
50
+ while self.index < len(self.json_str):
51
+ j = self.parse_json()
52
+ if j != "":
53
+ json.append(j)
54
+ if self.index == last_index:
55
+ self.index += 1
56
+ last_index = self.index
57
+ # If nothing extra was found, don't return an array
58
+ if len(json) == 1:
59
+ self.log(
60
+ "There were no more elements, returning the element without the array",
61
+ )
62
+ json = json[0]
63
+ if self.logging:
64
+ return json, self.logger
65
+ else:
66
+ return json
67
+
68
+ def parse_json(
69
+ self,
70
+ ) -> JSONReturnType:
71
+ while True:
72
+ char = self.get_char_at()
73
+ # False means that we are at the end of the string provided
74
+ if char is False:
75
+ return ""
76
+ # <object> starts with '{'
77
+ elif char == "{":
78
+ self.index += 1
79
+ return self.parse_object()
80
+ # <array> starts with '['
81
+ elif char == "[":
82
+ self.index += 1
83
+ return self.parse_array()
84
+ # there can be an edge case in which a key is empty and at the end of an object
85
+ # like "key": }. We return an empty string here to close the object properly
86
+ elif char == "}":
87
+ self.log(
88
+ "At the end of an object we found a key with missing value, skipping",
89
+ )
90
+ return ""
91
+ # <string> starts with a quote
92
+ elif not self.context.empty and (char in ['"', "'", "“"] or char.isalpha()):
93
+ return self.parse_string()
94
+ # <number> starts with [0-9] or minus
95
+ elif not self.context.empty and (
96
+ char.isdigit() or char == "-" or char == "."
97
+ ):
98
+ return self.parse_number()
99
+ # If everything else fails, we just ignore and move on
100
+ else:
101
+ self.index += 1
102
+
103
+ def parse_object(self) -> Dict[str, JSONReturnType]:
104
+ # <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
105
+ obj = {}
106
+ # Stop when you either find the closing parentheses or you have iterated over the entire string
107
+ while (self.get_char_at() or "}") != "}":
108
+ # This is what we expect to find:
109
+ # <member> ::= <string> ': ' <json>
110
+
111
+ # Skip filler whitespaces
112
+ self.skip_whitespaces_at()
113
+
114
+ # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
115
+ if (self.get_char_at() or "") == ":":
116
+ self.log(
117
+ "While parsing an object we found a : before a key, ignoring",
118
+ )
119
+ self.index += 1
120
+
121
+ # We are now searching for they string key
122
+ # Context is used in the string parser to manage the lack of quotes
123
+ self.context.set(ContextValues.OBJECT_KEY)
124
+
125
+ self.skip_whitespaces_at()
126
+
127
+ # <member> starts with a <string>
128
+ key = ""
129
+ while self.get_char_at():
130
+ key = str(self.parse_string())
131
+
132
+ if key != "" or (key == "" and self.get_char_at() == ":"):
133
+ # If the string is empty but there is a object divider, we are done here
134
+ break
135
+
136
+ self.skip_whitespaces_at()
137
+
138
+ # We reached the end here
139
+ if (self.get_char_at() or "}") == "}":
140
+ continue
141
+
142
+ self.skip_whitespaces_at()
143
+
144
+ # An extreme case of missing ":" after a key
145
+ if (self.get_char_at() or "") != ":":
146
+ self.log(
147
+ "While parsing an object we missed a : after a key",
148
+ )
149
+
150
+ self.index += 1
151
+ self.context.reset()
152
+ self.context.set(ContextValues.OBJECT_VALUE)
153
+ # The value can be any valid json
154
+ value = self.parse_json()
155
+
156
+ # Reset context since our job is done
157
+ self.context.reset()
158
+ obj[key] = value
159
+
160
+ if (self.get_char_at() or "") in [",", "'", '"']:
161
+ self.index += 1
162
+
163
+ # Remove trailing spaces
164
+ self.skip_whitespaces_at()
165
+
166
+ self.index += 1
167
+ return obj
168
+
169
+ def parse_array(self) -> List[JSONReturnType]:
170
+ # <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
171
+ arr = []
172
+ self.context.set(ContextValues.ARRAY)
173
+ # Stop when you either find the closing parentheses or you have iterated over the entire string
174
+ while (self.get_char_at() or "]") != "]":
175
+ self.skip_whitespaces_at()
176
+ value = self.parse_json()
177
+
178
+ # It is possible that parse_json() returns nothing valid, so we stop
179
+ if value == "":
180
+ break
181
+
182
+ if value == "..." and self.get_char_at(-1) == ".":
183
+ self.log(
184
+ "While parsing an array, found a stray '...'; ignoring it",
185
+ )
186
+ else:
187
+ arr.append(value)
188
+
189
+ # skip over whitespace after a value but before closing ]
190
+ char = self.get_char_at()
191
+ while char and (char.isspace() or char == ","):
192
+ self.index += 1
193
+ char = self.get_char_at()
194
+
195
+ # Especially at the end of an LLM generated json you might miss the last "]"
196
+ char = self.get_char_at()
197
+ if char and char != "]":
198
+ self.log(
199
+ "While parsing an array we missed the closing ], adding it back",
200
+ )
201
+ self.index -= 1
202
+
203
+ self.index += 1
204
+ self.context.reset()
205
+ return arr
206
+
207
+ def parse_string(self) -> Union[str, bool, None]:
208
+ # <string> is a string of valid characters enclosed in quotes
209
+ # i.e. { name: "John" }
210
+ # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
211
+
212
+ # Flag to manage corner cases related to missing starting quote
213
+ missing_quotes = False
214
+ doubled_quotes = False
215
+ lstring_delimiter = rstring_delimiter = '"'
216
+
217
+ char = self.get_char_at()
218
+ # A valid string can only start with a valid quote or, in our case, with a literal
219
+ while char and char not in ['"', "'", "“"] and not char.isalnum():
220
+ self.index += 1
221
+ char = self.get_char_at()
222
+
223
+ if not char:
224
+ # This is an empty string
225
+ return ""
226
+
227
+ # Ensuring we use the right delimiter
228
+ if char == "'":
229
+ lstring_delimiter = rstring_delimiter = "'"
230
+ elif char == "“":
231
+ lstring_delimiter = "“"
232
+ rstring_delimiter = "”"
233
+ elif char.isalnum():
234
+ # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
235
+ # But remember, object keys are only of type string
236
+ if (
237
+ char.lower() in ["t", "f", "n"]
238
+ and self.context.current != ContextValues.OBJECT_KEY
239
+ ):
240
+ value = self.parse_boolean_or_null()
241
+ if value != "":
242
+ return value
243
+ self.log(
244
+ "While parsing a string, we found a literal instead of a quote",
245
+ )
246
+ self.log(
247
+ "While parsing a string, we found no starting quote. Will add the quote back",
248
+ )
249
+ missing_quotes = True
250
+
251
+ if not missing_quotes:
252
+ self.index += 1
253
+
254
+ # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
255
+ if self.get_char_at() == lstring_delimiter:
256
+ # If it's an empty key, this was easy
257
+ if (
258
+ self.context.current == ContextValues.OBJECT_KEY
259
+ and self.get_char_at(1) == ":"
260
+ ):
261
+ self.index += 1
262
+ return ""
263
+ # Find the next delimiter
264
+ i = self.skip_to_character(character=rstring_delimiter, idx=1)
265
+ next_c = self.get_char_at(i)
266
+ # Now check that the next character is also a delimiter to ensure that we have "".....""
267
+ # In that case we ignore this rstring delimiter
268
+ if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
269
+ self.log(
270
+ "While parsing a string, we found a valid starting doubled quote, ignoring it",
271
+ )
272
+ doubled_quotes = True
273
+ self.index += 1
274
+ else:
275
+ # Ok this is not a doubled quote, check if this is an empty string or not
276
+ i = self.skip_whitespaces_at(idx=1, move_main_index=False)
277
+ next_c = self.get_char_at(i)
278
+ if next_c not in [",", "]", "}"]:
279
+ self.log(
280
+ "While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
281
+ )
282
+ self.index += 1
283
+
284
+ # Initialize our return value
285
+ string_acc = ""
286
+
287
+ # Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
288
+ # In that case we need to use the ":|,|}" characters as terminators of the string
289
+ # So this will stop if:
290
+ # * It finds a closing quote
291
+ # * It iterated over the entire sequence
292
+ # * If we are fixing missing quotes in an object, when it finds the special terminators
293
+ char = self.get_char_at()
294
+ while char and char != rstring_delimiter:
295
+ if (
296
+ missing_quotes
297
+ and self.context.current == ContextValues.OBJECT_KEY
298
+ and (char == ":" or char.isspace())
299
+ ):
300
+ self.log(
301
+ "While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
302
+ )
303
+ break
304
+ if self.context.current == ContextValues.OBJECT_VALUE and char in [
305
+ ",",
306
+ "}",
307
+ ]:
308
+ rstring_delimiter_missing = True
309
+ # check if this is a case in which the closing comma is NOT missing instead
310
+ i = self.skip_to_character(character=rstring_delimiter, idx=1)
311
+ next_c = self.get_char_at(i)
312
+ if next_c:
313
+ i += 1
314
+ # found a delimiter, now we need to check that is followed strictly by a comma or brace
315
+ i = self.skip_whitespaces_at(idx=i, move_main_index=False)
316
+ next_c = self.get_char_at(i)
317
+ if next_c and next_c in [",", "}"]:
318
+ rstring_delimiter_missing = False
319
+ if rstring_delimiter_missing:
320
+ self.log(
321
+ "While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
322
+ )
323
+ break
324
+ string_acc += char
325
+ self.index += 1
326
+ char = self.get_char_at()
327
+ if char and len(string_acc) > 0 and string_acc[-1] == "\\":
328
+ # This is a special case, if people use real strings this might happen
329
+ self.log("Found a stray escape sequence, normalizing it")
330
+ string_acc = string_acc[:-1]
331
+ if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
332
+ escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
333
+ string_acc += escape_seqs.get(char, char) or char
334
+ self.index += 1
335
+ char = self.get_char_at()
336
+ # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
337
+ if char == rstring_delimiter:
338
+ # Special case here, in case of double quotes one after another
339
+ if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
340
+ self.log(
341
+ "While parsing a string, we found a doubled quote, ignoring it"
342
+ )
343
+ self.index += 1
344
+ elif (
345
+ missing_quotes
346
+ and self.context.current == ContextValues.OBJECT_VALUE
347
+ ):
348
+ # In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
349
+ i = 1
350
+ next_c = self.get_char_at(i)
351
+ while next_c and next_c not in [
352
+ rstring_delimiter,
353
+ lstring_delimiter,
354
+ ]:
355
+ i += 1
356
+ next_c = self.get_char_at(i)
357
+ if next_c:
358
+ # We found a quote, now let's make sure there's a ":" following
359
+ i += 1
360
+ # found a delimiter, now we need to check that is followed strictly by a comma or brace
361
+ i = self.skip_whitespaces_at(idx=i, move_main_index=False)
362
+ next_c = self.get_char_at(i)
363
+ if next_c and next_c == ":":
364
+ # Reset the cursor
365
+ self.index -= 1
366
+ char = self.get_char_at()
367
+ self.log(
368
+ "In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.",
369
+ )
370
+ break
371
+ else:
372
+ # Check if eventually there is a rstring delimiter, otherwise we bail
373
+ i = 1
374
+ next_c = self.get_char_at(i)
375
+ check_comma_in_object_value = True
376
+ while next_c and next_c not in [
377
+ rstring_delimiter,
378
+ lstring_delimiter,
379
+ ]:
380
+ # This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
381
+ # This is because the routine after will make sure to correct any bad guess and this solves a corner case
382
+ if check_comma_in_object_value and next_c.isalpha():
383
+ check_comma_in_object_value = False
384
+ # If we are in an object context, let's check for the right delimiters
385
+ if (
386
+ (
387
+ ContextValues.OBJECT_KEY in self.context.context
388
+ and next_c in [":", "}"]
389
+ )
390
+ or (
391
+ ContextValues.OBJECT_VALUE in self.context.context
392
+ and next_c == "}"
393
+ )
394
+ or (
395
+ ContextValues.ARRAY in self.context.context
396
+ and next_c in ["]", ","]
397
+ )
398
+ or (
399
+ check_comma_in_object_value
400
+ and self.context.current == ContextValues.OBJECT_VALUE
401
+ and next_c == ","
402
+ )
403
+ ):
404
+ break
405
+ i += 1
406
+ next_c = self.get_char_at(i)
407
+ # If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
408
+ if (
409
+ next_c == ","
410
+ and self.context.current == ContextValues.OBJECT_VALUE
411
+ ):
412
+ i += 1
413
+ i = self.skip_to_character(character=rstring_delimiter, idx=i)
414
+ next_c = self.get_char_at(i)
415
+ # Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
416
+ i += 1
417
+ i = self.skip_whitespaces_at(idx=i, move_main_index=False)
418
+ next_c = self.get_char_at(i)
419
+ if next_c == "}":
420
+ # OK this is valid then
421
+ self.log(
422
+ "While parsing a string, we misplaced a quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
423
+ )
424
+ string_acc += str(char)
425
+ self.index += 1
426
+ char = self.get_char_at()
427
+ elif next_c == rstring_delimiter:
428
+ if self.context.current == ContextValues.OBJECT_VALUE:
429
+ # But this might not be it! This could be just a missing comma
430
+ # We found a delimiter and we need to check if this is a key
431
+ # so find a rstring_delimiter and a colon after
432
+ i += 1
433
+ i = self.skip_to_character(
434
+ character=rstring_delimiter, idx=i
435
+ )
436
+ i += 1
437
+ next_c = self.get_char_at(i)
438
+ while next_c and next_c != ":":
439
+ if next_c in [
440
+ lstring_delimiter,
441
+ rstring_delimiter,
442
+ ",",
443
+ ]:
444
+ break
445
+ i += 1
446
+ next_c = self.get_char_at(i)
447
+ # Only if we fail to find a ':' then we know this is misplaced quote
448
+ if next_c != ":":
449
+ self.log(
450
+ "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
451
+ )
452
+ string_acc += str(char)
453
+ self.index += 1
454
+ char = self.get_char_at()
455
+
456
+ if (
457
+ char
458
+ and missing_quotes
459
+ and self.context.current == ContextValues.OBJECT_KEY
460
+ and char.isspace()
461
+ ):
462
+ self.log(
463
+ "While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
464
+ )
465
+ self.skip_whitespaces_at()
466
+ if self.get_char_at() not in [":", ","]:
467
+ return ""
468
+
469
+ # A fallout of the previous special case in the while loop,
470
+ # we need to update the index only if we had a closing quote
471
+ if char != rstring_delimiter:
472
+ self.log(
473
+ "While parsing a string, we missed the closing quote, ignoring",
474
+ )
475
+ else:
476
+ self.index += 1
477
+
478
+ return string_acc.rstrip()
479
+
480
+ def parse_number(self) -> Union[float, int, str, JSONReturnType]:
481
+ # <number> is a valid real number expressed in one of a number of given formats
482
+ number_str = ""
483
+ number_chars = set("0123456789-.eE/,")
484
+ char = self.get_char_at()
485
+ is_array = self.context.current == ContextValues.ARRAY
486
+ while char and char in number_chars and (char != "," or not is_array):
487
+ number_str += char
488
+ self.index += 1
489
+ char = self.get_char_at()
490
+ if len(number_str) > 1 and number_str[-1] in "-eE/,":
491
+ # The number ends with a non valid character for a number/currency, rolling back one
492
+ number_str = number_str[:-1]
493
+ self.index -= 1
494
+ try:
495
+ if "," in number_str:
496
+ return str(number_str)
497
+ if "." in number_str or "e" in number_str or "E" in number_str:
498
+ return float(number_str)
499
+ elif number_str == "-":
500
+ # If there is a stray "-" this will throw an exception, throw away this character
501
+ return self.parse_json()
502
+ else:
503
+ return int(number_str)
504
+ except ValueError:
505
+ return number_str
506
+
507
+ def parse_boolean_or_null(self) -> Union[bool, str, None]:
508
+ # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
509
+ starting_index = self.index
510
+ char = (self.get_char_at() or "").lower()
511
+ value: Optional[Tuple[str, Optional[bool]]]
512
+ if char == "t":
513
+ value = ("true", True)
514
+ elif char == "f":
515
+ value = ("false", False)
516
+ elif char == "n":
517
+ value = ("null", None)
518
+
519
+ if value:
520
+ i = 0
521
+ while char and i < len(value[0]) and char == value[0][i]:
522
+ i += 1
523
+ self.index += 1
524
+ char = (self.get_char_at() or "").lower()
525
+ if i == len(value[0]):
526
+ return value[1]
527
+
528
+ # If nothing works reset the index before returning
529
+ self.index = starting_index
530
+ return ""
531
+
532
+ def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
533
+ # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
534
+ try:
535
+ return self.json_str[self.index + count]
536
+ except IndexError:
537
+ return False
538
+
539
+ def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
540
+ """
541
+ This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
542
+ """
543
+ try:
544
+ char = self.json_str[self.index + idx]
545
+ except IndexError:
546
+ return idx
547
+ while char.isspace():
548
+ if move_main_index:
549
+ self.index += 1
550
+ else:
551
+ idx += 1
552
+ try:
553
+ char = self.json_str[self.index + idx]
554
+ except IndexError:
555
+ return idx
556
+ return idx
557
+
558
+ def skip_to_character(self, character: str, idx: int = 0) -> int:
559
+ """
560
+ This function quickly iterates to find a character, syntactic sugar to make the code more concise
561
+ """
562
+ try:
563
+ char = self.json_str[self.index + idx]
564
+ except IndexError:
565
+ return idx
566
+ while char != character:
567
+ idx += 1
568
+ try:
569
+ char = self.json_str[self.index + idx]
570
+ except IndexError:
571
+ return idx
572
+ return idx
573
+
574
+ def _log(self, text: str) -> None:
575
+ window: int = 10
576
+ start: int = max(self.index - window, 0)
577
+ end: int = min(self.index + window, len(self.json_str))
578
+ context: str = self.json_str[start:end]
579
+ self.logger.append(
580
+ {
581
+ "text": text,
582
+ "context": context,
583
+ }
584
+ )