json-repair 0.29.2__py3-none-any.whl → 0.29.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,45 @@
1
+ from enum import Enum, auto
2
+ from typing import List, Optional
3
+
4
+
5
+ class ContextValues(Enum):
6
+ OBJECT_KEY = auto()
7
+ OBJECT_VALUE = auto()
8
+ ARRAY = auto()
9
+
10
+
11
+ class JsonContext:
12
+ def __init__(self) -> None:
13
+ self.context: List[ContextValues] = []
14
+ self.current: Optional[ContextValues] = None
15
+ self.empty: bool = True
16
+
17
+ def set(self, value: ContextValues) -> None:
18
+ """
19
+ Set a new context value.
20
+
21
+ Args:
22
+ value (ContextValues): The context value to be added.
23
+
24
+ Returns:
25
+ None
26
+ """
27
+ # If a value is provided update the context variable and save in stack
28
+ if value:
29
+ self.context.append(value)
30
+ self.current = value
31
+ self.empty = False
32
+
33
+ def reset(self) -> None:
34
+ """
35
+ Remove the most recent context value.
36
+
37
+ Returns:
38
+ None
39
+ """
40
+ try:
41
+ self.context.pop()
42
+ self.current = self.context[-1]
43
+ except IndexError:
44
+ self.current = None
45
+ self.empty = True
@@ -0,0 +1,584 @@
1
+ from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
2
+
3
+ from .string_file_wrapper import StringFileWrapper
4
+ from .json_context import JsonContext, ContextValues
5
+
6
+ JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
7
+
8
+
9
+ class JSONParser:
10
+ def __init__(
11
+ self,
12
+ json_str: Union[str, StringFileWrapper],
13
+ json_fd: Optional[TextIO],
14
+ logging: Optional[bool],
15
+ json_fd_chunk_length: int = 0,
16
+ ) -> None:
17
+ # The string to parse
18
+ self.json_str: Union[str, StringFileWrapper] = json_str
19
+ # Alternatively, the file description with a json file in it
20
+ if json_fd:
21
+ # This is a trick we do to treat the file wrapper as an array
22
+ self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
23
+ # Index is our iterator that will keep track of which character we are looking at right now
24
+ self.index: int = 0
25
+ # This is used in the object member parsing to manage the special cases of missing quotes in key or value
26
+ self.context = JsonContext()
27
+ # Use this to log the activity, but only if logging is active
28
+
29
+ # This is a trick but a beatiful one. We call self.log in the code over and over even if it's not needed.
30
+ # We could add a guard in the code for each call but that would make this code unreadable, so here's this neat trick
31
+ # Replace self.log with a noop
32
+ self.logging = logging
33
+ if logging:
34
+ self.logger: List[Dict[str, str]] = []
35
+ self.log = self._log
36
+ else:
37
+ # No-op
38
+ self.log = lambda *args, **kwargs: None
39
+
40
+ def parse(
41
+ self,
42
+ ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
43
+ json = self.parse_json()
44
+ if self.index < len(self.json_str):
45
+ self.log(
46
+ "The parser returned early, checking if there's more json elements",
47
+ )
48
+ json = [json]
49
+ last_index = self.index
50
+ while self.index < len(self.json_str):
51
+ j = self.parse_json()
52
+ if j != "":
53
+ json.append(j)
54
+ if self.index == last_index:
55
+ self.index += 1
56
+ last_index = self.index
57
+ # If nothing extra was found, don't return an array
58
+ if len(json) == 1:
59
+ self.log(
60
+ "There were no more elements, returning the element without the array",
61
+ )
62
+ json = json[0]
63
+ if self.logging:
64
+ return json, self.logger
65
+ else:
66
+ return json
67
+
68
+ def parse_json(
69
+ self,
70
+ ) -> JSONReturnType:
71
+ while True:
72
+ char = self.get_char_at()
73
+ # False means that we are at the end of the string provided
74
+ if char is False:
75
+ return ""
76
+ # <object> starts with '{'
77
+ elif char == "{":
78
+ self.index += 1
79
+ return self.parse_object()
80
+ # <array> starts with '['
81
+ elif char == "[":
82
+ self.index += 1
83
+ return self.parse_array()
84
+ # there can be an edge case in which a key is empty and at the end of an object
85
+ # like "key": }. We return an empty string here to close the object properly
86
+ elif char == "}":
87
+ self.log(
88
+ "At the end of an object we found a key with missing value, skipping",
89
+ )
90
+ return ""
91
+ # <string> starts with a quote
92
+ elif not self.context.empty and (char in ['"', "'", "“"] or char.isalpha()):
93
+ return self.parse_string()
94
+ # <number> starts with [0-9] or minus
95
+ elif not self.context.empty and (
96
+ char.isdigit() or char == "-" or char == "."
97
+ ):
98
+ return self.parse_number()
99
+ # If everything else fails, we just ignore and move on
100
+ else:
101
+ self.index += 1
102
+
103
+ def parse_object(self) -> Dict[str, JSONReturnType]:
104
+ # <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
105
+ obj = {}
106
+ # Stop when you either find the closing parentheses or you have iterated over the entire string
107
+ while (self.get_char_at() or "}") != "}":
108
+ # This is what we expect to find:
109
+ # <member> ::= <string> ': ' <json>
110
+
111
+ # Skip filler whitespaces
112
+ self.skip_whitespaces_at()
113
+
114
+ # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
115
+ if (self.get_char_at() or "") == ":":
116
+ self.log(
117
+ "While parsing an object we found a : before a key, ignoring",
118
+ )
119
+ self.index += 1
120
+
121
+ # We are now searching for they string key
122
+ # Context is used in the string parser to manage the lack of quotes
123
+ self.context.set(ContextValues.OBJECT_KEY)
124
+
125
+ self.skip_whitespaces_at()
126
+
127
+ # <member> starts with a <string>
128
+ key = ""
129
+ while self.get_char_at():
130
+ key = str(self.parse_string())
131
+
132
+ if key != "" or (key == "" and self.get_char_at() == ":"):
133
+ # If the string is empty but there is a object divider, we are done here
134
+ break
135
+
136
+ self.skip_whitespaces_at()
137
+
138
+ # We reached the end here
139
+ if (self.get_char_at() or "}") == "}":
140
+ continue
141
+
142
+ self.skip_whitespaces_at()
143
+
144
+ # An extreme case of missing ":" after a key
145
+ if (self.get_char_at() or "") != ":":
146
+ self.log(
147
+ "While parsing an object we missed a : after a key",
148
+ )
149
+
150
+ self.index += 1
151
+ self.context.reset()
152
+ self.context.set(ContextValues.OBJECT_VALUE)
153
+ # The value can be any valid json
154
+ value = self.parse_json()
155
+
156
+ # Reset context since our job is done
157
+ self.context.reset()
158
+ obj[key] = value
159
+
160
+ if (self.get_char_at() or "") in [",", "'", '"']:
161
+ self.index += 1
162
+
163
+ # Remove trailing spaces
164
+ self.skip_whitespaces_at()
165
+
166
+ self.index += 1
167
+ return obj
168
+
169
+ def parse_array(self) -> List[JSONReturnType]:
170
+ # <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
171
+ arr = []
172
+ self.context.set(ContextValues.ARRAY)
173
+ # Stop when you either find the closing parentheses or you have iterated over the entire string
174
+ while (self.get_char_at() or "]") != "]":
175
+ self.skip_whitespaces_at()
176
+ value = self.parse_json()
177
+
178
+ # It is possible that parse_json() returns nothing valid, so we stop
179
+ if value == "":
180
+ break
181
+
182
+ if value == "..." and self.get_char_at(-1) == ".":
183
+ self.log(
184
+ "While parsing an array, found a stray '...'; ignoring it",
185
+ )
186
+ else:
187
+ arr.append(value)
188
+
189
+ # skip over whitespace after a value but before closing ]
190
+ char = self.get_char_at()
191
+ while char and (char.isspace() or char == ","):
192
+ self.index += 1
193
+ char = self.get_char_at()
194
+
195
+ # Especially at the end of an LLM generated json you might miss the last "]"
196
+ char = self.get_char_at()
197
+ if char and char != "]":
198
+ self.log(
199
+ "While parsing an array we missed the closing ], adding it back",
200
+ )
201
+ self.index -= 1
202
+
203
+ self.index += 1
204
+ self.context.reset()
205
+ return arr
206
+
207
+ def parse_string(self) -> Union[str, bool, None]:
208
+ # <string> is a string of valid characters enclosed in quotes
209
+ # i.e. { name: "John" }
210
+ # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
211
+
212
+ # Flag to manage corner cases related to missing starting quote
213
+ missing_quotes = False
214
+ doubled_quotes = False
215
+ lstring_delimiter = rstring_delimiter = '"'
216
+
217
+ char = self.get_char_at()
218
+ # A valid string can only start with a valid quote or, in our case, with a literal
219
+ while char and char not in ['"', "'", "“"] and not char.isalnum():
220
+ self.index += 1
221
+ char = self.get_char_at()
222
+
223
+ if not char:
224
+ # This is an empty string
225
+ return ""
226
+
227
+ # Ensuring we use the right delimiter
228
+ if char == "'":
229
+ lstring_delimiter = rstring_delimiter = "'"
230
+ elif char == "“":
231
+ lstring_delimiter = "“"
232
+ rstring_delimiter = "”"
233
+ elif char.isalnum():
234
+ # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
235
+ # But remember, object keys are only of type string
236
+ if (
237
+ char.lower() in ["t", "f", "n"]
238
+ and self.context.current != ContextValues.OBJECT_KEY
239
+ ):
240
+ value = self.parse_boolean_or_null()
241
+ if value != "":
242
+ return value
243
+ self.log(
244
+ "While parsing a string, we found a literal instead of a quote",
245
+ )
246
+ self.log(
247
+ "While parsing a string, we found no starting quote. Will add the quote back",
248
+ )
249
+ missing_quotes = True
250
+
251
+ if not missing_quotes:
252
+ self.index += 1
253
+
254
+ # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
255
+ if self.get_char_at() == lstring_delimiter:
256
+ # If it's an empty key, this was easy
257
+ if (
258
+ self.context.current == ContextValues.OBJECT_KEY
259
+ and self.get_char_at(1) == ":"
260
+ ):
261
+ self.index += 1
262
+ return ""
263
+ # Find the next delimiter
264
+ i = self.skip_to_character(character=rstring_delimiter, idx=1)
265
+ next_c = self.get_char_at(i)
266
+ # Now check that the next character is also a delimiter to ensure that we have "".....""
267
+ # In that case we ignore this rstring delimiter
268
+ if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
269
+ self.log(
270
+ "While parsing a string, we found a valid starting doubled quote, ignoring it",
271
+ )
272
+ doubled_quotes = True
273
+ self.index += 1
274
+ else:
275
+ # Ok this is not a doubled quote, check if this is an empty string or not
276
+ i = self.skip_whitespaces_at(idx=1, move_main_index=False)
277
+ next_c = self.get_char_at(i)
278
+ if next_c not in [",", "]", "}"]:
279
+ self.log(
280
+ "While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
281
+ )
282
+ self.index += 1
283
+
284
+ # Initialize our return value
285
+ string_acc = ""
286
+
287
+ # Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
288
+ # In that case we need to use the ":|,|}" characters as terminators of the string
289
+ # So this will stop if:
290
+ # * It finds a closing quote
291
+ # * It iterated over the entire sequence
292
+ # * If we are fixing missing quotes in an object, when it finds the special terminators
293
+ char = self.get_char_at()
294
+ while char and char != rstring_delimiter:
295
+ if (
296
+ missing_quotes
297
+ and self.context.current == ContextValues.OBJECT_KEY
298
+ and (char == ":" or char.isspace())
299
+ ):
300
+ self.log(
301
+ "While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
302
+ )
303
+ break
304
+ if self.context.current == ContextValues.OBJECT_VALUE and char in [
305
+ ",",
306
+ "}",
307
+ ]:
308
+ rstring_delimiter_missing = True
309
+ # check if this is a case in which the closing comma is NOT missing instead
310
+ i = self.skip_to_character(character=rstring_delimiter, idx=1)
311
+ next_c = self.get_char_at(i)
312
+ if next_c:
313
+ i += 1
314
+ # found a delimiter, now we need to check that is followed strictly by a comma or brace
315
+ i = self.skip_whitespaces_at(idx=i, move_main_index=False)
316
+ next_c = self.get_char_at(i)
317
+ if next_c and next_c in [",", "}"]:
318
+ rstring_delimiter_missing = False
319
+ if rstring_delimiter_missing:
320
+ self.log(
321
+ "While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
322
+ )
323
+ break
324
+ string_acc += char
325
+ self.index += 1
326
+ char = self.get_char_at()
327
+ if char and len(string_acc) > 0 and string_acc[-1] == "\\":
328
+ # This is a special case, if people use real strings this might happen
329
+ self.log("Found a stray escape sequence, normalizing it")
330
+ string_acc = string_acc[:-1]
331
+ if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
332
+ escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
333
+ string_acc += escape_seqs.get(char, char) or char
334
+ self.index += 1
335
+ char = self.get_char_at()
336
+ # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
337
+ if char == rstring_delimiter:
338
+ # Special case here, in case of double quotes one after another
339
+ if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
340
+ self.log(
341
+ "While parsing a string, we found a doubled quote, ignoring it"
342
+ )
343
+ self.index += 1
344
+ elif (
345
+ missing_quotes
346
+ and self.context.current == ContextValues.OBJECT_VALUE
347
+ ):
348
+ # In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
349
+ i = 1
350
+ next_c = self.get_char_at(i)
351
+ while next_c and next_c not in [
352
+ rstring_delimiter,
353
+ lstring_delimiter,
354
+ ]:
355
+ i += 1
356
+ next_c = self.get_char_at(i)
357
+ if next_c:
358
+ # We found a quote, now let's make sure there's a ":" following
359
+ i += 1
360
+ # found a delimiter, now we need to check that is followed strictly by a comma or brace
361
+ i = self.skip_whitespaces_at(idx=i, move_main_index=False)
362
+ next_c = self.get_char_at(i)
363
+ if next_c and next_c == ":":
364
+ # Reset the cursor
365
+ self.index -= 1
366
+ char = self.get_char_at()
367
+ self.log(
368
+ "In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.",
369
+ )
370
+ break
371
+ else:
372
+ # Check if eventually there is a rstring delimiter, otherwise we bail
373
+ i = 1
374
+ next_c = self.get_char_at(i)
375
+ check_comma_in_object_value = True
376
+ while next_c and next_c not in [
377
+ rstring_delimiter,
378
+ lstring_delimiter,
379
+ ]:
380
+ # This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
381
+ # This is because the routine after will make sure to correct any bad guess and this solves a corner case
382
+ if check_comma_in_object_value and next_c.isalpha():
383
+ check_comma_in_object_value = False
384
+ # If we are in an object context, let's check for the right delimiters
385
+ if (
386
+ (
387
+ ContextValues.OBJECT_KEY in self.context.context
388
+ and next_c in [":", "}"]
389
+ )
390
+ or (
391
+ ContextValues.OBJECT_VALUE in self.context.context
392
+ and next_c == "}"
393
+ )
394
+ or (
395
+ ContextValues.ARRAY in self.context.context
396
+ and next_c in ["]", ","]
397
+ )
398
+ or (
399
+ check_comma_in_object_value
400
+ and self.context.current == ContextValues.OBJECT_VALUE
401
+ and next_c == ","
402
+ )
403
+ ):
404
+ break
405
+ i += 1
406
+ next_c = self.get_char_at(i)
407
+ # If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
408
+ if (
409
+ next_c == ","
410
+ and self.context.current == ContextValues.OBJECT_VALUE
411
+ ):
412
+ i += 1
413
+ i = self.skip_to_character(character=rstring_delimiter, idx=i)
414
+ next_c = self.get_char_at(i)
415
+ # Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
416
+ i += 1
417
+ i = self.skip_whitespaces_at(idx=i, move_main_index=False)
418
+ next_c = self.get_char_at(i)
419
+ if next_c == "}":
420
+ # OK this is valid then
421
+ self.log(
422
+ "While parsing a string, we misplaced a quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
423
+ )
424
+ string_acc += str(char)
425
+ self.index += 1
426
+ char = self.get_char_at()
427
+ elif next_c == rstring_delimiter:
428
+ if self.context.current == ContextValues.OBJECT_VALUE:
429
+ # But this might not be it! This could be just a missing comma
430
+ # We found a delimiter and we need to check if this is a key
431
+ # so find a rstring_delimiter and a colon after
432
+ i += 1
433
+ i = self.skip_to_character(
434
+ character=rstring_delimiter, idx=i
435
+ )
436
+ i += 1
437
+ next_c = self.get_char_at(i)
438
+ while next_c and next_c != ":":
439
+ if next_c in [
440
+ lstring_delimiter,
441
+ rstring_delimiter,
442
+ ",",
443
+ ]:
444
+ break
445
+ i += 1
446
+ next_c = self.get_char_at(i)
447
+ # Only if we fail to find a ':' then we know this is misplaced quote
448
+ if next_c != ":":
449
+ self.log(
450
+ "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
451
+ )
452
+ string_acc += str(char)
453
+ self.index += 1
454
+ char = self.get_char_at()
455
+
456
+ if (
457
+ char
458
+ and missing_quotes
459
+ and self.context.current == ContextValues.OBJECT_KEY
460
+ and char.isspace()
461
+ ):
462
+ self.log(
463
+ "While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
464
+ )
465
+ self.skip_whitespaces_at()
466
+ if self.get_char_at() not in [":", ","]:
467
+ return ""
468
+
469
+ # A fallout of the previous special case in the while loop,
470
+ # we need to update the index only if we had a closing quote
471
+ if char != rstring_delimiter:
472
+ self.log(
473
+ "While parsing a string, we missed the closing quote, ignoring",
474
+ )
475
+ else:
476
+ self.index += 1
477
+
478
+ return string_acc.rstrip()
479
+
480
+ def parse_number(self) -> Union[float, int, str, JSONReturnType]:
481
+ # <number> is a valid real number expressed in one of a number of given formats
482
+ number_str = ""
483
+ number_chars = set("0123456789-.eE/,")
484
+ char = self.get_char_at()
485
+ is_array = self.context.current == ContextValues.ARRAY
486
+ while char and char in number_chars and (char != "," or not is_array):
487
+ number_str += char
488
+ self.index += 1
489
+ char = self.get_char_at()
490
+ if len(number_str) > 1 and number_str[-1] in "-eE/,":
491
+ # The number ends with a non valid character for a number/currency, rolling back one
492
+ number_str = number_str[:-1]
493
+ self.index -= 1
494
+ try:
495
+ if "," in number_str:
496
+ return str(number_str)
497
+ if "." in number_str or "e" in number_str or "E" in number_str:
498
+ return float(number_str)
499
+ elif number_str == "-":
500
+ # If there is a stray "-" this will throw an exception, throw away this character
501
+ return self.parse_json()
502
+ else:
503
+ return int(number_str)
504
+ except ValueError:
505
+ return number_str
506
+
507
+ def parse_boolean_or_null(self) -> Union[bool, str, None]:
508
+ # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
509
+ starting_index = self.index
510
+ char = (self.get_char_at() or "").lower()
511
+ value: Optional[Tuple[str, Optional[bool]]]
512
+ if char == "t":
513
+ value = ("true", True)
514
+ elif char == "f":
515
+ value = ("false", False)
516
+ elif char == "n":
517
+ value = ("null", None)
518
+
519
+ if value:
520
+ i = 0
521
+ while char and i < len(value[0]) and char == value[0][i]:
522
+ i += 1
523
+ self.index += 1
524
+ char = (self.get_char_at() or "").lower()
525
+ if i == len(value[0]):
526
+ return value[1]
527
+
528
+ # If nothing works reset the index before returning
529
+ self.index = starting_index
530
+ return ""
531
+
532
+ def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
533
+ # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
534
+ try:
535
+ return self.json_str[self.index + count]
536
+ except IndexError:
537
+ return False
538
+
539
+ def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
540
+ """
541
+ This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
542
+ """
543
+ try:
544
+ char = self.json_str[self.index + idx]
545
+ except IndexError:
546
+ return idx
547
+ while char.isspace():
548
+ if move_main_index:
549
+ self.index += 1
550
+ else:
551
+ idx += 1
552
+ try:
553
+ char = self.json_str[self.index + idx]
554
+ except IndexError:
555
+ return idx
556
+ return idx
557
+
558
+ def skip_to_character(self, character: str, idx: int = 0) -> int:
559
+ """
560
+ This function quickly iterates to find a character, syntactic sugar to make the code more concise
561
+ """
562
+ try:
563
+ char = self.json_str[self.index + idx]
564
+ except IndexError:
565
+ return idx
566
+ while char != character:
567
+ idx += 1
568
+ try:
569
+ char = self.json_str[self.index + idx]
570
+ except IndexError:
571
+ return idx
572
+ return idx
573
+
574
+ def _log(self, text: str) -> None:
575
+ window: int = 10
576
+ start: int = max(self.index - window, 0)
577
+ end: int = min(self.index + window, len(self.json_str))
578
+ context: str = self.json_str[start:end]
579
+ self.logger.append(
580
+ {
581
+ "text": text,
582
+ "context": context,
583
+ }
584
+ )