json-repair 0.29.1__py3-none-any.whl → 0.29.3__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -23,650 +23,10 @@ All supported use cases are in the unit tests
23
23
  """
24
24
 
25
25
  import argparse
26
- import os
27
26
  import sys
28
27
  import json
29
- from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
30
-
31
-
32
- class StringFileWrapper:
33
- # This is a trick to simplify the code, transform the filedescriptor handling into a string handling
34
- def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
35
- self.fd = fd
36
- self.length: int = 0
37
- # Buffers are 1MB strings that are read from the file
38
- # and kept in memory to keep reads low
39
- self.buffers: dict[int, str] = {}
40
- # CHUNK_LENGTH is in bytes
41
- if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
42
- CHUNK_LENGTH = 1_000_000
43
- self.buffer_length = CHUNK_LENGTH
44
-
45
- def get_buffer(self, index: int) -> str:
46
- if self.buffers.get(index) is None:
47
- self.fd.seek(index * self.buffer_length)
48
- self.buffers[index] = self.fd.read(self.buffer_length)
49
- # Save memory by keeping max 2MB buffer chunks and min 2 chunks
50
- if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
51
- oldest_key = next(iter(self.buffers))
52
- if oldest_key != index:
53
- self.buffers.pop(oldest_key)
54
- return self.buffers[index]
55
-
56
- def __getitem__(self, index: Union[int, slice]) -> str:
57
- # The buffer is an array that is seek like a RAM:
58
- # self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
59
- # self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
60
- if isinstance(index, slice):
61
- buffer_index = index.start // self.buffer_length
62
- buffer_end = index.stop // self.buffer_length
63
- if buffer_index == buffer_end:
64
- return self.get_buffer(buffer_index)[
65
- index.start % self.buffer_length : index.stop % self.buffer_length
66
- ]
67
- else:
68
- start_slice = self.get_buffer(buffer_index)[
69
- index.start % self.buffer_length :
70
- ]
71
- end_slice = self.get_buffer(buffer_end)[
72
- : index.stop % self.buffer_length
73
- ]
74
- middle_slices = [
75
- self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)
76
- ]
77
- return start_slice + "".join(middle_slices) + end_slice
78
- else:
79
- buffer_index = index // self.buffer_length
80
- return self.get_buffer(buffer_index)[index % self.buffer_length]
81
-
82
- def __len__(self) -> int:
83
- if self.length < 1:
84
- current_position = self.fd.tell()
85
- self.fd.seek(0, os.SEEK_END)
86
- self.length = self.fd.tell()
87
- self.fd.seek(current_position)
88
- return self.length
89
-
90
-
91
- class LoggerConfig:
92
- # This is a type class to simplify the declaration
93
- def __init__(self, log_level: Optional[str]):
94
- self.log: List[Dict[str, str]] = []
95
- self.window: int = 10
96
- self.log_level: str = log_level if log_level else "none"
97
-
98
-
99
- JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
100
-
101
-
102
- class JSONParser:
103
- def __init__(
104
- self,
105
- json_str: Union[str, StringFileWrapper],
106
- json_fd: Optional[TextIO],
107
- logging: Optional[bool],
108
- json_fd_chunk_length: int = 0,
109
- ) -> None:
110
- # The string to parse
111
- self.json_str = json_str
112
- # Alternatively, the file description with a json file in it
113
- if json_fd:
114
- # This is a trick we do to treat the file wrapper as an array
115
- self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
116
- # Index is our iterator that will keep track of which character we are looking at right now
117
- self.index: int = 0
118
- # This is used in the object member parsing to manage the special cases of missing quotes in key or value
119
- self.context: list[str] = []
120
- # Use this to log the activity, but only if logging is active
121
- self.logger = LoggerConfig(log_level="info" if logging else None)
122
-
123
- def parse(
124
- self,
125
- ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
126
- json = self.parse_json()
127
- if self.index < len(self.json_str):
128
- self.log(
129
- "The parser returned early, checking if there's more json elements",
130
- "info",
131
- )
132
- json = [json]
133
- last_index = self.index
134
- while self.index < len(self.json_str):
135
- j = self.parse_json()
136
- if j != "":
137
- json.append(j)
138
- if self.index == last_index:
139
- self.index += 1
140
- last_index = self.index
141
- # If nothing extra was found, don't return an array
142
- if len(json) == 1:
143
- self.log(
144
- "There were no more elements, returning the element without the array",
145
- "info",
146
- )
147
- json = json[0]
148
- if self.logger.log_level == "none":
149
- return json
150
- else:
151
- return json, self.logger.log
152
-
153
- def parse_json(
154
- self,
155
- ) -> JSONReturnType:
156
- while True:
157
- char = self.get_char_at()
158
- # This parser will ignore any basic element (string or number) that is not inside an array or object
159
- is_in_context = len(self.context) > 0
160
- # False means that we are at the end of the string provided
161
- if char is False:
162
- return ""
163
- # <object> starts with '{'
164
- elif char == "{":
165
- self.index += 1
166
- return self.parse_object()
167
- # <array> starts with '['
168
- elif char == "[":
169
- self.index += 1
170
- return self.parse_array()
171
- # there can be an edge case in which a key is empty and at the end of an object
172
- # like "key": }. We return an empty string here to close the object properly
173
- elif char == "}":
174
- self.log(
175
- "At the end of an object we found a key with missing value, skipping",
176
- "info",
177
- )
178
- return ""
179
- # <string> starts with a quote
180
- elif is_in_context and (char in ['"', "'", "“"] or char.isalpha()):
181
- return self.parse_string()
182
- # <number> starts with [0-9] or minus
183
- elif is_in_context and (char.isdigit() or char == "-" or char == "."):
184
- return self.parse_number()
185
- # If everything else fails, we just ignore and move on
186
- else:
187
- self.index += 1
188
-
189
- def parse_object(self) -> Dict[str, Any]:
190
- # <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
191
- obj = {}
192
- # Stop when you either find the closing parentheses or you have iterated over the entire string
193
- while (self.get_char_at() or "}") != "}":
194
- # This is what we expect to find:
195
- # <member> ::= <string> ': ' <json>
196
-
197
- # Skip filler whitespaces
198
- self.skip_whitespaces_at()
199
-
200
- # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
201
- if (self.get_char_at() or "") == ":":
202
- self.log(
203
- "While parsing an object we found a : before a key, ignoring",
204
- "info",
205
- )
206
- self.index += 1
207
-
208
- # We are now searching for they string key
209
- # Context is used in the string parser to manage the lack of quotes
210
- self.set_context("object_key")
211
-
212
- self.skip_whitespaces_at()
213
-
214
- # <member> starts with a <string>
215
- key = ""
216
- while self.get_char_at():
217
- key = str(self.parse_string())
218
-
219
- if key != "" or (key == "" and self.get_char_at() == ":"):
220
- # If the string is empty but there is a object divider, we are done here
221
- break
222
-
223
- self.skip_whitespaces_at()
224
-
225
- # We reached the end here
226
- if (self.get_char_at() or "}") == "}":
227
- continue
228
-
229
- self.skip_whitespaces_at()
230
-
231
- # An extreme case of missing ":" after a key
232
- if (self.get_char_at() or "") != ":":
233
- self.log(
234
- "While parsing an object we missed a : after a key",
235
- "info",
236
- )
237
-
238
- self.index += 1
239
- self.reset_context()
240
- self.set_context("object_value")
241
- # The value can be any valid json
242
- value = self.parse_json()
243
-
244
- # Reset context since our job is done
245
- self.reset_context()
246
- obj[key] = value
247
-
248
- if (self.get_char_at() or "") in [",", "'", '"']:
249
- self.index += 1
250
-
251
- # Remove trailing spaces
252
- self.skip_whitespaces_at()
253
-
254
- self.index += 1
255
- return obj
256
-
257
- def parse_array(self) -> List[Any]:
258
- # <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
259
- arr = []
260
- self.set_context("array")
261
- # Stop when you either find the closing parentheses or you have iterated over the entire string
262
- while (self.get_char_at() or "]") != "]":
263
- self.skip_whitespaces_at()
264
- value = self.parse_json()
265
-
266
- # It is possible that parse_json() returns nothing valid, so we stop
267
- if value == "":
268
- break
269
-
270
- if value == "..." and self.get_char_at(-1) == ".":
271
- self.log(
272
- "While parsing an array, found a stray '...'; ignoring it", "info"
273
- )
274
- else:
275
- arr.append(value)
276
-
277
- # skip over whitespace after a value but before closing ]
278
- char = self.get_char_at()
279
- while char and (char.isspace() or char == ","):
280
- self.index += 1
281
- char = self.get_char_at()
282
-
283
- # Especially at the end of an LLM generated json you might miss the last "]"
284
- char = self.get_char_at()
285
- if char and char != "]":
286
- self.log(
287
- "While parsing an array we missed the closing ], adding it back", "info"
288
- )
289
- self.index -= 1
290
-
291
- self.index += 1
292
- self.reset_context()
293
- return arr
294
-
295
- def parse_string(self) -> Union[str, bool, None]:
296
- # <string> is a string of valid characters enclosed in quotes
297
- # i.e. { name: "John" }
298
- # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
299
-
300
- # Flag to manage corner cases related to missing starting quote
301
- missing_quotes = False
302
- doubled_quotes = False
303
- lstring_delimiter = rstring_delimiter = '"'
304
-
305
- char = self.get_char_at()
306
- # A valid string can only start with a valid quote or, in our case, with a literal
307
- while char and char not in ['"', "'", "“"] and not char.isalnum():
308
- self.index += 1
309
- char = self.get_char_at()
310
-
311
- if not char:
312
- # This is an empty string
313
- return ""
314
-
315
- # Ensuring we use the right delimiter
316
- if char == "'":
317
- lstring_delimiter = rstring_delimiter = "'"
318
- elif char == "“":
319
- lstring_delimiter = "“"
320
- rstring_delimiter = "”"
321
- elif char.isalnum():
322
- # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
323
- # But remember, object keys are only of type string
324
- if char.lower() in ["t", "f", "n"] and self.get_context() != "object_key":
325
- value = self.parse_boolean_or_null()
326
- if value != "":
327
- return value
328
- self.log(
329
- "While parsing a string, we found a literal instead of a quote",
330
- "info",
331
- )
332
- self.log(
333
- "While parsing a string, we found no starting quote. Will add the quote back",
334
- "info",
335
- )
336
- missing_quotes = True
337
-
338
- if not missing_quotes:
339
- self.index += 1
340
-
341
- # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
342
- if self.get_char_at() == lstring_delimiter:
343
- # If it's an empty key, this was easy
344
- if self.get_context() == "object_key" and self.get_char_at(1) == ":":
345
- self.index += 1
346
- return ""
347
- # Find the next delimiter
348
- i = 1
349
- next_c = self.get_char_at(i)
350
- while next_c and next_c != rstring_delimiter:
351
- i += 1
352
- next_c = self.get_char_at(i)
353
- # Now check that the next character is also a delimiter to ensure that we have "".....""
354
- # In that case we ignore this rstring delimiter
355
- if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
356
- self.log(
357
- "While parsing a string, we found a valid starting doubled quote, ignoring it",
358
- "info",
359
- )
360
- doubled_quotes = True
361
- self.index += 1
362
- else:
363
- # Ok this is not a doubled quote, check if this is an empty string or not
364
- i = 1
365
- next_c = self.get_char_at(i)
366
- while next_c and next_c.isspace():
367
- i += 1
368
- next_c = self.get_char_at(i)
369
- if next_c not in [",", "]", "}"]:
370
- self.log(
371
- "While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
372
- "info",
373
- )
374
- self.index += 1
375
-
376
- # Initialize our return value
377
- string_acc = ""
378
-
379
- # Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
380
- # In that case we need to use the ":|,|}" characters as terminators of the string
381
- # So this will stop if:
382
- # * It finds a closing quote
383
- # * It iterated over the entire sequence
384
- # * If we are fixing missing quotes in an object, when it finds the special terminators
385
- char = self.get_char_at()
386
- while char and char != rstring_delimiter:
387
- if missing_quotes:
388
- if self.get_context() == "object_key" and (
389
- char == ":" or char.isspace()
390
- ):
391
- self.log(
392
- "While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
393
- "info",
394
- )
395
- break
396
- elif self.get_context() == "object_value" and char in [",", "}"]:
397
- rstring_delimiter_missing = True
398
- # check if this is a case in which the closing comma is NOT missing instead
399
- i = 1
400
- next_c = self.get_char_at(i)
401
- while next_c and next_c != rstring_delimiter:
402
- i += 1
403
- next_c = self.get_char_at(i)
404
- if next_c:
405
- i += 1
406
- next_c = self.get_char_at(i)
407
- # found a delimiter, now we need to check that is followed strictly by a comma or brace
408
- while next_c and next_c.isspace():
409
- i += 1
410
- next_c = self.get_char_at(i)
411
- if next_c and next_c in [",", "}"]:
412
- rstring_delimiter_missing = False
413
- if rstring_delimiter_missing:
414
- self.log(
415
- "While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
416
- "info",
417
- )
418
- break
419
- string_acc += char
420
- self.index += 1
421
- char = self.get_char_at()
422
- if char and len(string_acc) > 0 and string_acc[-1] == "\\":
423
- # This is a special case, if people use real strings this might happen
424
- self.log("Found a stray escape sequence, normalizing it", "info")
425
- string_acc = string_acc[:-1]
426
- if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
427
- escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
428
- string_acc += escape_seqs.get(char, char) or char
429
- self.index += 1
430
- char = self.get_char_at()
431
- # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
432
- if char == rstring_delimiter:
433
- # Special case here, in case of double quotes one after another
434
- if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
435
- self.log(
436
- "While parsing a string, we found a doubled quote, ignoring it",
437
- "info",
438
- )
439
- self.index += 1
440
- elif missing_quotes and self.get_context() == "object_value":
441
- # In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
442
- i = 1
443
- next_c = self.get_char_at(i)
444
- while next_c and next_c not in [
445
- rstring_delimiter,
446
- lstring_delimiter,
447
- ]:
448
- i += 1
449
- next_c = self.get_char_at(i)
450
- if next_c:
451
- # We found a quote, now let's make sure there's a ":" following
452
- i += 1
453
- next_c = self.get_char_at(i)
454
- # found a delimiter, now we need to check that is followed strictly by a comma or brace
455
- while next_c and next_c.isspace():
456
- i += 1
457
- next_c = self.get_char_at(i)
458
- if next_c and next_c == ":":
459
- # Reset the cursor
460
- self.index -= 1
461
- char = self.get_char_at()
462
- self.log(
463
- "In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.",
464
- "info",
465
- )
466
- break
467
- else:
468
- # Check if eventually there is a rstring delimiter, otherwise we bail
469
- i = 1
470
- next_c = self.get_char_at(i)
471
- check_comma_in_object_value = True
472
- while next_c and next_c not in [
473
- rstring_delimiter,
474
- lstring_delimiter,
475
- ]:
476
- # This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
477
- # This is because the routine after will make sure to correct any bad guess and this solves a corner case
478
- if check_comma_in_object_value and next_c.isalpha():
479
- check_comma_in_object_value = False
480
- # If we are in an object context, let's check for the right delimiters
481
- if (
482
- ("object_key" in self.context and next_c in [":", "}"])
483
- or ("object_value" in self.context and next_c == "}")
484
- or ("array" in self.context and next_c in ["]", ","])
485
- or (
486
- check_comma_in_object_value
487
- and self.get_context() == "object_value"
488
- and next_c == ","
489
- )
490
- ):
491
- break
492
- i += 1
493
- next_c = self.get_char_at(i)
494
- # If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
495
- if next_c == "," and self.get_context() == "object_value":
496
- i += 1
497
- next_c = self.get_char_at(i)
498
- while next_c and next_c != rstring_delimiter:
499
- i += 1
500
- next_c = self.get_char_at(i)
501
- # Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
502
- i += 1
503
- next_c = self.get_char_at(i)
504
- while next_c and next_c.isspace():
505
- i += 1
506
- next_c = self.get_char_at(i)
507
- if next_c == "}":
508
- # OK this is valid then
509
- self.log(
510
- "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
511
- "info",
512
- )
513
- string_acc += str(char)
514
- self.index += 1
515
- char = self.get_char_at()
516
- elif next_c == rstring_delimiter:
517
- if self.get_context() == "object_value":
518
- # But this might not be it! This could be just a missing comma
519
- # We found a delimiter and we need to check if this is a key
520
- # so find a rstring_delimiter and a colon after
521
- i += 1
522
- next_c = self.get_char_at(i)
523
- while next_c and next_c != rstring_delimiter:
524
- i += 1
525
- next_c = self.get_char_at(i)
526
- i += 1
527
- next_c = self.get_char_at(i)
528
- while next_c and next_c != ":":
529
- if next_c in [
530
- lstring_delimiter,
531
- rstring_delimiter,
532
- ",",
533
- ]:
534
- break
535
- i += 1
536
- next_c = self.get_char_at(i)
537
- # Only if we fail to find a ':' then we know this is misplaced quote
538
- if next_c != ":":
539
- self.log(
540
- "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
541
- "info",
542
- )
543
- string_acc += str(char)
544
- self.index += 1
545
- char = self.get_char_at()
546
-
547
- if (
548
- char
549
- and missing_quotes
550
- and self.get_context() == "object_key"
551
- and char.isspace()
552
- ):
553
- self.log(
554
- "While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
555
- "info",
556
- )
557
- self.skip_whitespaces_at()
558
- if self.get_char_at() not in [":", ","]:
559
- return ""
560
-
561
- # A fallout of the previous special case in the while loop,
562
- # we need to update the index only if we had a closing quote
563
- if char != rstring_delimiter:
564
- self.log(
565
- "While parsing a string, we missed the closing quote, ignoring",
566
- "info",
567
- )
568
- else:
569
- self.index += 1
570
-
571
- return string_acc.rstrip()
572
-
573
- def parse_number(self) -> Union[float, int, str, JSONReturnType]:
574
- # <number> is a valid real number expressed in one of a number of given formats
575
- number_str = ""
576
- number_chars = set("0123456789-.eE/,")
577
- char = self.get_char_at()
578
- is_array = self.get_context() == "array"
579
- while char and char in number_chars and (char != "," or not is_array):
580
- number_str += char
581
- self.index += 1
582
- char = self.get_char_at()
583
- if len(number_str) > 1 and number_str[-1] in "-eE/,":
584
- # The number ends with a non valid character for a number/currency, rolling back one
585
- number_str = number_str[:-1]
586
- self.index -= 1
587
- try:
588
- if "," in number_str:
589
- return str(number_str)
590
- if "." in number_str or "e" in number_str or "E" in number_str:
591
- return float(number_str)
592
- elif number_str == "-":
593
- # If there is a stray "-" this will throw an exception, throw away this character
594
- return self.parse_json()
595
- else:
596
- return int(number_str)
597
- except ValueError:
598
- return number_str
599
-
600
- def parse_boolean_or_null(self) -> Union[bool, str, None]:
601
- # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
602
- starting_index = self.index
603
- char = (self.get_char_at() or "").lower()
604
- value: Optional[Tuple[str, Optional[bool]]]
605
- if char == "t":
606
- value = ("true", True)
607
- elif char == "f":
608
- value = ("false", False)
609
- elif char == "n":
610
- value = ("null", None)
611
-
612
- if value:
613
- i = 0
614
- while char and i < len(value[0]) and char == value[0][i]:
615
- i += 1
616
- self.index += 1
617
- char = (self.get_char_at() or "").lower()
618
- if i == len(value[0]):
619
- return value[1]
620
-
621
- # If nothing works reset the index before returning
622
- self.index = starting_index
623
- return ""
624
-
625
- def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
626
- # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
627
- try:
628
- return self.json_str[self.index + count]
629
- except IndexError:
630
- return False
631
-
632
- def skip_whitespaces_at(self) -> None:
633
- """
634
- This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
635
- """
636
- try:
637
- char = self.json_str[self.index]
638
- except IndexError:
639
- return
640
- while char.isspace():
641
- self.index += 1
642
- try:
643
- char = self.json_str[self.index]
644
- except IndexError:
645
- return
646
-
647
- def set_context(self, value: str) -> None:
648
- # If a value is provided update the context variable and save in stack
649
- if value:
650
- self.context.append(value)
651
-
652
- def reset_context(self) -> None:
653
- self.context.pop()
654
-
655
- def get_context(self) -> str:
656
- return self.context[-1]
657
-
658
- def log(self, text: str, level: str) -> None:
659
- if level == self.logger.log_level:
660
- context = ""
661
- start = max(self.index - self.logger.window, 0)
662
- end = min(self.index + self.logger.window, len(self.json_str))
663
- context = self.json_str[start:end]
664
- self.logger.log.append(
665
- {
666
- "text": text,
667
- "context": context,
668
- }
669
- )
28
+ from typing import Dict, List, Optional, Union, TextIO, Tuple
29
+ from .json_parser import JSONParser, JSONReturnType
670
30
 
671
31
 
672
32
  def repair_json(