json-repair 0.29.2__py3-none-any.whl → 0.29.4__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -23,651 +23,10 @@ All supported use cases are in the unit tests
23
23
  """
24
24
 
25
25
  import argparse
26
- import os
27
26
  import sys
28
27
  import json
29
- from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
30
-
31
-
32
- class StringFileWrapper:
33
- # This is a trick to simplify the code, transform the filedescriptor handling into a string handling
34
- def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
35
- self.fd = fd
36
- self.length: int = 0
37
- # Buffers are 1MB strings that are read from the file
38
- # and kept in memory to keep reads low
39
- self.buffers: dict[int, str] = {}
40
- # CHUNK_LENGTH is in bytes
41
- if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
42
- CHUNK_LENGTH = 1_000_000
43
- self.buffer_length = CHUNK_LENGTH
44
-
45
- def get_buffer(self, index: int) -> str:
46
- if self.buffers.get(index) is None:
47
- self.fd.seek(index * self.buffer_length)
48
- self.buffers[index] = self.fd.read(self.buffer_length)
49
- # Save memory by keeping max 2MB buffer chunks and min 2 chunks
50
- if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
51
- oldest_key = next(iter(self.buffers))
52
- if oldest_key != index:
53
- self.buffers.pop(oldest_key)
54
- return self.buffers[index]
55
-
56
- def __getitem__(self, index: Union[int, slice]) -> str:
57
- # The buffer is an array that is seek like a RAM:
58
- # self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
59
- # self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
60
- if isinstance(index, slice):
61
- buffer_index = index.start // self.buffer_length
62
- buffer_end = index.stop // self.buffer_length
63
- if buffer_index == buffer_end:
64
- return self.get_buffer(buffer_index)[
65
- index.start % self.buffer_length : index.stop % self.buffer_length
66
- ]
67
- else:
68
- start_slice = self.get_buffer(buffer_index)[
69
- index.start % self.buffer_length :
70
- ]
71
- end_slice = self.get_buffer(buffer_end)[
72
- : index.stop % self.buffer_length
73
- ]
74
- middle_slices = [
75
- self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)
76
- ]
77
- return start_slice + "".join(middle_slices) + end_slice
78
- else:
79
- buffer_index = index // self.buffer_length
80
- return self.get_buffer(buffer_index)[index % self.buffer_length]
81
-
82
- def __len__(self) -> int:
83
- if self.length < 1:
84
- current_position = self.fd.tell()
85
- self.fd.seek(0, os.SEEK_END)
86
- self.length = self.fd.tell()
87
- self.fd.seek(current_position)
88
- return self.length
89
-
90
-
91
- class LoggerConfig:
92
- # This is a type class to simplify the declaration
93
- def __init__(self, log_level: Optional[str]):
94
- self.log: List[Dict[str, str]] = []
95
- self.window: int = 10
96
- self.log_level: str = log_level if log_level else "none"
97
-
98
-
99
- JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
100
-
101
-
102
- class JSONParser:
103
- def __init__(
104
- self,
105
- json_str: Union[str, StringFileWrapper],
106
- json_fd: Optional[TextIO],
107
- logging: Optional[bool],
108
- json_fd_chunk_length: int = 0,
109
- ) -> None:
110
- # The string to parse
111
- self.json_str = json_str
112
- # Alternatively, the file description with a json file in it
113
- if json_fd:
114
- # This is a trick we do to treat the file wrapper as an array
115
- self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
116
- # Index is our iterator that will keep track of which character we are looking at right now
117
- self.index: int = 0
118
- # This is used in the object member parsing to manage the special cases of missing quotes in key or value
119
- self.context: list[str] = []
120
- # Use this to log the activity, but only if logging is active
121
- self.logger = LoggerConfig(log_level="info" if logging else None)
122
-
123
- def parse(
124
- self,
125
- ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
126
- json = self.parse_json()
127
- if self.index < len(self.json_str):
128
- self.log(
129
- "The parser returned early, checking if there's more json elements",
130
- "info",
131
- )
132
- json = [json]
133
- last_index = self.index
134
- while self.index < len(self.json_str):
135
- j = self.parse_json()
136
- if j != "":
137
- json.append(j)
138
- if self.index == last_index:
139
- self.index += 1
140
- last_index = self.index
141
- # If nothing extra was found, don't return an array
142
- if len(json) == 1:
143
- self.log(
144
- "There were no more elements, returning the element without the array",
145
- "info",
146
- )
147
- json = json[0]
148
- if self.logger.log_level == "none":
149
- return json
150
- else:
151
- return json, self.logger.log
152
-
153
- def parse_json(
154
- self,
155
- ) -> JSONReturnType:
156
- while True:
157
- char = self.get_char_at()
158
- # This parser will ignore any basic element (string or number) that is not inside an array or object
159
- is_in_context = len(self.context) > 0
160
- # False means that we are at the end of the string provided
161
- if char is False:
162
- return ""
163
- # <object> starts with '{'
164
- elif char == "{":
165
- self.index += 1
166
- return self.parse_object()
167
- # <array> starts with '['
168
- elif char == "[":
169
- self.index += 1
170
- return self.parse_array()
171
- # there can be an edge case in which a key is empty and at the end of an object
172
- # like "key": }. We return an empty string here to close the object properly
173
- elif char == "}":
174
- self.log(
175
- "At the end of an object we found a key with missing value, skipping",
176
- "info",
177
- )
178
- return ""
179
- # <string> starts with a quote
180
- elif is_in_context and (char in ['"', "'", "“"] or char.isalpha()):
181
- return self.parse_string()
182
- # <number> starts with [0-9] or minus
183
- elif is_in_context and (char.isdigit() or char == "-" or char == "."):
184
- return self.parse_number()
185
- # If everything else fails, we just ignore and move on
186
- else:
187
- self.index += 1
188
-
189
- def parse_object(self) -> Dict[str, Any]:
190
- # <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
191
- obj = {}
192
- # Stop when you either find the closing parentheses or you have iterated over the entire string
193
- while (self.get_char_at() or "}") != "}":
194
- # This is what we expect to find:
195
- # <member> ::= <string> ': ' <json>
196
-
197
- # Skip filler whitespaces
198
- self.skip_whitespaces_at()
199
-
200
- # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
201
- if (self.get_char_at() or "") == ":":
202
- self.log(
203
- "While parsing an object we found a : before a key, ignoring",
204
- "info",
205
- )
206
- self.index += 1
207
-
208
- # We are now searching for they string key
209
- # Context is used in the string parser to manage the lack of quotes
210
- self.set_context("object_key")
211
-
212
- self.skip_whitespaces_at()
213
-
214
- # <member> starts with a <string>
215
- key = ""
216
- while self.get_char_at():
217
- key = str(self.parse_string())
218
-
219
- if key != "" or (key == "" and self.get_char_at() == ":"):
220
- # If the string is empty but there is a object divider, we are done here
221
- break
222
-
223
- self.skip_whitespaces_at()
224
-
225
- # We reached the end here
226
- if (self.get_char_at() or "}") == "}":
227
- continue
228
-
229
- self.skip_whitespaces_at()
230
-
231
- # An extreme case of missing ":" after a key
232
- if (self.get_char_at() or "") != ":":
233
- self.log(
234
- "While parsing an object we missed a : after a key",
235
- "info",
236
- )
237
-
238
- self.index += 1
239
- self.reset_context()
240
- self.set_context("object_value")
241
- # The value can be any valid json
242
- value = self.parse_json()
243
-
244
- # Reset context since our job is done
245
- self.reset_context()
246
- obj[key] = value
247
-
248
- if (self.get_char_at() or "") in [",", "'", '"']:
249
- self.index += 1
250
-
251
- # Remove trailing spaces
252
- self.skip_whitespaces_at()
253
-
254
- self.index += 1
255
- return obj
256
-
257
- def parse_array(self) -> List[Any]:
258
- # <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
259
- arr = []
260
- self.set_context("array")
261
- # Stop when you either find the closing parentheses or you have iterated over the entire string
262
- while (self.get_char_at() or "]") != "]":
263
- self.skip_whitespaces_at()
264
- value = self.parse_json()
265
-
266
- # It is possible that parse_json() returns nothing valid, so we stop
267
- if value == "":
268
- break
269
-
270
- if value == "..." and self.get_char_at(-1) == ".":
271
- self.log(
272
- "While parsing an array, found a stray '...'; ignoring it", "info"
273
- )
274
- else:
275
- arr.append(value)
276
-
277
- # skip over whitespace after a value but before closing ]
278
- char = self.get_char_at()
279
- while char and (char.isspace() or char == ","):
280
- self.index += 1
281
- char = self.get_char_at()
282
-
283
- # Especially at the end of an LLM generated json you might miss the last "]"
284
- char = self.get_char_at()
285
- if char and char != "]":
286
- self.log(
287
- "While parsing an array we missed the closing ], adding it back", "info"
288
- )
289
- self.index -= 1
290
-
291
- self.index += 1
292
- self.reset_context()
293
- return arr
294
-
295
- def parse_string(self) -> Union[str, bool, None]:
296
- # <string> is a string of valid characters enclosed in quotes
297
- # i.e. { name: "John" }
298
- # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
299
-
300
- # Flag to manage corner cases related to missing starting quote
301
- missing_quotes = False
302
- doubled_quotes = False
303
- lstring_delimiter = rstring_delimiter = '"'
304
-
305
- char = self.get_char_at()
306
- # A valid string can only start with a valid quote or, in our case, with a literal
307
- while char and char not in ['"', "'", "“"] and not char.isalnum():
308
- self.index += 1
309
- char = self.get_char_at()
310
-
311
- if not char:
312
- # This is an empty string
313
- return ""
314
-
315
- # Ensuring we use the right delimiter
316
- if char == "'":
317
- lstring_delimiter = rstring_delimiter = "'"
318
- elif char == "“":
319
- lstring_delimiter = "“"
320
- rstring_delimiter = "”"
321
- elif char.isalnum():
322
- # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
323
- # But remember, object keys are only of type string
324
- if char.lower() in ["t", "f", "n"] and self.get_context() != "object_key":
325
- value = self.parse_boolean_or_null()
326
- if value != "":
327
- return value
328
- self.log(
329
- "While parsing a string, we found a literal instead of a quote",
330
- "info",
331
- )
332
- self.log(
333
- "While parsing a string, we found no starting quote. Will add the quote back",
334
- "info",
335
- )
336
- missing_quotes = True
337
-
338
- if not missing_quotes:
339
- self.index += 1
340
-
341
- # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
342
- if self.get_char_at() == lstring_delimiter:
343
- # If it's an empty key, this was easy
344
- if self.get_context() == "object_key" and self.get_char_at(1) == ":":
345
- self.index += 1
346
- return ""
347
- # Find the next delimiter
348
- i = 1
349
- next_c = self.get_char_at(i)
350
- while next_c and next_c != rstring_delimiter:
351
- i += 1
352
- next_c = self.get_char_at(i)
353
- # Now check that the next character is also a delimiter to ensure that we have "".....""
354
- # In that case we ignore this rstring delimiter
355
- if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
356
- self.log(
357
- "While parsing a string, we found a valid starting doubled quote, ignoring it",
358
- "info",
359
- )
360
- doubled_quotes = True
361
- self.index += 1
362
- else:
363
- # Ok this is not a doubled quote, check if this is an empty string or not
364
- i = 1
365
- next_c = self.get_char_at(i)
366
- while next_c and next_c.isspace():
367
- i += 1
368
- next_c = self.get_char_at(i)
369
- if next_c not in [",", "]", "}"]:
370
- self.log(
371
- "While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
372
- "info",
373
- )
374
- self.index += 1
375
-
376
- # Initialize our return value
377
- string_acc = ""
378
-
379
- # Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
380
- # In that case we need to use the ":|,|}" characters as terminators of the string
381
- # So this will stop if:
382
- # * It finds a closing quote
383
- # * It iterated over the entire sequence
384
- # * If we are fixing missing quotes in an object, when it finds the special terminators
385
- char = self.get_char_at()
386
- while char and char != rstring_delimiter:
387
- if (
388
- missing_quotes
389
- and self.get_context() == "object_key"
390
- and (char == ":" or char.isspace())
391
- ):
392
- self.log(
393
- "While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
394
- "info",
395
- )
396
- break
397
- if self.get_context() == "object_value" and char in [",", "}"]:
398
- rstring_delimiter_missing = True
399
- # check if this is a case in which the closing comma is NOT missing instead
400
- i = 1
401
- next_c = self.get_char_at(i)
402
- while next_c and next_c != rstring_delimiter:
403
- i += 1
404
- next_c = self.get_char_at(i)
405
- if next_c:
406
- i += 1
407
- next_c = self.get_char_at(i)
408
- # found a delimiter, now we need to check that is followed strictly by a comma or brace
409
- while next_c and next_c.isspace():
410
- i += 1
411
- next_c = self.get_char_at(i)
412
- if next_c and next_c in [",", "}"]:
413
- rstring_delimiter_missing = False
414
- if rstring_delimiter_missing:
415
- self.log(
416
- "While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
417
- "info",
418
- )
419
- break
420
- string_acc += char
421
- self.index += 1
422
- char = self.get_char_at()
423
- if char and len(string_acc) > 0 and string_acc[-1] == "\\":
424
- # This is a special case, if people use real strings this might happen
425
- self.log("Found a stray escape sequence, normalizing it", "info")
426
- string_acc = string_acc[:-1]
427
- if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
428
- escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
429
- string_acc += escape_seqs.get(char, char) or char
430
- self.index += 1
431
- char = self.get_char_at()
432
- # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
433
- if char == rstring_delimiter:
434
- # Special case here, in case of double quotes one after another
435
- if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
436
- self.log(
437
- "While parsing a string, we found a doubled quote, ignoring it",
438
- "info",
439
- )
440
- self.index += 1
441
- elif missing_quotes and self.get_context() == "object_value":
442
- # In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
443
- i = 1
444
- next_c = self.get_char_at(i)
445
- while next_c and next_c not in [
446
- rstring_delimiter,
447
- lstring_delimiter,
448
- ]:
449
- i += 1
450
- next_c = self.get_char_at(i)
451
- if next_c:
452
- # We found a quote, now let's make sure there's a ":" following
453
- i += 1
454
- next_c = self.get_char_at(i)
455
- # found a delimiter, now we need to check that is followed strictly by a comma or brace
456
- while next_c and next_c.isspace():
457
- i += 1
458
- next_c = self.get_char_at(i)
459
- if next_c and next_c == ":":
460
- # Reset the cursor
461
- self.index -= 1
462
- char = self.get_char_at()
463
- self.log(
464
- "In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.",
465
- "info",
466
- )
467
- break
468
- else:
469
- # Check if eventually there is a rstring delimiter, otherwise we bail
470
- i = 1
471
- next_c = self.get_char_at(i)
472
- check_comma_in_object_value = True
473
- while next_c and next_c not in [
474
- rstring_delimiter,
475
- lstring_delimiter,
476
- ]:
477
- # This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
478
- # This is because the routine after will make sure to correct any bad guess and this solves a corner case
479
- if check_comma_in_object_value and next_c.isalpha():
480
- check_comma_in_object_value = False
481
- # If we are in an object context, let's check for the right delimiters
482
- if (
483
- ("object_key" in self.context and next_c in [":", "}"])
484
- or ("object_value" in self.context and next_c == "}")
485
- or ("array" in self.context and next_c in ["]", ","])
486
- or (
487
- check_comma_in_object_value
488
- and self.get_context() == "object_value"
489
- and next_c == ","
490
- )
491
- ):
492
- break
493
- i += 1
494
- next_c = self.get_char_at(i)
495
- # If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
496
- if next_c == "," and self.get_context() == "object_value":
497
- i += 1
498
- next_c = self.get_char_at(i)
499
- while next_c and next_c != rstring_delimiter:
500
- i += 1
501
- next_c = self.get_char_at(i)
502
- # Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
503
- i += 1
504
- next_c = self.get_char_at(i)
505
- while next_c and next_c.isspace():
506
- i += 1
507
- next_c = self.get_char_at(i)
508
- if next_c == "}":
509
- # OK this is valid then
510
- self.log(
511
- "While parsing a string, we misplaced a quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
512
- "info",
513
- )
514
- string_acc += str(char)
515
- self.index += 1
516
- char = self.get_char_at()
517
- elif next_c == rstring_delimiter:
518
- if self.get_context() == "object_value":
519
- # But this might not be it! This could be just a missing comma
520
- # We found a delimiter and we need to check if this is a key
521
- # so find a rstring_delimiter and a colon after
522
- i += 1
523
- next_c = self.get_char_at(i)
524
- while next_c and next_c != rstring_delimiter:
525
- i += 1
526
- next_c = self.get_char_at(i)
527
- i += 1
528
- next_c = self.get_char_at(i)
529
- while next_c and next_c != ":":
530
- if next_c in [
531
- lstring_delimiter,
532
- rstring_delimiter,
533
- ",",
534
- ]:
535
- break
536
- i += 1
537
- next_c = self.get_char_at(i)
538
- # Only if we fail to find a ':' then we know this is misplaced quote
539
- if next_c != ":":
540
- self.log(
541
- "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
542
- "info",
543
- )
544
- string_acc += str(char)
545
- self.index += 1
546
- char = self.get_char_at()
547
-
548
- if (
549
- char
550
- and missing_quotes
551
- and self.get_context() == "object_key"
552
- and char.isspace()
553
- ):
554
- self.log(
555
- "While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
556
- "info",
557
- )
558
- self.skip_whitespaces_at()
559
- if self.get_char_at() not in [":", ","]:
560
- return ""
561
-
562
- # A fallout of the previous special case in the while loop,
563
- # we need to update the index only if we had a closing quote
564
- if char != rstring_delimiter:
565
- self.log(
566
- "While parsing a string, we missed the closing quote, ignoring",
567
- "info",
568
- )
569
- else:
570
- self.index += 1
571
-
572
- return string_acc.rstrip()
573
-
574
- def parse_number(self) -> Union[float, int, str, JSONReturnType]:
575
- # <number> is a valid real number expressed in one of a number of given formats
576
- number_str = ""
577
- number_chars = set("0123456789-.eE/,")
578
- char = self.get_char_at()
579
- is_array = self.get_context() == "array"
580
- while char and char in number_chars and (char != "," or not is_array):
581
- number_str += char
582
- self.index += 1
583
- char = self.get_char_at()
584
- if len(number_str) > 1 and number_str[-1] in "-eE/,":
585
- # The number ends with a non valid character for a number/currency, rolling back one
586
- number_str = number_str[:-1]
587
- self.index -= 1
588
- try:
589
- if "," in number_str:
590
- return str(number_str)
591
- if "." in number_str or "e" in number_str or "E" in number_str:
592
- return float(number_str)
593
- elif number_str == "-":
594
- # If there is a stray "-" this will throw an exception, throw away this character
595
- return self.parse_json()
596
- else:
597
- return int(number_str)
598
- except ValueError:
599
- return number_str
600
-
601
- def parse_boolean_or_null(self) -> Union[bool, str, None]:
602
- # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
603
- starting_index = self.index
604
- char = (self.get_char_at() or "").lower()
605
- value: Optional[Tuple[str, Optional[bool]]]
606
- if char == "t":
607
- value = ("true", True)
608
- elif char == "f":
609
- value = ("false", False)
610
- elif char == "n":
611
- value = ("null", None)
612
-
613
- if value:
614
- i = 0
615
- while char and i < len(value[0]) and char == value[0][i]:
616
- i += 1
617
- self.index += 1
618
- char = (self.get_char_at() or "").lower()
619
- if i == len(value[0]):
620
- return value[1]
621
-
622
- # If nothing works reset the index before returning
623
- self.index = starting_index
624
- return ""
625
-
626
- def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
627
- # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
628
- try:
629
- return self.json_str[self.index + count]
630
- except IndexError:
631
- return False
632
-
633
- def skip_whitespaces_at(self) -> None:
634
- """
635
- This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
636
- """
637
- try:
638
- char = self.json_str[self.index]
639
- except IndexError:
640
- return
641
- while char.isspace():
642
- self.index += 1
643
- try:
644
- char = self.json_str[self.index]
645
- except IndexError:
646
- return
647
-
648
- def set_context(self, value: str) -> None:
649
- # If a value is provided update the context variable and save in stack
650
- if value:
651
- self.context.append(value)
652
-
653
- def reset_context(self) -> None:
654
- self.context.pop()
655
-
656
- def get_context(self) -> str:
657
- return self.context[-1]
658
-
659
- def log(self, text: str, level: str) -> None:
660
- if level == self.logger.log_level:
661
- context = ""
662
- start = max(self.index - self.logger.window, 0)
663
- end = min(self.index + self.logger.window, len(self.json_str))
664
- context = self.json_str[start:end]
665
- self.logger.log.append(
666
- {
667
- "text": text,
668
- "context": context,
669
- }
670
- )
28
+ from typing import Dict, List, Optional, Union, TextIO, Tuple
29
+ from .json_parser import JSONParser, JSONReturnType
671
30
 
672
31
 
673
32
  def repair_json(