json-repair 0.29.2__py3-none-any.whl → 0.29.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -23,651 +23,10 @@ All supported use cases are in the unit tests
23
23
  """
24
24
 
25
25
  import argparse
26
- import os
27
26
  import sys
28
27
  import json
29
- from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
30
-
31
-
32
- class StringFileWrapper:
33
- # This is a trick to simplify the code, transform the filedescriptor handling into a string handling
34
- def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
35
- self.fd = fd
36
- self.length: int = 0
37
- # Buffers are 1MB strings that are read from the file
38
- # and kept in memory to keep reads low
39
- self.buffers: dict[int, str] = {}
40
- # CHUNK_LENGTH is in bytes
41
- if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
42
- CHUNK_LENGTH = 1_000_000
43
- self.buffer_length = CHUNK_LENGTH
44
-
45
- def get_buffer(self, index: int) -> str:
46
- if self.buffers.get(index) is None:
47
- self.fd.seek(index * self.buffer_length)
48
- self.buffers[index] = self.fd.read(self.buffer_length)
49
- # Save memory by keeping max 2MB buffer chunks and min 2 chunks
50
- if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
51
- oldest_key = next(iter(self.buffers))
52
- if oldest_key != index:
53
- self.buffers.pop(oldest_key)
54
- return self.buffers[index]
55
-
56
- def __getitem__(self, index: Union[int, slice]) -> str:
57
- # The buffer is an array that is seek like a RAM:
58
- # self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
59
- # self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
60
- if isinstance(index, slice):
61
- buffer_index = index.start // self.buffer_length
62
- buffer_end = index.stop // self.buffer_length
63
- if buffer_index == buffer_end:
64
- return self.get_buffer(buffer_index)[
65
- index.start % self.buffer_length : index.stop % self.buffer_length
66
- ]
67
- else:
68
- start_slice = self.get_buffer(buffer_index)[
69
- index.start % self.buffer_length :
70
- ]
71
- end_slice = self.get_buffer(buffer_end)[
72
- : index.stop % self.buffer_length
73
- ]
74
- middle_slices = [
75
- self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)
76
- ]
77
- return start_slice + "".join(middle_slices) + end_slice
78
- else:
79
- buffer_index = index // self.buffer_length
80
- return self.get_buffer(buffer_index)[index % self.buffer_length]
81
-
82
- def __len__(self) -> int:
83
- if self.length < 1:
84
- current_position = self.fd.tell()
85
- self.fd.seek(0, os.SEEK_END)
86
- self.length = self.fd.tell()
87
- self.fd.seek(current_position)
88
- return self.length
89
-
90
-
91
- class LoggerConfig:
92
- # This is a type class to simplify the declaration
93
- def __init__(self, log_level: Optional[str]):
94
- self.log: List[Dict[str, str]] = []
95
- self.window: int = 10
96
- self.log_level: str = log_level if log_level else "none"
97
-
98
-
99
- JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
100
-
101
-
102
- class JSONParser:
103
- def __init__(
104
- self,
105
- json_str: Union[str, StringFileWrapper],
106
- json_fd: Optional[TextIO],
107
- logging: Optional[bool],
108
- json_fd_chunk_length: int = 0,
109
- ) -> None:
110
- # The string to parse
111
- self.json_str = json_str
112
- # Alternatively, the file description with a json file in it
113
- if json_fd:
114
- # This is a trick we do to treat the file wrapper as an array
115
- self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
116
- # Index is our iterator that will keep track of which character we are looking at right now
117
- self.index: int = 0
118
- # This is used in the object member parsing to manage the special cases of missing quotes in key or value
119
- self.context: list[str] = []
120
- # Use this to log the activity, but only if logging is active
121
- self.logger = LoggerConfig(log_level="info" if logging else None)
122
-
123
- def parse(
124
- self,
125
- ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
126
- json = self.parse_json()
127
- if self.index < len(self.json_str):
128
- self.log(
129
- "The parser returned early, checking if there's more json elements",
130
- "info",
131
- )
132
- json = [json]
133
- last_index = self.index
134
- while self.index < len(self.json_str):
135
- j = self.parse_json()
136
- if j != "":
137
- json.append(j)
138
- if self.index == last_index:
139
- self.index += 1
140
- last_index = self.index
141
- # If nothing extra was found, don't return an array
142
- if len(json) == 1:
143
- self.log(
144
- "There were no more elements, returning the element without the array",
145
- "info",
146
- )
147
- json = json[0]
148
- if self.logger.log_level == "none":
149
- return json
150
- else:
151
- return json, self.logger.log
152
-
153
- def parse_json(
154
- self,
155
- ) -> JSONReturnType:
156
- while True:
157
- char = self.get_char_at()
158
- # This parser will ignore any basic element (string or number) that is not inside an array or object
159
- is_in_context = len(self.context) > 0
160
- # False means that we are at the end of the string provided
161
- if char is False:
162
- return ""
163
- # <object> starts with '{'
164
- elif char == "{":
165
- self.index += 1
166
- return self.parse_object()
167
- # <array> starts with '['
168
- elif char == "[":
169
- self.index += 1
170
- return self.parse_array()
171
- # there can be an edge case in which a key is empty and at the end of an object
172
- # like "key": }. We return an empty string here to close the object properly
173
- elif char == "}":
174
- self.log(
175
- "At the end of an object we found a key with missing value, skipping",
176
- "info",
177
- )
178
- return ""
179
- # <string> starts with a quote
180
- elif is_in_context and (char in ['"', "'", "“"] or char.isalpha()):
181
- return self.parse_string()
182
- # <number> starts with [0-9] or minus
183
- elif is_in_context and (char.isdigit() or char == "-" or char == "."):
184
- return self.parse_number()
185
- # If everything else fails, we just ignore and move on
186
- else:
187
- self.index += 1
188
-
189
- def parse_object(self) -> Dict[str, Any]:
190
- # <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
191
- obj = {}
192
- # Stop when you either find the closing parentheses or you have iterated over the entire string
193
- while (self.get_char_at() or "}") != "}":
194
- # This is what we expect to find:
195
- # <member> ::= <string> ': ' <json>
196
-
197
- # Skip filler whitespaces
198
- self.skip_whitespaces_at()
199
-
200
- # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
201
- if (self.get_char_at() or "") == ":":
202
- self.log(
203
- "While parsing an object we found a : before a key, ignoring",
204
- "info",
205
- )
206
- self.index += 1
207
-
208
- # We are now searching for they string key
209
- # Context is used in the string parser to manage the lack of quotes
210
- self.set_context("object_key")
211
-
212
- self.skip_whitespaces_at()
213
-
214
- # <member> starts with a <string>
215
- key = ""
216
- while self.get_char_at():
217
- key = str(self.parse_string())
218
-
219
- if key != "" or (key == "" and self.get_char_at() == ":"):
220
- # If the string is empty but there is a object divider, we are done here
221
- break
222
-
223
- self.skip_whitespaces_at()
224
-
225
- # We reached the end here
226
- if (self.get_char_at() or "}") == "}":
227
- continue
228
-
229
- self.skip_whitespaces_at()
230
-
231
- # An extreme case of missing ":" after a key
232
- if (self.get_char_at() or "") != ":":
233
- self.log(
234
- "While parsing an object we missed a : after a key",
235
- "info",
236
- )
237
-
238
- self.index += 1
239
- self.reset_context()
240
- self.set_context("object_value")
241
- # The value can be any valid json
242
- value = self.parse_json()
243
-
244
- # Reset context since our job is done
245
- self.reset_context()
246
- obj[key] = value
247
-
248
- if (self.get_char_at() or "") in [",", "'", '"']:
249
- self.index += 1
250
-
251
- # Remove trailing spaces
252
- self.skip_whitespaces_at()
253
-
254
- self.index += 1
255
- return obj
256
-
257
- def parse_array(self) -> List[Any]:
258
- # <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
259
- arr = []
260
- self.set_context("array")
261
- # Stop when you either find the closing parentheses or you have iterated over the entire string
262
- while (self.get_char_at() or "]") != "]":
263
- self.skip_whitespaces_at()
264
- value = self.parse_json()
265
-
266
- # It is possible that parse_json() returns nothing valid, so we stop
267
- if value == "":
268
- break
269
-
270
- if value == "..." and self.get_char_at(-1) == ".":
271
- self.log(
272
- "While parsing an array, found a stray '...'; ignoring it", "info"
273
- )
274
- else:
275
- arr.append(value)
276
-
277
- # skip over whitespace after a value but before closing ]
278
- char = self.get_char_at()
279
- while char and (char.isspace() or char == ","):
280
- self.index += 1
281
- char = self.get_char_at()
282
-
283
- # Especially at the end of an LLM generated json you might miss the last "]"
284
- char = self.get_char_at()
285
- if char and char != "]":
286
- self.log(
287
- "While parsing an array we missed the closing ], adding it back", "info"
288
- )
289
- self.index -= 1
290
-
291
- self.index += 1
292
- self.reset_context()
293
- return arr
294
-
295
- def parse_string(self) -> Union[str, bool, None]:
296
- # <string> is a string of valid characters enclosed in quotes
297
- # i.e. { name: "John" }
298
- # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
299
-
300
- # Flag to manage corner cases related to missing starting quote
301
- missing_quotes = False
302
- doubled_quotes = False
303
- lstring_delimiter = rstring_delimiter = '"'
304
-
305
- char = self.get_char_at()
306
- # A valid string can only start with a valid quote or, in our case, with a literal
307
- while char and char not in ['"', "'", "“"] and not char.isalnum():
308
- self.index += 1
309
- char = self.get_char_at()
310
-
311
- if not char:
312
- # This is an empty string
313
- return ""
314
-
315
- # Ensuring we use the right delimiter
316
- if char == "'":
317
- lstring_delimiter = rstring_delimiter = "'"
318
- elif char == "“":
319
- lstring_delimiter = "“"
320
- rstring_delimiter = "”"
321
- elif char.isalnum():
322
- # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
323
- # But remember, object keys are only of type string
324
- if char.lower() in ["t", "f", "n"] and self.get_context() != "object_key":
325
- value = self.parse_boolean_or_null()
326
- if value != "":
327
- return value
328
- self.log(
329
- "While parsing a string, we found a literal instead of a quote",
330
- "info",
331
- )
332
- self.log(
333
- "While parsing a string, we found no starting quote. Will add the quote back",
334
- "info",
335
- )
336
- missing_quotes = True
337
-
338
- if not missing_quotes:
339
- self.index += 1
340
-
341
- # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
342
- if self.get_char_at() == lstring_delimiter:
343
- # If it's an empty key, this was easy
344
- if self.get_context() == "object_key" and self.get_char_at(1) == ":":
345
- self.index += 1
346
- return ""
347
- # Find the next delimiter
348
- i = 1
349
- next_c = self.get_char_at(i)
350
- while next_c and next_c != rstring_delimiter:
351
- i += 1
352
- next_c = self.get_char_at(i)
353
- # Now check that the next character is also a delimiter to ensure that we have "".....""
354
- # In that case we ignore this rstring delimiter
355
- if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
356
- self.log(
357
- "While parsing a string, we found a valid starting doubled quote, ignoring it",
358
- "info",
359
- )
360
- doubled_quotes = True
361
- self.index += 1
362
- else:
363
- # Ok this is not a doubled quote, check if this is an empty string or not
364
- i = 1
365
- next_c = self.get_char_at(i)
366
- while next_c and next_c.isspace():
367
- i += 1
368
- next_c = self.get_char_at(i)
369
- if next_c not in [",", "]", "}"]:
370
- self.log(
371
- "While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
372
- "info",
373
- )
374
- self.index += 1
375
-
376
- # Initialize our return value
377
- string_acc = ""
378
-
379
- # Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
380
- # In that case we need to use the ":|,|}" characters as terminators of the string
381
- # So this will stop if:
382
- # * It finds a closing quote
383
- # * It iterated over the entire sequence
384
- # * If we are fixing missing quotes in an object, when it finds the special terminators
385
- char = self.get_char_at()
386
- while char and char != rstring_delimiter:
387
- if (
388
- missing_quotes
389
- and self.get_context() == "object_key"
390
- and (char == ":" or char.isspace())
391
- ):
392
- self.log(
393
- "While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
394
- "info",
395
- )
396
- break
397
- if self.get_context() == "object_value" and char in [",", "}"]:
398
- rstring_delimiter_missing = True
399
- # check if this is a case in which the closing comma is NOT missing instead
400
- i = 1
401
- next_c = self.get_char_at(i)
402
- while next_c and next_c != rstring_delimiter:
403
- i += 1
404
- next_c = self.get_char_at(i)
405
- if next_c:
406
- i += 1
407
- next_c = self.get_char_at(i)
408
- # found a delimiter, now we need to check that is followed strictly by a comma or brace
409
- while next_c and next_c.isspace():
410
- i += 1
411
- next_c = self.get_char_at(i)
412
- if next_c and next_c in [",", "}"]:
413
- rstring_delimiter_missing = False
414
- if rstring_delimiter_missing:
415
- self.log(
416
- "While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
417
- "info",
418
- )
419
- break
420
- string_acc += char
421
- self.index += 1
422
- char = self.get_char_at()
423
- if char and len(string_acc) > 0 and string_acc[-1] == "\\":
424
- # This is a special case, if people use real strings this might happen
425
- self.log("Found a stray escape sequence, normalizing it", "info")
426
- string_acc = string_acc[:-1]
427
- if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
428
- escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
429
- string_acc += escape_seqs.get(char, char) or char
430
- self.index += 1
431
- char = self.get_char_at()
432
- # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
433
- if char == rstring_delimiter:
434
- # Special case here, in case of double quotes one after another
435
- if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
436
- self.log(
437
- "While parsing a string, we found a doubled quote, ignoring it",
438
- "info",
439
- )
440
- self.index += 1
441
- elif missing_quotes and self.get_context() == "object_value":
442
- # In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
443
- i = 1
444
- next_c = self.get_char_at(i)
445
- while next_c and next_c not in [
446
- rstring_delimiter,
447
- lstring_delimiter,
448
- ]:
449
- i += 1
450
- next_c = self.get_char_at(i)
451
- if next_c:
452
- # We found a quote, now let's make sure there's a ":" following
453
- i += 1
454
- next_c = self.get_char_at(i)
455
- # found a delimiter, now we need to check that is followed strictly by a comma or brace
456
- while next_c and next_c.isspace():
457
- i += 1
458
- next_c = self.get_char_at(i)
459
- if next_c and next_c == ":":
460
- # Reset the cursor
461
- self.index -= 1
462
- char = self.get_char_at()
463
- self.log(
464
- "In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.",
465
- "info",
466
- )
467
- break
468
- else:
469
- # Check if eventually there is a rstring delimiter, otherwise we bail
470
- i = 1
471
- next_c = self.get_char_at(i)
472
- check_comma_in_object_value = True
473
- while next_c and next_c not in [
474
- rstring_delimiter,
475
- lstring_delimiter,
476
- ]:
477
- # This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
478
- # This is because the routine after will make sure to correct any bad guess and this solves a corner case
479
- if check_comma_in_object_value and next_c.isalpha():
480
- check_comma_in_object_value = False
481
- # If we are in an object context, let's check for the right delimiters
482
- if (
483
- ("object_key" in self.context and next_c in [":", "}"])
484
- or ("object_value" in self.context and next_c == "}")
485
- or ("array" in self.context and next_c in ["]", ","])
486
- or (
487
- check_comma_in_object_value
488
- and self.get_context() == "object_value"
489
- and next_c == ","
490
- )
491
- ):
492
- break
493
- i += 1
494
- next_c = self.get_char_at(i)
495
- # If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
496
- if next_c == "," and self.get_context() == "object_value":
497
- i += 1
498
- next_c = self.get_char_at(i)
499
- while next_c and next_c != rstring_delimiter:
500
- i += 1
501
- next_c = self.get_char_at(i)
502
- # Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
503
- i += 1
504
- next_c = self.get_char_at(i)
505
- while next_c and next_c.isspace():
506
- i += 1
507
- next_c = self.get_char_at(i)
508
- if next_c == "}":
509
- # OK this is valid then
510
- self.log(
511
- "While parsing a string, we misplaced a quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
512
- "info",
513
- )
514
- string_acc += str(char)
515
- self.index += 1
516
- char = self.get_char_at()
517
- elif next_c == rstring_delimiter:
518
- if self.get_context() == "object_value":
519
- # But this might not be it! This could be just a missing comma
520
- # We found a delimiter and we need to check if this is a key
521
- # so find a rstring_delimiter and a colon after
522
- i += 1
523
- next_c = self.get_char_at(i)
524
- while next_c and next_c != rstring_delimiter:
525
- i += 1
526
- next_c = self.get_char_at(i)
527
- i += 1
528
- next_c = self.get_char_at(i)
529
- while next_c and next_c != ":":
530
- if next_c in [
531
- lstring_delimiter,
532
- rstring_delimiter,
533
- ",",
534
- ]:
535
- break
536
- i += 1
537
- next_c = self.get_char_at(i)
538
- # Only if we fail to find a ':' then we know this is misplaced quote
539
- if next_c != ":":
540
- self.log(
541
- "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
542
- "info",
543
- )
544
- string_acc += str(char)
545
- self.index += 1
546
- char = self.get_char_at()
547
-
548
- if (
549
- char
550
- and missing_quotes
551
- and self.get_context() == "object_key"
552
- and char.isspace()
553
- ):
554
- self.log(
555
- "While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
556
- "info",
557
- )
558
- self.skip_whitespaces_at()
559
- if self.get_char_at() not in [":", ","]:
560
- return ""
561
-
562
- # A fallout of the previous special case in the while loop,
563
- # we need to update the index only if we had a closing quote
564
- if char != rstring_delimiter:
565
- self.log(
566
- "While parsing a string, we missed the closing quote, ignoring",
567
- "info",
568
- )
569
- else:
570
- self.index += 1
571
-
572
- return string_acc.rstrip()
573
-
574
- def parse_number(self) -> Union[float, int, str, JSONReturnType]:
575
- # <number> is a valid real number expressed in one of a number of given formats
576
- number_str = ""
577
- number_chars = set("0123456789-.eE/,")
578
- char = self.get_char_at()
579
- is_array = self.get_context() == "array"
580
- while char and char in number_chars and (char != "," or not is_array):
581
- number_str += char
582
- self.index += 1
583
- char = self.get_char_at()
584
- if len(number_str) > 1 and number_str[-1] in "-eE/,":
585
- # The number ends with a non valid character for a number/currency, rolling back one
586
- number_str = number_str[:-1]
587
- self.index -= 1
588
- try:
589
- if "," in number_str:
590
- return str(number_str)
591
- if "." in number_str or "e" in number_str or "E" in number_str:
592
- return float(number_str)
593
- elif number_str == "-":
594
- # If there is a stray "-" this will throw an exception, throw away this character
595
- return self.parse_json()
596
- else:
597
- return int(number_str)
598
- except ValueError:
599
- return number_str
600
-
601
- def parse_boolean_or_null(self) -> Union[bool, str, None]:
602
- # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
603
- starting_index = self.index
604
- char = (self.get_char_at() or "").lower()
605
- value: Optional[Tuple[str, Optional[bool]]]
606
- if char == "t":
607
- value = ("true", True)
608
- elif char == "f":
609
- value = ("false", False)
610
- elif char == "n":
611
- value = ("null", None)
612
-
613
- if value:
614
- i = 0
615
- while char and i < len(value[0]) and char == value[0][i]:
616
- i += 1
617
- self.index += 1
618
- char = (self.get_char_at() or "").lower()
619
- if i == len(value[0]):
620
- return value[1]
621
-
622
- # If nothing works reset the index before returning
623
- self.index = starting_index
624
- return ""
625
-
626
- def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
627
- # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
628
- try:
629
- return self.json_str[self.index + count]
630
- except IndexError:
631
- return False
632
-
633
- def skip_whitespaces_at(self) -> None:
634
- """
635
- This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
636
- """
637
- try:
638
- char = self.json_str[self.index]
639
- except IndexError:
640
- return
641
- while char.isspace():
642
- self.index += 1
643
- try:
644
- char = self.json_str[self.index]
645
- except IndexError:
646
- return
647
-
648
- def set_context(self, value: str) -> None:
649
- # If a value is provided update the context variable and save in stack
650
- if value:
651
- self.context.append(value)
652
-
653
- def reset_context(self) -> None:
654
- self.context.pop()
655
-
656
- def get_context(self) -> str:
657
- return self.context[-1]
658
-
659
- def log(self, text: str, level: str) -> None:
660
- if level == self.logger.log_level:
661
- context = ""
662
- start = max(self.index - self.logger.window, 0)
663
- end = min(self.index + self.logger.window, len(self.json_str))
664
- context = self.json_str[start:end]
665
- self.logger.log.append(
666
- {
667
- "text": text,
668
- "context": context,
669
- }
670
- )
28
+ from typing import Dict, List, Optional, Union, TextIO, Tuple
29
+ from .json_parser import JSONParser, JSONReturnType
671
30
 
672
31
 
673
32
  def repair_json(