json-repair 0.46.1__py3-none-any.whl → 0.46.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/__init__.py +3 -4
- json_repair/json_parser.py +35 -90
- json_repair/json_repair.py +1 -4
- json_repair/object_comparer.py +1 -4
- json_repair/string_file_wrapper.py +9 -17
- {json_repair-0.46.1.dist-info → json_repair-0.46.2.dist-info}/METADATA +1 -1
- json_repair-0.46.2.dist-info/RECORD +14 -0
- json_repair-0.46.1.dist-info/RECORD +0 -14
- {json_repair-0.46.1.dist-info → json_repair-0.46.2.dist-info}/WHEEL +0 -0
- {json_repair-0.46.1.dist-info → json_repair-0.46.2.dist-info}/entry_points.txt +0 -0
- {json_repair-0.46.1.dist-info → json_repair-0.46.2.dist-info}/licenses/LICENSE +0 -0
- {json_repair-0.46.1.dist-info → json_repair-0.46.2.dist-info}/top_level.txt +0 -0
json_repair/__init__.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
from .json_repair import from_file
|
2
|
-
|
3
|
-
|
4
|
-
from .json_repair import repair_json as repair_json
|
1
|
+
from .json_repair import from_file, load, loads, repair_json
|
2
|
+
|
3
|
+
__all__ = ["from_file", "load", "loads", "repair_json"]
|
json_repair/json_parser.py
CHANGED
@@ -105,14 +105,10 @@ class JSONParser:
|
|
105
105
|
)
|
106
106
|
return ""
|
107
107
|
# <string> starts with a quote
|
108
|
-
elif not self.context.empty and (
|
109
|
-
char in self.STRING_DELIMITERS or char.isalpha()
|
110
|
-
):
|
108
|
+
elif not self.context.empty and (char in self.STRING_DELIMITERS or char.isalpha()):
|
111
109
|
return self.parse_string()
|
112
110
|
# <number> starts with [0-9] or minus
|
113
|
-
elif not self.context.empty and (
|
114
|
-
char.isdigit() or char == "-" or char == "."
|
115
|
-
):
|
111
|
+
elif not self.context.empty and (char.isdigit() or char == "-" or char == "."):
|
116
112
|
return self.parse_number()
|
117
113
|
elif char in ["#", "/"]:
|
118
114
|
return self.parse_comment()
|
@@ -164,8 +160,7 @@ class JSONParser:
|
|
164
160
|
if isinstance(prev_value, list):
|
165
161
|
prev_value.extend(
|
166
162
|
new_array[0]
|
167
|
-
if len(new_array) == 1
|
168
|
-
and isinstance(new_array[0], list)
|
163
|
+
if len(new_array) == 1 and isinstance(new_array[0], list)
|
169
164
|
else new_array
|
170
165
|
)
|
171
166
|
self.skip_whitespaces_at()
|
@@ -185,11 +180,7 @@ class JSONParser:
|
|
185
180
|
)
|
186
181
|
self.index = rollback_index - 1
|
187
182
|
# add an opening curly brace to make this work
|
188
|
-
self.json_str =
|
189
|
-
self.json_str[: self.index + 1]
|
190
|
-
+ "{"
|
191
|
-
+ self.json_str[self.index + 1 :]
|
192
|
-
)
|
183
|
+
self.json_str = self.json_str[: self.index + 1] + "{" + self.json_str[self.index + 1 :]
|
193
184
|
break
|
194
185
|
|
195
186
|
# Skip filler whitespaces
|
@@ -242,10 +233,7 @@ class JSONParser:
|
|
242
233
|
i = 1
|
243
234
|
i = self.skip_to_character(char, i)
|
244
235
|
i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
|
245
|
-
if self.get_char_at(i) == ":"
|
246
|
-
value = self.parse_object()
|
247
|
-
else:
|
248
|
-
value = self.parse_string()
|
236
|
+
value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
|
249
237
|
else:
|
250
238
|
value = self.parse_json()
|
251
239
|
|
@@ -307,10 +295,7 @@ class JSONParser:
|
|
307
295
|
elif char.isalnum():
|
308
296
|
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
|
309
297
|
# But remember, object keys are only of type string
|
310
|
-
if (
|
311
|
-
char.lower() in ["t", "f", "n"]
|
312
|
-
and self.context.current != ContextValues.OBJECT_KEY
|
313
|
-
):
|
298
|
+
if char.lower() in ["t", "f", "n"] and self.context.current != ContextValues.OBJECT_KEY:
|
314
299
|
value = self.parse_boolean_or_null()
|
315
300
|
if value != "":
|
316
301
|
return value
|
@@ -323,15 +308,9 @@ class JSONParser:
|
|
323
308
|
self.index += 1
|
324
309
|
|
325
310
|
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
|
326
|
-
if (
|
327
|
-
self.get_char_at() in self.STRING_DELIMITERS
|
328
|
-
and self.get_char_at() == lstring_delimiter
|
329
|
-
):
|
311
|
+
if self.get_char_at() in self.STRING_DELIMITERS and self.get_char_at() == lstring_delimiter:
|
330
312
|
# If it's an empty key, this was easy
|
331
|
-
if (
|
332
|
-
self.context.current == ContextValues.OBJECT_KEY
|
333
|
-
and self.get_char_at(1) == ":"
|
334
|
-
):
|
313
|
+
if self.context.current == ContextValues.OBJECT_KEY and self.get_char_at(1) == ":":
|
335
314
|
self.index += 1
|
336
315
|
return ""
|
337
316
|
if self.get_char_at(1) == lstring_delimiter:
|
@@ -380,11 +359,7 @@ class JSONParser:
|
|
380
359
|
char = self.get_char_at()
|
381
360
|
unmatched_delimiter = False
|
382
361
|
while char and char != rstring_delimiter:
|
383
|
-
if (
|
384
|
-
missing_quotes
|
385
|
-
and self.context.current == ContextValues.OBJECT_KEY
|
386
|
-
and (char == ":" or char.isspace())
|
387
|
-
):
|
362
|
+
if missing_quotes and self.context.current == ContextValues.OBJECT_KEY and (char == ":" or char.isspace()):
|
388
363
|
self.log(
|
389
364
|
"While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
|
390
365
|
)
|
@@ -421,9 +396,7 @@ class JSONParser:
|
|
421
396
|
else:
|
422
397
|
# But again, this could just be something a bit stupid like "lorem, "ipsum" sic"
|
423
398
|
# Check if we find a : afterwards (skipping space)
|
424
|
-
i = self.skip_whitespaces_at(
|
425
|
-
idx=i + 1, move_main_index=False
|
426
|
-
)
|
399
|
+
i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
|
427
400
|
next_c = self.get_char_at(i)
|
428
401
|
if next_c and next_c != ":":
|
429
402
|
rstring_delimiter_missing = False
|
@@ -486,12 +459,19 @@ class JSONParser:
|
|
486
459
|
string_acc += escape_seqs.get(char, char) or char
|
487
460
|
self.index += 1
|
488
461
|
char = self.get_char_at()
|
462
|
+
elif char in ["u", "x"]:
|
463
|
+
# If we find a unicode escape sequence, normalize it
|
464
|
+
num_chars = 4 if char == "u" else 2
|
465
|
+
next_chars = self.json_str[self.index + 1 : self.index + 1 + num_chars]
|
466
|
+
if len(next_chars) == num_chars and all(c in "0123456789abcdefABCDEF" for c in next_chars):
|
467
|
+
self.log("Found a unicode escape sequence, normalizing it")
|
468
|
+
string_acc = string_acc[:-1]
|
469
|
+
string_acc += chr(int(next_chars, 16))
|
470
|
+
self.index += 1 + num_chars
|
471
|
+
char = self.get_char_at()
|
472
|
+
continue
|
489
473
|
# If we are in object key context and we find a colon, it could be a missing right quote
|
490
|
-
if
|
491
|
-
char == ":"
|
492
|
-
and not missing_quotes
|
493
|
-
and self.context.current == ContextValues.OBJECT_KEY
|
494
|
-
):
|
474
|
+
if char == ":" and not missing_quotes and self.context.current == ContextValues.OBJECT_KEY:
|
495
475
|
# Ok now we need to check if this is followed by a value like "..."
|
496
476
|
i = self.skip_to_character(character=lstring_delimiter, idx=1)
|
497
477
|
next_c = self.get_char_at(i)
|
@@ -522,14 +502,9 @@ class JSONParser:
|
|
522
502
|
if char == rstring_delimiter:
|
523
503
|
# Special case here, in case of double quotes one after another
|
524
504
|
if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
|
525
|
-
self.log(
|
526
|
-
"While parsing a string, we found a doubled quote, ignoring it"
|
527
|
-
)
|
505
|
+
self.log("While parsing a string, we found a doubled quote, ignoring it")
|
528
506
|
self.index += 1
|
529
|
-
elif
|
530
|
-
missing_quotes
|
531
|
-
and self.context.current == ContextValues.OBJECT_VALUE
|
532
|
-
):
|
507
|
+
elif missing_quotes and self.context.current == ContextValues.OBJECT_VALUE:
|
533
508
|
# In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
|
534
509
|
i = 1
|
535
510
|
next_c = self.get_char_at(i)
|
@@ -573,18 +548,9 @@ class JSONParser:
|
|
573
548
|
check_comma_in_object_value = False
|
574
549
|
# If we are in an object context, let's check for the right delimiters
|
575
550
|
if (
|
576
|
-
(
|
577
|
-
|
578
|
-
|
579
|
-
)
|
580
|
-
or (
|
581
|
-
ContextValues.OBJECT_VALUE in self.context.context
|
582
|
-
and next_c == "}"
|
583
|
-
)
|
584
|
-
or (
|
585
|
-
ContextValues.ARRAY in self.context.context
|
586
|
-
and next_c in ["]", ","]
|
587
|
-
)
|
551
|
+
(ContextValues.OBJECT_KEY in self.context.context and next_c in [":", "}"])
|
552
|
+
or (ContextValues.OBJECT_VALUE in self.context.context and next_c == "}")
|
553
|
+
or (ContextValues.ARRAY in self.context.context and next_c in ["]", ","])
|
588
554
|
or (
|
589
555
|
check_comma_in_object_value
|
590
556
|
and self.context.current == ContextValues.OBJECT_VALUE
|
@@ -595,10 +561,7 @@ class JSONParser:
|
|
595
561
|
i += 1
|
596
562
|
next_c = self.get_char_at(i)
|
597
563
|
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
|
598
|
-
if
|
599
|
-
next_c == ","
|
600
|
-
and self.context.current == ContextValues.OBJECT_VALUE
|
601
|
-
):
|
564
|
+
if next_c == "," and self.context.current == ContextValues.OBJECT_VALUE:
|
602
565
|
i += 1
|
603
566
|
i = self.skip_to_character(character=rstring_delimiter, idx=i)
|
604
567
|
next_c = self.get_char_at(i)
|
@@ -606,29 +569,20 @@ class JSONParser:
|
|
606
569
|
i += 1
|
607
570
|
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
|
608
571
|
next_c = self.get_char_at(i)
|
609
|
-
elif (
|
610
|
-
next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\"
|
611
|
-
):
|
572
|
+
elif next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\":
|
612
573
|
# Check if self.index:self.index+i is only whitespaces, break if that's the case
|
613
|
-
if all(
|
614
|
-
str(self.get_char_at(j)).isspace()
|
615
|
-
for j in range(1, i)
|
616
|
-
if self.get_char_at(j)
|
617
|
-
):
|
574
|
+
if all(str(self.get_char_at(j)).isspace() for j in range(1, i) if self.get_char_at(j)):
|
618
575
|
break
|
619
576
|
if self.context.current == ContextValues.OBJECT_VALUE:
|
620
577
|
# But this might not be it! This could be just a missing comma
|
621
578
|
# We found a delimiter and we need to check if this is a key
|
622
579
|
# so find a rstring_delimiter and a colon after
|
623
|
-
i = self.skip_to_character(
|
624
|
-
character=rstring_delimiter, idx=i + 1
|
625
|
-
)
|
580
|
+
i = self.skip_to_character(character=rstring_delimiter, idx=i + 1)
|
626
581
|
i += 1
|
627
582
|
next_c = self.get_char_at(i)
|
628
583
|
while next_c and next_c != ":":
|
629
584
|
if next_c in [",", "]", "}"] or (
|
630
|
-
next_c == rstring_delimiter
|
631
|
-
and self.get_char_at(i - 1) != "\\"
|
585
|
+
next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\"
|
632
586
|
):
|
633
587
|
break
|
634
588
|
i += 1
|
@@ -661,12 +615,7 @@ class JSONParser:
|
|
661
615
|
string_acc += str(char)
|
662
616
|
self.index += 1
|
663
617
|
char = self.get_char_at()
|
664
|
-
if (
|
665
|
-
char
|
666
|
-
and missing_quotes
|
667
|
-
and self.context.current == ContextValues.OBJECT_KEY
|
668
|
-
and char.isspace()
|
669
|
-
):
|
618
|
+
if char and missing_quotes and self.context.current == ContextValues.OBJECT_KEY and char.isspace():
|
670
619
|
self.log(
|
671
620
|
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
|
672
621
|
)
|
@@ -686,9 +635,7 @@ class JSONParser:
|
|
686
635
|
else:
|
687
636
|
self.index += 1
|
688
637
|
|
689
|
-
if not self.stream_stable and (
|
690
|
-
missing_quotes or (string_acc and string_acc[-1] == "\n")
|
691
|
-
):
|
638
|
+
if not self.stream_stable and (missing_quotes or (string_acc and string_acc[-1] == "\n")):
|
692
639
|
# Clean the whitespaces for some corner cases
|
693
640
|
string_acc = string_acc.rstrip()
|
694
641
|
|
@@ -796,9 +743,7 @@ class JSONParser:
|
|
796
743
|
while True:
|
797
744
|
char = self.get_char_at()
|
798
745
|
if not char:
|
799
|
-
self.log(
|
800
|
-
"Reached end-of-string while parsing block comment; unclosed block comment."
|
801
|
-
)
|
746
|
+
self.log("Reached end-of-string while parsing block comment; unclosed block comment.")
|
802
747
|
break
|
803
748
|
comment += char
|
804
749
|
self.index += 1
|
json_repair/json_repair.py
CHANGED
@@ -236,10 +236,7 @@ def cli(inline_args: list[str] | None = None) -> int:
|
|
236
236
|
help="Number of spaces for indentation (Default 2)",
|
237
237
|
)
|
238
238
|
|
239
|
-
if inline_args is None
|
240
|
-
args = parser.parse_args()
|
241
|
-
else:
|
242
|
-
args = parser.parse_args(inline_args)
|
239
|
+
args = parser.parse_args() if inline_args is None else parser.parse_args(inline_args)
|
243
240
|
|
244
241
|
# Inline mode requires a filename, so error out if none was provided.
|
245
242
|
if args.inline and not args.filename: # pragma: no cover
|
json_repair/object_comparer.py
CHANGED
@@ -30,10 +30,7 @@ class ObjectComparer: # pragma: no cover
|
|
30
30
|
elif isinstance(obj1, list):
|
31
31
|
if len(obj1) != len(obj2):
|
32
32
|
return False
|
33
|
-
for i in range(len(obj1))
|
34
|
-
if not ObjectComparer.is_same_object(obj1[i], obj2[i]):
|
35
|
-
return False
|
36
|
-
return True
|
33
|
+
return all(ObjectComparer.is_same_object(obj1[i], obj2[i]) for i in range(len(obj1)))
|
37
34
|
|
38
35
|
# For atoms: types already match, so just return True
|
39
36
|
return True
|
@@ -4,7 +4,7 @@ from typing import TextIO
|
|
4
4
|
|
5
5
|
class StringFileWrapper:
|
6
6
|
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
|
7
|
-
def __init__(self, fd: TextIO,
|
7
|
+
def __init__(self, fd: TextIO, chunk_length: int) -> None:
|
8
8
|
"""
|
9
9
|
Initialize the StringFileWrapper with a file descriptor and chunk length.
|
10
10
|
|
@@ -23,10 +23,10 @@ class StringFileWrapper:
|
|
23
23
|
# Buffers are 1MB strings that are read from the file
|
24
24
|
# and kept in memory to keep reads low
|
25
25
|
self.buffers: dict[int, str] = {}
|
26
|
-
#
|
27
|
-
if not
|
28
|
-
|
29
|
-
self.buffer_length =
|
26
|
+
# chunk_length is in bytes
|
27
|
+
if not chunk_length or chunk_length < 2:
|
28
|
+
chunk_length = 1_000_000
|
29
|
+
self.buffer_length = chunk_length
|
30
30
|
|
31
31
|
def get_buffer(self, index: int) -> str:
|
32
32
|
"""
|
@@ -65,19 +65,11 @@ class StringFileWrapper:
|
|
65
65
|
buffer_index = index.start // self.buffer_length
|
66
66
|
buffer_end = index.stop // self.buffer_length
|
67
67
|
if buffer_index == buffer_end:
|
68
|
-
return self.get_buffer(buffer_index)[
|
69
|
-
index.start % self.buffer_length : index.stop % self.buffer_length
|
70
|
-
]
|
68
|
+
return self.get_buffer(buffer_index)[index.start % self.buffer_length : index.stop % self.buffer_length]
|
71
69
|
else:
|
72
|
-
start_slice = self.get_buffer(buffer_index)[
|
73
|
-
|
74
|
-
]
|
75
|
-
end_slice = self.get_buffer(buffer_end)[
|
76
|
-
: index.stop % self.buffer_length
|
77
|
-
]
|
78
|
-
middle_slices = [
|
79
|
-
self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)
|
80
|
-
]
|
70
|
+
start_slice = self.get_buffer(buffer_index)[index.start % self.buffer_length :]
|
71
|
+
end_slice = self.get_buffer(buffer_end)[: index.stop % self.buffer_length]
|
72
|
+
middle_slices = [self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)]
|
81
73
|
return start_slice + "".join(middle_slices) + end_slice
|
82
74
|
else:
|
83
75
|
buffer_index = index // self.buffer_length
|
@@ -0,0 +1,14 @@
|
|
1
|
+
json_repair/__init__.py,sha256=6FDD6dEVM5Pb5o4Zodgw4ex30Hzy-YvNRy0vts9SQ4I,118
|
2
|
+
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
3
|
+
json_repair/json_context.py,sha256=WsMOjqpGSr6aaDONcrk8UFtTurzWon2Qq9AoBBYseoI,934
|
4
|
+
json_repair/json_parser.py,sha256=B-DgJfyQOMHQ3F0RIBnltUGnGw0DFM-J7xOcLmCylVs,39744
|
5
|
+
json_repair/json_repair.py,sha256=pyH5fCkS1lyNPVjkqXerQ91lBz3eTHDPgV1QtnvJm-Y,11243
|
6
|
+
json_repair/object_comparer.py,sha256=LlIF0MisRglzC-CiG5AxAEDCBWBHeJd-6uXYx0uRmCk,1175
|
7
|
+
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
+
json_repair/string_file_wrapper.py,sha256=tGkWBEUPE-CZPf4uSM5NE9oSDTpskX0myJiXsl-gbds,4333
|
9
|
+
json_repair-0.46.2.dist-info/licenses/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
10
|
+
json_repair-0.46.2.dist-info/METADATA,sha256=-EKRFk4rzF6I4EqFqEVfXJn7aPFrgFzdf1oCZfWgYLE,12208
|
11
|
+
json_repair-0.46.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
12
|
+
json_repair-0.46.2.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
13
|
+
json_repair-0.46.2.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
14
|
+
json_repair-0.46.2.dist-info/RECORD,,
|
@@ -1,14 +0,0 @@
|
|
1
|
-
json_repair/__init__.py,sha256=c4L2kZrHvWEKfj_ODU2naliNuvU6FlFVxtF0hbLe6s8,178
|
2
|
-
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
3
|
-
json_repair/json_context.py,sha256=WsMOjqpGSr6aaDONcrk8UFtTurzWon2Qq9AoBBYseoI,934
|
4
|
-
json_repair/json_parser.py,sha256=7IPu-tin9jLX_y1F9tn3UVpqILARhZYFaTTvq9xrLnU,40451
|
5
|
-
json_repair/json_repair.py,sha256=9wxf0vVNfr_RNQI1rbVPvxQ9feEwwvgnvkiYXwGEBX8,11292
|
6
|
-
json_repair/object_comparer.py,sha256=5-LK-s_2MAHddTxqXSzSkaIFvPXKGLh6swC1gyN74Lk,1245
|
7
|
-
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
|
-
json_repair/string_file_wrapper.py,sha256=uwW4B1s9Cf-iF3ANsCz-RPu2ddCqDETrt8bdojh8ufA,4485
|
9
|
-
json_repair-0.46.1.dist-info/licenses/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
10
|
-
json_repair-0.46.1.dist-info/METADATA,sha256=y-p_aOKtX4eu7p-JNj6IO3s8svB06IityZRnRKEN_xE,12208
|
11
|
-
json_repair-0.46.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
12
|
-
json_repair-0.46.1.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
13
|
-
json_repair-0.46.1.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
14
|
-
json_repair-0.46.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|