json-repair 0.39.0__py3-none-any.whl → 0.40.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/json_parser.py +74 -81
- {json_repair-0.39.0.dist-info → json_repair-0.40.0.dist-info}/METADATA +10 -10
- {json_repair-0.39.0.dist-info → json_repair-0.40.0.dist-info}/RECORD +7 -7
- {json_repair-0.39.0.dist-info → json_repair-0.40.0.dist-info}/WHEEL +1 -1
- {json_repair-0.39.0.dist-info → json_repair-0.40.0.dist-info}/LICENSE +0 -0
- {json_repair-0.39.0.dist-info → json_repair-0.40.0.dist-info}/entry_points.txt +0 -0
- {json_repair-0.39.0.dist-info → json_repair-0.40.0.dist-info}/top_level.txt +0 -0
json_repair/json_parser.py
CHANGED
@@ -9,6 +9,7 @@ JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
|
|
9
9
|
class JSONParser:
|
10
10
|
# Constants
|
11
11
|
STRING_DELIMITERS = ['"', "'", "“", "”"]
|
12
|
+
NUMBER_CHARS = set("0123456789-.eE/,")
|
12
13
|
|
13
14
|
def __init__(
|
14
15
|
self,
|
@@ -129,8 +130,6 @@ class JSONParser:
|
|
129
130
|
# Context is used in the string parser to manage the lack of quotes
|
130
131
|
self.context.set(ContextValues.OBJECT_KEY)
|
131
132
|
|
132
|
-
self.skip_whitespaces_at()
|
133
|
-
|
134
133
|
# Save this index in case we need find a duplicate key
|
135
134
|
rollback_index = self.index
|
136
135
|
|
@@ -219,18 +218,13 @@ class JSONParser:
|
|
219
218
|
char = self.get_char_at()
|
220
219
|
|
221
220
|
# Especially at the end of an LLM generated json you might miss the last "]"
|
222
|
-
char = self.get_char_at()
|
223
221
|
if char and char != "]":
|
224
222
|
self.log(
|
225
|
-
"While parsing an array we missed the closing ],
|
226
|
-
)
|
227
|
-
self.index -= 1
|
228
|
-
# Add the missing closing bracket
|
229
|
-
self.json_str = (
|
230
|
-
self.json_str[: self.index + 1] + "]" + self.json_str[self.index + 1 :]
|
223
|
+
"While parsing an array we missed the closing ], ignoring it",
|
231
224
|
)
|
232
225
|
|
233
226
|
self.index += 1
|
227
|
+
|
234
228
|
self.context.reset()
|
235
229
|
return arr
|
236
230
|
|
@@ -275,15 +269,11 @@ class JSONParser:
|
|
275
269
|
self.log(
|
276
270
|
"While parsing a string, we found a literal instead of a quote",
|
277
271
|
)
|
278
|
-
self.log(
|
279
|
-
"While parsing a string, we found no starting quote. Will add the quote back",
|
280
|
-
)
|
281
272
|
missing_quotes = True
|
282
273
|
|
283
274
|
if not missing_quotes:
|
284
275
|
self.index += 1
|
285
276
|
|
286
|
-
self.skip_whitespaces_at()
|
287
277
|
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
|
288
278
|
if self.get_char_at() in self.STRING_DELIMITERS:
|
289
279
|
# If the next character is the same type of quote, then we manage it as double quotes
|
@@ -583,6 +573,13 @@ class JSONParser:
|
|
583
573
|
elif (
|
584
574
|
next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\"
|
585
575
|
):
|
576
|
+
# Check if self.index:self.index+i is only whitespaces, break if that's the case
|
577
|
+
if all(
|
578
|
+
str(self.get_char_at(j)).isspace()
|
579
|
+
for j in range(1, i)
|
580
|
+
if self.get_char_at(j)
|
581
|
+
):
|
582
|
+
break
|
586
583
|
if self.context.current == ContextValues.OBJECT_VALUE:
|
587
584
|
# But this might not be it! This could be just a missing comma
|
588
585
|
# We found a delimiter and we need to check if this is a key
|
@@ -610,27 +607,24 @@ class JSONParser:
|
|
610
607
|
self.index += 1
|
611
608
|
char = self.get_char_at()
|
612
609
|
elif self.context.current == ContextValues.ARRAY:
|
613
|
-
#
|
614
|
-
#
|
615
|
-
|
616
|
-
|
617
|
-
|
618
|
-
|
619
|
-
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
|
626
|
-
|
627
|
-
|
628
|
-
|
629
|
-
|
630
|
-
|
631
|
-
self.index += 1
|
632
|
-
char = self.get_char_at()
|
633
|
-
|
610
|
+
# If we got up to here it means that this is a situation like this:
|
611
|
+
# ["bla bla bla "puppy" bla bla bla "kitty" bla bla"]
|
612
|
+
# So we need to ignore this quote
|
613
|
+
self.log(
|
614
|
+
"While parsing a string in Array context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
|
615
|
+
)
|
616
|
+
unmatched_delimiter = not unmatched_delimiter
|
617
|
+
string_acc += str(char)
|
618
|
+
self.index += 1
|
619
|
+
char = self.get_char_at()
|
620
|
+
elif self.context.current == ContextValues.OBJECT_KEY:
|
621
|
+
# In this case we just ignore this and move on
|
622
|
+
self.log(
|
623
|
+
"While parsing a string in Object Key context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
|
624
|
+
)
|
625
|
+
string_acc += str(char)
|
626
|
+
self.index += 1
|
627
|
+
char = self.get_char_at()
|
634
628
|
if (
|
635
629
|
char
|
636
630
|
and missing_quotes
|
@@ -663,10 +657,9 @@ class JSONParser:
|
|
663
657
|
def parse_number(self) -> Union[float, int, str, JSONReturnType]:
|
664
658
|
# <number> is a valid real number expressed in one of a number of given formats
|
665
659
|
number_str = ""
|
666
|
-
number_chars = set("0123456789-.eE/,")
|
667
660
|
char = self.get_char_at()
|
668
661
|
is_array = self.context.current == ContextValues.ARRAY
|
669
|
-
while char and char in
|
662
|
+
while char and char in self.NUMBER_CHARS and (not is_array or char != ","):
|
670
663
|
number_str += char
|
671
664
|
self.index += 1
|
672
665
|
char = self.get_char_at()
|
@@ -712,51 +705,6 @@ class JSONParser:
|
|
712
705
|
self.index = starting_index
|
713
706
|
return ""
|
714
707
|
|
715
|
-
def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
|
716
|
-
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
|
717
|
-
try:
|
718
|
-
return self.json_str[self.index + count]
|
719
|
-
except IndexError:
|
720
|
-
return False
|
721
|
-
|
722
|
-
def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
|
723
|
-
"""
|
724
|
-
This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
|
725
|
-
"""
|
726
|
-
try:
|
727
|
-
char = self.json_str[self.index + idx]
|
728
|
-
except IndexError:
|
729
|
-
return idx
|
730
|
-
while char.isspace():
|
731
|
-
if move_main_index:
|
732
|
-
self.index += 1
|
733
|
-
else:
|
734
|
-
idx += 1
|
735
|
-
try:
|
736
|
-
char = self.json_str[self.index + idx]
|
737
|
-
except IndexError:
|
738
|
-
return idx
|
739
|
-
return idx
|
740
|
-
|
741
|
-
def skip_to_character(self, character: str, idx: int = 0) -> int:
|
742
|
-
"""
|
743
|
-
This function quickly iterates to find a character, syntactic sugar to make the code more concise
|
744
|
-
"""
|
745
|
-
try:
|
746
|
-
char = self.json_str[self.index + idx]
|
747
|
-
except IndexError:
|
748
|
-
return idx
|
749
|
-
while char != character:
|
750
|
-
idx += 1
|
751
|
-
try:
|
752
|
-
char = self.json_str[self.index + idx]
|
753
|
-
except IndexError:
|
754
|
-
return idx
|
755
|
-
if self.index + idx > 0 and self.json_str[self.index + idx - 1] == "\\":
|
756
|
-
# Ah this is an escaped character, try again
|
757
|
-
return self.skip_to_character(character=character, idx=idx + 1)
|
758
|
-
return idx
|
759
|
-
|
760
708
|
def parse_comment(self) -> str:
|
761
709
|
"""
|
762
710
|
Parse code-like comments:
|
@@ -827,6 +775,51 @@ class JSONParser:
|
|
827
775
|
self.index += 1
|
828
776
|
return ""
|
829
777
|
|
778
|
+
def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
|
779
|
+
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
|
780
|
+
try:
|
781
|
+
return self.json_str[self.index + count]
|
782
|
+
except IndexError:
|
783
|
+
return False
|
784
|
+
|
785
|
+
def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
|
786
|
+
"""
|
787
|
+
This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
|
788
|
+
"""
|
789
|
+
try:
|
790
|
+
char = self.json_str[self.index + idx]
|
791
|
+
except IndexError:
|
792
|
+
return idx
|
793
|
+
while char.isspace():
|
794
|
+
if move_main_index:
|
795
|
+
self.index += 1
|
796
|
+
else:
|
797
|
+
idx += 1
|
798
|
+
try:
|
799
|
+
char = self.json_str[self.index + idx]
|
800
|
+
except IndexError:
|
801
|
+
return idx
|
802
|
+
return idx
|
803
|
+
|
804
|
+
def skip_to_character(self, character: str, idx: int = 0) -> int:
|
805
|
+
"""
|
806
|
+
This function quickly iterates to find a character, syntactic sugar to make the code more concise
|
807
|
+
"""
|
808
|
+
try:
|
809
|
+
char = self.json_str[self.index + idx]
|
810
|
+
except IndexError:
|
811
|
+
return idx
|
812
|
+
while char != character:
|
813
|
+
idx += 1
|
814
|
+
try:
|
815
|
+
char = self.json_str[self.index + idx]
|
816
|
+
except IndexError:
|
817
|
+
return idx
|
818
|
+
if self.index + idx > 0 and self.json_str[self.index + idx - 1] == "\\":
|
819
|
+
# Ah this is an escaped character, try again
|
820
|
+
return self.skip_to_character(character=character, idx=idx + 1)
|
821
|
+
return idx
|
822
|
+
|
830
823
|
def _log(self, text: str) -> None:
|
831
824
|
window: int = 10
|
832
825
|
start: int = max(self.index - window, 0)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: json_repair
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.40.0
|
4
4
|
Summary: A package to repair broken json strings
|
5
5
|
Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
|
6
6
|
License: MIT License
|
@@ -196,12 +196,12 @@ pipx install json-repair
|
|
196
196
|
to know all options available:
|
197
197
|
```
|
198
198
|
$ json_repair -h
|
199
|
-
usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] filename
|
199
|
+
usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] [filename]
|
200
200
|
|
201
201
|
Repair and parse JSON files.
|
202
202
|
|
203
203
|
positional arguments:
|
204
|
-
filename The JSON file to repair
|
204
|
+
filename The JSON file to repair (if omitted, reads from stdin)
|
205
205
|
|
206
206
|
options:
|
207
207
|
-h, --help show this help message and exit
|
@@ -226,13 +226,13 @@ In this example, any version that starts with `0.` will be acceptable, allowing
|
|
226
226
|
# How to cite
|
227
227
|
If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
|
228
228
|
|
229
|
-
@software{Baccianella_JSON_Repair_-
|
230
|
-
author
|
231
|
-
month
|
232
|
-
title
|
233
|
-
url
|
234
|
-
version =
|
235
|
-
year
|
229
|
+
@software{Baccianella_JSON_Repair_-_2025,
|
230
|
+
author = "Stefano {Baccianella}",
|
231
|
+
month = "feb",
|
232
|
+
title = "JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs",
|
233
|
+
url = "https://github.com/mangiucugna/json_repair",
|
234
|
+
version = "0.39.1",
|
235
|
+
year = 2025
|
236
236
|
}
|
237
237
|
|
238
238
|
Thank you for citing my work and please send me a link to the paper if you can!
|
@@ -1,13 +1,13 @@
|
|
1
1
|
json_repair/__init__.py,sha256=c4L2kZrHvWEKfj_ODU2naliNuvU6FlFVxtF0hbLe6s8,178
|
2
2
|
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
3
3
|
json_repair/json_context.py,sha256=mm6dOyrPJ1sDskTORZSXCW7W9-5veMlUKqXQ3Hw3EG4,971
|
4
|
-
json_repair/json_parser.py,sha256=
|
4
|
+
json_repair/json_parser.py,sha256=aw-iCtblc9iL24w5zljHbbblK7Ao6G49MPoj513D2KE,38750
|
5
5
|
json_repair/json_repair.py,sha256=k-5HRRlCqrxNmJi0u1KE3IUeL4HXqi1XZ7oAL-NFDLo,10314
|
6
6
|
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
7
|
json_repair/string_file_wrapper.py,sha256=koZmdq2-Z5K7XF1bDqX6dEbNaVMJYcMTjq-aGe6NQvA,4526
|
8
|
-
json_repair-0.
|
9
|
-
json_repair-0.
|
10
|
-
json_repair-0.
|
11
|
-
json_repair-0.
|
12
|
-
json_repair-0.
|
13
|
-
json_repair-0.
|
8
|
+
json_repair-0.40.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
9
|
+
json_repair-0.40.0.dist-info/METADATA,sha256=i43pAASjiIvd0XJ3CMO1nqaV14JNE2MjPx0U8lMJVYc,11838
|
10
|
+
json_repair-0.40.0.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
|
11
|
+
json_repair-0.40.0.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
12
|
+
json_repair-0.40.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
13
|
+
json_repair-0.40.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|