json-repair 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/json_repair.py +45 -29
- {json_repair-0.3.0.dist-info → json_repair-0.4.1.dist-info}/METADATA +17 -2
- json_repair-0.4.1.dist-info/RECORD +7 -0
- json_repair-0.3.0.dist-info/RECORD +0 -7
- {json_repair-0.3.0.dist-info → json_repair-0.4.1.dist-info}/LICENSE +0 -0
- {json_repair-0.3.0.dist-info → json_repair-0.4.1.dist-info}/WHEEL +0 -0
- {json_repair-0.3.0.dist-info → json_repair-0.4.1.dist-info}/top_level.txt +0 -0
json_repair/json_repair.py
CHANGED
@@ -52,13 +52,17 @@ class JSONParser:
|
|
52
52
|
# <array> starts with '['
|
53
53
|
elif char == "[":
|
54
54
|
return self.parse_array()
|
55
|
+
# there can be an edge case in which a key is empty and at the end of an object
|
56
|
+
# like "key": }. We return an empty string here to close the object properly
|
57
|
+
elif char == "}" and self.context == "object_value":
|
58
|
+
return ""
|
55
59
|
# <string> starts with '"'
|
56
60
|
elif char == '"':
|
57
61
|
return self.parse_string()
|
58
62
|
# <number> starts with [0-9] or minus
|
59
63
|
elif char.isdigit() or char == "-":
|
60
64
|
return self.parse_number()
|
61
|
-
# <boolean> could (T)rue or (F)alse or (N)ull
|
65
|
+
# <boolean> could be (T)rue or (F)alse or (N)ull
|
62
66
|
elif char == "t" or char == "f" or char == "n":
|
63
67
|
return self.parse_boolean_or_null()
|
64
68
|
# This might be a <string> that is missing the starting '"'
|
@@ -67,6 +71,7 @@ class JSONParser:
|
|
67
71
|
# Ignore whitespaces outside of strings
|
68
72
|
elif char.isspace():
|
69
73
|
self.index += 1
|
74
|
+
self.skip_whitespaces_at()
|
70
75
|
return self.parse_json()
|
71
76
|
# If everything else fails, then we give up and return an exception
|
72
77
|
else:
|
@@ -87,7 +92,7 @@ class JSONParser:
|
|
87
92
|
# <member> ::= <string> ': ' <json>
|
88
93
|
|
89
94
|
# Skip filler whitespaces
|
90
|
-
self.
|
95
|
+
self.skip_whitespaces_at()
|
91
96
|
|
92
97
|
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
|
93
98
|
if self.get_char_at() == ":":
|
@@ -100,6 +105,7 @@ class JSONParser:
|
|
100
105
|
self.context = "object_key"
|
101
106
|
|
102
107
|
# <member> starts with a <string>
|
108
|
+
self.skip_whitespaces_at()
|
103
109
|
key = self.parse_string()
|
104
110
|
while key == "":
|
105
111
|
key = self.parse_string()
|
@@ -124,7 +130,7 @@ class JSONParser:
|
|
124
130
|
self.index += 1
|
125
131
|
|
126
132
|
# Remove trailing spaces
|
127
|
-
self.
|
133
|
+
self.skip_whitespaces_at()
|
128
134
|
|
129
135
|
# Especially at the end of an LLM generated json you might miss the last "}"
|
130
136
|
if self.get_char_at() and self.get_char_at() != "}":
|
@@ -172,8 +178,6 @@ class JSONParser:
|
|
172
178
|
# Flag to manage corner cases related to missing starting quote
|
173
179
|
fixed_quotes = False
|
174
180
|
# i.e. { name: "John" }
|
175
|
-
# Remove any trailing space
|
176
|
-
self.trim()
|
177
181
|
if self.get_char_at() != '"':
|
178
182
|
self.insert_char_at('"')
|
179
183
|
fixed_quotes = True
|
@@ -211,14 +215,10 @@ class JSONParser:
|
|
211
215
|
and self.get_char_at().isspace()
|
212
216
|
):
|
213
217
|
# skip whitespaces
|
214
|
-
self.
|
218
|
+
self.skip_whitespaces_at()
|
215
219
|
# This string is invalid if there's no valid termination afterwards
|
216
220
|
|
217
|
-
if (
|
218
|
-
self.get_char_at() != ":"
|
219
|
-
or self.get_char_at() != ","
|
220
|
-
or self.get_char_at() != "}"
|
221
|
-
):
|
221
|
+
if self.get_char_at() not in [":", ","]:
|
222
222
|
return ""
|
223
223
|
|
224
224
|
end = self.index
|
@@ -232,11 +232,13 @@ class JSONParser:
|
|
232
232
|
|
233
233
|
def parse_number(self) -> Union[float, int]:
|
234
234
|
# <number> is a valid real number expressed in one of a number of given formats
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
number_str
|
239
|
-
self.index +=
|
235
|
+
number_str = ""
|
236
|
+
char = self.get_char_at()
|
237
|
+
while char and (char.isdigit() or char in "-.eE"):
|
238
|
+
number_str += char
|
239
|
+
self.index += 1
|
240
|
+
char = self.get_char_at()
|
241
|
+
if number_str:
|
240
242
|
if "." in number_str or "e" in number_str or "E" in number_str:
|
241
243
|
return float(number_str)
|
242
244
|
else:
|
@@ -264,23 +266,34 @@ class JSONParser:
|
|
264
266
|
self.json_str = self.json_str[: self.index] + char + self.json_str[self.index :]
|
265
267
|
self.index += 1
|
266
268
|
|
267
|
-
def get_char_at(self
|
269
|
+
def get_char_at(self) -> Union[str, bool]:
|
268
270
|
# Why not use something simpler? Because we might be out of bounds and doing this check all the time is annoying
|
269
|
-
|
270
|
-
|
271
|
+
try:
|
272
|
+
return self.json_str[self.index]
|
273
|
+
except IndexError:
|
274
|
+
return False
|
271
275
|
|
272
|
-
def remove_char_at(self
|
273
|
-
|
274
|
-
self.json_str = self.json_str[: self.index] + self.json_str[self.index + idx :]
|
276
|
+
def remove_char_at(self) -> None:
|
277
|
+
self.json_str = self.json_str[: self.index] + self.json_str[self.index + 1 :]
|
275
278
|
|
276
|
-
def
|
279
|
+
def skip_whitespaces_at(self) -> None:
|
277
280
|
# Remove trailing spaces
|
278
|
-
|
281
|
+
# I'd rather not do this BUT this method is called so many times that it makes sense to expand get_char_at
|
282
|
+
# At least this is what the profiler said and I believe in our lord and savior the profiler
|
283
|
+
try:
|
284
|
+
char = self.json_str[self.index]
|
285
|
+
except IndexError:
|
286
|
+
return
|
287
|
+
while char and char.isspace():
|
279
288
|
self.index += 1
|
289
|
+
try:
|
290
|
+
char = self.json_str[self.index]
|
291
|
+
except IndexError:
|
292
|
+
return
|
280
293
|
|
281
294
|
|
282
295
|
def repair_json(
|
283
|
-
json_str: str, return_objects: bool = False
|
296
|
+
json_str: str, return_objects: bool = False, skip_json_loads: bool = False
|
284
297
|
) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
|
285
298
|
"""
|
286
299
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
|
@@ -290,11 +303,14 @@ def repair_json(
|
|
290
303
|
json_str = re.sub(r"^\s+", "", json_str)
|
291
304
|
json_str = re.sub(r"\s+$", "", json_str)
|
292
305
|
json_str = re.sub(r"/\*.*?\*/", "", json_str)
|
293
|
-
|
294
|
-
|
295
|
-
except Exception:
|
296
|
-
parser = JSONParser(json_str)
|
306
|
+
parser = JSONParser(json_str)
|
307
|
+
if skip_json_loads:
|
297
308
|
parsed_json = parser.parse()
|
309
|
+
else:
|
310
|
+
try:
|
311
|
+
parsed_json = json.loads(json_str)
|
312
|
+
except json.JSONDecodeError:
|
313
|
+
parsed_json = parser.parse()
|
298
314
|
# It's useful to return the actual object instead of the json string, it allows this lib to be a replacement of the json library
|
299
315
|
if return_objects:
|
300
316
|
return parsed_json
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: json-repair
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.4.1
|
4
4
|
Summary: A package to repair broken json strings
|
5
5
|
Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
|
6
6
|
License: MIT License
|
@@ -59,7 +59,7 @@ You can look how I used it by checking out this demo: https://huggingface.co/spa
|
|
59
59
|
# Not even this library could fix this JSON
|
60
60
|
|
61
61
|
You can use this library to completely replace `json.loads()`:
|
62
|
-
|
62
|
+
|
63
63
|
import json_repair
|
64
64
|
try:
|
65
65
|
decoded_object = json_repair.loads(json_string)
|
@@ -74,6 +74,21 @@ or just
|
|
74
74
|
except Exception:
|
75
75
|
# Manage Exception
|
76
76
|
|
77
|
+
## Performance
|
78
|
+
If you find this library too slow because is using `json.loads()` you can skip that by passing `skip_json_loads=True` to `repair_json`. Like:
|
79
|
+
|
80
|
+
from json_repair import repair_json
|
81
|
+
try:
|
82
|
+
good_json_string = repair_json(bad_json_string, skip_json_loads=True)
|
83
|
+
except Exception:
|
84
|
+
# Not even this library could fix this JSON
|
85
|
+
|
86
|
+
I made a choice of not using any fast json library to avoid having any external dependency, so that anybody can use it regardless of their stack.
|
87
|
+
|
88
|
+
Some rules of thumb to use:
|
89
|
+
- Setting `return_objects=True` will always be faster because the parser returns an object already and it doesn't have serialize that object to JSON
|
90
|
+
- `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
|
91
|
+
|
77
92
|
# How it works
|
78
93
|
This module will parse the JSON file following the BNF definition:
|
79
94
|
|
@@ -0,0 +1,7 @@
|
|
1
|
+
json_repair/__init__.py,sha256=p9mZnte8Bg18NcxqgJ7vopH2gQv_XbZ0dRnk686QuRE,92
|
2
|
+
json_repair/json_repair.py,sha256=Qxy8eQpkm9e1qVUDvhYOuID7flLHIzAsr5cB1NWb3Y4,12974
|
3
|
+
json_repair-0.4.1.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
4
|
+
json_repair-0.4.1.dist-info/METADATA,sha256=MXc1lBLV9wr-DNTqqkzuH8W-VYiDNY9UVsvsHJGqMw4,6155
|
5
|
+
json_repair-0.4.1.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
|
6
|
+
json_repair-0.4.1.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
7
|
+
json_repair-0.4.1.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
json_repair/__init__.py,sha256=p9mZnte8Bg18NcxqgJ7vopH2gQv_XbZ0dRnk686QuRE,92
|
2
|
-
json_repair/json_repair.py,sha256=QdgEnpE4bDqy9nCR9Gw61MB6igfI4K0YFOrBI_dh5mc,12224
|
3
|
-
json_repair-0.3.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
4
|
-
json_repair-0.3.0.dist-info/METADATA,sha256=EECNyro1Jtknlrtf3vqhl3StFJ_8drVdpTOodvZl7Us,5386
|
5
|
-
json_repair-0.3.0.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
|
6
|
-
json_repair-0.3.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
7
|
-
json_repair-0.3.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|