json-repair 0.16.3__tar.gz → 0.17.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {json_repair-0.16.3/src/json_repair.egg-info → json_repair-0.17.0}/PKG-INFO +1 -1
- {json_repair-0.16.3 → json_repair-0.17.0}/pyproject.toml +1 -1
- {json_repair-0.16.3 → json_repair-0.17.0}/src/json_repair/json_repair.py +151 -116
- {json_repair-0.16.3 → json_repair-0.17.0/src/json_repair.egg-info}/PKG-INFO +1 -1
- json_repair-0.17.0/tests/test_json_repair.py +578 -0
- {json_repair-0.16.3 → json_repair-0.17.0}/tests/test_performance.py +18 -17
- json_repair-0.16.3/tests/test_json_repair.py +0 -305
- {json_repair-0.16.3 → json_repair-0.17.0}/LICENSE +0 -0
- {json_repair-0.16.3 → json_repair-0.17.0}/README.md +0 -0
- {json_repair-0.16.3 → json_repair-0.17.0}/setup.cfg +0 -0
- {json_repair-0.16.3 → json_repair-0.17.0}/src/json_repair/__init__.py +0 -0
- {json_repair-0.16.3 → json_repair-0.17.0}/src/json_repair.egg-info/SOURCES.txt +0 -0
- {json_repair-0.16.3 → json_repair-0.17.0}/src/json_repair.egg-info/dependency_links.txt +0 -0
- {json_repair-0.16.3 → json_repair-0.17.0}/src/json_repair.egg-info/top_level.txt +0 -0
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
4
4
|
[project]
|
5
5
|
name = "json_repair"
|
6
|
-
version = "0.
|
6
|
+
version = "0.17.0"
|
7
7
|
license = {file = "LICENSE"}
|
8
8
|
authors = [
|
9
9
|
{ name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },
|
@@ -11,7 +11,7 @@ This module will parse the JSON file following the BNF definition:
|
|
11
11
|
|
12
12
|
<container> ::= <object> | <array>
|
13
13
|
<array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
|
14
|
-
<object> ::= '{' [ <
|
14
|
+
<object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
|
15
15
|
<member> ::= <string> ': ' <json> ; A pair consisting of a name, and a JSON value
|
16
16
|
|
17
17
|
If something is wrong (a missing parantheses or quotes for example) it will use a few simple heuristics to fix the JSON string:
|
@@ -27,9 +27,11 @@ from typing import Any, Dict, List, Union, TextIO
|
|
27
27
|
|
28
28
|
|
29
29
|
class JSONParser:
|
30
|
-
def __init__(self, json_str: str, logging: bool = False) -> None:
|
30
|
+
def __init__(self, json_str: str, json_fd: TextIO, logging: bool = False) -> None:
|
31
31
|
# The string to parse
|
32
32
|
self.json_str = json_str
|
33
|
+
# Alternatively, the file description with a json file in it
|
34
|
+
self.json_fd = json_fd
|
33
35
|
# Index is our iterator that will keep track of which character we are looking at right now
|
34
36
|
self.index = 0
|
35
37
|
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
|
@@ -56,48 +58,28 @@ class JSONParser:
|
|
56
58
|
return ""
|
57
59
|
# <object> starts with '{'
|
58
60
|
# but an object key must be a string
|
59
|
-
elif
|
61
|
+
elif char == "{":
|
60
62
|
self.index += 1
|
61
63
|
return self.parse_object()
|
62
64
|
# <array> starts with '['
|
63
65
|
# but an object key must be a string
|
64
|
-
elif
|
66
|
+
elif char == "[":
|
65
67
|
self.index += 1
|
66
68
|
return self.parse_array()
|
67
69
|
# there can be an edge case in which a key is empty and at the end of an object
|
68
70
|
# like "key": }. We return an empty string here to close the object properly
|
69
|
-
elif
|
71
|
+
elif char == "}":
|
70
72
|
self.log(
|
71
73
|
"At the end of an object we found a key with missing value, skipping",
|
72
74
|
"info",
|
73
75
|
)
|
74
76
|
return ""
|
75
|
-
# <string> starts with
|
76
|
-
elif char
|
77
|
+
# <string> starts with a quote
|
78
|
+
elif char in ['"', "'", "“"] or char.isalpha():
|
77
79
|
return self.parse_string()
|
78
|
-
elif char == "'":
|
79
|
-
return self.parse_string(string_quotes="'")
|
80
|
-
elif char == "“":
|
81
|
-
return self.parse_string(string_quotes=["“", "”"])
|
82
80
|
# <number> starts with [0-9] or minus
|
83
|
-
elif (
|
84
|
-
self.get_context() != ""
|
85
|
-
and self.get_context() != "object_key"
|
86
|
-
and char.isdigit()
|
87
|
-
or char == "-"
|
88
|
-
or char == "."
|
89
|
-
):
|
81
|
+
elif char.isdigit() or char == "-" or char == ".":
|
90
82
|
return self.parse_number()
|
91
|
-
# <boolean> could be (T)rue or (F)alse or (N)ull
|
92
|
-
elif (
|
93
|
-
self.get_context() != ""
|
94
|
-
and self.get_context() != "object_key"
|
95
|
-
and char.lower() in ["t", "f", "n"]
|
96
|
-
):
|
97
|
-
return self.parse_boolean_or_null()
|
98
|
-
# This might be a <string> that is missing the starting '"'
|
99
|
-
elif self.get_context() != "" and char.isalpha():
|
100
|
-
return self.parse_string()
|
101
83
|
# If everything else fails, we just ignore and move on
|
102
84
|
else:
|
103
85
|
self.index += 1
|
@@ -117,11 +99,9 @@ class JSONParser:
|
|
117
99
|
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
|
118
100
|
if (self.get_char_at() or "") == ":":
|
119
101
|
self.log(
|
120
|
-
"While parsing an object we found a : before a key,
|
102
|
+
"While parsing an object we found a : before a key, ignoring",
|
121
103
|
"info",
|
122
104
|
)
|
123
|
-
self.remove_char_at()
|
124
|
-
self.insert_char_at(",")
|
125
105
|
self.index += 1
|
126
106
|
|
127
107
|
# We are now searching for they string key
|
@@ -133,7 +113,7 @@ class JSONParser:
|
|
133
113
|
# <member> starts with a <string>
|
134
114
|
key = ""
|
135
115
|
while key == "" and self.get_char_at():
|
136
|
-
key = self.
|
116
|
+
key = self.parse_string()
|
137
117
|
|
138
118
|
# This can happen sometimes like { "": "value" }
|
139
119
|
if key == "" and self.get_char_at() == ":":
|
@@ -153,10 +133,10 @@ class JSONParser:
|
|
153
133
|
# An extreme case of missing ":" after a key
|
154
134
|
if (self.get_char_at() or "") != ":":
|
155
135
|
self.log(
|
156
|
-
"While parsing an object we missed a : after a key
|
136
|
+
"While parsing an object we missed a : after a key",
|
157
137
|
"info",
|
158
138
|
)
|
159
|
-
|
139
|
+
|
160
140
|
self.index += 1
|
161
141
|
self.reset_context()
|
162
142
|
self.set_context("object_value")
|
@@ -176,10 +156,10 @@ class JSONParser:
|
|
176
156
|
# Especially at the end of an LLM generated json you might miss the last "}"
|
177
157
|
if (self.get_char_at() or "}") != "}":
|
178
158
|
self.log(
|
179
|
-
"While parsing an object, we couldn't find the closing },
|
159
|
+
"While parsing an object, we couldn't find the closing }, ignoring",
|
180
160
|
"info",
|
181
161
|
)
|
182
|
-
|
162
|
+
|
183
163
|
self.index += 1
|
184
164
|
return obj
|
185
165
|
|
@@ -205,6 +185,10 @@ class JSONParser:
|
|
205
185
|
char = self.get_char_at()
|
206
186
|
# If this is the right value of an object and we are closing the object, it means the array is over
|
207
187
|
if self.get_context() == "object_value" and char == "}":
|
188
|
+
self.log(
|
189
|
+
"While parsing an array inside an object, we got to the end without finding a ]. Stopped parsing",
|
190
|
+
"info",
|
191
|
+
)
|
208
192
|
break
|
209
193
|
|
210
194
|
# Especially at the end of an LLM generated json you might miss the last "]"
|
@@ -217,35 +201,68 @@ class JSONParser:
|
|
217
201
|
if char == ",":
|
218
202
|
# Remove trailing "," before adding the "]"
|
219
203
|
self.log(
|
220
|
-
"While parsing an array,
|
204
|
+
"While parsing an array, found a trailing , before adding ]",
|
221
205
|
"info",
|
222
206
|
)
|
223
|
-
|
224
|
-
self.insert_char_at("]")
|
207
|
+
|
225
208
|
self.index -= 1
|
226
209
|
|
227
210
|
self.index += 1
|
228
211
|
self.reset_context()
|
229
212
|
return arr
|
230
213
|
|
231
|
-
def parse_string(self
|
214
|
+
def parse_string(self) -> str:
|
232
215
|
# <string> is a string of valid characters enclosed in quotes
|
233
216
|
# i.e. { name: "John" }
|
234
217
|
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
|
235
218
|
|
236
219
|
# Flag to manage corner cases related to missing starting quote
|
237
|
-
|
220
|
+
missing_quotes = False
|
238
221
|
doubled_quotes = False
|
239
222
|
lstring_delimiter = rstring_delimiter = '"'
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
223
|
+
|
224
|
+
char = self.get_char_at()
|
225
|
+
# A valid string can only start with a valid quote or, in our case, with a literal
|
226
|
+
while char and char not in ['"', "'", "“"] and not char.isalpha():
|
227
|
+
self.index += 1
|
228
|
+
char = self.get_char_at()
|
229
|
+
|
230
|
+
# Ensuring we use the right delimiter
|
231
|
+
if char == "'":
|
232
|
+
lstring_delimiter = rstring_delimiter = "'"
|
233
|
+
elif char == "“":
|
234
|
+
lstring_delimiter = "“"
|
235
|
+
rstring_delimiter = "”"
|
236
|
+
elif char.isalpha():
|
237
|
+
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
|
238
|
+
if char.lower() in ["t", "f", "n"]:
|
239
|
+
value = self.parse_boolean_or_null()
|
240
|
+
if value != "":
|
241
|
+
return value
|
242
|
+
self.log(
|
243
|
+
"While parsing a string, we found a literal instead of a quote",
|
244
|
+
"info",
|
245
|
+
)
|
246
|
+
if self.get_context() == "":
|
247
|
+
# A string literal in the wild isn't a valid json and not something we can fix
|
248
|
+
self.log(
|
249
|
+
"While parsing a string, we found a literal outside of context, ignoring it",
|
250
|
+
"info",
|
251
|
+
)
|
252
|
+
self.index += 1
|
253
|
+
return self.parse_json()
|
254
|
+
self.log(
|
255
|
+
"While parsing a string, we found no starting quote, ignoring", "info"
|
256
|
+
)
|
257
|
+
missing_quotes = True
|
258
|
+
|
259
|
+
if not missing_quotes:
|
260
|
+
self.index += 1
|
261
|
+
|
245
262
|
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
|
246
|
-
if self.get_char_at(
|
263
|
+
if self.get_char_at() == lstring_delimiter:
|
247
264
|
# This is a valid exception only if it's closed by a double delimiter again
|
248
|
-
i =
|
265
|
+
i = 1
|
249
266
|
next_c = self.get_char_at(i)
|
250
267
|
while next_c and next_c != rstring_delimiter:
|
251
268
|
i += 1
|
@@ -259,18 +276,9 @@ class JSONParser:
|
|
259
276
|
)
|
260
277
|
doubled_quotes = True
|
261
278
|
self.index += 1
|
262
|
-
char = self.get_char_at()
|
263
|
-
if char != lstring_delimiter:
|
264
|
-
self.log(
|
265
|
-
"While parsing a string, we found no starting quote, adding it", "info"
|
266
|
-
)
|
267
|
-
self.insert_char_at(lstring_delimiter)
|
268
|
-
fixed_quotes = True
|
269
|
-
else:
|
270
|
-
self.index += 1
|
271
279
|
|
272
|
-
#
|
273
|
-
|
280
|
+
# Initialize our return value
|
281
|
+
string_acc = ""
|
274
282
|
|
275
283
|
# Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
|
276
284
|
# In that case we need to use the ":|,|}" characters as terminators of the string
|
@@ -280,22 +288,25 @@ class JSONParser:
|
|
280
288
|
# * If we are fixing missing quotes in an object, when it finds the special terminators
|
281
289
|
char = self.get_char_at()
|
282
290
|
while char and char != rstring_delimiter:
|
283
|
-
if
|
291
|
+
if missing_quotes:
|
284
292
|
if self.get_context() == "object_key" and (
|
285
293
|
char == ":" or char.isspace()
|
286
294
|
):
|
287
295
|
break
|
288
296
|
elif self.get_context() == "object_value" and char in [",", "}"]:
|
289
297
|
break
|
298
|
+
string_acc += char
|
290
299
|
self.index += 1
|
291
300
|
char = self.get_char_at()
|
292
301
|
# If the string contains an escaped character we should respect that or remove the escape
|
293
302
|
if self.get_char_at(-1) == "\\":
|
294
303
|
if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
|
304
|
+
string_acc += char
|
295
305
|
self.index += 1
|
296
306
|
char = self.get_char_at()
|
297
307
|
else:
|
298
|
-
|
308
|
+
# Remove this character from the final output
|
309
|
+
string_acc = string_acc[:-2] + string_acc[-1:]
|
299
310
|
self.index -= 1
|
300
311
|
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
|
301
312
|
if char == rstring_delimiter:
|
@@ -305,8 +316,6 @@ class JSONParser:
|
|
305
316
|
"While parsing a string, we found a doubled quote, ignoring it",
|
306
317
|
"info",
|
307
318
|
)
|
308
|
-
# self destruct this character
|
309
|
-
self.remove_char_at()
|
310
319
|
else:
|
311
320
|
# Check if eventually there is a rstring delimiter, otherwise we bail
|
312
321
|
i = 1
|
@@ -343,12 +352,13 @@ class JSONParser:
|
|
343
352
|
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
344
353
|
"info",
|
345
354
|
)
|
355
|
+
string_acc += char
|
346
356
|
self.index += 1
|
347
357
|
char = self.get_char_at()
|
348
358
|
|
349
359
|
if (
|
350
360
|
char
|
351
|
-
and
|
361
|
+
and missing_quotes
|
352
362
|
and self.get_context() == "object_key"
|
353
363
|
and char.isspace()
|
354
364
|
):
|
@@ -360,19 +370,16 @@ class JSONParser:
|
|
360
370
|
if self.get_char_at() not in [":", ","]:
|
361
371
|
return ""
|
362
372
|
|
363
|
-
end = self.index
|
364
|
-
|
365
373
|
# A fallout of the previous special case in the while loop, we need to update the index only if we had a closing quote
|
366
374
|
if char != rstring_delimiter:
|
367
375
|
self.log(
|
368
|
-
"While parsing a string, we missed the closing quote,
|
376
|
+
"While parsing a string, we missed the closing quote, ignoring",
|
369
377
|
"info",
|
370
378
|
)
|
371
|
-
self.insert_char_at(rstring_delimiter)
|
372
379
|
else:
|
373
380
|
self.index += 1
|
374
381
|
|
375
|
-
return
|
382
|
+
return string_acc.rstrip()
|
376
383
|
|
377
384
|
def parse_number(self) -> Union[float, int, str]:
|
378
385
|
# <number> is a valid real number expressed in one of a number of given formats
|
@@ -395,51 +402,57 @@ class JSONParser:
|
|
395
402
|
except ValueError:
|
396
403
|
return number_str
|
397
404
|
else:
|
398
|
-
#
|
399
|
-
return self.
|
405
|
+
# If nothing works, let's skip and keep parsing
|
406
|
+
return self.parse_json()
|
400
407
|
|
401
408
|
def parse_boolean_or_null(self) -> Union[bool, str, None]:
|
402
409
|
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
|
403
|
-
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
|
410
|
+
starting_index = self.index
|
411
|
+
value = ""
|
412
|
+
char = self.get_char_at().lower()
|
413
|
+
if char == "t":
|
414
|
+
value = ("true", True)
|
415
|
+
elif char == "f":
|
416
|
+
value = ("false", False)
|
417
|
+
elif char == "n":
|
418
|
+
value = ("null", None)
|
419
|
+
|
420
|
+
if len(value):
|
421
|
+
i = 0
|
422
|
+
while char and i < len(value[0]) and char == value[0][i]:
|
423
|
+
i += 1
|
424
|
+
self.index += 1
|
425
|
+
char = self.get_char_at().lower()
|
426
|
+
if i == len(value[0]):
|
427
|
+
return value[1]
|
408
428
|
|
409
|
-
#
|
410
|
-
|
411
|
-
|
412
|
-
def insert_char_at(self, char: str) -> None:
|
413
|
-
self.json_str = self.json_str[: self.index] + char + self.json_str[self.index :]
|
414
|
-
self.index += 1
|
429
|
+
# If nothing works reset the index before returning
|
430
|
+
self.index = starting_index
|
431
|
+
return ""
|
415
432
|
|
416
433
|
def get_char_at(self, count: int = 0) -> Union[str, bool]:
|
417
|
-
|
418
|
-
|
419
|
-
|
420
|
-
|
421
|
-
|
422
|
-
|
423
|
-
|
424
|
-
|
425
|
-
|
426
|
-
|
427
|
-
|
434
|
+
if self.json_fd:
|
435
|
+
self.json_fd.seek(self.index + count)
|
436
|
+
char = self.json_fd.read(1)
|
437
|
+
if char == "":
|
438
|
+
return False
|
439
|
+
return char
|
440
|
+
else:
|
441
|
+
# Why not use something simpler? Because we might be out of bounds and doing this check all the time is annoying
|
442
|
+
try:
|
443
|
+
return self.json_str[self.index + count]
|
444
|
+
except IndexError:
|
445
|
+
return False
|
428
446
|
|
429
447
|
def skip_whitespaces_at(self) -> None:
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
except IndexError:
|
436
|
-
return
|
448
|
+
"""
|
449
|
+
This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
|
450
|
+
"""
|
451
|
+
|
452
|
+
char = self.get_char_at()
|
437
453
|
while char and char.isspace():
|
438
454
|
self.index += 1
|
439
|
-
|
440
|
-
char = self.json_str[self.index]
|
441
|
-
except IndexError:
|
442
|
-
return
|
455
|
+
char = self.get_char_at()
|
443
456
|
|
444
457
|
def set_context(self, value: str) -> None:
|
445
458
|
# If a value is provided update the context variable and save in stack
|
@@ -460,23 +473,31 @@ class JSONParser:
|
|
460
473
|
|
461
474
|
def log(self, text: str, level: str) -> None:
|
462
475
|
if level == self.logger["log_level"]:
|
476
|
+
context = ""
|
477
|
+
if self.json_fd:
|
478
|
+
self.json_fd.seek(self.index - self.logger["window"])
|
479
|
+
context = self.json_fd.read(self.logger["window"] * 2)
|
480
|
+
self.json_fd.seek(self.index)
|
481
|
+
else:
|
482
|
+
context = self.json_str[
|
483
|
+
self.index
|
484
|
+
- self.logger["window"] : self.index
|
485
|
+
+ self.logger["window"]
|
486
|
+
]
|
463
487
|
self.logger["log"].append(
|
464
488
|
{
|
465
489
|
"text": text,
|
466
|
-
"context":
|
467
|
-
self.index
|
468
|
-
- self.logger["window"] : self.index
|
469
|
-
+ self.logger["window"]
|
470
|
-
],
|
490
|
+
"context": context,
|
471
491
|
}
|
472
492
|
)
|
473
493
|
|
474
494
|
|
475
495
|
def repair_json(
|
476
|
-
json_str: str,
|
496
|
+
json_str: str = "",
|
477
497
|
return_objects: bool = False,
|
478
498
|
skip_json_loads: bool = False,
|
479
499
|
logging: bool = False,
|
500
|
+
json_fd: TextIO = None,
|
480
501
|
) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
|
481
502
|
"""
|
482
503
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
|
@@ -485,13 +506,15 @@ def repair_json(
|
|
485
506
|
When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
|
486
507
|
When `logging=True` is passed, it will return an tuple with the repaired json and a log of all repair actions
|
487
508
|
"""
|
488
|
-
|
489
|
-
parser = JSONParser(json_str, logging)
|
509
|
+
parser = JSONParser(json_str, json_fd, logging)
|
490
510
|
if skip_json_loads:
|
491
511
|
parsed_json = parser.parse()
|
492
512
|
else:
|
493
513
|
try:
|
494
|
-
|
514
|
+
if json_fd:
|
515
|
+
parsed_json = json.load(json_fd)
|
516
|
+
else:
|
517
|
+
parsed_json = json.loads(json_str)
|
495
518
|
except json.JSONDecodeError:
|
496
519
|
parsed_json = parser.parse()
|
497
520
|
# It's useful to return the actual object instead of the json string, it allows this lib to be a replacement of the json library
|
@@ -507,18 +530,30 @@ def loads(
|
|
507
530
|
This function works like `json.loads()` except that it will fix your JSON in the process.
|
508
531
|
It is a wrapper around the `repair_json()` function with `return_objects=True`.
|
509
532
|
"""
|
510
|
-
return repair_json(
|
533
|
+
return repair_json(
|
534
|
+
json_str=json_str,
|
535
|
+
return_objects=True,
|
536
|
+
skip_json_loads=skip_json_loads,
|
537
|
+
logging=logging,
|
538
|
+
)
|
511
539
|
|
512
540
|
|
513
541
|
def load(
|
514
|
-
|
542
|
+
fd: TextIO, skip_json_loads: bool = False, logging: bool = False
|
515
543
|
) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
|
516
|
-
|
544
|
+
"""
|
545
|
+
This function works like `json.load()` except that it will fix your JSON in the process.
|
546
|
+
It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`.
|
547
|
+
"""
|
548
|
+
return repair_json(json_fd=fd, skip_json_loads=skip_json_loads, logging=logging)
|
517
549
|
|
518
550
|
|
519
551
|
def from_file(
|
520
552
|
filename: str, skip_json_loads: bool = False, logging: bool = False
|
521
553
|
) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
|
554
|
+
"""
|
555
|
+
This function is a wrapper around `load()` so you can pass the filename as string
|
556
|
+
"""
|
522
557
|
fd = open(filename)
|
523
558
|
jsonobj = load(fd, skip_json_loads, logging)
|
524
559
|
fd.close()
|