json-repair 0.30.0__py3-none-any.whl → 0.30.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- json_repair/json_context.py +3 -5
- json_repair/json_parser.py +39 -21
- json_repair/json_repair.py +68 -16
- {json_repair-0.30.0.dist-info → json_repair-0.30.2.dist-info}/METADATA +34 -5
- json_repair-0.30.2.dist-info/RECORD +13 -0
- {json_repair-0.30.0.dist-info → json_repair-0.30.2.dist-info}/WHEEL +1 -1
- json_repair-0.30.0.dist-info/RECORD +0 -13
- {json_repair-0.30.0.dist-info → json_repair-0.30.2.dist-info}/LICENSE +0 -0
- {json_repair-0.30.0.dist-info → json_repair-0.30.2.dist-info}/entry_points.txt +0 -0
- {json_repair-0.30.0.dist-info → json_repair-0.30.2.dist-info}/top_level.txt +0 -0
json_repair/json_context.py
CHANGED
@@ -24,11 +24,9 @@ class JsonContext:
|
|
24
24
|
Returns:
|
25
25
|
None
|
26
26
|
"""
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
self.current = value
|
31
|
-
self.empty = False
|
27
|
+
self.context.append(value)
|
28
|
+
self.current = value
|
29
|
+
self.empty = False
|
32
30
|
|
33
31
|
def reset(self) -> None:
|
34
32
|
"""
|
json_repair/json_parser.py
CHANGED
@@ -314,31 +314,49 @@ class JSONParser:
|
|
314
314
|
if next_c:
|
315
315
|
i += 1
|
316
316
|
# found a delimiter, now we need to check that is followed strictly by a comma or brace
|
317
|
+
# or the string ended
|
317
318
|
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
|
318
319
|
next_c = self.get_char_at(i)
|
319
|
-
if next_c
|
320
|
+
if not next_c or next_c in [",", "}"]:
|
320
321
|
rstring_delimiter_missing = False
|
322
|
+
else:
|
323
|
+
# OK but this could still be some garbage at the end of the string
|
324
|
+
# So we need to check if we find a new lstring_delimiter afterwards
|
325
|
+
# If we do, this is a missing delimiter
|
326
|
+
i = self.skip_to_character(character=lstring_delimiter, idx=i)
|
327
|
+
next_c = self.get_char_at(i)
|
328
|
+
if not next_c:
|
329
|
+
rstring_delimiter_missing = False
|
321
330
|
else:
|
322
|
-
#
|
323
|
-
|
324
|
-
#
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
#
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
331
|
+
# There could be a case in which even the next key:value is missing delimeters
|
332
|
+
# because it might be a systemic issue with the output
|
333
|
+
# So let's check if we can find a : in the string instead
|
334
|
+
i = self.skip_to_character(character=":", idx=1)
|
335
|
+
next_c = self.get_char_at(i)
|
336
|
+
if next_c:
|
337
|
+
# OK then this is a systemic issue with the output
|
338
|
+
break
|
339
|
+
else:
|
340
|
+
# skip any whitespace first
|
341
|
+
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
|
342
|
+
# We couldn't find any rstring_delimeter before the end of the string
|
343
|
+
# check if this is the last string of an object and therefore we can keep going
|
344
|
+
# make an exception if this is the last char before the closing brace
|
345
|
+
j = self.skip_to_character(character="}", idx=i)
|
346
|
+
if j - i > 1:
|
347
|
+
# Ok it's not right after the comma
|
348
|
+
# Let's ignore
|
349
|
+
rstring_delimiter_missing = False
|
350
|
+
# Check that j was not out of bound
|
351
|
+
elif self.get_char_at(j):
|
352
|
+
# Check for an unmatched opening brace in string_acc
|
353
|
+
for c in reversed(string_acc):
|
354
|
+
if c == "{":
|
355
|
+
# Ok then this is part of the string
|
356
|
+
rstring_delimiter_missing = False
|
357
|
+
break
|
358
|
+
elif c == "}":
|
359
|
+
break
|
342
360
|
if rstring_delimiter_missing:
|
343
361
|
self.log(
|
344
362
|
"While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here",
|
json_repair/json_repair.py
CHANGED
@@ -41,10 +41,18 @@ def repair_json(
|
|
41
41
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
42
42
|
"""
|
43
43
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
44
|
+
|
45
|
+
Args:
|
46
|
+
json_str (str, optional): The JSON string to repair. Defaults to an empty string.
|
47
|
+
return_objects (bool, optional): If True, return the decoded data structure. Defaults to False.
|
48
|
+
skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
|
49
|
+
logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
|
50
|
+
json_fd (Optional[TextIO], optional): File descriptor for JSON input. Do not use! Use `from_file` or `load` instead. Defaults to None.
|
51
|
+
ensure_ascii (bool, optional): Set to False to avoid converting non-latin characters to ascii (for example when using chinese characters). Defaults to True. Ignored if `skip_json_loads` is True.
|
52
|
+
chunk_length (int, optional): Size in bytes of the file chunks to read at once. Ignored if `json_fd` is None. Do not use! Use `from_file` or `load` instead. Defaults to 1MB.
|
53
|
+
|
54
|
+
Returns:
|
55
|
+
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON or a tuple with the repaired JSON and repair log.
|
48
56
|
"""
|
49
57
|
parser = JSONParser(json_str, json_fd, logging, chunk_length)
|
50
58
|
if skip_json_loads:
|
@@ -72,6 +80,14 @@ def loads(
|
|
72
80
|
"""
|
73
81
|
This function works like `json.loads()` except that it will fix your JSON in the process.
|
74
82
|
It is a wrapper around the `repair_json()` function with `return_objects=True`.
|
83
|
+
|
84
|
+
Args:
|
85
|
+
json_str (str): The JSON string to load and repair.
|
86
|
+
skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
|
87
|
+
logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
|
75
91
|
"""
|
76
92
|
return repair_json(
|
77
93
|
json_str=json_str,
|
@@ -90,6 +106,15 @@ def load(
|
|
90
106
|
"""
|
91
107
|
This function works like `json.load()` except that it will fix your JSON in the process.
|
92
108
|
It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`.
|
109
|
+
|
110
|
+
Args:
|
111
|
+
fd (TextIO): File descriptor for JSON input.
|
112
|
+
skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
|
113
|
+
logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
|
114
|
+
chunk_length (int, optional): Size in bytes of the file chunks to read at once. Defaults to 1MB.
|
115
|
+
|
116
|
+
Returns:
|
117
|
+
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
|
93
118
|
"""
|
94
119
|
return repair_json(
|
95
120
|
json_fd=fd,
|
@@ -108,20 +133,48 @@ def from_file(
|
|
108
133
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
109
134
|
"""
|
110
135
|
This function is a wrapper around `load()` so you can pass the filename as string
|
136
|
+
|
137
|
+
Args:
|
138
|
+
filename (str): The name of the file containing JSON data to load and repair.
|
139
|
+
skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
|
140
|
+
logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
|
141
|
+
chunk_length (int, optional): Size in bytes of the file chunks to read at once. Defaults to 1MB.
|
142
|
+
|
143
|
+
Returns:
|
144
|
+
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
|
111
145
|
"""
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
fd.close()
|
146
|
+
with open(filename) as fd:
|
147
|
+
jsonobj = load(
|
148
|
+
fd=fd,
|
149
|
+
skip_json_loads=skip_json_loads,
|
150
|
+
logging=logging,
|
151
|
+
chunk_length=chunk_length,
|
152
|
+
)
|
120
153
|
|
121
154
|
return jsonobj
|
122
155
|
|
123
156
|
|
124
157
|
def cli(inline_args: Optional[List[str]] = None) -> int:
|
158
|
+
"""
|
159
|
+
Command-line interface for repairing and parsing JSON files.
|
160
|
+
|
161
|
+
Args:
|
162
|
+
inline_args (Optional[List[str]]): List of command-line arguments for testing purposes. Defaults to None.
|
163
|
+
- filename (str): The JSON file to repair
|
164
|
+
- -i, --inline (bool): Replace the file inline instead of returning the output to stdout.
|
165
|
+
- -o, --output TARGET (str): If specified, the output will be written to TARGET filename instead of stdout.
|
166
|
+
- --ensure_ascii (bool): Pass ensure_ascii=True to json.dumps(). Will pass False otherwise.
|
167
|
+
- --indent INDENT (int): Number of spaces for indentation (Default 2).
|
168
|
+
|
169
|
+
Returns:
|
170
|
+
int: Exit code of the CLI operation.
|
171
|
+
|
172
|
+
Raises:
|
173
|
+
Exception: Any exception that occurs during file processing.
|
174
|
+
|
175
|
+
Example:
|
176
|
+
>>> cli(['example.json', '--indent', '4'])
|
177
|
+
"""
|
125
178
|
parser = argparse.ArgumentParser(description="Repair and parse JSON files.")
|
126
179
|
parser.add_argument("filename", help="The JSON file to repair")
|
127
180
|
parser.add_argument(
|
@@ -167,14 +220,13 @@ def cli(inline_args: Optional[List[str]] = None) -> int:
|
|
167
220
|
result = from_file(args.filename)
|
168
221
|
|
169
222
|
if args.inline or args.output:
|
170
|
-
|
171
|
-
|
172
|
-
fd.close()
|
223
|
+
with open(args.output or args.filename, mode="w") as fd:
|
224
|
+
json.dump(result, fd, indent=args.indent, ensure_ascii=ensure_ascii)
|
173
225
|
else:
|
174
226
|
print(json.dumps(result, indent=args.indent, ensure_ascii=ensure_ascii))
|
175
227
|
except Exception as e: # pragma: no cover
|
176
228
|
print(f"Error: {str(e)}", file=sys.stderr)
|
177
|
-
|
229
|
+
return 1
|
178
230
|
|
179
231
|
return 0 # Success
|
180
232
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: json_repair
|
3
|
-
Version: 0.30.
|
3
|
+
Version: 0.30.2
|
4
4
|
Summary: A package to repair broken json strings
|
5
5
|
Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
|
6
6
|
License: MIT License
|
@@ -32,18 +32,20 @@ Keywords: JSON,REPAIR,LLM,PARSER
|
|
32
32
|
Classifier: Programming Language :: Python :: 3
|
33
33
|
Classifier: License :: OSI Approved :: MIT License
|
34
34
|
Classifier: Operating System :: OS Independent
|
35
|
-
Requires-Python: >=3.
|
35
|
+
Requires-Python: >=3.9
|
36
36
|
Description-Content-Type: text/markdown
|
37
37
|
License-File: LICENSE
|
38
38
|
|
39
39
|
[](https://pypi.org/project/json-repair/)
|
40
|
-

|
41
41
|
[](https://pypi.org/project/json-repair/)
|
42
42
|
[](https://github.com/sponsors/mangiucugna)
|
43
|
+
[](https://github.com/mangiucugna/json_repair/stargazers)
|
44
|
+
|
43
45
|
|
44
46
|
This simple package can be used to fix an invalid json string. To know all cases in which this package will work, check out the unit test.
|
45
47
|
|
46
|
-
|
48
|
+

|
47
49
|
|
48
50
|
---
|
49
51
|
# Offer me a beer
|
@@ -54,6 +56,8 @@ If you find this library useful, you can help me by donating toward my monthly b
|
|
54
56
|
# Demo
|
55
57
|
If you are unsure if this library will fix your specific problem, or simply want your json validated online, you can visit the demo site on GitHub pages: https://mangiucugna.github.io/json_repair/
|
56
58
|
|
59
|
+
Or hear an [audio deepdive generate by Google's NotebookLM](https://notebooklm.google.com/notebook/05312bb3-f6f3-4e49-a99b-bd51db64520b/audio) for an introduction to the module
|
60
|
+
|
57
61
|
---
|
58
62
|
|
59
63
|
# Motivation
|
@@ -64,6 +68,11 @@ I searched for a lightweight python package that was able to reliably fix this p
|
|
64
68
|
|
65
69
|
*So I wrote one*
|
66
70
|
|
71
|
+
### Wouldn't GPT-4o Structured Output make this library outdated?
|
72
|
+
|
73
|
+
As part of my job we use OpenAI APIs and we noticed that even with structured output sometimes the result isn't a fully valid json.
|
74
|
+
So we still use this library to cover those outliers.
|
75
|
+
|
67
76
|
# Supported use cases
|
68
77
|
|
69
78
|
### Fixing Syntax Errors in JSON
|
@@ -144,6 +153,26 @@ and another method to read from a file:
|
|
144
153
|
|
145
154
|
Keep in mind that the library will not catch any IO-related exception and those will need to be managed by you
|
146
155
|
|
156
|
+
### Non-Latin characters
|
157
|
+
|
158
|
+
When working with non-Latin characters (such as Chinese, Japanese, or Korean), you need to pass `ensure_ascii=False` to `repair_json()` in order to preserve the non-Latin characters in the output.
|
159
|
+
|
160
|
+
Here's an example using Chinese characters:
|
161
|
+
|
162
|
+
repair_json("{'test_chinese_ascii':'统一码'}")
|
163
|
+
|
164
|
+
will return
|
165
|
+
|
166
|
+
{"test_chinese_ascii": "\u7edf\u4e00\u7801"}
|
167
|
+
|
168
|
+
Instead passing `ensure_ascii=False`:
|
169
|
+
|
170
|
+
repair_json("{'test_chinese_ascii':'统一码'}", ensure_ascii=False)
|
171
|
+
|
172
|
+
will return
|
173
|
+
|
174
|
+
{"test_chinese_ascii": "统一码"}
|
175
|
+
|
147
176
|
### Performance considerations
|
148
177
|
If you find this library too slow because is using `json.loads()` you can skip that by passing `skip_json_loads=True` to `repair_json`. Like:
|
149
178
|
|
@@ -226,7 +255,7 @@ This module will parse the JSON file following the BNF definition:
|
|
226
255
|
<object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
|
227
256
|
<member> ::= <string> ': ' <json> ; A pair consisting of a name, and a JSON value
|
228
257
|
|
229
|
-
If something is wrong (a missing
|
258
|
+
If something is wrong (a missing parentheses or quotes for example) it will use a few simple heuristics to fix the JSON string:
|
230
259
|
- Add the missing parentheses if the parser believes that the array or object should be closed
|
231
260
|
- Quote strings or add missing single quotes
|
232
261
|
- Adjust whitespaces and remove line breaks
|
@@ -0,0 +1,13 @@
|
|
1
|
+
json_repair/__init__.py,sha256=c4L2kZrHvWEKfj_ODU2naliNuvU6FlFVxtF0hbLe6s8,178
|
2
|
+
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
3
|
+
json_repair/json_context.py,sha256=mm6dOyrPJ1sDskTORZSXCW7W9-5veMlUKqXQ3Hw3EG4,971
|
4
|
+
json_repair/json_parser.py,sha256=7qVtBWheWPrLDJxej4lwdgi0zRH_TBReOXGsy18ZfZs,28698
|
5
|
+
json_repair/json_repair.py,sha256=LINLSJBs3cJMfs1YRDaIpfWR5PJLs87Oe06G5yQjY18,9729
|
6
|
+
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
json_repair/string_file_wrapper.py,sha256=EHLhNBWoyUitzT08thytYJiNZh_klEFwfT8zutPSdb4,3905
|
8
|
+
json_repair-0.30.2.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
9
|
+
json_repair-0.30.2.dist-info/METADATA,sha256=SLQn7rdr1YWIy49JdefVozK6yiz2Y61mh0UDde8omSI,11794
|
10
|
+
json_repair-0.30.2.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
11
|
+
json_repair-0.30.2.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
12
|
+
json_repair-0.30.2.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
13
|
+
json_repair-0.30.2.dist-info/RECORD,,
|
@@ -1,13 +0,0 @@
|
|
1
|
-
json_repair/__init__.py,sha256=c4L2kZrHvWEKfj_ODU2naliNuvU6FlFVxtF0hbLe6s8,178
|
2
|
-
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
3
|
-
json_repair/json_context.py,sha256=DdJu3DJR-ANvr8KrWfJqdtOE3uI6_B0VQidKvE3PjJA,1080
|
4
|
-
json_repair/json_parser.py,sha256=UQgXtXTRo0oLb4N7GhPAELWtS0E9zGSPBXfnnGOCgfo,27527
|
5
|
-
json_repair/json_repair.py,sha256=Er6klw5GgmdnLmNM9GXD9gfTi8Mn9cvvTUiVITFA-1E,6101
|
6
|
-
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
json_repair/string_file_wrapper.py,sha256=EHLhNBWoyUitzT08thytYJiNZh_klEFwfT8zutPSdb4,3905
|
8
|
-
json_repair-0.30.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
9
|
-
json_repair-0.30.0.dist-info/METADATA,sha256=72pBek4v2f-1zqIwWXjaPcG8reqdX9zpy1dkjT9yspI,10686
|
10
|
-
json_repair-0.30.0.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
11
|
-
json_repair-0.30.0.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
12
|
-
json_repair-0.30.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
13
|
-
json_repair-0.30.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|