json-repair 0.28.3__py3-none-any.whl → 0.29.0__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- json_repair/__main__.py +4 -0
- json_repair/json_repair.py +103 -11
- {json_repair-0.28.3.dist-info → json_repair-0.29.0.dist-info}/METADATA +54 -1
- json_repair-0.29.0.dist-info/RECORD +10 -0
- {json_repair-0.28.3.dist-info → json_repair-0.29.0.dist-info}/WHEEL +1 -1
- json_repair-0.29.0.dist-info/entry_points.txt +2 -0
- json_repair-0.28.3.dist-info/RECORD +0 -8
- {json_repair-0.28.3.dist-info → json_repair-0.29.0.dist-info}/LICENSE +0 -0
- {json_repair-0.28.3.dist-info → json_repair-0.29.0.dist-info}/top_level.txt +0 -0
json_repair/__main__.py
ADDED
json_repair/json_repair.py
CHANGED
@@ -22,26 +22,62 @@ If something is wrong (a missing parantheses or quotes for example) it will use
|
|
22
22
|
All supported use cases are in the unit tests
|
23
23
|
"""
|
24
24
|
|
25
|
+
import argparse
|
25
26
|
import os
|
27
|
+
import sys
|
26
28
|
import json
|
27
29
|
from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
|
28
30
|
|
29
31
|
|
30
32
|
class StringFileWrapper:
|
31
33
|
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
|
32
|
-
def __init__(self, fd: TextIO) -> None:
|
34
|
+
def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
|
33
35
|
self.fd = fd
|
34
36
|
self.length: int = 0
|
37
|
+
# Buffers are 1MB strings that are read from the file
|
38
|
+
# and kept in memory to keep reads low
|
39
|
+
self.buffers: dict[int, str] = {}
|
40
|
+
# CHUNK_LENGTH is in bytes
|
41
|
+
if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
|
42
|
+
CHUNK_LENGTH = 1_000_000
|
43
|
+
self.buffer_length = CHUNK_LENGTH
|
44
|
+
|
45
|
+
def get_buffer(self, index: int) -> str:
|
46
|
+
if self.buffers.get(index) is None:
|
47
|
+
self.fd.seek(index * self.buffer_length)
|
48
|
+
self.buffers[index] = self.fd.read(self.buffer_length)
|
49
|
+
# Save memory by keeping max 2MB buffer chunks and min 2 chunks
|
50
|
+
if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
|
51
|
+
oldest_key = next(iter(self.buffers))
|
52
|
+
if oldest_key != index:
|
53
|
+
self.buffers.pop(oldest_key)
|
54
|
+
return self.buffers[index]
|
35
55
|
|
36
56
|
def __getitem__(self, index: Union[int, slice]) -> str:
|
57
|
+
# The buffer is an array that is seek like a RAM:
|
58
|
+
# self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
|
59
|
+
# self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
|
37
60
|
if isinstance(index, slice):
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
61
|
+
buffer_index = index.start // self.buffer_length
|
62
|
+
buffer_end = index.stop // self.buffer_length
|
63
|
+
if buffer_index == buffer_end:
|
64
|
+
return self.get_buffer(buffer_index)[
|
65
|
+
index.start % self.buffer_length : index.stop % self.buffer_length
|
66
|
+
]
|
67
|
+
else:
|
68
|
+
start_slice = self.get_buffer(buffer_index)[
|
69
|
+
index.start % self.buffer_length :
|
70
|
+
]
|
71
|
+
end_slice = self.get_buffer(buffer_end)[
|
72
|
+
: index.stop % self.buffer_length
|
73
|
+
]
|
74
|
+
middle_slices = [
|
75
|
+
self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)
|
76
|
+
]
|
77
|
+
return start_slice + "".join(middle_slices) + end_slice
|
42
78
|
else:
|
43
|
-
self.
|
44
|
-
return self.
|
79
|
+
buffer_index = index // self.buffer_length
|
80
|
+
return self.get_buffer(buffer_index)[index % self.buffer_length]
|
45
81
|
|
46
82
|
def __len__(self) -> int:
|
47
83
|
if self.length < 1:
|
@@ -69,13 +105,14 @@ class JSONParser:
|
|
69
105
|
json_str: Union[str, StringFileWrapper],
|
70
106
|
json_fd: Optional[TextIO],
|
71
107
|
logging: Optional[bool],
|
108
|
+
json_fd_chunk_length: int = 0,
|
72
109
|
) -> None:
|
73
110
|
# The string to parse
|
74
111
|
self.json_str = json_str
|
75
112
|
# Alternatively, the file description with a json file in it
|
76
113
|
if json_fd:
|
77
114
|
# This is a trick we do to treat the file wrapper as an array
|
78
|
-
self.json_str = StringFileWrapper(json_fd)
|
115
|
+
self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
|
79
116
|
# Index is our iterator that will keep track of which character we are looking at right now
|
80
117
|
self.index: int = 0
|
81
118
|
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
|
@@ -639,6 +676,7 @@ def repair_json(
|
|
639
676
|
logging: bool = False,
|
640
677
|
json_fd: Optional[TextIO] = None,
|
641
678
|
ensure_ascii: bool = True,
|
679
|
+
chunk_length: int = 0,
|
642
680
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
643
681
|
"""
|
644
682
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
|
@@ -647,7 +685,7 @@ def repair_json(
|
|
647
685
|
When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
|
648
686
|
When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions
|
649
687
|
"""
|
650
|
-
parser = JSONParser(json_str, json_fd, logging)
|
688
|
+
parser = JSONParser(json_str, json_fd, logging, chunk_length)
|
651
689
|
if skip_json_loads:
|
652
690
|
parsed_json = parser.parse()
|
653
691
|
else:
|
@@ -683,7 +721,10 @@ def loads(
|
|
683
721
|
|
684
722
|
|
685
723
|
def load(
|
686
|
-
fd: TextIO,
|
724
|
+
fd: TextIO,
|
725
|
+
skip_json_loads: bool = False,
|
726
|
+
logging: bool = False,
|
727
|
+
chunk_length: int = 0,
|
687
728
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
688
729
|
"""
|
689
730
|
This function works like `json.load()` except that it will fix your JSON in the process.
|
@@ -691,6 +732,7 @@ def load(
|
|
691
732
|
"""
|
692
733
|
return repair_json(
|
693
734
|
json_fd=fd,
|
735
|
+
chunk_length=chunk_length,
|
694
736
|
return_objects=True,
|
695
737
|
skip_json_loads=skip_json_loads,
|
696
738
|
logging=logging,
|
@@ -701,12 +743,62 @@ def from_file(
|
|
701
743
|
filename: str,
|
702
744
|
skip_json_loads: bool = False,
|
703
745
|
logging: bool = False,
|
746
|
+
chunk_length: int = 0,
|
704
747
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
705
748
|
"""
|
706
749
|
This function is a wrapper around `load()` so you can pass the filename as string
|
707
750
|
"""
|
708
751
|
fd = open(filename)
|
709
|
-
jsonobj = load(
|
752
|
+
jsonobj = load(
|
753
|
+
fd=fd,
|
754
|
+
skip_json_loads=skip_json_loads,
|
755
|
+
logging=logging,
|
756
|
+
chunk_length=chunk_length,
|
757
|
+
)
|
710
758
|
fd.close()
|
711
759
|
|
712
760
|
return jsonobj
|
761
|
+
|
762
|
+
|
763
|
+
def cli(): # pragma: no cover
|
764
|
+
parser = argparse.ArgumentParser(description="Repair and parse JSON files.")
|
765
|
+
parser.add_argument("filename", help="The JSON file to repair")
|
766
|
+
parser.add_argument(
|
767
|
+
"-i",
|
768
|
+
"--inline",
|
769
|
+
action="store_true",
|
770
|
+
help="Replace the file inline instead of returning the output to stdout",
|
771
|
+
)
|
772
|
+
parser.add_argument(
|
773
|
+
"--ensure_ascii",
|
774
|
+
action="store_true",
|
775
|
+
help="Pass the ensure_ascii parameter to json.dumps()",
|
776
|
+
)
|
777
|
+
parser.add_argument(
|
778
|
+
"--indent",
|
779
|
+
type=int,
|
780
|
+
default=2,
|
781
|
+
help="Number of spaces for indentation (Default 2)",
|
782
|
+
)
|
783
|
+
|
784
|
+
args = parser.parse_args()
|
785
|
+
|
786
|
+
ensure_ascii = False
|
787
|
+
if args.ensure_ascii:
|
788
|
+
ensure_ascii = True
|
789
|
+
try:
|
790
|
+
result = from_file(args.filename)
|
791
|
+
|
792
|
+
if args.inline:
|
793
|
+
fd = open(args.filename, mode="w")
|
794
|
+
json.dump(result, fd, indent=args.indent, ensure_ascii=ensure_ascii)
|
795
|
+
fd.close()
|
796
|
+
else:
|
797
|
+
print(json.dumps(result, indent=args.indent, ensure_ascii=ensure_ascii))
|
798
|
+
except Exception as e:
|
799
|
+
print(f"Error: {str(e)}", file=sys.stderr)
|
800
|
+
sys.exit(1)
|
801
|
+
|
802
|
+
|
803
|
+
if __name__ == "__main__": # pragma: no cover
|
804
|
+
cli()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: json_repair
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.29.0
|
4
4
|
Summary: A package to repair broken json strings
|
5
5
|
Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
|
6
6
|
License: MIT License
|
@@ -45,6 +45,21 @@ This simple package can be used to fix an invalid json string. To know all cases
|
|
45
45
|
|
46
46
|
Inspired by https://github.com/josdejong/jsonrepair
|
47
47
|
|
48
|
+
---
|
49
|
+
# How to cite
|
50
|
+
If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
|
51
|
+
|
52
|
+
@software{Baccianella_JSON_Repair_-_2024,
|
53
|
+
author = {Baccianella, Stefano},
|
54
|
+
month = aug,
|
55
|
+
title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
|
56
|
+
url = {https://github.com/mangiucugna/json_repair},
|
57
|
+
version = {0.28.3},
|
58
|
+
year = {2024}
|
59
|
+
}
|
60
|
+
|
61
|
+
Thank you for citing my work and please send me a link to the paper if you can!
|
62
|
+
|
48
63
|
---
|
49
64
|
# Offer me a beer
|
50
65
|
If you find this library useful, you can help me by donating toward my monthly beer budget here: https://github.com/sponsors/mangiucugna
|
@@ -82,6 +97,18 @@ or just
|
|
82
97
|
|
83
98
|
decoded_object = json_repair.repair_json(json_string, return_objects=True)
|
84
99
|
|
100
|
+
### Avoid this antipattern
|
101
|
+
Some users of this library adopt the following pattern:
|
102
|
+
|
103
|
+
obj = {}
|
104
|
+
try:
|
105
|
+
obj = json.loads(string)
|
106
|
+
except json.JSONDecodeError as e:
|
107
|
+
obj = json_repair.loads(string)
|
108
|
+
...
|
109
|
+
|
110
|
+
This is wasteful because `json_repair` will already verify for you if the JSON is valid, if you still want to do that then add `skip_json_loads=True` to the call as explained the section below.
|
111
|
+
|
85
112
|
### Read json from a file or file descriptor
|
86
113
|
|
87
114
|
JSON repair provides also a drop-in replacement for `json.load()`:
|
@@ -122,6 +149,32 @@ Some rules of thumb to use:
|
|
122
149
|
- Setting `return_objects=True` will always be faster because the parser returns an object already and it doesn't have serialize that object to JSON
|
123
150
|
- `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
|
124
151
|
- If you are having issues with escaping pass the string as **raw** string like: `r"string with escaping\""`
|
152
|
+
|
153
|
+
### Use json_repair from CLI
|
154
|
+
|
155
|
+
Install the library for command-line with:
|
156
|
+
```
|
157
|
+
pipx install json-repair
|
158
|
+
```
|
159
|
+
then run
|
160
|
+
```
|
161
|
+
$ json_repair -h
|
162
|
+
|
163
|
+
usage: json_repair [-h] [-i] [--ensure_ascii] [--indent INDENT] filename
|
164
|
+
|
165
|
+
Repair and parse JSON files.
|
166
|
+
|
167
|
+
positional arguments:
|
168
|
+
filename The JSON file to repair
|
169
|
+
|
170
|
+
options:
|
171
|
+
-h, --help show this help message and exit
|
172
|
+
-i, --inline Replace the file inline instead of returning the output to stdout
|
173
|
+
--ensure_ascii Pass the ensure_ascii parameter to json.dumps()
|
174
|
+
--indent INDENT Number of spaces for indentation (Default 2)
|
175
|
+
```
|
176
|
+
to learn how to use it
|
177
|
+
|
125
178
|
## Adding to requirements
|
126
179
|
**Please pin this library only on the major version!**
|
127
180
|
|
@@ -0,0 +1,10 @@
|
|
1
|
+
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
+
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
3
|
+
json_repair/json_repair.py,sha256=hltJ3Qa4qFbUD3mVKkYvFWksnCcIZqx8zamKfBpjeNs,33538
|
4
|
+
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
json_repair-0.29.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
6
|
+
json_repair-0.29.0.dist-info/METADATA,sha256=yh0EJo-I1u0R6X-Gq9ETz0WbgmuGIhzR7Icw9W4Kee0,9630
|
7
|
+
json_repair-0.29.0.dist-info/WHEEL,sha256=uCRv0ZEik_232NlR4YDw4Pv3Ajt5bKvMH13NUU7hFuI,91
|
8
|
+
json_repair-0.29.0.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
9
|
+
json_repair-0.29.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
10
|
+
json_repair-0.29.0.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
-
json_repair/json_repair.py,sha256=QShXijcgNG3ejW_rBbmk0RMjJE1KlGvYBhXcQnMZcHo,30129
|
3
|
-
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
json_repair-0.28.3.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
5
|
-
json_repair-0.28.3.dist-info/METADATA,sha256=ZIm82pnDJX68089RoprQJq-HrL2LF1LVr4xDTh_6VJI,8043
|
6
|
-
json_repair-0.28.3.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
|
7
|
-
json_repair-0.28.3.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
8
|
-
json_repair-0.28.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|