json-repair 0.28.3__py3-none-any.whl → 0.29.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/__main__.py +4 -0
- json_repair/json_repair.py +103 -11
- {json_repair-0.28.3.dist-info → json_repair-0.29.0.dist-info}/METADATA +54 -1
- json_repair-0.29.0.dist-info/RECORD +10 -0
- {json_repair-0.28.3.dist-info → json_repair-0.29.0.dist-info}/WHEEL +1 -1
- json_repair-0.29.0.dist-info/entry_points.txt +2 -0
- json_repair-0.28.3.dist-info/RECORD +0 -8
- {json_repair-0.28.3.dist-info → json_repair-0.29.0.dist-info}/LICENSE +0 -0
- {json_repair-0.28.3.dist-info → json_repair-0.29.0.dist-info}/top_level.txt +0 -0
json_repair/__main__.py
ADDED
json_repair/json_repair.py
CHANGED
@@ -22,26 +22,62 @@ If something is wrong (a missing parantheses or quotes for example) it will use
|
|
22
22
|
All supported use cases are in the unit tests
|
23
23
|
"""
|
24
24
|
|
25
|
+
import argparse
|
25
26
|
import os
|
27
|
+
import sys
|
26
28
|
import json
|
27
29
|
from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
|
28
30
|
|
29
31
|
|
30
32
|
class StringFileWrapper:
|
31
33
|
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
|
32
|
-
def __init__(self, fd: TextIO) -> None:
|
34
|
+
def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
|
33
35
|
self.fd = fd
|
34
36
|
self.length: int = 0
|
37
|
+
# Buffers are 1MB strings that are read from the file
|
38
|
+
# and kept in memory to keep reads low
|
39
|
+
self.buffers: dict[int, str] = {}
|
40
|
+
# CHUNK_LENGTH is in bytes
|
41
|
+
if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
|
42
|
+
CHUNK_LENGTH = 1_000_000
|
43
|
+
self.buffer_length = CHUNK_LENGTH
|
44
|
+
|
45
|
+
def get_buffer(self, index: int) -> str:
|
46
|
+
if self.buffers.get(index) is None:
|
47
|
+
self.fd.seek(index * self.buffer_length)
|
48
|
+
self.buffers[index] = self.fd.read(self.buffer_length)
|
49
|
+
# Save memory by keeping max 2MB buffer chunks and min 2 chunks
|
50
|
+
if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
|
51
|
+
oldest_key = next(iter(self.buffers))
|
52
|
+
if oldest_key != index:
|
53
|
+
self.buffers.pop(oldest_key)
|
54
|
+
return self.buffers[index]
|
35
55
|
|
36
56
|
def __getitem__(self, index: Union[int, slice]) -> str:
|
57
|
+
# The buffer is an array that is seek like a RAM:
|
58
|
+
# self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
|
59
|
+
# self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
|
37
60
|
if isinstance(index, slice):
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
61
|
+
buffer_index = index.start // self.buffer_length
|
62
|
+
buffer_end = index.stop // self.buffer_length
|
63
|
+
if buffer_index == buffer_end:
|
64
|
+
return self.get_buffer(buffer_index)[
|
65
|
+
index.start % self.buffer_length : index.stop % self.buffer_length
|
66
|
+
]
|
67
|
+
else:
|
68
|
+
start_slice = self.get_buffer(buffer_index)[
|
69
|
+
index.start % self.buffer_length :
|
70
|
+
]
|
71
|
+
end_slice = self.get_buffer(buffer_end)[
|
72
|
+
: index.stop % self.buffer_length
|
73
|
+
]
|
74
|
+
middle_slices = [
|
75
|
+
self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)
|
76
|
+
]
|
77
|
+
return start_slice + "".join(middle_slices) + end_slice
|
42
78
|
else:
|
43
|
-
self.
|
44
|
-
return self.
|
79
|
+
buffer_index = index // self.buffer_length
|
80
|
+
return self.get_buffer(buffer_index)[index % self.buffer_length]
|
45
81
|
|
46
82
|
def __len__(self) -> int:
|
47
83
|
if self.length < 1:
|
@@ -69,13 +105,14 @@ class JSONParser:
|
|
69
105
|
json_str: Union[str, StringFileWrapper],
|
70
106
|
json_fd: Optional[TextIO],
|
71
107
|
logging: Optional[bool],
|
108
|
+
json_fd_chunk_length: int = 0,
|
72
109
|
) -> None:
|
73
110
|
# The string to parse
|
74
111
|
self.json_str = json_str
|
75
112
|
# Alternatively, the file description with a json file in it
|
76
113
|
if json_fd:
|
77
114
|
# This is a trick we do to treat the file wrapper as an array
|
78
|
-
self.json_str = StringFileWrapper(json_fd)
|
115
|
+
self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
|
79
116
|
# Index is our iterator that will keep track of which character we are looking at right now
|
80
117
|
self.index: int = 0
|
81
118
|
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
|
@@ -639,6 +676,7 @@ def repair_json(
|
|
639
676
|
logging: bool = False,
|
640
677
|
json_fd: Optional[TextIO] = None,
|
641
678
|
ensure_ascii: bool = True,
|
679
|
+
chunk_length: int = 0,
|
642
680
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
643
681
|
"""
|
644
682
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
|
@@ -647,7 +685,7 @@ def repair_json(
|
|
647
685
|
When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
|
648
686
|
When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions
|
649
687
|
"""
|
650
|
-
parser = JSONParser(json_str, json_fd, logging)
|
688
|
+
parser = JSONParser(json_str, json_fd, logging, chunk_length)
|
651
689
|
if skip_json_loads:
|
652
690
|
parsed_json = parser.parse()
|
653
691
|
else:
|
@@ -683,7 +721,10 @@ def loads(
|
|
683
721
|
|
684
722
|
|
685
723
|
def load(
|
686
|
-
fd: TextIO,
|
724
|
+
fd: TextIO,
|
725
|
+
skip_json_loads: bool = False,
|
726
|
+
logging: bool = False,
|
727
|
+
chunk_length: int = 0,
|
687
728
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
688
729
|
"""
|
689
730
|
This function works like `json.load()` except that it will fix your JSON in the process.
|
@@ -691,6 +732,7 @@ def load(
|
|
691
732
|
"""
|
692
733
|
return repair_json(
|
693
734
|
json_fd=fd,
|
735
|
+
chunk_length=chunk_length,
|
694
736
|
return_objects=True,
|
695
737
|
skip_json_loads=skip_json_loads,
|
696
738
|
logging=logging,
|
@@ -701,12 +743,62 @@ def from_file(
|
|
701
743
|
filename: str,
|
702
744
|
skip_json_loads: bool = False,
|
703
745
|
logging: bool = False,
|
746
|
+
chunk_length: int = 0,
|
704
747
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
705
748
|
"""
|
706
749
|
This function is a wrapper around `load()` so you can pass the filename as string
|
707
750
|
"""
|
708
751
|
fd = open(filename)
|
709
|
-
jsonobj = load(
|
752
|
+
jsonobj = load(
|
753
|
+
fd=fd,
|
754
|
+
skip_json_loads=skip_json_loads,
|
755
|
+
logging=logging,
|
756
|
+
chunk_length=chunk_length,
|
757
|
+
)
|
710
758
|
fd.close()
|
711
759
|
|
712
760
|
return jsonobj
|
761
|
+
|
762
|
+
|
763
|
+
def cli(): # pragma: no cover
|
764
|
+
parser = argparse.ArgumentParser(description="Repair and parse JSON files.")
|
765
|
+
parser.add_argument("filename", help="The JSON file to repair")
|
766
|
+
parser.add_argument(
|
767
|
+
"-i",
|
768
|
+
"--inline",
|
769
|
+
action="store_true",
|
770
|
+
help="Replace the file inline instead of returning the output to stdout",
|
771
|
+
)
|
772
|
+
parser.add_argument(
|
773
|
+
"--ensure_ascii",
|
774
|
+
action="store_true",
|
775
|
+
help="Pass the ensure_ascii parameter to json.dumps()",
|
776
|
+
)
|
777
|
+
parser.add_argument(
|
778
|
+
"--indent",
|
779
|
+
type=int,
|
780
|
+
default=2,
|
781
|
+
help="Number of spaces for indentation (Default 2)",
|
782
|
+
)
|
783
|
+
|
784
|
+
args = parser.parse_args()
|
785
|
+
|
786
|
+
ensure_ascii = False
|
787
|
+
if args.ensure_ascii:
|
788
|
+
ensure_ascii = True
|
789
|
+
try:
|
790
|
+
result = from_file(args.filename)
|
791
|
+
|
792
|
+
if args.inline:
|
793
|
+
fd = open(args.filename, mode="w")
|
794
|
+
json.dump(result, fd, indent=args.indent, ensure_ascii=ensure_ascii)
|
795
|
+
fd.close()
|
796
|
+
else:
|
797
|
+
print(json.dumps(result, indent=args.indent, ensure_ascii=ensure_ascii))
|
798
|
+
except Exception as e:
|
799
|
+
print(f"Error: {str(e)}", file=sys.stderr)
|
800
|
+
sys.exit(1)
|
801
|
+
|
802
|
+
|
803
|
+
if __name__ == "__main__": # pragma: no cover
|
804
|
+
cli()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: json_repair
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.29.0
|
4
4
|
Summary: A package to repair broken json strings
|
5
5
|
Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
|
6
6
|
License: MIT License
|
@@ -45,6 +45,21 @@ This simple package can be used to fix an invalid json string. To know all cases
|
|
45
45
|
|
46
46
|
Inspired by https://github.com/josdejong/jsonrepair
|
47
47
|
|
48
|
+
---
|
49
|
+
# How to cite
|
50
|
+
If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
|
51
|
+
|
52
|
+
@software{Baccianella_JSON_Repair_-_2024,
|
53
|
+
author = {Baccianella, Stefano},
|
54
|
+
month = aug,
|
55
|
+
title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
|
56
|
+
url = {https://github.com/mangiucugna/json_repair},
|
57
|
+
version = {0.28.3},
|
58
|
+
year = {2024}
|
59
|
+
}
|
60
|
+
|
61
|
+
Thank you for citing my work and please send me a link to the paper if you can!
|
62
|
+
|
48
63
|
---
|
49
64
|
# Offer me a beer
|
50
65
|
If you find this library useful, you can help me by donating toward my monthly beer budget here: https://github.com/sponsors/mangiucugna
|
@@ -82,6 +97,18 @@ or just
|
|
82
97
|
|
83
98
|
decoded_object = json_repair.repair_json(json_string, return_objects=True)
|
84
99
|
|
100
|
+
### Avoid this antipattern
|
101
|
+
Some users of this library adopt the following pattern:
|
102
|
+
|
103
|
+
obj = {}
|
104
|
+
try:
|
105
|
+
obj = json.loads(string)
|
106
|
+
except json.JSONDecodeError as e:
|
107
|
+
obj = json_repair.loads(string)
|
108
|
+
...
|
109
|
+
|
110
|
+
This is wasteful because `json_repair` will already verify for you if the JSON is valid, if you still want to do that then add `skip_json_loads=True` to the call as explained the section below.
|
111
|
+
|
85
112
|
### Read json from a file or file descriptor
|
86
113
|
|
87
114
|
JSON repair provides also a drop-in replacement for `json.load()`:
|
@@ -122,6 +149,32 @@ Some rules of thumb to use:
|
|
122
149
|
- Setting `return_objects=True` will always be faster because the parser returns an object already and it doesn't have serialize that object to JSON
|
123
150
|
- `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
|
124
151
|
- If you are having issues with escaping pass the string as **raw** string like: `r"string with escaping\""`
|
152
|
+
|
153
|
+
### Use json_repair from CLI
|
154
|
+
|
155
|
+
Install the library for command-line with:
|
156
|
+
```
|
157
|
+
pipx install json-repair
|
158
|
+
```
|
159
|
+
then run
|
160
|
+
```
|
161
|
+
$ json_repair -h
|
162
|
+
|
163
|
+
usage: json_repair [-h] [-i] [--ensure_ascii] [--indent INDENT] filename
|
164
|
+
|
165
|
+
Repair and parse JSON files.
|
166
|
+
|
167
|
+
positional arguments:
|
168
|
+
filename The JSON file to repair
|
169
|
+
|
170
|
+
options:
|
171
|
+
-h, --help show this help message and exit
|
172
|
+
-i, --inline Replace the file inline instead of returning the output to stdout
|
173
|
+
--ensure_ascii Pass the ensure_ascii parameter to json.dumps()
|
174
|
+
--indent INDENT Number of spaces for indentation (Default 2)
|
175
|
+
```
|
176
|
+
to learn how to use it
|
177
|
+
|
125
178
|
## Adding to requirements
|
126
179
|
**Please pin this library only on the major version!**
|
127
180
|
|
@@ -0,0 +1,10 @@
|
|
1
|
+
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
+
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
3
|
+
json_repair/json_repair.py,sha256=hltJ3Qa4qFbUD3mVKkYvFWksnCcIZqx8zamKfBpjeNs,33538
|
4
|
+
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
+
json_repair-0.29.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
6
|
+
json_repair-0.29.0.dist-info/METADATA,sha256=yh0EJo-I1u0R6X-Gq9ETz0WbgmuGIhzR7Icw9W4Kee0,9630
|
7
|
+
json_repair-0.29.0.dist-info/WHEEL,sha256=uCRv0ZEik_232NlR4YDw4Pv3Ajt5bKvMH13NUU7hFuI,91
|
8
|
+
json_repair-0.29.0.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
9
|
+
json_repair-0.29.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
10
|
+
json_repair-0.29.0.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
-
json_repair/json_repair.py,sha256=QShXijcgNG3ejW_rBbmk0RMjJE1KlGvYBhXcQnMZcHo,30129
|
3
|
-
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
json_repair-0.28.3.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
5
|
-
json_repair-0.28.3.dist-info/METADATA,sha256=ZIm82pnDJX68089RoprQJq-HrL2LF1LVr4xDTh_6VJI,8043
|
6
|
-
json_repair-0.28.3.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
|
7
|
-
json_repair-0.28.3.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
8
|
-
json_repair-0.28.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|