json-repair 0.28.2__py3-none-any.whl → 0.28.4__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- json_repair/json_repair.py +58 -13
- {json_repair-0.28.2.dist-info → json_repair-0.28.4.dist-info}/METADATA +29 -1
- json_repair-0.28.4.dist-info/RECORD +8 -0
- {json_repair-0.28.2.dist-info → json_repair-0.28.4.dist-info}/WHEEL +1 -1
- json_repair-0.28.2.dist-info/RECORD +0 -8
- {json_repair-0.28.2.dist-info → json_repair-0.28.4.dist-info}/LICENSE +0 -0
- {json_repair-0.28.2.dist-info → json_repair-0.28.4.dist-info}/top_level.txt +0 -0
json_repair/json_repair.py
CHANGED
@@ -29,19 +29,52 @@ from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
|
|
29
29
|
|
30
30
|
class StringFileWrapper:
|
31
31
|
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
|
32
|
-
def __init__(self, fd: TextIO) -> None:
|
32
|
+
def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
|
33
33
|
self.fd = fd
|
34
34
|
self.length: int = 0
|
35
|
-
|
36
|
-
|
35
|
+
# Buffers are 1MB strings that are read from the file
|
36
|
+
# and kept in memory to keep reads low
|
37
|
+
self.buffers: dict[int, str] = {}
|
38
|
+
# CHUNK_LENGTH is in bytes
|
39
|
+
if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
|
40
|
+
CHUNK_LENGTH = 1_000_000
|
41
|
+
self.buffer_length = CHUNK_LENGTH
|
42
|
+
|
43
|
+
def fill_buffer(self, index: int) -> None:
|
44
|
+
if self.buffers.get(index) is None:
|
45
|
+
self.fd.seek(index * self.buffer_length)
|
46
|
+
self.buffers[index] = self.fd.read(self.buffer_length)
|
47
|
+
# Save memory by keeping max 2MB buffer chunks and min 2 chunks
|
48
|
+
if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
|
49
|
+
oldest_key = next(iter(self.buffers))
|
50
|
+
self.buffers.pop(oldest_key)
|
51
|
+
|
52
|
+
def __getitem__(self, index: Union[int, slice]) -> str:
|
53
|
+
# The buffer is an array that is seek like a RAM:
|
54
|
+
# self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
|
55
|
+
# self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
|
37
56
|
if isinstance(index, slice):
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
57
|
+
buffer_index = index.start // self.buffer_length
|
58
|
+
buffer_end = index.stop // self.buffer_length
|
59
|
+
for i in range(buffer_index, buffer_end + 1):
|
60
|
+
self.fill_buffer(i)
|
61
|
+
if buffer_index == buffer_end:
|
62
|
+
return self.buffers[buffer_index][
|
63
|
+
index.start % self.buffer_length : index.stop % self.buffer_length
|
64
|
+
]
|
65
|
+
else:
|
66
|
+
start_slice = self.buffers[buffer_index][
|
67
|
+
index.start % self.buffer_length :
|
68
|
+
]
|
69
|
+
end_slice = self.buffers[buffer_end][: index.stop % self.buffer_length]
|
70
|
+
middle_slices = [
|
71
|
+
self.buffers[i] for i in range(buffer_index + 1, buffer_end)
|
72
|
+
]
|
73
|
+
return start_slice + "".join(middle_slices) + end_slice
|
42
74
|
else:
|
43
|
-
self.
|
44
|
-
|
75
|
+
buffer_index = index // self.buffer_length
|
76
|
+
self.fill_buffer(buffer_index)
|
77
|
+
return self.buffers[buffer_index][index % self.buffer_length]
|
45
78
|
|
46
79
|
def __len__(self) -> int:
|
47
80
|
if self.length < 1:
|
@@ -69,13 +102,14 @@ class JSONParser:
|
|
69
102
|
json_str: Union[str, StringFileWrapper],
|
70
103
|
json_fd: Optional[TextIO],
|
71
104
|
logging: Optional[bool],
|
105
|
+
json_fd_chunk_length: int = 0,
|
72
106
|
) -> None:
|
73
107
|
# The string to parse
|
74
108
|
self.json_str = json_str
|
75
109
|
# Alternatively, the file description with a json file in it
|
76
110
|
if json_fd:
|
77
111
|
# This is a trick we do to treat the file wrapper as an array
|
78
|
-
self.json_str = StringFileWrapper(json_fd)
|
112
|
+
self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
|
79
113
|
# Index is our iterator that will keep track of which character we are looking at right now
|
80
114
|
self.index: int = 0
|
81
115
|
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
|
@@ -639,6 +673,7 @@ def repair_json(
|
|
639
673
|
logging: bool = False,
|
640
674
|
json_fd: Optional[TextIO] = None,
|
641
675
|
ensure_ascii: bool = True,
|
676
|
+
chunk_length: int = 0,
|
642
677
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
643
678
|
"""
|
644
679
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
|
@@ -647,7 +682,7 @@ def repair_json(
|
|
647
682
|
When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
|
648
683
|
When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions
|
649
684
|
"""
|
650
|
-
parser = JSONParser(json_str, json_fd, logging)
|
685
|
+
parser = JSONParser(json_str, json_fd, logging, chunk_length)
|
651
686
|
if skip_json_loads:
|
652
687
|
parsed_json = parser.parse()
|
653
688
|
else:
|
@@ -683,7 +718,10 @@ def loads(
|
|
683
718
|
|
684
719
|
|
685
720
|
def load(
|
686
|
-
fd: TextIO,
|
721
|
+
fd: TextIO,
|
722
|
+
skip_json_loads: bool = False,
|
723
|
+
logging: bool = False,
|
724
|
+
chunk_length: int = 0,
|
687
725
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
688
726
|
"""
|
689
727
|
This function works like `json.load()` except that it will fix your JSON in the process.
|
@@ -691,6 +729,7 @@ def load(
|
|
691
729
|
"""
|
692
730
|
return repair_json(
|
693
731
|
json_fd=fd,
|
732
|
+
chunk_length=chunk_length,
|
694
733
|
return_objects=True,
|
695
734
|
skip_json_loads=skip_json_loads,
|
696
735
|
logging=logging,
|
@@ -701,12 +740,18 @@ def from_file(
|
|
701
740
|
filename: str,
|
702
741
|
skip_json_loads: bool = False,
|
703
742
|
logging: bool = False,
|
743
|
+
chunk_length: int = 0,
|
704
744
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
705
745
|
"""
|
706
746
|
This function is a wrapper around `load()` so you can pass the filename as string
|
707
747
|
"""
|
708
748
|
fd = open(filename)
|
709
|
-
jsonobj = load(
|
749
|
+
jsonobj = load(
|
750
|
+
fd=fd,
|
751
|
+
skip_json_loads=skip_json_loads,
|
752
|
+
logging=logging,
|
753
|
+
chunk_length=chunk_length,
|
754
|
+
)
|
710
755
|
fd.close()
|
711
756
|
|
712
757
|
return jsonobj
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: json_repair
|
3
|
-
Version: 0.28.
|
3
|
+
Version: 0.28.4
|
4
4
|
Summary: A package to repair broken json strings
|
5
5
|
Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
|
6
6
|
License: MIT License
|
@@ -45,6 +45,21 @@ This simple package can be used to fix an invalid json string. To know all cases
|
|
45
45
|
|
46
46
|
Inspired by https://github.com/josdejong/jsonrepair
|
47
47
|
|
48
|
+
---
|
49
|
+
# How to cite
|
50
|
+
If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
|
51
|
+
|
52
|
+
@software{Baccianella_JSON_Repair_-_2024,
|
53
|
+
author = {Baccianella, Stefano},
|
54
|
+
month = aug,
|
55
|
+
title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
|
56
|
+
url = {https://github.com/mangiucugna/json_repair},
|
57
|
+
version = {0.28.3},
|
58
|
+
year = {2024}
|
59
|
+
}
|
60
|
+
|
61
|
+
Thank you for citing my work and please send me a link to the paper if you can!
|
62
|
+
|
48
63
|
---
|
49
64
|
# Offer me a beer
|
50
65
|
If you find this library useful, you can help me by donating toward my monthly beer budget here: https://github.com/sponsors/mangiucugna
|
@@ -82,6 +97,18 @@ or just
|
|
82
97
|
|
83
98
|
decoded_object = json_repair.repair_json(json_string, return_objects=True)
|
84
99
|
|
100
|
+
### Avoid this antipattern
|
101
|
+
Some users of this library adopt the following pattern:
|
102
|
+
|
103
|
+
obj = {}
|
104
|
+
try:
|
105
|
+
obj = json.loads(string)
|
106
|
+
except json.JSONDecodeError as e:
|
107
|
+
obj = json_repair.loads(string)
|
108
|
+
...
|
109
|
+
|
110
|
+
This is wasteful because `json_repair` will already verify for you if the JSON is valid, if you still want to do that then add `skip_json_loads=True` to the call as explained the section below.
|
111
|
+
|
85
112
|
### Read json from a file or file descriptor
|
86
113
|
|
87
114
|
JSON repair provides also a drop-in replacement for `json.load()`:
|
@@ -122,6 +149,7 @@ Some rules of thumb to use:
|
|
122
149
|
- Setting `return_objects=True` will always be faster because the parser returns an object already and it doesn't have serialize that object to JSON
|
123
150
|
- `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
|
124
151
|
- If you are having issues with escaping pass the string as **raw** string like: `r"string with escaping\""`
|
152
|
+
|
125
153
|
## Adding to requirements
|
126
154
|
**Please pin this library only on the major version!**
|
127
155
|
|
@@ -0,0 +1,8 @@
|
|
1
|
+
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
+
json_repair/json_repair.py,sha256=odtRiFJ-u8mbdw_3Djx4jADxGoBeQvot3536D6Y6K0c,32266
|
3
|
+
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
json_repair-0.28.4.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
5
|
+
json_repair-0.28.4.dist-info/METADATA,sha256=2JB2TM0mrFC7OejTtgFrpyr2qQOw8xwX0KmLTSzephk,9019
|
6
|
+
json_repair-0.28.4.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
|
7
|
+
json_repair-0.28.4.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
8
|
+
json_repair-0.28.4.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
-
json_repair/json_repair.py,sha256=v43na-l2g34pwTZH5FDljI_r5ArIaZfCeHW_LbB8puw,30123
|
3
|
-
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
json_repair-0.28.2.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
5
|
-
json_repair-0.28.2.dist-info/METADATA,sha256=llPJ1A8UePeGKbPSkC6-b2kE2somdgSMiq-wuPANGZ8,8043
|
6
|
-
json_repair-0.28.2.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
|
7
|
-
json_repair-0.28.2.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
8
|
-
json_repair-0.28.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|