json-repair 0.28.2__py3-none-any.whl → 0.28.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/json_repair.py +58 -13
- {json_repair-0.28.2.dist-info → json_repair-0.28.4.dist-info}/METADATA +29 -1
- json_repair-0.28.4.dist-info/RECORD +8 -0
- {json_repair-0.28.2.dist-info → json_repair-0.28.4.dist-info}/WHEEL +1 -1
- json_repair-0.28.2.dist-info/RECORD +0 -8
- {json_repair-0.28.2.dist-info → json_repair-0.28.4.dist-info}/LICENSE +0 -0
- {json_repair-0.28.2.dist-info → json_repair-0.28.4.dist-info}/top_level.txt +0 -0
json_repair/json_repair.py
CHANGED
@@ -29,19 +29,52 @@ from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
|
|
29
29
|
|
30
30
|
class StringFileWrapper:
|
31
31
|
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
|
32
|
-
def __init__(self, fd: TextIO) -> None:
|
32
|
+
def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
|
33
33
|
self.fd = fd
|
34
34
|
self.length: int = 0
|
35
|
-
|
36
|
-
|
35
|
+
# Buffers are 1MB strings that are read from the file
|
36
|
+
# and kept in memory to keep reads low
|
37
|
+
self.buffers: dict[int, str] = {}
|
38
|
+
# CHUNK_LENGTH is in bytes
|
39
|
+
if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
|
40
|
+
CHUNK_LENGTH = 1_000_000
|
41
|
+
self.buffer_length = CHUNK_LENGTH
|
42
|
+
|
43
|
+
def fill_buffer(self, index: int) -> None:
|
44
|
+
if self.buffers.get(index) is None:
|
45
|
+
self.fd.seek(index * self.buffer_length)
|
46
|
+
self.buffers[index] = self.fd.read(self.buffer_length)
|
47
|
+
# Save memory by keeping max 2MB buffer chunks and min 2 chunks
|
48
|
+
if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
|
49
|
+
oldest_key = next(iter(self.buffers))
|
50
|
+
self.buffers.pop(oldest_key)
|
51
|
+
|
52
|
+
def __getitem__(self, index: Union[int, slice]) -> str:
|
53
|
+
# The buffer is an array that is seek like a RAM:
|
54
|
+
# self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
|
55
|
+
# self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
|
37
56
|
if isinstance(index, slice):
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
57
|
+
buffer_index = index.start // self.buffer_length
|
58
|
+
buffer_end = index.stop // self.buffer_length
|
59
|
+
for i in range(buffer_index, buffer_end + 1):
|
60
|
+
self.fill_buffer(i)
|
61
|
+
if buffer_index == buffer_end:
|
62
|
+
return self.buffers[buffer_index][
|
63
|
+
index.start % self.buffer_length : index.stop % self.buffer_length
|
64
|
+
]
|
65
|
+
else:
|
66
|
+
start_slice = self.buffers[buffer_index][
|
67
|
+
index.start % self.buffer_length :
|
68
|
+
]
|
69
|
+
end_slice = self.buffers[buffer_end][: index.stop % self.buffer_length]
|
70
|
+
middle_slices = [
|
71
|
+
self.buffers[i] for i in range(buffer_index + 1, buffer_end)
|
72
|
+
]
|
73
|
+
return start_slice + "".join(middle_slices) + end_slice
|
42
74
|
else:
|
43
|
-
self.
|
44
|
-
|
75
|
+
buffer_index = index // self.buffer_length
|
76
|
+
self.fill_buffer(buffer_index)
|
77
|
+
return self.buffers[buffer_index][index % self.buffer_length]
|
45
78
|
|
46
79
|
def __len__(self) -> int:
|
47
80
|
if self.length < 1:
|
@@ -69,13 +102,14 @@ class JSONParser:
|
|
69
102
|
json_str: Union[str, StringFileWrapper],
|
70
103
|
json_fd: Optional[TextIO],
|
71
104
|
logging: Optional[bool],
|
105
|
+
json_fd_chunk_length: int = 0,
|
72
106
|
) -> None:
|
73
107
|
# The string to parse
|
74
108
|
self.json_str = json_str
|
75
109
|
# Alternatively, the file description with a json file in it
|
76
110
|
if json_fd:
|
77
111
|
# This is a trick we do to treat the file wrapper as an array
|
78
|
-
self.json_str = StringFileWrapper(json_fd)
|
112
|
+
self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
|
79
113
|
# Index is our iterator that will keep track of which character we are looking at right now
|
80
114
|
self.index: int = 0
|
81
115
|
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
|
@@ -639,6 +673,7 @@ def repair_json(
|
|
639
673
|
logging: bool = False,
|
640
674
|
json_fd: Optional[TextIO] = None,
|
641
675
|
ensure_ascii: bool = True,
|
676
|
+
chunk_length: int = 0,
|
642
677
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
643
678
|
"""
|
644
679
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
|
@@ -647,7 +682,7 @@ def repair_json(
|
|
647
682
|
When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
|
648
683
|
When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions
|
649
684
|
"""
|
650
|
-
parser = JSONParser(json_str, json_fd, logging)
|
685
|
+
parser = JSONParser(json_str, json_fd, logging, chunk_length)
|
651
686
|
if skip_json_loads:
|
652
687
|
parsed_json = parser.parse()
|
653
688
|
else:
|
@@ -683,7 +718,10 @@ def loads(
|
|
683
718
|
|
684
719
|
|
685
720
|
def load(
|
686
|
-
fd: TextIO,
|
721
|
+
fd: TextIO,
|
722
|
+
skip_json_loads: bool = False,
|
723
|
+
logging: bool = False,
|
724
|
+
chunk_length: int = 0,
|
687
725
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
688
726
|
"""
|
689
727
|
This function works like `json.load()` except that it will fix your JSON in the process.
|
@@ -691,6 +729,7 @@ def load(
|
|
691
729
|
"""
|
692
730
|
return repair_json(
|
693
731
|
json_fd=fd,
|
732
|
+
chunk_length=chunk_length,
|
694
733
|
return_objects=True,
|
695
734
|
skip_json_loads=skip_json_loads,
|
696
735
|
logging=logging,
|
@@ -701,12 +740,18 @@ def from_file(
|
|
701
740
|
filename: str,
|
702
741
|
skip_json_loads: bool = False,
|
703
742
|
logging: bool = False,
|
743
|
+
chunk_length: int = 0,
|
704
744
|
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
705
745
|
"""
|
706
746
|
This function is a wrapper around `load()` so you can pass the filename as string
|
707
747
|
"""
|
708
748
|
fd = open(filename)
|
709
|
-
jsonobj = load(
|
749
|
+
jsonobj = load(
|
750
|
+
fd=fd,
|
751
|
+
skip_json_loads=skip_json_loads,
|
752
|
+
logging=logging,
|
753
|
+
chunk_length=chunk_length,
|
754
|
+
)
|
710
755
|
fd.close()
|
711
756
|
|
712
757
|
return jsonobj
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: json_repair
|
3
|
-
Version: 0.28.
|
3
|
+
Version: 0.28.4
|
4
4
|
Summary: A package to repair broken json strings
|
5
5
|
Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
|
6
6
|
License: MIT License
|
@@ -45,6 +45,21 @@ This simple package can be used to fix an invalid json string. To know all cases
|
|
45
45
|
|
46
46
|
Inspired by https://github.com/josdejong/jsonrepair
|
47
47
|
|
48
|
+
---
|
49
|
+
# How to cite
|
50
|
+
If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
|
51
|
+
|
52
|
+
@software{Baccianella_JSON_Repair_-_2024,
|
53
|
+
author = {Baccianella, Stefano},
|
54
|
+
month = aug,
|
55
|
+
title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
|
56
|
+
url = {https://github.com/mangiucugna/json_repair},
|
57
|
+
version = {0.28.3},
|
58
|
+
year = {2024}
|
59
|
+
}
|
60
|
+
|
61
|
+
Thank you for citing my work and please send me a link to the paper if you can!
|
62
|
+
|
48
63
|
---
|
49
64
|
# Offer me a beer
|
50
65
|
If you find this library useful, you can help me by donating toward my monthly beer budget here: https://github.com/sponsors/mangiucugna
|
@@ -82,6 +97,18 @@ or just
|
|
82
97
|
|
83
98
|
decoded_object = json_repair.repair_json(json_string, return_objects=True)
|
84
99
|
|
100
|
+
### Avoid this antipattern
|
101
|
+
Some users of this library adopt the following pattern:
|
102
|
+
|
103
|
+
obj = {}
|
104
|
+
try:
|
105
|
+
obj = json.loads(string)
|
106
|
+
except json.JSONDecodeError as e:
|
107
|
+
obj = json_repair.loads(string)
|
108
|
+
...
|
109
|
+
|
110
|
+
This is wasteful because `json_repair` will already verify for you if the JSON is valid, if you still want to do that then add `skip_json_loads=True` to the call as explained the section below.
|
111
|
+
|
85
112
|
### Read json from a file or file descriptor
|
86
113
|
|
87
114
|
JSON repair provides also a drop-in replacement for `json.load()`:
|
@@ -122,6 +149,7 @@ Some rules of thumb to use:
|
|
122
149
|
- Setting `return_objects=True` will always be faster because the parser returns an object already and it doesn't have serialize that object to JSON
|
123
150
|
- `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
|
124
151
|
- If you are having issues with escaping pass the string as **raw** string like: `r"string with escaping\""`
|
152
|
+
|
125
153
|
## Adding to requirements
|
126
154
|
**Please pin this library only on the major version!**
|
127
155
|
|
@@ -0,0 +1,8 @@
|
|
1
|
+
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
+
json_repair/json_repair.py,sha256=odtRiFJ-u8mbdw_3Djx4jADxGoBeQvot3536D6Y6K0c,32266
|
3
|
+
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
+
json_repair-0.28.4.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
5
|
+
json_repair-0.28.4.dist-info/METADATA,sha256=2JB2TM0mrFC7OejTtgFrpyr2qQOw8xwX0KmLTSzephk,9019
|
6
|
+
json_repair-0.28.4.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
|
7
|
+
json_repair-0.28.4.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
8
|
+
json_repair-0.28.4.dist-info/RECORD,,
|
@@ -1,8 +0,0 @@
|
|
1
|
-
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
-
json_repair/json_repair.py,sha256=v43na-l2g34pwTZH5FDljI_r5ArIaZfCeHW_LbB8puw,30123
|
3
|
-
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
4
|
-
json_repair-0.28.2.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
5
|
-
json_repair-0.28.2.dist-info/METADATA,sha256=llPJ1A8UePeGKbPSkC6-b2kE2somdgSMiq-wuPANGZ8,8043
|
6
|
-
json_repair-0.28.2.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
|
7
|
-
json_repair-0.28.2.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
8
|
-
json_repair-0.28.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|