json-repair 0.29.2__py3-none-any.whl → 0.29.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/json_context.py +45 -0
- json_repair/json_parser.py +584 -0
- json_repair/json_repair.py +2 -643
- json_repair/string_file_wrapper.py +98 -0
- {json_repair-0.29.2.dist-info → json_repair-0.29.4.dist-info}/METADATA +41 -16
- json_repair-0.29.4.dist-info/RECORD +13 -0
- {json_repair-0.29.2.dist-info → json_repair-0.29.4.dist-info}/WHEEL +1 -1
- json_repair-0.29.2.dist-info/RECORD +0 -10
- {json_repair-0.29.2.dist-info → json_repair-0.29.4.dist-info}/LICENSE +0 -0
- {json_repair-0.29.2.dist-info → json_repair-0.29.4.dist-info}/entry_points.txt +0 -0
- {json_repair-0.29.2.dist-info → json_repair-0.29.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,98 @@
|
|
1
|
+
import os
|
2
|
+
from typing import TextIO, Union
|
3
|
+
|
4
|
+
|
5
|
+
class StringFileWrapper:
|
6
|
+
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
|
7
|
+
def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
|
8
|
+
"""
|
9
|
+
Initialize the StringFileWrapper with a file descriptor and chunk length.
|
10
|
+
|
11
|
+
Args:
|
12
|
+
fd (TextIO): The file descriptor to wrap.
|
13
|
+
CHUNK_LENGTH (int): The length of each chunk to read from the file.
|
14
|
+
|
15
|
+
Attributes:
|
16
|
+
fd (TextIO): The wrapped file descriptor.
|
17
|
+
length (int): The total length of the file content.
|
18
|
+
buffers (dict[int, str]): Dictionary to store chunks of file content.
|
19
|
+
buffer_length (int): The length of each buffer chunk.
|
20
|
+
"""
|
21
|
+
self.fd = fd
|
22
|
+
self.length: int = 0
|
23
|
+
# Buffers are 1MB strings that are read from the file
|
24
|
+
# and kept in memory to keep reads low
|
25
|
+
self.buffers: dict[int, str] = {}
|
26
|
+
# CHUNK_LENGTH is in bytes
|
27
|
+
if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
|
28
|
+
CHUNK_LENGTH = 1_000_000
|
29
|
+
self.buffer_length = CHUNK_LENGTH
|
30
|
+
|
31
|
+
def get_buffer(self, index: int) -> str:
|
32
|
+
"""
|
33
|
+
Retrieve or load a buffer chunk from the file.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
index (int): The index of the buffer chunk to retrieve.
|
37
|
+
|
38
|
+
Returns:
|
39
|
+
str: The buffer chunk at the specified index.
|
40
|
+
"""
|
41
|
+
if self.buffers.get(index) is None:
|
42
|
+
self.fd.seek(index * self.buffer_length)
|
43
|
+
self.buffers[index] = self.fd.read(self.buffer_length)
|
44
|
+
# Save memory by keeping max 2MB buffer chunks and min 2 chunks
|
45
|
+
if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
|
46
|
+
oldest_key = next(iter(self.buffers))
|
47
|
+
if oldest_key != index:
|
48
|
+
self.buffers.pop(oldest_key)
|
49
|
+
return self.buffers[index]
|
50
|
+
|
51
|
+
def __getitem__(self, index: Union[int, slice]) -> str:
|
52
|
+
"""
|
53
|
+
Retrieve a character or a slice of characters from the file.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
index (Union[int, slice]): The index or slice of characters to retrieve.
|
57
|
+
|
58
|
+
Returns:
|
59
|
+
str: The character(s) at the specified index or slice.
|
60
|
+
"""
|
61
|
+
# The buffer is an array that is seek like a RAM:
|
62
|
+
# self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
|
63
|
+
# self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
|
64
|
+
if isinstance(index, slice):
|
65
|
+
buffer_index = index.start // self.buffer_length
|
66
|
+
buffer_end = index.stop // self.buffer_length
|
67
|
+
if buffer_index == buffer_end:
|
68
|
+
return self.get_buffer(buffer_index)[
|
69
|
+
index.start % self.buffer_length : index.stop % self.buffer_length
|
70
|
+
]
|
71
|
+
else:
|
72
|
+
start_slice = self.get_buffer(buffer_index)[
|
73
|
+
index.start % self.buffer_length :
|
74
|
+
]
|
75
|
+
end_slice = self.get_buffer(buffer_end)[
|
76
|
+
: index.stop % self.buffer_length
|
77
|
+
]
|
78
|
+
middle_slices = [
|
79
|
+
self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)
|
80
|
+
]
|
81
|
+
return start_slice + "".join(middle_slices) + end_slice
|
82
|
+
else:
|
83
|
+
buffer_index = index // self.buffer_length
|
84
|
+
return self.get_buffer(buffer_index)[index % self.buffer_length]
|
85
|
+
|
86
|
+
def __len__(self) -> int:
|
87
|
+
"""
|
88
|
+
Get the total length of the file.
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
int: The total number of characters in the file.
|
92
|
+
"""
|
93
|
+
if self.length < 1:
|
94
|
+
current_position = self.fd.tell()
|
95
|
+
self.fd.seek(0, os.SEEK_END)
|
96
|
+
self.length = self.fd.tell()
|
97
|
+
self.fd.seek(current_position)
|
98
|
+
return self.length
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: json_repair
|
3
|
-
Version: 0.29.
|
3
|
+
Version: 0.29.4
|
4
4
|
Summary: A package to repair broken json strings
|
5
5
|
Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
|
6
6
|
License: MIT License
|
@@ -45,21 +45,6 @@ This simple package can be used to fix an invalid json string. To know all cases
|
|
45
45
|
|
46
46
|
Inspired by https://github.com/josdejong/jsonrepair
|
47
47
|
|
48
|
-
---
|
49
|
-
# How to cite
|
50
|
-
If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
|
51
|
-
|
52
|
-
@software{Baccianella_JSON_Repair_-_2024,
|
53
|
-
author = {Baccianella, Stefano},
|
54
|
-
month = aug,
|
55
|
-
title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
|
56
|
-
url = {https://github.com/mangiucugna/json_repair},
|
57
|
-
version = {0.28.3},
|
58
|
-
year = {2024}
|
59
|
-
}
|
60
|
-
|
61
|
-
Thank you for citing my work and please send me a link to the paper if you can!
|
62
|
-
|
63
48
|
---
|
64
49
|
# Offer me a beer
|
65
50
|
If you find this library useful, you can help me by donating toward my monthly beer budget here: https://github.com/sponsors/mangiucugna
|
@@ -79,7 +64,30 @@ I searched for a lightweight python package that was able to reliably fix this p
|
|
79
64
|
|
80
65
|
*So I wrote one*
|
81
66
|
|
67
|
+
# Supported use cases
|
68
|
+
|
69
|
+
### Fixing Syntax Errors in JSON
|
70
|
+
|
71
|
+
- Missing quotes, misplaced commas, unescaped characters, and incomplete key-value pairs.
|
72
|
+
- Missing quotation marks, improperly formatted values (true, false, null), and repairs corrupted key-value structures.
|
73
|
+
|
74
|
+
### Repairing Malformed JSON Arrays and Objects
|
75
|
+
|
76
|
+
- Incomplete or broken arrays/objects by adding necessary elements (e.g., commas, brackets) or default values (null, "").
|
77
|
+
- The library can process JSON that includes extra non-JSON characters like comments or improperly placed characters, cleaning them up while maintaining valid structure.
|
78
|
+
|
79
|
+
### Auto-Completion for Missing JSON Values
|
80
|
+
|
81
|
+
- Automatically completes missing values in JSON fields with reasonable defaults (like empty strings or null), ensuring validity.
|
82
|
+
|
82
83
|
# How to use
|
84
|
+
|
85
|
+
Install the library with pip
|
86
|
+
|
87
|
+
pip install json-repair
|
88
|
+
|
89
|
+
then you can use use it in your code like this
|
90
|
+
|
83
91
|
from json_repair import repair_json
|
84
92
|
|
85
93
|
good_json_string = repair_json(bad_json_string)
|
@@ -185,6 +193,23 @@ To ensure that you only pin the major version of this library in your `requireme
|
|
185
193
|
|
186
194
|
In this example, any version that starts with `0.` will be acceptable, allowing for updates on minor and patch versions.
|
187
195
|
|
196
|
+
---
|
197
|
+
# How to cite
|
198
|
+
If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
|
199
|
+
|
200
|
+
@software{Baccianella_JSON_Repair_-_2024,
|
201
|
+
author = {Baccianella, Stefano},
|
202
|
+
month = aug,
|
203
|
+
title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
|
204
|
+
url = {https://github.com/mangiucugna/json_repair},
|
205
|
+
version = {0.28.3},
|
206
|
+
year = {2024}
|
207
|
+
}
|
208
|
+
|
209
|
+
Thank you for citing my work and please send me a link to the paper if you can!
|
210
|
+
|
211
|
+
---
|
212
|
+
|
188
213
|
# How it works
|
189
214
|
This module will parse the JSON file following the BNF definition:
|
190
215
|
|
@@ -0,0 +1,13 @@
|
|
1
|
+
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
+
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
3
|
+
json_repair/json_context.py,sha256=DdJu3DJR-ANvr8KrWfJqdtOE3uI6_B0VQidKvE3PjJA,1080
|
4
|
+
json_repair/json_parser.py,sha256=BUPyAsb7wzkjNrBmsZgxgoOM9JhksCN-8cHcbJQpcPU,25525
|
5
|
+
json_repair/json_repair.py,sha256=GTg3OAXRbAJAHWs8oiQDqUHh4h6qKDVvWPXcrqafzLY,6100
|
6
|
+
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
json_repair/string_file_wrapper.py,sha256=EHLhNBWoyUitzT08thytYJiNZh_klEFwfT8zutPSdb4,3905
|
8
|
+
json_repair-0.29.4.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
9
|
+
json_repair-0.29.4.dist-info/METADATA,sha256=dBmPfg4wBTxOFXklH4V38aiO4pUks5FS7HcvQlZ4NIg,10686
|
10
|
+
json_repair-0.29.4.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
11
|
+
json_repair-0.29.4.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
12
|
+
json_repair-0.29.4.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
13
|
+
json_repair-0.29.4.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
-
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
3
|
-
json_repair/json_repair.py,sha256=anGQI5RxauBnZUO9QKoPU7JgN_sUaIddyiR4ecpMmm8,34060
|
4
|
-
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
json_repair-0.29.2.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
6
|
-
json_repair-0.29.2.dist-info/METADATA,sha256=Jtwl047L79Xj0CmA363Xc2EemzttgMWqYW0abi4a7fA,9787
|
7
|
-
json_repair-0.29.2.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
|
8
|
-
json_repair-0.29.2.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
9
|
-
json_repair-0.29.2.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
10
|
-
json_repair-0.29.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|