json-repair 0.29.2__py3-none-any.whl → 0.29.3__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- json_repair/json_context.py +69 -0
- json_repair/json_parser.py +598 -0
- json_repair/json_repair.py +2 -643
- json_repair/string_file_wrapper.py +98 -0
- {json_repair-0.29.2.dist-info → json_repair-0.29.3.dist-info}/METADATA +41 -16
- json_repair-0.29.3.dist-info/RECORD +13 -0
- {json_repair-0.29.2.dist-info → json_repair-0.29.3.dist-info}/WHEEL +1 -1
- json_repair-0.29.2.dist-info/RECORD +0 -10
- {json_repair-0.29.2.dist-info → json_repair-0.29.3.dist-info}/LICENSE +0 -0
- {json_repair-0.29.2.dist-info → json_repair-0.29.3.dist-info}/entry_points.txt +0 -0
- {json_repair-0.29.2.dist-info → json_repair-0.29.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,98 @@
|
|
1
|
+
import os
|
2
|
+
from typing import TextIO, Union
|
3
|
+
|
4
|
+
|
5
|
+
class StringFileWrapper:
|
6
|
+
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
|
7
|
+
def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
|
8
|
+
"""
|
9
|
+
Initialize the StringFileWrapper with a file descriptor and chunk length.
|
10
|
+
|
11
|
+
Args:
|
12
|
+
fd (TextIO): The file descriptor to wrap.
|
13
|
+
CHUNK_LENGTH (int): The length of each chunk to read from the file.
|
14
|
+
|
15
|
+
Attributes:
|
16
|
+
fd (TextIO): The wrapped file descriptor.
|
17
|
+
length (int): The total length of the file content.
|
18
|
+
buffers (dict[int, str]): Dictionary to store chunks of file content.
|
19
|
+
buffer_length (int): The length of each buffer chunk.
|
20
|
+
"""
|
21
|
+
self.fd = fd
|
22
|
+
self.length: int = 0
|
23
|
+
# Buffers are 1MB strings that are read from the file
|
24
|
+
# and kept in memory to keep reads low
|
25
|
+
self.buffers: dict[int, str] = {}
|
26
|
+
# CHUNK_LENGTH is in bytes
|
27
|
+
if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
|
28
|
+
CHUNK_LENGTH = 1_000_000
|
29
|
+
self.buffer_length = CHUNK_LENGTH
|
30
|
+
|
31
|
+
def get_buffer(self, index: int) -> str:
|
32
|
+
"""
|
33
|
+
Retrieve or load a buffer chunk from the file.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
index (int): The index of the buffer chunk to retrieve.
|
37
|
+
|
38
|
+
Returns:
|
39
|
+
str: The buffer chunk at the specified index.
|
40
|
+
"""
|
41
|
+
if self.buffers.get(index) is None:
|
42
|
+
self.fd.seek(index * self.buffer_length)
|
43
|
+
self.buffers[index] = self.fd.read(self.buffer_length)
|
44
|
+
# Save memory by keeping max 2MB buffer chunks and min 2 chunks
|
45
|
+
if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
|
46
|
+
oldest_key = next(iter(self.buffers))
|
47
|
+
if oldest_key != index:
|
48
|
+
self.buffers.pop(oldest_key)
|
49
|
+
return self.buffers[index]
|
50
|
+
|
51
|
+
def __getitem__(self, index: Union[int, slice]) -> str:
|
52
|
+
"""
|
53
|
+
Retrieve a character or a slice of characters from the file.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
index (Union[int, slice]): The index or slice of characters to retrieve.
|
57
|
+
|
58
|
+
Returns:
|
59
|
+
str: The character(s) at the specified index or slice.
|
60
|
+
"""
|
61
|
+
# The buffer is an array that is seek like a RAM:
|
62
|
+
# self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
|
63
|
+
# self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
|
64
|
+
if isinstance(index, slice):
|
65
|
+
buffer_index = index.start // self.buffer_length
|
66
|
+
buffer_end = index.stop // self.buffer_length
|
67
|
+
if buffer_index == buffer_end:
|
68
|
+
return self.get_buffer(buffer_index)[
|
69
|
+
index.start % self.buffer_length : index.stop % self.buffer_length
|
70
|
+
]
|
71
|
+
else:
|
72
|
+
start_slice = self.get_buffer(buffer_index)[
|
73
|
+
index.start % self.buffer_length :
|
74
|
+
]
|
75
|
+
end_slice = self.get_buffer(buffer_end)[
|
76
|
+
: index.stop % self.buffer_length
|
77
|
+
]
|
78
|
+
middle_slices = [
|
79
|
+
self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)
|
80
|
+
]
|
81
|
+
return start_slice + "".join(middle_slices) + end_slice
|
82
|
+
else:
|
83
|
+
buffer_index = index // self.buffer_length
|
84
|
+
return self.get_buffer(buffer_index)[index % self.buffer_length]
|
85
|
+
|
86
|
+
def __len__(self) -> int:
|
87
|
+
"""
|
88
|
+
Get the total length of the file.
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
int: The total number of characters in the file.
|
92
|
+
"""
|
93
|
+
if self.length < 1:
|
94
|
+
current_position = self.fd.tell()
|
95
|
+
self.fd.seek(0, os.SEEK_END)
|
96
|
+
self.length = self.fd.tell()
|
97
|
+
self.fd.seek(current_position)
|
98
|
+
return self.length
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: json_repair
|
3
|
-
Version: 0.29.
|
3
|
+
Version: 0.29.3
|
4
4
|
Summary: A package to repair broken json strings
|
5
5
|
Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
|
6
6
|
License: MIT License
|
@@ -45,21 +45,6 @@ This simple package can be used to fix an invalid json string. To know all cases
|
|
45
45
|
|
46
46
|
Inspired by https://github.com/josdejong/jsonrepair
|
47
47
|
|
48
|
-
---
|
49
|
-
# How to cite
|
50
|
-
If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
|
51
|
-
|
52
|
-
@software{Baccianella_JSON_Repair_-_2024,
|
53
|
-
author = {Baccianella, Stefano},
|
54
|
-
month = aug,
|
55
|
-
title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
|
56
|
-
url = {https://github.com/mangiucugna/json_repair},
|
57
|
-
version = {0.28.3},
|
58
|
-
year = {2024}
|
59
|
-
}
|
60
|
-
|
61
|
-
Thank you for citing my work and please send me a link to the paper if you can!
|
62
|
-
|
63
48
|
---
|
64
49
|
# Offer me a beer
|
65
50
|
If you find this library useful, you can help me by donating toward my monthly beer budget here: https://github.com/sponsors/mangiucugna
|
@@ -79,7 +64,30 @@ I searched for a lightweight python package that was able to reliably fix this p
|
|
79
64
|
|
80
65
|
*So I wrote one*
|
81
66
|
|
67
|
+
# Supported use cases
|
68
|
+
|
69
|
+
### Fixing Syntax Errors in JSON
|
70
|
+
|
71
|
+
- Missing quotes, misplaced commas, unescaped characters, and incomplete key-value pairs.
|
72
|
+
- Missing quotation marks, improperly formatted values (true, false, null), and repairs corrupted key-value structures.
|
73
|
+
|
74
|
+
### Repairing Malformed JSON Arrays and Objects
|
75
|
+
|
76
|
+
- Incomplete or broken arrays/objects by adding necessary elements (e.g., commas, brackets) or default values (null, "").
|
77
|
+
- The library can process JSON that includes extra non-JSON characters like comments or improperly placed characters, cleaning them up while maintaining valid structure.
|
78
|
+
|
79
|
+
### Auto-Completion for Missing JSON Values
|
80
|
+
|
81
|
+
- Automatically completes missing values in JSON fields with reasonable defaults (like empty strings or null), ensuring validity.
|
82
|
+
|
82
83
|
# How to use
|
84
|
+
|
85
|
+
Install the library with pip
|
86
|
+
|
87
|
+
pip install json-repair
|
88
|
+
|
89
|
+
then you can use use it in your code like this
|
90
|
+
|
83
91
|
from json_repair import repair_json
|
84
92
|
|
85
93
|
good_json_string = repair_json(bad_json_string)
|
@@ -185,6 +193,23 @@ To ensure that you only pin the major version of this library in your `requireme
|
|
185
193
|
|
186
194
|
In this example, any version that starts with `0.` will be acceptable, allowing for updates on minor and patch versions.
|
187
195
|
|
196
|
+
---
|
197
|
+
# How to cite
|
198
|
+
If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
|
199
|
+
|
200
|
+
@software{Baccianella_JSON_Repair_-_2024,
|
201
|
+
author = {Baccianella, Stefano},
|
202
|
+
month = aug,
|
203
|
+
title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
|
204
|
+
url = {https://github.com/mangiucugna/json_repair},
|
205
|
+
version = {0.28.3},
|
206
|
+
year = {2024}
|
207
|
+
}
|
208
|
+
|
209
|
+
Thank you for citing my work and please send me a link to the paper if you can!
|
210
|
+
|
211
|
+
---
|
212
|
+
|
188
213
|
# How it works
|
189
214
|
This module will parse the JSON file following the BNF definition:
|
190
215
|
|
@@ -0,0 +1,13 @@
|
|
1
|
+
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
+
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
3
|
+
json_repair/json_context.py,sha256=MOzT0z4Pc03SWhggwwEpDNXyeHm04kLfvDBOBd3xkVU,1782
|
4
|
+
json_repair/json_parser.py,sha256=Gimn0LFUTpdGCFo9rOGjH3W39PEj00_Lrj4mPOSnBFU,25949
|
5
|
+
json_repair/json_repair.py,sha256=GTg3OAXRbAJAHWs8oiQDqUHh4h6qKDVvWPXcrqafzLY,6100
|
6
|
+
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
json_repair/string_file_wrapper.py,sha256=EHLhNBWoyUitzT08thytYJiNZh_klEFwfT8zutPSdb4,3905
|
8
|
+
json_repair-0.29.3.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
9
|
+
json_repair-0.29.3.dist-info/METADATA,sha256=tQ_crOtYbu3fseCXoc-VDIHIJq0HtGRA9SvmCIldvUE,10686
|
10
|
+
json_repair-0.29.3.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
11
|
+
json_repair-0.29.3.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
12
|
+
json_repair-0.29.3.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
13
|
+
json_repair-0.29.3.dist-info/RECORD,,
|
@@ -1,10 +0,0 @@
|
|
1
|
-
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
-
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
3
|
-
json_repair/json_repair.py,sha256=anGQI5RxauBnZUO9QKoPU7JgN_sUaIddyiR4ecpMmm8,34060
|
4
|
-
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
5
|
-
json_repair-0.29.2.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
6
|
-
json_repair-0.29.2.dist-info/METADATA,sha256=Jtwl047L79Xj0CmA363Xc2EemzttgMWqYW0abi4a7fA,9787
|
7
|
-
json_repair-0.29.2.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
|
8
|
-
json_repair-0.29.2.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
9
|
-
json_repair-0.29.2.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
10
|
-
json_repair-0.29.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|