json-repair 0.53.1__py3-none-any.whl → 0.54__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/json_parser.py +10 -0
- json_repair/json_repair.py +21 -3
- json_repair/parse_object.py +33 -10
- json_repair/parse_string.py +11 -2
- json_repair/utils/string_file_wrapper.py +95 -27
- {json_repair-0.53.1.dist-info → json_repair-0.54.dist-info}/METADATA +19 -1
- {json_repair-0.53.1.dist-info → json_repair-0.54.dist-info}/RECORD +11 -11
- {json_repair-0.53.1.dist-info → json_repair-0.54.dist-info}/WHEEL +0 -0
- {json_repair-0.53.1.dist-info → json_repair-0.54.dist-info}/entry_points.txt +0 -0
- {json_repair-0.53.1.dist-info → json_repair-0.54.dist-info}/licenses/LICENSE +0 -0
- {json_repair-0.53.1.dist-info → json_repair-0.54.dist-info}/top_level.txt +0 -0
json_repair/json_parser.py
CHANGED
|
@@ -35,6 +35,7 @@ class JSONParser:
|
|
|
35
35
|
logging: bool | None,
|
|
36
36
|
json_fd_chunk_length: int = 0,
|
|
37
37
|
stream_stable: bool = False,
|
|
38
|
+
strict: bool = False,
|
|
38
39
|
) -> None:
|
|
39
40
|
# The string to parse
|
|
40
41
|
self.json_str: str | StringFileWrapper = json_str
|
|
@@ -66,6 +67,10 @@ class JSONParser:
|
|
|
66
67
|
# case 3: '{"key": "val\\n123,`key2:value2' => '{"key": "val\\n123,`key2:value2"}'
|
|
67
68
|
# case 4: '{"key": "val\\n123,`key2:value2`"}' => '{"key": "val\\n123,`key2:value2`"}'
|
|
68
69
|
self.stream_stable = stream_stable
|
|
70
|
+
# Over time the library got more and more complex heuristics to repair JSON. Some of these heuristics
|
|
71
|
+
# may not be desirable in some use cases and the user would prefer json_repair to return an exception.
|
|
72
|
+
# So strict mode was added to disable some of those heuristics.
|
|
73
|
+
self.strict = strict
|
|
69
74
|
|
|
70
75
|
def parse(
|
|
71
76
|
self,
|
|
@@ -93,6 +98,11 @@ class JSONParser:
|
|
|
93
98
|
"There were no more elements, returning the element without the array",
|
|
94
99
|
)
|
|
95
100
|
json = json[0]
|
|
101
|
+
elif self.strict:
|
|
102
|
+
self.log(
|
|
103
|
+
"Multiple top-level JSON elements found in strict mode, raising an error",
|
|
104
|
+
)
|
|
105
|
+
raise ValueError("Multiple top-level JSON elements found in strict mode.")
|
|
96
106
|
if self.logging:
|
|
97
107
|
return json, self.logger
|
|
98
108
|
else:
|
json_repair/json_repair.py
CHANGED
|
@@ -40,6 +40,7 @@ def repair_json(
|
|
|
40
40
|
json_fd: TextIO | None = None,
|
|
41
41
|
chunk_length: int = 0,
|
|
42
42
|
stream_stable: bool = False,
|
|
43
|
+
strict: bool = False,
|
|
43
44
|
**json_dumps_args: Any,
|
|
44
45
|
) -> str: ...
|
|
45
46
|
|
|
@@ -53,6 +54,7 @@ def repair_json(
|
|
|
53
54
|
json_fd: TextIO | None = None,
|
|
54
55
|
chunk_length: int = 0,
|
|
55
56
|
stream_stable: bool = False,
|
|
57
|
+
strict: bool = False,
|
|
56
58
|
**json_dumps_args: Any,
|
|
57
59
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]: ...
|
|
58
60
|
|
|
@@ -65,6 +67,7 @@ def repair_json(
|
|
|
65
67
|
json_fd: TextIO | None = None,
|
|
66
68
|
chunk_length: int = 0,
|
|
67
69
|
stream_stable: bool = False,
|
|
70
|
+
strict: bool = False,
|
|
68
71
|
**json_dumps_args: Any,
|
|
69
72
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
|
|
70
73
|
"""
|
|
@@ -79,10 +82,11 @@ def repair_json(
|
|
|
79
82
|
ensure_ascii (bool, optional): Set to False to avoid converting non-latin characters to ascii (for example when using chinese characters). Defaults to True. Ignored if `skip_json_loads` is True.
|
|
80
83
|
chunk_length (int, optional): Size in bytes of the file chunks to read at once. Ignored if `json_fd` is None. Do not use! Use `from_file` or `load` instead. Defaults to 1MB.
|
|
81
84
|
stream_stable (bool, optional): When the json to be repaired is the accumulation of streaming json at a certain moment.If this parameter to True will keep the repair results stable.
|
|
85
|
+
strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
|
|
82
86
|
Returns:
|
|
83
87
|
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON or a tuple with the repaired JSON and repair log when logging is True.
|
|
84
88
|
"""
|
|
85
|
-
parser = JSONParser(json_str, json_fd, logging, chunk_length, stream_stable)
|
|
89
|
+
parser = JSONParser(json_str, json_fd, logging, chunk_length, stream_stable, strict)
|
|
86
90
|
if skip_json_loads:
|
|
87
91
|
parsed_json = parser.parse()
|
|
88
92
|
else:
|
|
@@ -109,6 +113,7 @@ def loads(
|
|
|
109
113
|
skip_json_loads: bool = False,
|
|
110
114
|
logging: bool = False,
|
|
111
115
|
stream_stable: bool = False,
|
|
116
|
+
strict: bool = False,
|
|
112
117
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | str:
|
|
113
118
|
"""
|
|
114
119
|
This function works like `json.loads()` except that it will fix your JSON in the process.
|
|
@@ -118,6 +123,7 @@ def loads(
|
|
|
118
123
|
json_str (str): The JSON string to load and repair.
|
|
119
124
|
skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
|
|
120
125
|
logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
|
|
126
|
+
strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
|
|
121
127
|
|
|
122
128
|
Returns:
|
|
123
129
|
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]], str]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
|
|
@@ -128,6 +134,7 @@ def loads(
|
|
|
128
134
|
skip_json_loads=skip_json_loads,
|
|
129
135
|
logging=logging,
|
|
130
136
|
stream_stable=stream_stable,
|
|
137
|
+
strict=strict,
|
|
131
138
|
)
|
|
132
139
|
|
|
133
140
|
|
|
@@ -136,6 +143,7 @@ def load(
|
|
|
136
143
|
skip_json_loads: bool = False,
|
|
137
144
|
logging: bool = False,
|
|
138
145
|
chunk_length: int = 0,
|
|
146
|
+
strict: bool = False,
|
|
139
147
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
|
|
140
148
|
"""
|
|
141
149
|
This function works like `json.load()` except that it will fix your JSON in the process.
|
|
@@ -146,6 +154,7 @@ def load(
|
|
|
146
154
|
skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
|
|
147
155
|
logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
|
|
148
156
|
chunk_length (int, optional): Size in bytes of the file chunks to read at once. Defaults to 1MB.
|
|
157
|
+
strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
|
|
149
158
|
|
|
150
159
|
Returns:
|
|
151
160
|
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
|
|
@@ -156,6 +165,7 @@ def load(
|
|
|
156
165
|
return_objects=True,
|
|
157
166
|
skip_json_loads=skip_json_loads,
|
|
158
167
|
logging=logging,
|
|
168
|
+
strict=strict,
|
|
159
169
|
)
|
|
160
170
|
|
|
161
171
|
|
|
@@ -164,6 +174,7 @@ def from_file(
|
|
|
164
174
|
skip_json_loads: bool = False,
|
|
165
175
|
logging: bool = False,
|
|
166
176
|
chunk_length: int = 0,
|
|
177
|
+
strict: bool = False,
|
|
167
178
|
) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
|
|
168
179
|
"""
|
|
169
180
|
This function is a wrapper around `load()` so you can pass the filename as string
|
|
@@ -173,6 +184,7 @@ def from_file(
|
|
|
173
184
|
skip_json_loads (bool, optional): If True, skip calling the built-in json.loads() function to verify that the json is valid before attempting to repair. Defaults to False.
|
|
174
185
|
logging (bool, optional): If True, return a tuple with the repaired json and a log of all repair actions. Defaults to False.
|
|
175
186
|
chunk_length (int, optional): Size in bytes of the file chunks to read at once. Defaults to 1MB.
|
|
187
|
+
strict (bool, optional): If True, surface structural problems (duplicate keys, missing separators, empty keys/values, etc.) as ValueError instead of repairing them.
|
|
176
188
|
|
|
177
189
|
Returns:
|
|
178
190
|
Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]: The repaired JSON object or a tuple with the repaired JSON object and repair log.
|
|
@@ -183,6 +195,7 @@ def from_file(
|
|
|
183
195
|
skip_json_loads=skip_json_loads,
|
|
184
196
|
logging=logging,
|
|
185
197
|
chunk_length=chunk_length,
|
|
198
|
+
strict=strict,
|
|
186
199
|
)
|
|
187
200
|
|
|
188
201
|
return jsonobj
|
|
@@ -240,6 +253,11 @@ def cli(inline_args: list[str] | None = None) -> int:
|
|
|
240
253
|
default=2,
|
|
241
254
|
help="Number of spaces for indentation (Default 2)",
|
|
242
255
|
)
|
|
256
|
+
parser.add_argument(
|
|
257
|
+
"--strict",
|
|
258
|
+
action="store_true",
|
|
259
|
+
help="Raise on duplicate keys, missing separators, empty keys/values, and other unrecoverable structures instead of repairing them",
|
|
260
|
+
)
|
|
243
261
|
|
|
244
262
|
args = parser.parse_args() if inline_args is None else parser.parse_args(inline_args)
|
|
245
263
|
|
|
@@ -259,10 +277,10 @@ def cli(inline_args: list[str] | None = None) -> int:
|
|
|
259
277
|
try:
|
|
260
278
|
# Use from_file if a filename is provided; otherwise read from stdin.
|
|
261
279
|
if args.filename:
|
|
262
|
-
result = from_file(args.filename)
|
|
280
|
+
result = from_file(args.filename, strict=args.strict)
|
|
263
281
|
else:
|
|
264
282
|
data = sys.stdin.read()
|
|
265
|
-
result = loads(data)
|
|
283
|
+
result = loads(data, strict=args.strict)
|
|
266
284
|
if args.inline or args.output:
|
|
267
285
|
with open(args.output or args.filename, mode="w") as fd:
|
|
268
286
|
json.dump(result, fd, indent=args.indent, ensure_ascii=ensure_ascii)
|
json_repair/parse_object.py
CHANGED
|
@@ -62,9 +62,17 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
62
62
|
if key == "":
|
|
63
63
|
self.skip_whitespaces()
|
|
64
64
|
if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
|
|
65
|
-
#
|
|
65
|
+
# Empty keys now trigger in strict mode, otherwise we keep repairing as before
|
|
66
|
+
if key == "" and self.strict:
|
|
67
|
+
self.log(
|
|
68
|
+
"Empty key found in strict mode while parsing object, raising an error",
|
|
69
|
+
)
|
|
70
|
+
raise ValueError("Empty key found in strict mode while parsing object.")
|
|
66
71
|
break
|
|
67
72
|
if ContextValues.ARRAY in self.context.context and key in obj:
|
|
73
|
+
if self.strict:
|
|
74
|
+
self.log("Duplicate key found in strict mode while parsing object, raising an error")
|
|
75
|
+
raise ValueError("Duplicate key found in strict mode while parsing object.")
|
|
68
76
|
self.log(
|
|
69
77
|
"While parsing an object we found a duplicate key, closing the object here and rolling back the index",
|
|
70
78
|
)
|
|
@@ -84,6 +92,11 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
84
92
|
|
|
85
93
|
# An extreme case of missing ":" after a key
|
|
86
94
|
if self.get_char_at() != ":":
|
|
95
|
+
if self.strict:
|
|
96
|
+
self.log(
|
|
97
|
+
"Missing ':' after key in strict mode while parsing object, raising an error",
|
|
98
|
+
)
|
|
99
|
+
raise ValueError("Missing ':' after key in strict mode while parsing object.")
|
|
87
100
|
self.log(
|
|
88
101
|
"While parsing an object we missed a : after a key",
|
|
89
102
|
)
|
|
@@ -91,17 +104,21 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
91
104
|
self.index += 1
|
|
92
105
|
self.context.reset()
|
|
93
106
|
self.context.set(ContextValues.OBJECT_VALUE)
|
|
94
|
-
# The value can be any valid json
|
|
107
|
+
# The value can be any valid json; strict mode will refuse repaired empties
|
|
95
108
|
self.skip_whitespaces()
|
|
96
109
|
# Corner case, a lone comma
|
|
97
110
|
value: JSONReturnType = ""
|
|
98
111
|
if self.get_char_at() in [",", "}"]:
|
|
99
112
|
self.log(
|
|
100
|
-
"While parsing an object value we found a stray , ignoring it",
|
|
113
|
+
"While parsing an object value we found a stray " + str(self.get_char_at()) + ", ignoring it",
|
|
101
114
|
)
|
|
102
115
|
else:
|
|
103
116
|
value = self.parse_json()
|
|
104
|
-
|
|
117
|
+
if value == "" and self.strict and self.get_char_at(-1) not in STRING_DELIMITERS:
|
|
118
|
+
self.log(
|
|
119
|
+
"Parsed value is empty in strict mode while parsing object, raising an error",
|
|
120
|
+
)
|
|
121
|
+
raise ValueError("Parsed value is empty in strict mode while parsing object.")
|
|
105
122
|
# Reset context since our job is done
|
|
106
123
|
self.context.reset()
|
|
107
124
|
obj[key] = value
|
|
@@ -116,6 +133,11 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
116
133
|
|
|
117
134
|
# If the object is empty but also isn't just {}
|
|
118
135
|
if not obj and self.index - start_index > 2:
|
|
136
|
+
if self.strict:
|
|
137
|
+
self.log(
|
|
138
|
+
"Parsed object is empty but contains extra characters in strict mode, raising an error",
|
|
139
|
+
)
|
|
140
|
+
raise ValueError("Parsed object is empty but contains extra characters in strict mode.")
|
|
119
141
|
self.log("Parsed object is empty, we will try to parse this as an array instead")
|
|
120
142
|
self.index = start_index
|
|
121
143
|
return self.parse_array()
|
|
@@ -133,11 +155,12 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
133
155
|
self.skip_whitespaces()
|
|
134
156
|
if self.get_char_at() not in STRING_DELIMITERS:
|
|
135
157
|
return obj
|
|
136
|
-
self.
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
158
|
+
if not self.strict:
|
|
159
|
+
self.log(
|
|
160
|
+
"Found a comma and string delimiter after object closing brace, checking for additional key-value pairs",
|
|
161
|
+
)
|
|
162
|
+
additional_obj = self.parse_object()
|
|
163
|
+
if isinstance(additional_obj, dict):
|
|
164
|
+
obj.update(additional_obj)
|
|
142
165
|
|
|
143
166
|
return obj
|
json_repair/parse_string.py
CHANGED
|
@@ -81,7 +81,10 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
81
81
|
self.log(
|
|
82
82
|
"While parsing a string, we found a doubled quote and then a quote again, ignoring it",
|
|
83
83
|
)
|
|
84
|
-
|
|
84
|
+
if self.strict:
|
|
85
|
+
raise ValueError("Found doubled quotes followed by another quote.")
|
|
86
|
+
else:
|
|
87
|
+
return ""
|
|
85
88
|
# Find the next delimiter
|
|
86
89
|
i = self.skip_to_character(character=rstring_delimiter, idx=1)
|
|
87
90
|
next_c = self.get_char_at(i)
|
|
@@ -102,6 +105,10 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
102
105
|
self.log(
|
|
103
106
|
"While parsing a string, we found a doubled quote but also another quote afterwards, ignoring it",
|
|
104
107
|
)
|
|
108
|
+
if self.strict:
|
|
109
|
+
raise ValueError(
|
|
110
|
+
"Found doubled quotes followed by another quote while parsing a string.",
|
|
111
|
+
)
|
|
105
112
|
self.index += 1
|
|
106
113
|
return ""
|
|
107
114
|
elif next_c not in [",", "]", "}"]:
|
|
@@ -292,7 +299,9 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
|
|
|
292
299
|
if self.get_char_at(i) in [",", "}"]:
|
|
293
300
|
# Ok then this is a missing right quote
|
|
294
301
|
self.log(
|
|
295
|
-
"While parsing a string missing the right delimiter in object key context, we found a
|
|
302
|
+
"While parsing a string missing the right delimiter in object key context, we found a "
|
|
303
|
+
+ str(self.get_char_at(i))
|
|
304
|
+
+ " stopping here",
|
|
296
305
|
)
|
|
297
306
|
break
|
|
298
307
|
else:
|
|
@@ -19,14 +19,16 @@ class StringFileWrapper:
|
|
|
19
19
|
buffer_length (int): The length of each buffer chunk.
|
|
20
20
|
"""
|
|
21
21
|
self.fd = fd
|
|
22
|
-
|
|
23
|
-
# Buffers are 1MB strings that are read from the file
|
|
24
|
-
# and kept in memory to keep reads low
|
|
22
|
+
# Buffers are chunks of text read from the file and cached to reduce disk access.
|
|
25
23
|
self.buffers: dict[int, str] = {}
|
|
26
|
-
# chunk_length is in bytes
|
|
27
24
|
if not chunk_length or chunk_length < 2:
|
|
28
25
|
chunk_length = 1_000_000
|
|
26
|
+
# chunk_length now refers to the number of characters per chunk.
|
|
29
27
|
self.buffer_length = chunk_length
|
|
28
|
+
# Keep track of the starting file position ("cookie") for each chunk so we can
|
|
29
|
+
# seek safely without landing in the middle of a multibyte code point.
|
|
30
|
+
self._chunk_positions: list[int] = [0]
|
|
31
|
+
self.length: int | None = None
|
|
30
32
|
|
|
31
33
|
def get_buffer(self, index: int) -> str:
|
|
32
34
|
"""
|
|
@@ -38,15 +40,33 @@ class StringFileWrapper:
|
|
|
38
40
|
Returns:
|
|
39
41
|
str: The buffer chunk at the specified index.
|
|
40
42
|
"""
|
|
41
|
-
if
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
43
|
+
if index < 0:
|
|
44
|
+
raise IndexError("Negative indexing is not supported")
|
|
45
|
+
|
|
46
|
+
cached = self.buffers.get(index)
|
|
47
|
+
if cached is not None:
|
|
48
|
+
return cached
|
|
49
|
+
|
|
50
|
+
self._ensure_chunk_position(index)
|
|
51
|
+
start_pos = self._chunk_positions[index]
|
|
52
|
+
self.fd.seek(start_pos)
|
|
53
|
+
chunk = self.fd.read(self.buffer_length)
|
|
54
|
+
if not chunk:
|
|
55
|
+
raise IndexError("Chunk index out of range")
|
|
56
|
+
end_pos = self.fd.tell()
|
|
57
|
+
if len(self._chunk_positions) <= index + 1:
|
|
58
|
+
self._chunk_positions.append(end_pos)
|
|
59
|
+
if len(chunk) < self.buffer_length:
|
|
60
|
+
self.length = index * self.buffer_length + len(chunk)
|
|
61
|
+
|
|
62
|
+
self.buffers[index] = chunk
|
|
63
|
+
# Save memory by keeping max 2MB buffer chunks and min 2 chunks
|
|
64
|
+
max_buffers = max(2, int(2_000_000 / self.buffer_length))
|
|
65
|
+
if len(self.buffers) > max_buffers:
|
|
66
|
+
oldest_key = next(iter(self.buffers))
|
|
67
|
+
if oldest_key != index:
|
|
68
|
+
self.buffers.pop(oldest_key)
|
|
69
|
+
return chunk
|
|
50
70
|
|
|
51
71
|
def __getitem__(self, index: int | slice) -> str:
|
|
52
72
|
"""
|
|
@@ -62,18 +82,49 @@ class StringFileWrapper:
|
|
|
62
82
|
# self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
|
|
63
83
|
# self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
|
|
64
84
|
if isinstance(index, slice):
|
|
65
|
-
|
|
66
|
-
|
|
85
|
+
total_len = len(self)
|
|
86
|
+
start = 0 if index.start is None else index.start
|
|
87
|
+
stop = total_len if index.stop is None else index.stop
|
|
88
|
+
step = 1 if index.step is None else index.step
|
|
89
|
+
|
|
90
|
+
if start < 0:
|
|
91
|
+
start += total_len
|
|
92
|
+
if stop < 0:
|
|
93
|
+
stop += total_len
|
|
94
|
+
|
|
95
|
+
start = max(start, 0)
|
|
96
|
+
stop = min(stop, total_len)
|
|
97
|
+
|
|
98
|
+
if step == 0:
|
|
99
|
+
raise ValueError("slice step cannot be zero")
|
|
100
|
+
if step != 1:
|
|
101
|
+
return "".join(self[i] for i in range(start, stop, step))
|
|
102
|
+
|
|
103
|
+
if start >= stop:
|
|
104
|
+
return ""
|
|
105
|
+
|
|
106
|
+
buffer_index = start // self.buffer_length
|
|
107
|
+
buffer_end = (stop - 1) // self.buffer_length
|
|
108
|
+
start_mod = start % self.buffer_length
|
|
109
|
+
stop_mod = stop % self.buffer_length
|
|
110
|
+
if stop_mod == 0 and stop > start:
|
|
111
|
+
stop_mod = self.buffer_length
|
|
67
112
|
if buffer_index == buffer_end:
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
113
|
+
buffer = self.get_buffer(buffer_index)
|
|
114
|
+
return buffer[start_mod:stop_mod]
|
|
115
|
+
|
|
116
|
+
start_slice = self.get_buffer(buffer_index)[start_mod:]
|
|
117
|
+
end_slice = self.get_buffer(buffer_end)[:stop_mod]
|
|
118
|
+
middle_slices = [self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)]
|
|
119
|
+
return start_slice + "".join(middle_slices) + end_slice
|
|
74
120
|
else:
|
|
121
|
+
if index < 0:
|
|
122
|
+
index += len(self)
|
|
123
|
+
if index < 0:
|
|
124
|
+
raise IndexError("string index out of range")
|
|
75
125
|
buffer_index = index // self.buffer_length
|
|
76
|
-
|
|
126
|
+
buffer = self.get_buffer(buffer_index)
|
|
127
|
+
return buffer[index % self.buffer_length]
|
|
77
128
|
|
|
78
129
|
def __len__(self) -> int:
|
|
79
130
|
"""
|
|
@@ -82,11 +133,10 @@ class StringFileWrapper:
|
|
|
82
133
|
Returns:
|
|
83
134
|
int: The total number of characters in the file.
|
|
84
135
|
"""
|
|
85
|
-
if self.length
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
self.fd.seek(current_position)
|
|
136
|
+
if self.length is None:
|
|
137
|
+
while self.length is None:
|
|
138
|
+
chunk_index = len(self._chunk_positions)
|
|
139
|
+
self._ensure_chunk_position(chunk_index)
|
|
90
140
|
return self.length
|
|
91
141
|
|
|
92
142
|
def __setitem__(self, index: int | slice, value: str) -> None: # pragma: no cover
|
|
@@ -106,3 +156,21 @@ class StringFileWrapper:
|
|
|
106
156
|
self.fd.seek(start)
|
|
107
157
|
self.fd.write(value)
|
|
108
158
|
self.fd.seek(current_position)
|
|
159
|
+
|
|
160
|
+
def _ensure_chunk_position(self, chunk_index: int) -> None:
|
|
161
|
+
"""
|
|
162
|
+
Ensure that we know the starting file position for the given chunk index.
|
|
163
|
+
"""
|
|
164
|
+
while len(self._chunk_positions) <= chunk_index:
|
|
165
|
+
prev_index = len(self._chunk_positions) - 1
|
|
166
|
+
start_pos = self._chunk_positions[-1]
|
|
167
|
+
self.fd.seek(start_pos, os.SEEK_SET)
|
|
168
|
+
chunk = self.fd.read(self.buffer_length)
|
|
169
|
+
end_pos = self.fd.tell()
|
|
170
|
+
if len(chunk) < self.buffer_length:
|
|
171
|
+
self.length = prev_index * self.buffer_length + len(chunk)
|
|
172
|
+
self._chunk_positions.append(end_pos)
|
|
173
|
+
if not chunk:
|
|
174
|
+
break
|
|
175
|
+
if len(self._chunk_positions) <= chunk_index:
|
|
176
|
+
raise IndexError("Chunk index out of range")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: json_repair
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.54
|
|
4
4
|
Summary: A package to repair broken json strings
|
|
5
5
|
Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -167,6 +167,23 @@ Some rules of thumb to use:
|
|
|
167
167
|
- `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
|
|
168
168
|
- If you are having issues with escaping pass the string as **raw** string like: `r"string with escaping\""`
|
|
169
169
|
|
|
170
|
+
### Strict mode
|
|
171
|
+
|
|
172
|
+
By default `json_repair` does its best to “fix” input, even when the JSON is far from valid.
|
|
173
|
+
In some scenarios you want the opposite behavior and need the parser to error out instead of repairing; pass `strict=True` to `repair_json`, `loads`, `load`, or `from_file` to enable that mode:
|
|
174
|
+
|
|
175
|
+
```
|
|
176
|
+
from json_repair import repair_json
|
|
177
|
+
|
|
178
|
+
repair_json(bad_json_string, strict=True)
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
The CLI exposes the same behavior with `json_repair --strict input.json` (or piping data via stdin).
|
|
182
|
+
|
|
183
|
+
In strict mode the parser raises `ValueError` as soon as it encounters structural issues such as duplicate keys, missing `:` separators, empty keys/values introduced by stray commas, multiple top-level elements, or other ambiguous constructs. This is useful when you just need validation with friendlier error messages while still benefiting from json_repair’s resilience elsewhere in your stack.
|
|
184
|
+
|
|
185
|
+
Strict mode still honors `skip_json_loads=True`; combining them lets you skip the initial `json.loads` check but still enforce strict parsing rules.
|
|
186
|
+
|
|
170
187
|
### Use json_repair with streaming
|
|
171
188
|
|
|
172
189
|
Sometimes you are streaming some data and want to repair the JSON coming from it. Normally this won't work but you can pass `stream_stable` to `repair_json()` or `loads()` to make it work:
|
|
@@ -198,6 +215,7 @@ options:
|
|
|
198
215
|
If specified, the output will be written to TARGET filename instead of stdout
|
|
199
216
|
--ensure_ascii Pass ensure_ascii=True to json.dumps()
|
|
200
217
|
--indent INDENT Number of spaces for indentation (Default 2)
|
|
218
|
+
--strict Raise on duplicate keys, missing separators, empty keys/values, and similar structural issues instead of repairing them
|
|
201
219
|
```
|
|
202
220
|
|
|
203
221
|
## Adding to requirements
|
|
@@ -1,22 +1,22 @@
|
|
|
1
1
|
json_repair/__init__.py,sha256=JQ4Nm8YzR8Id2a527Ql0Az-rKapTp8DCMPKybLtQ620,180
|
|
2
2
|
json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
|
|
3
|
-
json_repair/json_parser.py,sha256=
|
|
4
|
-
json_repair/json_repair.py,sha256=
|
|
3
|
+
json_repair/json_parser.py,sha256=nATFDlcEnPD8G2NDSKj2nme_v1la_cCcFZrdQvEjTZs,8495
|
|
4
|
+
json_repair/json_repair.py,sha256=iT-OJgpBnKUJVIV4IUlXmMUkOyW6bNnKCZLB7Fys8hk,12758
|
|
5
5
|
json_repair/parse_array.py,sha256=rZfnRiS86vBATOUHqSx2T5fE79Ndlk2NoTsg9Wek7l4,2239
|
|
6
6
|
json_repair/parse_comment.py,sha256=MUDxrx8BFfAaKvx6x4gWviJNvwRi2yv5qnrR6honmas,2660
|
|
7
7
|
json_repair/parse_number.py,sha256=Ddv3Dih1VYfdasUe5DxQWAqy7YAE3aZJ7iePCfdi1EQ,1292
|
|
8
|
-
json_repair/parse_object.py,sha256=
|
|
9
|
-
json_repair/parse_string.py,sha256=
|
|
8
|
+
json_repair/parse_object.py,sha256=noaiP10kzl-jA-1jc6tMmtFoJMIputpB3zFxcAuYQvY,6986
|
|
9
|
+
json_repair/parse_string.py,sha256=L4McLWzRkbW_7Xx_hSGOmfpoPMwbYTGEKBAjqwanLEs,26146
|
|
10
10
|
json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
json_repair/parse_string_helpers/parse_boolean_or_null.py,sha256=pGmH1QATBls70kTvUlJv4F8NiPaBWcyGhRL03sTOnto,871
|
|
12
12
|
json_repair/parse_string_helpers/parse_json_llm_block.py,sha256=wPSm-8RY30Ek8HxzjCkCRtdLq4-Cez-PJB3vOk_vP3w,670
|
|
13
13
|
json_repair/utils/constants.py,sha256=cv2gvyosuq0me0600WyTysM9avrtfXPuXYR26tawcuo,158
|
|
14
14
|
json_repair/utils/json_context.py,sha256=WsMOjqpGSr6aaDONcrk8UFtTurzWon2Qq9AoBBYseoI,934
|
|
15
15
|
json_repair/utils/object_comparer.py,sha256=XKV3MRab8H7_v4sm-wpEa5le0XX9OeycWo5S-MFm-GI,1716
|
|
16
|
-
json_repair/utils/string_file_wrapper.py,sha256=
|
|
17
|
-
json_repair-0.
|
|
18
|
-
json_repair-0.
|
|
19
|
-
json_repair-0.
|
|
20
|
-
json_repair-0.
|
|
21
|
-
json_repair-0.
|
|
22
|
-
json_repair-0.
|
|
16
|
+
json_repair/utils/string_file_wrapper.py,sha256=Zlm0ZfJAw_VPlIy-QldL_OKYrPk3TYGq1JVAFPv7SnQ,6862
|
|
17
|
+
json_repair-0.54.dist-info/licenses/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
|
18
|
+
json_repair-0.54.dist-info/METADATA,sha256=xoD5G1EZ7muIRVbzdjsgD10OQbxS-K06sNGqlNDvvdQ,12220
|
|
19
|
+
json_repair-0.54.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
20
|
+
json_repair-0.54.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
|
|
21
|
+
json_repair-0.54.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
|
22
|
+
json_repair-0.54.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|