json-repair 0.33.0__tar.gz → 0.35.0__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {json_repair-0.33.0/src/json_repair.egg-info → json_repair-0.35.0}/PKG-INFO +1 -1
- {json_repair-0.33.0 → json_repair-0.35.0}/pyproject.toml +1 -1
- {json_repair-0.33.0 → json_repair-0.35.0}/src/json_repair/json_parser.py +97 -26
- {json_repair-0.33.0 → json_repair-0.35.0}/src/json_repair/string_file_wrapper.py +21 -0
- {json_repair-0.33.0 → json_repair-0.35.0/src/json_repair.egg-info}/PKG-INFO +1 -1
- {json_repair-0.33.0 → json_repair-0.35.0}/tests/test_json_repair.py +4 -2
- {json_repair-0.33.0 → json_repair-0.35.0}/tests/test_performance.py +1 -1
- {json_repair-0.33.0 → json_repair-0.35.0}/LICENSE +0 -0
- {json_repair-0.33.0 → json_repair-0.35.0}/README.md +0 -0
- {json_repair-0.33.0 → json_repair-0.35.0}/setup.cfg +0 -0
- {json_repair-0.33.0 → json_repair-0.35.0}/src/json_repair/__init__.py +0 -0
- {json_repair-0.33.0 → json_repair-0.35.0}/src/json_repair/__main__.py +0 -0
- {json_repair-0.33.0 → json_repair-0.35.0}/src/json_repair/json_context.py +0 -0
- {json_repair-0.33.0 → json_repair-0.35.0}/src/json_repair/json_repair.py +0 -0
- {json_repair-0.33.0 → json_repair-0.35.0}/src/json_repair/py.typed +0 -0
- {json_repair-0.33.0 → json_repair-0.35.0}/src/json_repair.egg-info/SOURCES.txt +0 -0
- {json_repair-0.33.0 → json_repair-0.35.0}/src/json_repair.egg-info/dependency_links.txt +0 -0
- {json_repair-0.33.0 → json_repair-0.35.0}/src/json_repair.egg-info/entry_points.txt +0 -0
- {json_repair-0.33.0 → json_repair-0.35.0}/src/json_repair.egg-info/top_level.txt +0 -0
- {json_repair-0.33.0 → json_repair-0.35.0}/tests/test_coverage.py +0 -0
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
4
4
|
[project]
|
5
5
|
name = "json_repair"
|
6
|
-
version = "0.
|
6
|
+
version = "0.35.0"
|
7
7
|
license = {file = "LICENSE"}
|
8
8
|
authors = [
|
9
9
|
{ name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },
|
@@ -7,6 +7,9 @@ JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
|
|
7
7
|
|
8
8
|
|
9
9
|
class JSONParser:
|
10
|
+
# Constants
|
11
|
+
STRING_DELIMITERS = ['"', "'", "“", "”"]
|
12
|
+
|
10
13
|
def __init__(
|
11
14
|
self,
|
12
15
|
json_str: Union[str, StringFileWrapper],
|
@@ -89,7 +92,9 @@ class JSONParser:
|
|
89
92
|
)
|
90
93
|
return ""
|
91
94
|
# <string> starts with a quote
|
92
|
-
elif not self.context.empty and (
|
95
|
+
elif not self.context.empty and (
|
96
|
+
char in self.STRING_DELIMITERS or char.isalpha()
|
97
|
+
):
|
93
98
|
return self.parse_string()
|
94
99
|
# <number> starts with [0-9] or minus
|
95
100
|
elif not self.context.empty and (
|
@@ -130,6 +135,8 @@ class JSONParser:
|
|
130
135
|
# <member> starts with a <string>
|
131
136
|
key = ""
|
132
137
|
while self.get_char_at():
|
138
|
+
# The rollback index needs to be updated here in case the key is empty
|
139
|
+
rollback_index = self.index
|
133
140
|
key = str(self.parse_string())
|
134
141
|
|
135
142
|
if key != "" or (key == "" and self.get_char_at() == ":"):
|
@@ -140,6 +147,12 @@ class JSONParser:
|
|
140
147
|
"While parsing an object we found a duplicate key, closing the object here and rolling back the index",
|
141
148
|
)
|
142
149
|
self.index = rollback_index - 1
|
150
|
+
# add an opening curly brace to make this work
|
151
|
+
self.json_str = (
|
152
|
+
self.json_str[: self.index + 1]
|
153
|
+
+ "{"
|
154
|
+
+ self.json_str[self.index + 1 :]
|
155
|
+
)
|
143
156
|
break
|
144
157
|
|
145
158
|
# Skip filler whitespaces
|
@@ -227,7 +240,7 @@ class JSONParser:
|
|
227
240
|
|
228
241
|
char = self.get_char_at()
|
229
242
|
# A valid string can only start with a valid quote or, in our case, with a literal
|
230
|
-
while char and char not in
|
243
|
+
while char and char not in self.STRING_DELIMITERS and not char.isalnum():
|
231
244
|
self.index += 1
|
232
245
|
char = self.get_char_at()
|
233
246
|
|
@@ -262,35 +275,61 @@ class JSONParser:
|
|
262
275
|
if not missing_quotes:
|
263
276
|
self.index += 1
|
264
277
|
|
278
|
+
self.skip_whitespaces_at()
|
265
279
|
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
|
266
|
-
if self.get_char_at()
|
267
|
-
# If
|
268
|
-
if (
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
self.index += 1
|
285
|
-
else:
|
286
|
-
# Ok this is not a doubled quote, check if this is an empty string or not
|
287
|
-
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
|
280
|
+
if self.get_char_at() in self.STRING_DELIMITERS:
|
281
|
+
# If the next character is the same type of quote, then we manage it as double quotes
|
282
|
+
if self.get_char_at() == lstring_delimiter:
|
283
|
+
# If it's an empty key, this was easy
|
284
|
+
if (
|
285
|
+
self.context.current == ContextValues.OBJECT_KEY
|
286
|
+
and self.get_char_at(1) == ":"
|
287
|
+
):
|
288
|
+
self.index += 1
|
289
|
+
return ""
|
290
|
+
if self.get_char_at(1) == lstring_delimiter:
|
291
|
+
# There's something fishy about this, we found doubled quotes and then again quotes
|
292
|
+
self.log(
|
293
|
+
"While parsing a string, we found a doubled quote and then a quote again, ignoring it",
|
294
|
+
)
|
295
|
+
return ""
|
296
|
+
# Find the next delimiter
|
297
|
+
i = self.skip_to_character(character=rstring_delimiter, idx=1)
|
288
298
|
next_c = self.get_char_at(i)
|
289
|
-
|
299
|
+
# Now check that the next character is also a delimiter to ensure that we have "".....""
|
300
|
+
# In that case we ignore this rstring delimiter
|
301
|
+
if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
|
290
302
|
self.log(
|
291
|
-
"While parsing a string, we found a
|
303
|
+
"While parsing a string, we found a valid starting doubled quote",
|
292
304
|
)
|
305
|
+
doubled_quotes = True
|
293
306
|
self.index += 1
|
307
|
+
else:
|
308
|
+
# Ok this is not a doubled quote, check if this is an empty string or not
|
309
|
+
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
|
310
|
+
next_c = self.get_char_at(i)
|
311
|
+
if next_c in self.STRING_DELIMITERS + ["{", "["]:
|
312
|
+
# something fishy is going on here
|
313
|
+
self.log(
|
314
|
+
"While parsing a string, we found a doubled quote but also another quote afterwards, ignoring it",
|
315
|
+
)
|
316
|
+
self.index += 1
|
317
|
+
return ""
|
318
|
+
elif next_c not in [",", "]", "}"]:
|
319
|
+
self.log(
|
320
|
+
"While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
|
321
|
+
)
|
322
|
+
self.index += 1
|
323
|
+
else:
|
324
|
+
# Otherwise we need to do another check before continuing
|
325
|
+
i = self.skip_to_character(character=rstring_delimiter, idx=1)
|
326
|
+
next_c = self.get_char_at(i)
|
327
|
+
if not next_c:
|
328
|
+
# mmmm that delimiter never appears again, this is a mistake
|
329
|
+
self.log(
|
330
|
+
"While parsing a string, we found a quote but it was a mistake, ignoring it",
|
331
|
+
)
|
332
|
+
return ""
|
294
333
|
|
295
334
|
# Initialize our return value
|
296
335
|
string_acc = ""
|
@@ -404,6 +443,38 @@ class JSONParser:
|
|
404
443
|
string_acc += escape_seqs.get(char, char) or char
|
405
444
|
self.index += 1
|
406
445
|
char = self.get_char_at()
|
446
|
+
# If we are in object key context and we find a colon, it could be a missing right quote
|
447
|
+
if (
|
448
|
+
char == ":"
|
449
|
+
and not missing_quotes
|
450
|
+
and self.context.current == ContextValues.OBJECT_KEY
|
451
|
+
):
|
452
|
+
# Ok now we need to check if this is followed by a value like "..."
|
453
|
+
i = self.skip_to_character(character=lstring_delimiter, idx=1)
|
454
|
+
next_c = self.get_char_at(i)
|
455
|
+
if next_c:
|
456
|
+
i += 1
|
457
|
+
# found the first delimiter
|
458
|
+
i = self.skip_to_character(character=rstring_delimiter, idx=i)
|
459
|
+
next_c = self.get_char_at(i)
|
460
|
+
if next_c:
|
461
|
+
# found a second delimiter
|
462
|
+
i += 1
|
463
|
+
# Skip spaces
|
464
|
+
i = self.skip_whitespaces_at(idx=i, move_main_index=False)
|
465
|
+
next_c = self.get_char_at(i)
|
466
|
+
if next_c and next_c in [",", "}"]:
|
467
|
+
# Ok then this is a missing right quote
|
468
|
+
self.log(
|
469
|
+
"While parsing a string missing the right delimiter in object key context, we found a :, stopping here",
|
470
|
+
)
|
471
|
+
break
|
472
|
+
else:
|
473
|
+
# The string ended without finding a lstring_delimiter, I will assume this is a missing right quote
|
474
|
+
self.log(
|
475
|
+
"While parsing a string missing the right delimiter in object key context, we found a :, stopping here",
|
476
|
+
)
|
477
|
+
break
|
407
478
|
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
|
408
479
|
if char == rstring_delimiter:
|
409
480
|
# Special case here, in case of double quotes one after another
|
@@ -96,3 +96,24 @@ class StringFileWrapper:
|
|
96
96
|
self.length = self.fd.tell()
|
97
97
|
self.fd.seek(current_position)
|
98
98
|
return self.length
|
99
|
+
|
100
|
+
def __setitem__(self, index: Union[int, slice], value: str) -> None:
|
101
|
+
"""
|
102
|
+
Set a character or a slice of characters in the file.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
index (slice): The slice of characters to set.
|
106
|
+
value (str): The value to set at the specified index or slice.
|
107
|
+
"""
|
108
|
+
if isinstance(index, slice):
|
109
|
+
start = index.start or 0
|
110
|
+
else:
|
111
|
+
start = index or 0
|
112
|
+
|
113
|
+
if start < 0:
|
114
|
+
start += len(self)
|
115
|
+
|
116
|
+
current_position = self.fd.tell()
|
117
|
+
self.fd.seek(start)
|
118
|
+
self.fd.write(value)
|
119
|
+
self.fd.seek(current_position)
|
@@ -146,14 +146,16 @@ def test_object_edge_cases():
|
|
146
146
|
assert repair_json('{"lorem": ipsum, sic, datum.",}') == '{"lorem": "ipsum, sic, datum."}'
|
147
147
|
assert repair_json('{"lorem": sic tamet. "ipsum": sic tamet, quick brown fox. "sic": ipsum}') == '{"lorem": "sic tamet.", "ipsum": "sic tamet", "sic": "ipsum"}'
|
148
148
|
assert repair_json('{"lorem_ipsum": "sic tamet, quick brown fox. }') == '{"lorem_ipsum": "sic tamet, quick brown fox."}'
|
149
|
-
assert repair_json('{"key":value, " key2":"value2" }') == '{"key": "value", "
|
149
|
+
assert repair_json('{"key":value, " key2":"value2" }') == '{"key": "value", "key2": "value2"}'
|
150
150
|
assert repair_json('{"key":value "key2":"value2" }') == '{"key": "value", "key2": "value2"}'
|
151
151
|
assert repair_json("{'text': 'words{words in brackets}more words'}") == '{"text": "words{words in brackets}more words"}'
|
152
152
|
assert repair_json('{text:words{words in brackets}}') == '{"text": "words{words in brackets}"}'
|
153
153
|
assert repair_json('{text:words{words in brackets}m}') == '{"text": "words{words in brackets}m"}'
|
154
154
|
assert repair_json('{"key": "value, value2"```') == '{"key": "value, value2"}'
|
155
155
|
assert repair_json('{key:value,key2:value2}') == '{"key": "value", "key2": "value2"}'
|
156
|
-
assert repair_json('
|
156
|
+
assert repair_json('{"key:"value"}') == '{"key": "value"}'
|
157
|
+
assert repair_json('{"key:value}') == '{"key": "value"}'
|
158
|
+
assert repair_json('[{"lorem": {"ipsum": "sic"}, """" "lorem": {"ipsum": "sic"}]') == '[{"lorem": {"ipsum": "sic"}}, {"lorem": {"ipsum": "sic"}}]'
|
157
159
|
|
158
160
|
def test_number_edge_cases():
|
159
161
|
assert repair_json(' - { "test_key": ["test_value", "test_value2"] }') == '{"test_key": ["test_value", "test_value2"]}'
|
@@ -97,7 +97,7 @@ def test_false_false_incorrect(benchmark):
|
|
97
97
|
mean_time = benchmark.stats.get("median")
|
98
98
|
|
99
99
|
# Define your time threshold in seconds
|
100
|
-
max_time =
|
100
|
+
max_time = 2 / 10 ** 3 # 2 millisecond
|
101
101
|
|
102
102
|
# Assert that the average time is below the threshold
|
103
103
|
assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|