json-repair 0.32.0__tar.gz → 0.34.0__tar.gz
Sign up to get free protection for your applications and to get access to all the features.
- {json_repair-0.32.0/src/json_repair.egg-info → json_repair-0.34.0}/PKG-INFO +1 -1
- {json_repair-0.32.0 → json_repair-0.34.0}/pyproject.toml +1 -1
- {json_repair-0.32.0 → json_repair-0.34.0}/src/json_repair/json_parser.py +87 -28
- {json_repair-0.32.0 → json_repair-0.34.0}/src/json_repair/string_file_wrapper.py +21 -0
- {json_repair-0.32.0 → json_repair-0.34.0/src/json_repair.egg-info}/PKG-INFO +1 -1
- {json_repair-0.32.0 → json_repair-0.34.0}/tests/test_json_repair.py +3 -2
- {json_repair-0.32.0 → json_repair-0.34.0}/LICENSE +0 -0
- {json_repair-0.32.0 → json_repair-0.34.0}/README.md +0 -0
- {json_repair-0.32.0 → json_repair-0.34.0}/setup.cfg +0 -0
- {json_repair-0.32.0 → json_repair-0.34.0}/src/json_repair/__init__.py +0 -0
- {json_repair-0.32.0 → json_repair-0.34.0}/src/json_repair/__main__.py +0 -0
- {json_repair-0.32.0 → json_repair-0.34.0}/src/json_repair/json_context.py +0 -0
- {json_repair-0.32.0 → json_repair-0.34.0}/src/json_repair/json_repair.py +0 -0
- {json_repair-0.32.0 → json_repair-0.34.0}/src/json_repair/py.typed +0 -0
- {json_repair-0.32.0 → json_repair-0.34.0}/src/json_repair.egg-info/SOURCES.txt +0 -0
- {json_repair-0.32.0 → json_repair-0.34.0}/src/json_repair.egg-info/dependency_links.txt +0 -0
- {json_repair-0.32.0 → json_repair-0.34.0}/src/json_repair.egg-info/entry_points.txt +0 -0
- {json_repair-0.32.0 → json_repair-0.34.0}/src/json_repair.egg-info/top_level.txt +0 -0
- {json_repair-0.32.0 → json_repair-0.34.0}/tests/test_coverage.py +0 -0
- {json_repair-0.32.0 → json_repair-0.34.0}/tests/test_performance.py +0 -0
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
4
4
|
[project]
|
5
5
|
name = "json_repair"
|
6
|
-
version = "0.
|
6
|
+
version = "0.34.0"
|
7
7
|
license = {file = "LICENSE"}
|
8
8
|
authors = [
|
9
9
|
{ name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },
|
@@ -7,6 +7,9 @@ JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
|
|
7
7
|
|
8
8
|
|
9
9
|
class JSONParser:
|
10
|
+
# Constants
|
11
|
+
STRING_DELIMITERS = ['"', "'", "“", "”"]
|
12
|
+
|
10
13
|
def __init__(
|
11
14
|
self,
|
12
15
|
json_str: Union[str, StringFileWrapper],
|
@@ -89,7 +92,9 @@ class JSONParser:
|
|
89
92
|
)
|
90
93
|
return ""
|
91
94
|
# <string> starts with a quote
|
92
|
-
elif not self.context.empty and (
|
95
|
+
elif not self.context.empty and (
|
96
|
+
char in self.STRING_DELIMITERS or char.isalpha()
|
97
|
+
):
|
93
98
|
return self.parse_string()
|
94
99
|
# <number> starts with [0-9] or minus
|
95
100
|
elif not self.context.empty and (
|
@@ -130,6 +135,8 @@ class JSONParser:
|
|
130
135
|
# <member> starts with a <string>
|
131
136
|
key = ""
|
132
137
|
while self.get_char_at():
|
138
|
+
# The rollback index needs to be updated here in case the key is empty
|
139
|
+
rollback_index = self.index
|
133
140
|
key = str(self.parse_string())
|
134
141
|
|
135
142
|
if key != "" or (key == "" and self.get_char_at() == ":"):
|
@@ -140,6 +147,12 @@ class JSONParser:
|
|
140
147
|
"While parsing an object we found a duplicate key, closing the object here and rolling back the index",
|
141
148
|
)
|
142
149
|
self.index = rollback_index - 1
|
150
|
+
# add an opening curly brace to make this work
|
151
|
+
self.json_str = (
|
152
|
+
self.json_str[: self.index + 1]
|
153
|
+
+ "{"
|
154
|
+
+ self.json_str[self.index + 1 :]
|
155
|
+
)
|
143
156
|
break
|
144
157
|
|
145
158
|
# Skip filler whitespaces
|
@@ -227,7 +240,7 @@ class JSONParser:
|
|
227
240
|
|
228
241
|
char = self.get_char_at()
|
229
242
|
# A valid string can only start with a valid quote or, in our case, with a literal
|
230
|
-
while char and char not in
|
243
|
+
while char and char not in self.STRING_DELIMITERS and not char.isalnum():
|
231
244
|
self.index += 1
|
232
245
|
char = self.get_char_at()
|
233
246
|
|
@@ -262,35 +275,61 @@ class JSONParser:
|
|
262
275
|
if not missing_quotes:
|
263
276
|
self.index += 1
|
264
277
|
|
278
|
+
self.skip_whitespaces_at()
|
265
279
|
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
|
266
|
-
if self.get_char_at()
|
267
|
-
# If
|
268
|
-
if (
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
|
284
|
-
self.index += 1
|
285
|
-
else:
|
286
|
-
# Ok this is not a doubled quote, check if this is an empty string or not
|
287
|
-
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
|
280
|
+
if self.get_char_at() in self.STRING_DELIMITERS:
|
281
|
+
# If the next character is the same type of quote, then we manage it as double quotes
|
282
|
+
if self.get_char_at() == lstring_delimiter:
|
283
|
+
# If it's an empty key, this was easy
|
284
|
+
if (
|
285
|
+
self.context.current == ContextValues.OBJECT_KEY
|
286
|
+
and self.get_char_at(1) == ":"
|
287
|
+
):
|
288
|
+
self.index += 1
|
289
|
+
return ""
|
290
|
+
if self.get_char_at(1) == lstring_delimiter:
|
291
|
+
# There's something fishy about this, we found doubled quotes and then again quotes
|
292
|
+
self.log(
|
293
|
+
"While parsing a string, we found a doubled quote and then a quote again, ignoring it",
|
294
|
+
)
|
295
|
+
return ""
|
296
|
+
# Find the next delimiter
|
297
|
+
i = self.skip_to_character(character=rstring_delimiter, idx=1)
|
288
298
|
next_c = self.get_char_at(i)
|
289
|
-
|
299
|
+
# Now check that the next character is also a delimiter to ensure that we have "".....""
|
300
|
+
# In that case we ignore this rstring delimiter
|
301
|
+
if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
|
290
302
|
self.log(
|
291
|
-
"While parsing a string, we found a
|
303
|
+
"While parsing a string, we found a valid starting doubled quote",
|
292
304
|
)
|
305
|
+
doubled_quotes = True
|
293
306
|
self.index += 1
|
307
|
+
else:
|
308
|
+
# Ok this is not a doubled quote, check if this is an empty string or not
|
309
|
+
i = self.skip_whitespaces_at(idx=1, move_main_index=False)
|
310
|
+
next_c = self.get_char_at(i)
|
311
|
+
if next_c in self.STRING_DELIMITERS + ["{", "["]:
|
312
|
+
# something fishy is going on here
|
313
|
+
self.log(
|
314
|
+
"While parsing a string, we found a doubled quote but also another quote afterwards, ignoring it",
|
315
|
+
)
|
316
|
+
self.index += 1
|
317
|
+
return ""
|
318
|
+
elif next_c not in [",", "]", "}"]:
|
319
|
+
self.log(
|
320
|
+
"While parsing a string, we found a doubled quote but it was a mistake, removing one quote",
|
321
|
+
)
|
322
|
+
self.index += 1
|
323
|
+
else:
|
324
|
+
# Otherwise we need to do another check before continuing
|
325
|
+
i = self.skip_to_character(character=rstring_delimiter, idx=1)
|
326
|
+
next_c = self.get_char_at(i)
|
327
|
+
if not next_c:
|
328
|
+
# mmmm that delimiter never appears again, this is a mistake
|
329
|
+
self.log(
|
330
|
+
"While parsing a string, we found a quote but it was a mistake, ignoring it",
|
331
|
+
)
|
332
|
+
return ""
|
294
333
|
|
295
334
|
# Initialize our return value
|
296
335
|
string_acc = ""
|
@@ -508,9 +547,8 @@ class JSONParser:
|
|
508
547
|
# But this might not be it! This could be just a missing comma
|
509
548
|
# We found a delimiter and we need to check if this is a key
|
510
549
|
# so find a rstring_delimiter and a colon after
|
511
|
-
i += 1
|
512
550
|
i = self.skip_to_character(
|
513
|
-
character=rstring_delimiter, idx=i
|
551
|
+
character=rstring_delimiter, idx=i + 1
|
514
552
|
)
|
515
553
|
i += 1
|
516
554
|
next_c = self.get_char_at(i)
|
@@ -531,6 +569,27 @@ class JSONParser:
|
|
531
569
|
string_acc += str(char)
|
532
570
|
self.index += 1
|
533
571
|
char = self.get_char_at()
|
572
|
+
elif self.context.current == ContextValues.ARRAY:
|
573
|
+
# In array context this could be something like "lorem "ipsum" sic"
|
574
|
+
# So let's check if we find a rstring_delimiter forward otherwise end early
|
575
|
+
i = self.skip_to_character(rstring_delimiter, idx=i + 1)
|
576
|
+
next_c = self.get_char_at(i)
|
577
|
+
if next_c and next_c == rstring_delimiter:
|
578
|
+
# Ok now if I find a comma or a closing ], that can be have also an optional rstring_delimiter before them
|
579
|
+
# We can consider this a misplaced quote
|
580
|
+
i += 1
|
581
|
+
i = self.skip_whitespaces_at(
|
582
|
+
idx=i, move_main_index=False
|
583
|
+
)
|
584
|
+
next_c = self.get_char_at(i)
|
585
|
+
if next_c and next_c in [",", "]"]:
|
586
|
+
self.log(
|
587
|
+
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
588
|
+
)
|
589
|
+
unmatched_delimiter = not unmatched_delimiter
|
590
|
+
string_acc += str(char)
|
591
|
+
self.index += 1
|
592
|
+
char = self.get_char_at()
|
534
593
|
|
535
594
|
if (
|
536
595
|
char
|
@@ -96,3 +96,24 @@ class StringFileWrapper:
|
|
96
96
|
self.length = self.fd.tell()
|
97
97
|
self.fd.seek(current_position)
|
98
98
|
return self.length
|
99
|
+
|
100
|
+
def __setitem__(self, index: Union[int, slice], value: str) -> None:
|
101
|
+
"""
|
102
|
+
Set a character or a slice of characters in the file.
|
103
|
+
|
104
|
+
Args:
|
105
|
+
index (slice): The slice of characters to set.
|
106
|
+
value (str): The value to set at the specified index or slice.
|
107
|
+
"""
|
108
|
+
if isinstance(index, slice):
|
109
|
+
start = index.start or 0
|
110
|
+
else:
|
111
|
+
start = index or 0
|
112
|
+
|
113
|
+
if start < 0:
|
114
|
+
start += len(self)
|
115
|
+
|
116
|
+
current_position = self.fd.tell()
|
117
|
+
self.fd.seek(start)
|
118
|
+
self.fd.write(value)
|
119
|
+
self.fd.seek(current_position)
|
@@ -121,6 +121,7 @@ def test_array_edge_cases():
|
|
121
121
|
assert repair_json('{"employees":["John", "Anna", "Peter') == '{"employees": ["John", "Anna", "Peter"]}'
|
122
122
|
assert repair_json('{"key1": {"key2": [1, 2, 3') == '{"key1": {"key2": [1, 2, 3]}}'
|
123
123
|
assert repair_json('{"key": ["value]}') == '{"key": ["value"]}'
|
124
|
+
assert repair_json('["lorem "ipsum" sic"]') == '["lorem \\"ipsum\\" sic"]'
|
124
125
|
|
125
126
|
def test_escaping():
|
126
127
|
assert repair_json("'\"'") == '""'
|
@@ -145,14 +146,14 @@ def test_object_edge_cases():
|
|
145
146
|
assert repair_json('{"lorem": ipsum, sic, datum.",}') == '{"lorem": "ipsum, sic, datum."}'
|
146
147
|
assert repair_json('{"lorem": sic tamet. "ipsum": sic tamet, quick brown fox. "sic": ipsum}') == '{"lorem": "sic tamet.", "ipsum": "sic tamet", "sic": "ipsum"}'
|
147
148
|
assert repair_json('{"lorem_ipsum": "sic tamet, quick brown fox. }') == '{"lorem_ipsum": "sic tamet, quick brown fox."}'
|
148
|
-
assert repair_json('{"key":value, " key2":"value2" }') == '{"key": "value", "
|
149
|
+
assert repair_json('{"key":value, " key2":"value2" }') == '{"key": "value", "key2": "value2"}'
|
149
150
|
assert repair_json('{"key":value "key2":"value2" }') == '{"key": "value", "key2": "value2"}'
|
150
151
|
assert repair_json("{'text': 'words{words in brackets}more words'}") == '{"text": "words{words in brackets}more words"}'
|
151
152
|
assert repair_json('{text:words{words in brackets}}') == '{"text": "words{words in brackets}"}'
|
152
153
|
assert repair_json('{text:words{words in brackets}m}') == '{"text": "words{words in brackets}m"}'
|
153
154
|
assert repair_json('{"key": "value, value2"```') == '{"key": "value, value2"}'
|
154
155
|
assert repair_json('{key:value,key2:value2}') == '{"key": "value", "key2": "value2"}'
|
155
|
-
assert repair_json('[{"lorem": {"ipsum": "sic"}, "lorem": {"ipsum": "sic"}]') == '[{"lorem": {"ipsum": "sic"}}, "lorem"
|
156
|
+
assert repair_json('[{"lorem": {"ipsum": "sic"}, """" "lorem": {"ipsum": "sic"}]') == '[{"lorem": {"ipsum": "sic"}}, {"lorem": {"ipsum": "sic"}}]'
|
156
157
|
|
157
158
|
def test_number_edge_cases():
|
158
159
|
assert repair_json(' - { "test_key": ["test_value", "test_value2"] }') == '{"test_key": ["test_value", "test_value2"]}'
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|