json-repair 0.19.0__tar.gz → 0.19.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {json_repair-0.19.0/src/json_repair.egg-info → json_repair-0.19.2}/PKG-INFO +1 -1
- {json_repair-0.19.0 → json_repair-0.19.2}/pyproject.toml +1 -1
- {json_repair-0.19.0 → json_repair-0.19.2}/src/json_repair/json_repair.py +55 -40
- {json_repair-0.19.0 → json_repair-0.19.2/src/json_repair.egg-info}/PKG-INFO +1 -1
- {json_repair-0.19.0 → json_repair-0.19.2}/tests/test_json_repair.py +98 -175
- {json_repair-0.19.0 → json_repair-0.19.2}/tests/test_performance.py +4 -4
- {json_repair-0.19.0 → json_repair-0.19.2}/LICENSE +0 -0
- {json_repair-0.19.0 → json_repair-0.19.2}/README.md +0 -0
- {json_repair-0.19.0 → json_repair-0.19.2}/setup.cfg +0 -0
- {json_repair-0.19.0 → json_repair-0.19.2}/src/json_repair/__init__.py +0 -0
- {json_repair-0.19.0 → json_repair-0.19.2}/src/json_repair.egg-info/SOURCES.txt +0 -0
- {json_repair-0.19.0 → json_repair-0.19.2}/src/json_repair.egg-info/dependency_links.txt +0 -0
- {json_repair-0.19.0 → json_repair-0.19.2}/src/json_repair.egg-info/top_level.txt +0 -0
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
4
4
|
[project]
|
5
5
|
name = "json_repair"
|
6
|
-
version = "0.19.
|
6
|
+
version = "0.19.2"
|
7
7
|
license = {file = "LICENSE"}
|
8
8
|
authors = [
|
9
9
|
{ name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },
|
@@ -22,6 +22,7 @@ If something is wrong (a missing parantheses or quotes for example) it will use
|
|
22
22
|
All supported use cases are in the unit tests
|
23
23
|
"""
|
24
24
|
|
25
|
+
import os
|
25
26
|
import json
|
26
27
|
from typing import Any, Dict, List, Union, TextIO
|
27
28
|
|
@@ -31,7 +32,9 @@ class JSONParser:
|
|
31
32
|
# The string to parse
|
32
33
|
self.json_str = json_str
|
33
34
|
# Alternatively, the file description with a json file in it
|
34
|
-
|
35
|
+
if json_fd:
|
36
|
+
# This is a trick we do to treat the file wrapper as an array
|
37
|
+
self.json_str = StringFileWrapper(json_fd)
|
35
38
|
# Index is our iterator that will keep track of which character we are looking at right now
|
36
39
|
self.index = 0
|
37
40
|
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
|
@@ -246,7 +249,8 @@ class JSONParser:
|
|
246
249
|
rstring_delimiter = "”"
|
247
250
|
elif char.isalpha():
|
248
251
|
# This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
|
249
|
-
|
252
|
+
# But remember, object keys are only of type string
|
253
|
+
if char.lower() in ["t", "f", "n"] and self.get_context() != "object_key":
|
250
254
|
value = self.parse_boolean_or_null()
|
251
255
|
if value != "":
|
252
256
|
return value
|
@@ -263,7 +267,8 @@ class JSONParser:
|
|
263
267
|
self.index += 1
|
264
268
|
return self.parse_json()
|
265
269
|
self.log(
|
266
|
-
"While parsing a string, we found no starting quote
|
270
|
+
"While parsing a string, we found no starting quote. Will add the quote back",
|
271
|
+
"info",
|
267
272
|
)
|
268
273
|
missing_quotes = True
|
269
274
|
|
@@ -309,6 +314,15 @@ class JSONParser:
|
|
309
314
|
string_acc += char
|
310
315
|
self.index += 1
|
311
316
|
char = self.get_char_at()
|
317
|
+
if len(string_acc) > 1 and string_acc[-1] == "\\":
|
318
|
+
# This is a special case, if people use real strings this might happen
|
319
|
+
self.log("Found a stray escape sequence, normalizing it", "info")
|
320
|
+
string_acc = string_acc[:-1]
|
321
|
+
if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
|
322
|
+
escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
|
323
|
+
string_acc += escape_seqs.get(char, char)
|
324
|
+
self.index += 1
|
325
|
+
char = self.get_char_at()
|
312
326
|
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
|
313
327
|
if char == rstring_delimiter:
|
314
328
|
# Special case here, in case of double quotes one after another
|
@@ -442,36 +456,22 @@ class JSONParser:
|
|
442
456
|
try:
|
443
457
|
return self.json_str[self.index + count]
|
444
458
|
except IndexError:
|
445
|
-
|
446
|
-
self.json_fd.seek(self.index + count)
|
447
|
-
char = self.json_fd.read(1)
|
448
|
-
if char == "":
|
449
|
-
return False
|
450
|
-
return char
|
451
|
-
else:
|
452
|
-
return False
|
459
|
+
return False
|
453
460
|
|
454
461
|
def skip_whitespaces_at(self) -> None:
|
455
462
|
"""
|
456
463
|
This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
|
457
464
|
"""
|
458
|
-
|
459
|
-
char = self.
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
# If this is not a file stream, we do this monster here to make this function much much faster
|
465
|
+
try:
|
466
|
+
char = self.json_str[self.index]
|
467
|
+
except IndexError:
|
468
|
+
return
|
469
|
+
while char.isspace():
|
470
|
+
self.index += 1
|
465
471
|
try:
|
466
472
|
char = self.json_str[self.index]
|
467
473
|
except IndexError:
|
468
474
|
return
|
469
|
-
while char.isspace():
|
470
|
-
self.index += 1
|
471
|
-
try:
|
472
|
-
char = self.json_str[self.index]
|
473
|
-
except IndexError:
|
474
|
-
return
|
475
475
|
|
476
476
|
def set_context(self, value: str) -> None:
|
477
477
|
# If a value is provided update the context variable and save in stack
|
@@ -493,22 +493,9 @@ class JSONParser:
|
|
493
493
|
def log(self, text: str, level: str) -> None:
|
494
494
|
if level == self.logger["log_level"]:
|
495
495
|
context = ""
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
self.json_fd.seek(self.index)
|
500
|
-
else:
|
501
|
-
start = (
|
502
|
-
self.index - self.logger["window"]
|
503
|
-
if (self.index - self.logger["window"]) >= 0
|
504
|
-
else 0
|
505
|
-
)
|
506
|
-
end = (
|
507
|
-
self.index + self.logger["window"]
|
508
|
-
if (self.index + self.logger["window"]) <= len(self.json_str)
|
509
|
-
else len(self.json_str)
|
510
|
-
)
|
511
|
-
context = self.json_str[start:end]
|
496
|
+
start = max(self.index - self.logger["window"], 0)
|
497
|
+
end = min(self.index + self.logger["window"], len(self.json_str))
|
498
|
+
context = self.json_str[start:end]
|
512
499
|
self.logger["log"].append(
|
513
500
|
{
|
514
501
|
"text": text,
|
@@ -584,3 +571,31 @@ def from_file(
|
|
584
571
|
fd.close()
|
585
572
|
|
586
573
|
return jsonobj
|
574
|
+
|
575
|
+
|
576
|
+
class StringFileWrapper:
|
577
|
+
# This is a trick to simplify the code above, transform the filedescriptor handling into an array handling
|
578
|
+
def __init__(self, fd: TextIO) -> None:
|
579
|
+
self.fd = fd
|
580
|
+
self.length = None
|
581
|
+
|
582
|
+
def __getitem__(self, index: int) -> Any:
|
583
|
+
if isinstance(index, slice):
|
584
|
+
self.fd.seek(index.start)
|
585
|
+
value = self.fd.read(index.stop - index.start)
|
586
|
+
self.fd.seek(index.start)
|
587
|
+
return value
|
588
|
+
else:
|
589
|
+
self.fd.seek(index)
|
590
|
+
return self.fd.read(1)
|
591
|
+
|
592
|
+
def __len__(self) -> int:
|
593
|
+
if not self.length:
|
594
|
+
current_position = self.fd.tell()
|
595
|
+
self.fd.seek(0, os.SEEK_END)
|
596
|
+
self.length = self.fd.tell()
|
597
|
+
self.fd.seek(current_position)
|
598
|
+
return self.length
|
599
|
+
|
600
|
+
def __setitem__(self):
|
601
|
+
raise Exception("This is read-only!")
|
@@ -1,27 +1,61 @@
|
|
1
1
|
from src.json_repair.json_repair import from_file, repair_json, loads
|
2
2
|
|
3
3
|
|
4
|
-
def
|
5
|
-
# Test with valid JSON strings
|
4
|
+
def test_valid_json():
|
6
5
|
assert repair_json("[]") == "[]"
|
7
|
-
assert repair_json("[
|
8
|
-
assert repair_json(" { } ") == "{}"
|
9
|
-
assert repair_json("\"") == '""'
|
10
|
-
assert repair_json("\n") == '""'
|
11
|
-
assert repair_json(' {"key": true, "key2": false, "key3": null}') == '{"key": true, "key2": false, "key3": null}'
|
12
|
-
assert repair_json('{"key": TRUE, "key2": FALSE, "key3": Null} ') == '{"key": true, "key2": false, "key3": null}'
|
13
|
-
assert repair_json("{'key': 'string', 'key2': false, \"key3\": null, \"key4\": unquoted}") == '{"key": "string", "key2": false, "key3": null, "key4": "unquoted"}'
|
6
|
+
assert repair_json("[1, 2, 3, 4]") == "[1, 2, 3, 4]"
|
14
7
|
assert (
|
15
8
|
repair_json('{"name": "John", "age": 30, "city": "New York"}')
|
16
9
|
== '{"name": "John", "age": 30, "city": "New York"}'
|
17
10
|
)
|
18
|
-
assert repair_json("[1, 2, 3, 4]") == "[1, 2, 3, 4]"
|
19
11
|
assert (
|
20
12
|
repair_json('{"employees":["John", "Anna", "Peter"]} ')
|
21
13
|
== '{"employees": ["John", "Anna", "Peter"]}'
|
22
14
|
)
|
15
|
+
assert repair_json('{"key": "value:value"}') == '{"key": "value:value"}'
|
16
|
+
assert (
|
17
|
+
repair_json('{"text": "The quick brown fox,"}')
|
18
|
+
== '{"text": "The quick brown fox,"}'
|
19
|
+
)
|
20
|
+
assert (
|
21
|
+
repair_json('{"text": "The quick brown fox won\'t jump"}')
|
22
|
+
== '{"text": "The quick brown fox won\'t jump"}'
|
23
|
+
)
|
24
|
+
assert repair_json('{"key": ""') == '{"key": ""}'
|
25
|
+
assert (
|
26
|
+
repair_json('{"key1": {"key2": [1, 2, 3]}}') == '{"key1": {"key2": [1, 2, 3]}}'
|
27
|
+
)
|
28
|
+
assert (
|
29
|
+
repair_json('{"key": 12345678901234567890}') == '{"key": 12345678901234567890}'
|
30
|
+
)
|
31
|
+
assert repair_json('{"key": "value\u263A"}') == '{"key": "value\\u263a"}'
|
32
|
+
assert repair_json('{"key": "value\\nvalue"}') == '{"key": "value\\nvalue"}'
|
23
33
|
|
24
|
-
|
34
|
+
def test_brackets_edge_cases():
|
35
|
+
assert repair_json("[{]") == "[]"
|
36
|
+
assert repair_json(" { } ") == "{}"
|
37
|
+
assert repair_json("[") == "[]"
|
38
|
+
assert repair_json("]") == '""'
|
39
|
+
assert repair_json("{") == "{}"
|
40
|
+
assert repair_json("}") == '""'
|
41
|
+
assert repair_json('{"') == '{}'
|
42
|
+
assert repair_json('["') == '[]'
|
43
|
+
assert repair_json('{foo: [}') == '{"foo": []}'
|
44
|
+
|
45
|
+
def test_general_edge_cases():
|
46
|
+
assert repair_json("\"") == '""'
|
47
|
+
assert repair_json("\n") == '""'
|
48
|
+
assert repair_json(" ") == '""'
|
49
|
+
assert repair_json("[[1\n\n]") == "[[1]]"
|
50
|
+
assert repair_json("string") == '""'
|
51
|
+
assert repair_json("stringbeforeobject {}") == '{}'
|
52
|
+
|
53
|
+
def test_mixed_data_types():
|
54
|
+
assert repair_json(' {"key": true, "key2": false, "key3": null}') == '{"key": true, "key2": false, "key3": null}'
|
55
|
+
assert repair_json('{"key": TRUE, "key2": FALSE, "key3": Null} ') == '{"key": true, "key2": false, "key3": null}'
|
56
|
+
|
57
|
+
def test_missing_and_mixed_quotes():
|
58
|
+
assert repair_json("{'key': 'string', 'key2': false, \"key3\": null, \"key4\": unquoted}") == '{"key": "string", "key2": false, "key3": null, "key4": "unquoted"}'
|
25
59
|
assert (
|
26
60
|
repair_json('{"name": "John", "age": 30, "city": "New York')
|
27
61
|
== '{"name": "John", "age": 30, "city": "New York"}'
|
@@ -38,6 +72,13 @@ def test_repair_json():
|
|
38
72
|
repair_json('{"name": John, "age": 30, "city": "New York"}')
|
39
73
|
== '{"name": "John", "age": 30, "city": "New York"}'
|
40
74
|
)
|
75
|
+
assert repair_json('{“slanted_delimiter”: "value"}') == '{"slanted_delimiter": "value"}'
|
76
|
+
assert (
|
77
|
+
repair_json('{"name": "John", "age": 30, "city": "New')
|
78
|
+
== '{"name": "John", "age": 30, "city": "New"}'
|
79
|
+
)
|
80
|
+
|
81
|
+
def test_array_edge_cases():
|
41
82
|
assert repair_json("[1, 2, 3,") == "[1, 2, 3]"
|
42
83
|
assert repair_json("[1, 2, 3, ...]") == "[1, 2, 3]"
|
43
84
|
assert repair_json("[1, 2, ... , 3]") == "[1, 2, 3]"
|
@@ -46,51 +87,28 @@ def test_repair_json():
|
|
46
87
|
repair_json('{"employees":["John", "Anna",')
|
47
88
|
== '{"employees": ["John", "Anna"]}'
|
48
89
|
)
|
49
|
-
|
50
|
-
# Test with edge cases
|
51
|
-
assert repair_json(" ") == '""'
|
52
|
-
assert repair_json("[") == "[]"
|
53
|
-
assert repair_json("]") == '""'
|
54
|
-
assert repair_json("[[1\n\n]") == "[[1]]"
|
55
|
-
assert repair_json("{") == "{}"
|
56
|
-
assert repair_json("}") == '""'
|
57
|
-
assert repair_json("string") == '""'
|
58
|
-
assert repair_json("stringbeforeobject {}") == '{}'
|
59
|
-
assert repair_json('{"') == '{}'
|
60
|
-
assert repair_json('["') == '[]'
|
61
|
-
assert repair_json("'\"'") == '"\\\""'
|
62
|
-
assert repair_json("{\"key\": 'string\"\n\t\le'") == '{"key": "string\\"\\n\\t\\\\le"}'
|
63
|
-
assert repair_json('{foo: [}') == '{"foo": []}'
|
64
|
-
assert repair_json('''{ "a": "{ b": {} }" }''') == '{"a": "{ b"}'
|
65
|
-
assert repair_json('{"key": "value:value"}') == '{"key": "value:value"}'
|
66
|
-
assert repair_json('{“slanted_delimiter”: "value"}') == '{"slanted_delimiter": "value"}'
|
67
|
-
assert (
|
68
|
-
repair_json('{"name": "John", "age": 30, "city": "New')
|
69
|
-
== '{"name": "John", "age": 30, "city": "New"}'
|
70
|
-
)
|
71
90
|
assert (
|
72
91
|
repair_json('{"employees":["John", "Anna", "Peter')
|
73
92
|
== '{"employees": ["John", "Anna", "Peter"]}'
|
74
93
|
)
|
75
|
-
assert (
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
assert (
|
80
|
-
|
81
|
-
|
82
|
-
)
|
83
|
-
assert (
|
84
|
-
|
85
|
-
|
86
|
-
|
94
|
+
assert repair_json('{"key1": {"key2": [1, 2, 3') == '{"key1": {"key2": [1, 2, 3]}}'
|
95
|
+
|
96
|
+
|
97
|
+
def test_escaping():
|
98
|
+
assert repair_json("'\"'") == '"\\\""'
|
99
|
+
assert repair_json("{\"key\": 'string\"\n\t\le'") == '{"key": "string\\"\\n\\tle"}'
|
100
|
+
assert repair_json(r'{"real_content": "Some string: Some other string \t Some string <a href=\"https://domain.com\">Some link</a>"') == r'{"real_content": "Some string: Some other string \t Some string <a href=\"https://domain.com\">Some link</a>"}'
|
101
|
+
assert repair_json('{"key_1\n": "value"}') == '{"key_1": "value"}'
|
102
|
+
assert repair_json('{"key\t_": "value"}') == '{"key\\t_": "value"}'
|
103
|
+
|
104
|
+
|
105
|
+
def test_object_edge_cases():
|
87
106
|
assert {
|
88
107
|
repair_json('{"value_1": "value_2": "data"}') == '{"value_1": "value_2", "data": ""}'
|
89
108
|
}
|
90
109
|
assert {
|
91
110
|
repair_json('{"value_1": true, COMMENT "value_2": "data"}') == '{"value_1": "value_2", "": "data"}'
|
92
111
|
}
|
93
|
-
# Test with garbage comments
|
94
112
|
assert repair_json('{"value_1": true, SHOULD_NOT_EXIST "value_2": "data" AAAA }') == '{"value_1": true, "value_2": "data"}'
|
95
113
|
assert {
|
96
114
|
repair_json('{"" : true, "key2": "value2"}') == '{" ": true, "key2": "value_2"}'
|
@@ -98,35 +116,37 @@ def test_repair_json():
|
|
98
116
|
assert {
|
99
117
|
repair_json('{"": true, "key2": "value2"}') == '{"empty_placeholder": true, "key2": "value_2"}'
|
100
118
|
}
|
101
|
-
# Test a nasty corner case
|
102
|
-
assert repair_json(' - { "test_key": ["test_value", "test_value2"] }') == '{"test_key": ["test_value", "test_value2"]}'
|
103
|
-
|
104
|
-
#Test markdown stupidities from ChatGPT
|
105
|
-
assert repair_json('{ "content": "[LINK]("https://google.com")" }') == '{"content": "[LINK](\\"https://google.com\\")"}'
|
106
|
-
assert repair_json('{ "content": "[LINK](" }') == '{"content": "[LINK]("}'
|
107
|
-
assert repair_json('{ "content": "[LINK](", "key": true }') == '{"content": "[LINK](", "key": true}'
|
108
|
-
assert repair_json("""
|
109
|
-
```json
|
110
|
-
{ "key": "value" }
|
111
|
-
```""") == '{"key": "value"}'
|
112
|
-
assert repair_json('````{ "key": "value" }```') == '{"key": "value"}'
|
113
|
-
assert repair_json(r'{"real_content": "Some string: Some other string Some string <a href=\"https://domain.com\">Some link</a>"') == r'{"real_content": "Some string: Some other string Some string <a href=\\\"https://domain.com\\\">Some link</a>"}'
|
114
|
-
assert repair_json('{"key_1\n": "value"}') == '{"key_1": "value"}'
|
115
|
-
assert repair_json('{"key\t_": "value"}') == '{"key\\t_": "value"}'
|
116
119
|
assert repair_json('{""answer"":[{""traits"":''Female aged 60+'',""answer1"":""5""}]}') == '{"answer": [{"traits": "Female aged 60+", "answer1": "5"}]}'
|
117
|
-
assert repair_json('{"key":""') == '{"key": ""}'
|
118
120
|
assert repair_json('{ "words": abcdef", "numbers": 12345", "words2": ghijkl" }') == '{"words": "abcdef", "numbers": 12345, "words2": "ghijkl"}'
|
121
|
+
assert repair_json('''{"number": 1,"reason": "According...""ans": "YES"}''') == '{"number": 1, "reason": "According...", "ans": "YES"}'
|
122
|
+
assert repair_json('''{ "a": "{ b": {} }" }''') == '{"a": "{ b"}'
|
123
|
+
assert repair_json("""{"b": "xxxxx" true}""") == '{"b": "xxxxx"}'
|
124
|
+
|
125
|
+
def test_number_edge_cases():
|
126
|
+
assert repair_json(' - { "test_key": ["test_value", "test_value2"] }') == '{"test_key": ["test_value", "test_value2"]}'
|
119
127
|
assert repair_json('{"key": 1/3}') == '{"key": "1/3"}'
|
120
128
|
assert repair_json('{"key": .25}') == '{"key": 0.25}'
|
121
|
-
assert repair_json("""{ "a": "", "b": [ { "c": 1} ] \n}```""") == '{"a": "", "b": [{"c": 1}]}'
|
122
|
-
assert repair_json("Based on the information extracted, here is the filled JSON output: ```json { 'a': 'b' } ```") == '{"a": "b"}'
|
123
|
-
assert repair_json('''{"number": 1,"reason": "According...""ans": "YES"}''') == '{"number": 1, "reason": "According...", "ans": "YES"}'
|
124
|
-
assert repair_json('{"key": 1/3, "foo": "bar"}') == '{"key": "1/3", "foo": "bar"}'
|
125
129
|
assert repair_json('{"here": "now", "key": 1/3, "foo": "bar"}') == '{"here": "now", "key": "1/3", "foo": "bar"}'
|
126
130
|
assert repair_json('{"key": 12345/67890}') == '{"key": "12345/67890"}'
|
127
131
|
assert repair_json('[105,12') == '["105,12"]'
|
128
132
|
assert repair_json('{"key", 105,12,') == '{"key": "105,12"}'
|
133
|
+
assert repair_json('{"key": 1/3, "foo": "bar"}') == '{"key": "1/3", "foo": "bar"}'
|
134
|
+
assert repair_json('{"key": 10-20}') == '{"key": "10-20"}'
|
135
|
+
assert repair_json('{"key": 1.1.1}') == '{"key": "1.1.1"}'
|
129
136
|
|
137
|
+
def test_markdown():
|
138
|
+
assert repair_json('{ "content": "[LINK]("https://google.com")" }') == '{"content": "[LINK](\\"https://google.com\\")"}'
|
139
|
+
assert repair_json('{ "content": "[LINK](" }') == '{"content": "[LINK]("}'
|
140
|
+
assert repair_json('{ "content": "[LINK](", "key": true }') == '{"content": "[LINK](", "key": true}'
|
141
|
+
|
142
|
+
def test_leading_trailing_characters():
|
143
|
+
assert repair_json('````{ "key": "value" }```') == '{"key": "value"}'
|
144
|
+
assert repair_json("""{ "a": "", "b": [ { "c": 1} ] \n}```""") == '{"a": "", "b": [{"c": 1}]}'
|
145
|
+
assert repair_json("Based on the information extracted, here is the filled JSON output: ```json { 'a': 'b' } ```") == '{"a": "b"}'
|
146
|
+
assert repair_json("""
|
147
|
+
```json
|
148
|
+
{ "key": "value" }
|
149
|
+
```""") == '{"key": "value"}'
|
130
150
|
|
131
151
|
def test_repair_json_with_objects():
|
132
152
|
# Test with valid JSON strings
|
@@ -142,47 +162,6 @@ def test_repair_json_with_objects():
|
|
142
162
|
assert repair_json('{"employees":["John", "Anna", "Peter"]} ', return_objects=True) == {
|
143
163
|
"employees": ["John", "Anna", "Peter"]
|
144
164
|
}
|
145
|
-
|
146
|
-
# Test with invalid JSON strings
|
147
|
-
assert repair_json('{"name": "John", "age": 30, "city": "New York', return_objects=True) == {
|
148
|
-
"name": "John",
|
149
|
-
"age": 30,
|
150
|
-
"city": "New York",
|
151
|
-
}
|
152
|
-
assert repair_json('{"name": "John", "age": 30, city: "New York"}', return_objects=True) == {
|
153
|
-
"name": "John",
|
154
|
-
"age": 30,
|
155
|
-
"city": "New York",
|
156
|
-
}
|
157
|
-
assert repair_json('{"name": "John", "age": 30, "city": New York}', return_objects=True) == {
|
158
|
-
"name": "John",
|
159
|
-
"age": 30,
|
160
|
-
"city": "New York",
|
161
|
-
}
|
162
|
-
assert repair_json('{"employees":["John", "Anna",', return_objects=True) == {
|
163
|
-
"employees": ["John", "Anna"]
|
164
|
-
}
|
165
|
-
|
166
|
-
# Test with edge cases
|
167
|
-
assert repair_json(" ", return_objects=True) == ""
|
168
|
-
assert repair_json("[", return_objects=True) == []
|
169
|
-
assert repair_json("{", return_objects=True) == {}
|
170
|
-
assert repair_json('{"key": "value:value"}', return_objects=True) == {"key": "value:value"}
|
171
|
-
assert repair_json("{'key': 'string', 'key2': false, \"key3\": null, \"key4\": unquoted}", return_objects=True) == {"key": "string", "key2": False, "key3": None, "key4": "unquoted"}
|
172
|
-
assert repair_json('{"name": "John", "age": 30, "city": "New', return_objects=True) == {
|
173
|
-
"name": "John",
|
174
|
-
"age": 30,
|
175
|
-
"city": "New",
|
176
|
-
}
|
177
|
-
assert repair_json('{"employees":["John", "Anna", "Peter', return_objects=True) == {
|
178
|
-
"employees": ["John", "Anna", "Peter"]
|
179
|
-
}
|
180
|
-
|
181
|
-
#Test with garbage comments
|
182
|
-
assert repair_json('{"value_1": true, SHOULD_NOT_EXIST "value_2": "data" AAAA }', return_objects=True) == {'value_1': True, 'value_2': 'data'}
|
183
|
-
|
184
|
-
#Test markdown stupidities from ChatGPT
|
185
|
-
assert repair_json('{ "content": "[LINK]("https://google.com")" }', return_objects=True) == { "content": "[LINK](\"https://google.com\")"}
|
186
165
|
assert repair_json('''
|
187
166
|
{
|
188
167
|
"resourceType": "Bundle",
|
@@ -216,73 +195,6 @@ def test_repair_json_with_objects():
|
|
216
195
|
]
|
217
196
|
""", return_objects=True) == [{"foo": "Foo bar baz", "tag": "#foo-bar-baz"},{"foo": "foo bar \"foobar\" foo bar baz.", "tag": "#foo-bar-foobar" }]
|
218
197
|
|
219
|
-
|
220
|
-
def test_repair_json_corner_cases_generate_by_gpt():
|
221
|
-
# Test with nested JSON
|
222
|
-
assert (
|
223
|
-
repair_json('{"key1": {"key2": [1, 2, 3]}}') == '{"key1": {"key2": [1, 2, 3]}}'
|
224
|
-
)
|
225
|
-
assert repair_json('{"key1": {"key2": [1, 2, 3') == '{"key1": {"key2": [1, 2, 3]}}'
|
226
|
-
|
227
|
-
# Test with empty keys
|
228
|
-
assert repair_json('{"": "value"}') == '{"": "value"}'
|
229
|
-
|
230
|
-
# Test with Unicode characters
|
231
|
-
assert repair_json('{"key": "value\u263A"}') == '{"key": "value\\u263a"}'
|
232
|
-
|
233
|
-
# Test with special characters
|
234
|
-
assert repair_json('{"key": "value\\nvalue"}') == '{"key": "value\\nvalue"}'
|
235
|
-
|
236
|
-
# Test with large numbers
|
237
|
-
assert (
|
238
|
-
repair_json('{"key": 12345678901234567890}') == '{"key": 12345678901234567890}'
|
239
|
-
)
|
240
|
-
|
241
|
-
# Test with whitespace
|
242
|
-
assert repair_json(' { "key" : "value" } ') == '{"key": "value"}'
|
243
|
-
|
244
|
-
# Test with null values
|
245
|
-
assert repair_json('{"key": null}') == '{"key": null}'
|
246
|
-
|
247
|
-
# Test with numeric-like values
|
248
|
-
assert repair_json('{"key": 10-20}') == '{"key": "10-20"}'
|
249
|
-
assert repair_json('{"key": 1.1.1}') == '{"key": "1.1.1"}'
|
250
|
-
|
251
|
-
|
252
|
-
def test_repair_json_corner_cases_generate_by_gpt_with_objects():
|
253
|
-
# Test with nested JSON
|
254
|
-
assert repair_json('{"key1": {"key2": [1, 2, 3]}}', return_objects=True) == {
|
255
|
-
"key1": {"key2": [1, 2, 3]}
|
256
|
-
}
|
257
|
-
assert repair_json('{"key1": {"key2": [1, 2, 3', return_objects=True) == {
|
258
|
-
"key1": {"key2": [1, 2, 3]}
|
259
|
-
}
|
260
|
-
|
261
|
-
# Test with empty keys
|
262
|
-
assert repair_json('{"": "value"}', return_objects=True) == {"": "value"}
|
263
|
-
|
264
|
-
# Test with Unicode characters
|
265
|
-
assert repair_json('{"key": "value\u263A"}', return_objects=True) == {"key": "value☺"}
|
266
|
-
|
267
|
-
# Test with special characters
|
268
|
-
assert repair_json('{"key": "value\\nvalue"}', return_objects=True) == {"key": "value\nvalue"}
|
269
|
-
|
270
|
-
# Test with large numbers
|
271
|
-
assert repair_json('{"key": 12345678901234567890}', return_objects=True) == {
|
272
|
-
"key": 12345678901234567890
|
273
|
-
}
|
274
|
-
|
275
|
-
# Test with whitespace
|
276
|
-
assert repair_json(' { "key" : "value" } ', return_objects=True) == {"key": "value"}
|
277
|
-
|
278
|
-
# Test with null values
|
279
|
-
assert repair_json('{"key": null}', return_objects=True) == {"key": None}
|
280
|
-
|
281
|
-
# Test with numeric-like values
|
282
|
-
assert repair_json('{"key": 10-20}', return_objects=True) == {"key": "10-20"}
|
283
|
-
assert repair_json('{"key": 1.1.1}', return_objects=True) == {"key": "1.1.1"}
|
284
|
-
|
285
|
-
|
286
198
|
def test_repair_json_skip_json_loads():
|
287
199
|
assert repair_json('{"key": true, "key2": false, "key3": null}', skip_json_loads=True) == '{"key": true, "key2": false, "key3": null}'
|
288
200
|
assert repair_json('{"key": true, "key2": false, "key3": null}', return_objects=True, skip_json_loads=True) == {"key": True, "key2": False, "key3": None}
|
@@ -291,9 +203,20 @@ def test_repair_json_skip_json_loads():
|
|
291
203
|
|
292
204
|
|
293
205
|
def test_repair_json_from_file():
|
294
|
-
|
295
206
|
import os.path
|
296
207
|
import pathlib
|
297
208
|
path = pathlib.Path(__file__).parent.resolve()
|
298
209
|
|
299
210
|
assert(from_file(os.path.join(path,"invalid.json"))) == '[{"_id": "655b66256574f09bdae8abe8", "index": 0, "guid": "31082ae3-b0f3-4406-90f4-cc450bd4379d", "isActive": false, "balance": "$2,562.78", "picture": "http://placehold.it/32x32", "age": 32, "eyeColor": "brown", "name": "Glover Rivas", "gender": "male", "company": "EMPIRICA", "email": "gloverrivas@empirica.com", "phone": "+1 (842) 507-3063", "address": "536 Montague Terrace, Jenkinsville, Kentucky, 2235", "about": "Mollit consectetur excepteur voluptate tempor dolore ullamco enim irure ullamco non enim officia. Voluptate occaecat proident laboris ea Lorem cupidatat reprehenderit nisi nisi aliqua. Amet nulla ipsum deserunt excepteur amet ad aute aute ex. Et enim minim sit veniam est quis dolor nisi sunt quis eiusmod in. Amet eiusmod cillum sunt occaecat dolor laboris voluptate in eiusmod irure aliqua duis.", "registered": "2023-11-18T09:32:36 -01:00", "latitude": 36.26102, "longitude": -91.304608, "tags": ["non", "tempor", "do", "ullamco", "dolore", "sunt", "ipsum"], "friends": [{"id": 0, "name": "Cara Shepherd"}, {"id": 1, "name": "Mason Farley"}, {"id": 2, "name": "Harriet Cochran"}], "greeting": "Hello, Glover Rivas! You have 7 unread messages.", "favoriteFruit": "strawberry"}, {"_id": "655b662585364bc57278bb6f", "index": 1, "guid": "0dea7a3a-f812-4dde-b78d-7a9b58e5da05", "isActive": true, "balance": "$1,359.48", "picture": "http://placehold.it/32x32", "age": 38, "eyeColor": "brown", "name": "Brandi Moreno", "gender": "female", "company": "MARQET", "email": "brandimoreno@marqet.com", "phone": "+1 (850) 434-2077", "address": "537 Doone Court, Waiohinu, Michigan, 3215", "about": "Irure proident adipisicing do Lorem do incididunt in laborum in eiusmod eiusmod ad elit proident. Eiusmod dolor ex magna magna occaecat. Nulla deserunt velit ex exercitation et irure sunt. Cupidatat ut excepteur ea quis labore sint cupidatat incididunt amet eu consectetur cillum ipsum proident. Occaecat exercitation aute laborum dolor proident reprehenderit laborum in voluptate culpa. Exercitation nulla adipisicing culpa aute est deserunt ea nisi deserunt consequat occaecat ut et non. Incididunt ex exercitation dolor dolor anim cillum dolore.", "registered": "2015-09-03T11:47:15 -02:00", "latitude": -19.768953, "longitude": 8.948458, "tags": ["laboris", "occaecat", "laborum", "laborum", "ex", "cillum", "occaecat"], "friends": [{"id": 0, "name": "Erna Kelly"}, {"id": 1, "name": "Black Mays"}, {"id": 2, "name": "Davis Buck"}], "greeting": "Hello, Brandi Moreno! You have 1 unread messages.", "favoriteFruit": "apple"}, {"_id": "655b6625870da431bcf5e0c2", "index": 2, "guid": "b17f6e3f-c898-4334-abbf-05cf222f143b", "isActive": false, "balance": "$1,493.77", "picture": "http://placehold.it/32x32", "age": 20, "eyeColor": "brown", "name": "Moody Meadows", "gender": "male", "company": "OPTIQUE", "email": "moodymeadows@optique.com", "phone": "+1 (993) 566-3041", "address": "766 Osborn Street, Bath, Maine, 7666", "about": "Non commodo excepteur nostrud qui adipisicing aliquip dolor minim nulla culpa proident. In ad cupidatat ea mollit ex est do deserunt proident nostrud. Cillum id id eiusmod amet exercitation nostrud cillum sunt deserunt dolore deserunt eiusmod mollit. Ut ex tempor ad laboris voluptate labore id officia fugiat exercitation amet.", "registered": "2015-01-16T02:48:28 -01:00", "latitude": -25.847327, "longitude": 63.95991, "tags": ["aute", "commodo", "adipisicing", "nostrud", "duis", "mollit", "ut"], "friends": [{"id": 0, "name": "Lacey Cash"}, {"id": 1, "name": "Gabrielle Harmon"}, {"id": 2, "name": "Ellis Lambert"}], "greeting": "Hello, Moody Meadows! You have 4 unread messages.", "favoriteFruit": "strawberry"}, {"_id": "655b6625f3e1bf422220854e", "index": 3, "guid": "92229883-2bfd-4974-a08c-1b506b372e46", "isActive": false, "balance": "$2,215.34", "picture": "http://placehold.it/32x32", "age": 22, "eyeColor": "brown", "name": "Heath Nguyen", "gender": "male", "company": "BLEENDOT", "email": "heathnguyen@bleendot.com", "phone": "+1 (989) 512-2797", "address": "135 Milton Street, Graniteville, Nebraska, 276", "about": "Consequat aliquip irure Lorem cupidatat nulla magna ullamco nulla voluptate adipisicing anim consectetur tempor aliquip. Magna aliqua nulla eu tempor esse proident. Proident fugiat ad ex Lorem reprehenderit dolor aliquip labore labore aliquip. Deserunt aute enim ea minim officia anim culpa sint commodo. Cillum consectetur excepteur aliqua exercitation Lorem veniam voluptate.", "registered": "2016-07-06T01:31:07 -02:00", "latitude": -60.997048, "longitude": -102.397885, "tags": ["do", "ad", "consequat", "irure", "tempor", "elit", "minim"], "friends": [{"id": 0, "name": "Walker Hernandez"}, {"id": 1, "name": "Maria Lane"}, {"id": 2, "name": "Mcknight Barron"}], "greeting": "Hello, Heath Nguyen! You have 4 unread messages.", "favoriteFruit": "apple"}, {"_id": "655b6625519a5b5e4b6742bf", "index": 4, "guid": "c5dc685f-6d0d-4173-b4cf-f5df29a1e8ef", "isActive": true, "balance": "$1,358.90", "picture": "http://placehold.it/32x32", "age": 33, "eyeColor": "brown", "name": "Deidre Duke", "gender": "female", "company": "OATFARM", "email": "deidreduke@oatfarm.com", "phone": "+1 (875) 587-3256", "address": "487 Schaefer Street, Wattsville, West Virginia, 4506", "about": "Laboris eu nulla esse magna sit eu deserunt non est aliqua exercitation commodo. Ad occaecat qui qui laborum dolore anim Lorem. Est qui occaecat irure enim deserunt enim aliqua ex deserunt incididunt esse. Quis in minim laboris proident non mollit. Magna ea do labore commodo. Et elit esse esse occaecat officia ipsum nisi.", "registered": "2021-09-12T04:17:08 -02:00", "latitude": 68.609781, "longitude": -87.509134, "tags": ["mollit", "cupidatat", "irure", "sit", "consequat", "anim", "fugiat"], "friends": [{"id": 0, "name": "Bean Paul"}, {"id": 1, "name": "Cochran Hubbard"}, {"id": 2, "name": "Rodgers Atkinson"}], "greeting": "Hello, Deidre Duke! You have 6 unread messages.", "favoriteFruit": "apple"}, {"_id": "655b6625a19b3f7e5f82f0ea", "index": 5, "guid": "75f3c264-baa1-47a0-b21c-4edac23d9935", "isActive": true, "balance": "$3,554.36", "picture": "http://placehold.it/32x32", "age": 26, "eyeColor": "blue", "name": "Lydia Holland", "gender": "female", "company": "ESCENTA", "email": "lydiaholland@escenta.com", "phone": "+1 (927) 482-3436", "address": "554 Rockaway Parkway, Kohatk, Montana, 6316", "about": "Consectetur ea est labore commodo laborum mollit pariatur non enim. Est dolore et non laboris tempor. Ea incididunt ut adipisicing cillum labore officia tempor eiusmod commodo. Cillum fugiat ex consectetur ut nostrud anim nostrud exercitation ut duis in ea. Eu et id fugiat est duis eiusmod ullamco quis officia minim sint ea nisi in.", "registered": "2018-03-13T01:48:56 -01:00", "latitude": -88.495799, "longitude": 71.840667, "tags": ["veniam", "minim", "consequat", "consequat", "incididunt", "consequat", "elit"], "friends": [{"id": 0, "name": "Debra Massey"}, {"id": 1, "name": "Weiss Savage"}, {"id": 2, "name": "Shannon Guerra"}], "greeting": "Hello, Lydia Holland! You have 5 unread messages.", "favoriteFruit": "banana"}]'
|
211
|
+
|
212
|
+
import tempfile
|
213
|
+
# Create a temporary file
|
214
|
+
temp_fd, temp_path = tempfile.mkstemp(suffix=".json")
|
215
|
+
try:
|
216
|
+
# Write content to the temporary file
|
217
|
+
with os.fdopen(temp_fd, 'w') as tmp:
|
218
|
+
tmp.write("{key:value}")
|
219
|
+
assert(from_file(temp_path, logging=True)) == ({'key': 'value'}, [{'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}, {'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}])
|
220
|
+
finally:
|
221
|
+
# Clean up - delete the temporary file
|
222
|
+
os.remove(temp_path)
|
@@ -42,7 +42,7 @@ def test_true_false_correct(benchmark):
|
|
42
42
|
mean_time = benchmark.stats.get("median")
|
43
43
|
|
44
44
|
# Define your time threshold in seconds
|
45
|
-
max_time =
|
45
|
+
max_time = 30 * (1 / 10 ** 6) # 30 microsecond
|
46
46
|
|
47
47
|
# Assert that the average time is below the threshold
|
48
48
|
assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
|
@@ -64,7 +64,7 @@ def test_false_true_correct(benchmark):
|
|
64
64
|
mean_time = benchmark.stats.get("median")
|
65
65
|
|
66
66
|
# Define your time threshold in seconds
|
67
|
-
max_time =
|
67
|
+
max_time = 14 / 10 ** 4 # 1.4 millisecond
|
68
68
|
|
69
69
|
# Assert that the average time is below the threshold
|
70
70
|
assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
|
@@ -75,7 +75,7 @@ def test_false_true_incorrect(benchmark):
|
|
75
75
|
mean_time = benchmark.stats.get("median")
|
76
76
|
|
77
77
|
# Define your time threshold in seconds
|
78
|
-
max_time =
|
78
|
+
max_time = 14 / 10 ** 4 # 1.4 millisecond
|
79
79
|
|
80
80
|
# Assert that the average time is below the threshold
|
81
81
|
assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
|
@@ -86,7 +86,7 @@ def test_false_false_correct(benchmark):
|
|
86
86
|
mean_time = benchmark.stats.get("median")
|
87
87
|
|
88
88
|
# Define your time threshold in seconds
|
89
|
-
max_time =
|
89
|
+
max_time = 60 / 10 ** 6 # 60 microsecond
|
90
90
|
|
91
91
|
# Assert that the average time is below the threshold
|
92
92
|
assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|