json-repair 0.19.1__tar.gz → 0.20.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: json_repair
3
- Version: 0.19.1
3
+ Version: 0.20.0
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
3
3
  build-backend = "setuptools.build_meta"
4
4
  [project]
5
5
  name = "json_repair"
6
- version = "0.19.1"
6
+ version = "0.20.0"
7
7
  license = {file = "LICENSE"}
8
8
  authors = [
9
9
  { name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },
@@ -22,6 +22,7 @@ If something is wrong (a missing parantheses or quotes for example) it will use
22
22
  All supported use cases are in the unit tests
23
23
  """
24
24
 
25
+ import os
25
26
  import json
26
27
  from typing import Any, Dict, List, Union, TextIO
27
28
 
@@ -31,7 +32,9 @@ class JSONParser:
31
32
  # The string to parse
32
33
  self.json_str = json_str
33
34
  # Alternatively, the file description with a json file in it
34
- self.json_fd = json_fd
35
+ if json_fd:
36
+ # This is a trick we do to treat the file wrapper as an array
37
+ self.json_str = StringFileWrapper(json_fd)
35
38
  # Index is our iterator that will keep track of which character we are looking at right now
36
39
  self.index = 0
37
40
  # This is used in the object member parsing to manage the special cases of missing quotes in key or value
@@ -246,7 +249,8 @@ class JSONParser:
246
249
  rstring_delimiter = "”"
247
250
  elif char.isalpha():
248
251
  # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
249
- if char.lower() in ["t", "f", "n"]:
252
+ # But remember, object keys are only of type string
253
+ if char.lower() in ["t", "f", "n"] and self.get_context() != "object_key":
250
254
  value = self.parse_boolean_or_null()
251
255
  if value != "":
252
256
  return value
@@ -263,7 +267,8 @@ class JSONParser:
263
267
  self.index += 1
264
268
  return self.parse_json()
265
269
  self.log(
266
- "While parsing a string, we found no starting quote, ignoring", "info"
270
+ "While parsing a string, we found no starting quote. Will add the quote back",
271
+ "info",
267
272
  )
268
273
  missing_quotes = True
269
274
 
@@ -330,32 +335,50 @@ class JSONParser:
330
335
  # Check if eventually there is a rstring delimiter, otherwise we bail
331
336
  i = 1
332
337
  next_c = self.get_char_at(i)
333
- while next_c and next_c != rstring_delimiter:
338
+ check_comma_in_object_value = True
339
+ while next_c and next_c not in [
340
+ rstring_delimiter,
341
+ lstring_delimiter,
342
+ ]:
343
+ # This is a bit of a weird workaround, essentially in object_value context we don't always break on commas
344
+ # This is because the routine after will make sure to correct any bad guess and this solves a corner case
345
+ if next_c.isalpha():
346
+ check_comma_in_object_value = False
334
347
  # If we are in an object context, let's check for the right delimiters
335
348
  if (
336
- next_c == lstring_delimiter
337
- or ("object_key" in self.context and next_c == ":")
338
- or ("object_value" in self.context and next_c in ["}", ","])
349
+ ("object_key" in self.context and next_c in [":", "}"])
350
+ or ("object_value" in self.context and next_c == "}")
339
351
  or ("array" in self.context and next_c in ["]", ","])
352
+ or (
353
+ check_comma_in_object_value
354
+ and self.get_context() == "object_value"
355
+ and next_c == ","
356
+ )
340
357
  ):
341
358
  break
342
359
  i += 1
343
360
  next_c = self.get_char_at(i)
344
361
  if next_c == rstring_delimiter:
345
- # But this might not be it! This could be just a missing comma
346
- # We need to check if we find a rstring_delimiter and a colon after
347
- i += 1
348
- next_c = self.get_char_at(i)
349
- while next_c and next_c != rstring_delimiter:
362
+ if self.get_context() == "object_value":
363
+ # But this might not be it! This could be just a missing comma
364
+ # We found a delimiter and we need to check if this is a key
365
+ # so find a rstring_delimiter and a colon after
350
366
  i += 1
351
367
  next_c = self.get_char_at(i)
352
- i += 1
353
- next_c = self.get_char_at(i)
354
- while next_c and next_c != ":":
355
- if next_c in [lstring_delimiter, rstring_delimiter, ","]:
356
- break
368
+ while next_c and next_c != rstring_delimiter:
369
+ i += 1
370
+ next_c = self.get_char_at(i)
357
371
  i += 1
358
372
  next_c = self.get_char_at(i)
373
+ while next_c and next_c != ":":
374
+ if next_c in [
375
+ lstring_delimiter,
376
+ rstring_delimiter,
377
+ ",",
378
+ ]:
379
+ break
380
+ i += 1
381
+ next_c = self.get_char_at(i)
359
382
  # Only if we fail to find a ':' then we know this is misplaced quote
360
383
  if next_c != ":":
361
384
  self.log(
@@ -451,36 +474,22 @@ class JSONParser:
451
474
  try:
452
475
  return self.json_str[self.index + count]
453
476
  except IndexError:
454
- if self.json_fd:
455
- self.json_fd.seek(self.index + count)
456
- char = self.json_fd.read(1)
457
- if char == "":
458
- return False
459
- return char
460
- else:
461
- return False
477
+ return False
462
478
 
463
479
  def skip_whitespaces_at(self) -> None:
464
480
  """
465
481
  This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
466
482
  """
467
- if self.json_fd:
468
- char = self.get_char_at()
469
- while char and char.isspace():
470
- self.index += 1
471
- char = self.get_char_at()
472
- else:
473
- # If this is not a file stream, we do this monster here to make this function much much faster
483
+ try:
484
+ char = self.json_str[self.index]
485
+ except IndexError:
486
+ return
487
+ while char.isspace():
488
+ self.index += 1
474
489
  try:
475
490
  char = self.json_str[self.index]
476
491
  except IndexError:
477
492
  return
478
- while char.isspace():
479
- self.index += 1
480
- try:
481
- char = self.json_str[self.index]
482
- except IndexError:
483
- return
484
493
 
485
494
  def set_context(self, value: str) -> None:
486
495
  # If a value is provided update the context variable and save in stack
@@ -502,22 +511,9 @@ class JSONParser:
502
511
  def log(self, text: str, level: str) -> None:
503
512
  if level == self.logger["log_level"]:
504
513
  context = ""
505
- if self.json_fd:
506
- self.json_fd.seek(self.index - self.logger["window"])
507
- context = self.json_fd.read(self.logger["window"] * 2)
508
- self.json_fd.seek(self.index)
509
- else:
510
- start = (
511
- self.index - self.logger["window"]
512
- if (self.index - self.logger["window"]) >= 0
513
- else 0
514
- )
515
- end = (
516
- self.index + self.logger["window"]
517
- if (self.index + self.logger["window"]) <= len(self.json_str)
518
- else len(self.json_str)
519
- )
520
- context = self.json_str[start:end]
514
+ start = max(self.index - self.logger["window"], 0)
515
+ end = min(self.index + self.logger["window"], len(self.json_str))
516
+ context = self.json_str[start:end]
521
517
  self.logger["log"].append(
522
518
  {
523
519
  "text": text,
@@ -593,3 +589,31 @@ def from_file(
593
589
  fd.close()
594
590
 
595
591
  return jsonobj
592
+
593
+
594
+ class StringFileWrapper:
595
+ # This is a trick to simplify the code above, transform the filedescriptor handling into an array handling
596
+ def __init__(self, fd: TextIO) -> None:
597
+ self.fd = fd
598
+ self.length = None
599
+
600
+ def __getitem__(self, index: int) -> Any:
601
+ if isinstance(index, slice):
602
+ self.fd.seek(index.start)
603
+ value = self.fd.read(index.stop - index.start)
604
+ self.fd.seek(index.start)
605
+ return value
606
+ else:
607
+ self.fd.seek(index)
608
+ return self.fd.read(1)
609
+
610
+ def __len__(self) -> int:
611
+ if not self.length:
612
+ current_position = self.fd.tell()
613
+ self.fd.seek(0, os.SEEK_END)
614
+ self.length = self.fd.tell()
615
+ self.fd.seek(current_position)
616
+ return self.length
617
+
618
+ def __setitem__(self):
619
+ raise Exception("This is read-only!")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: json_repair
3
- Version: 0.19.1
3
+ Version: 0.20.0
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -1,27 +1,61 @@
1
1
  from src.json_repair.json_repair import from_file, repair_json, loads
2
2
 
3
3
 
4
- def test_repair_json():
5
- # Test with valid JSON strings
4
+ def test_valid_json():
6
5
  assert repair_json("[]") == "[]"
7
- assert repair_json("[{]") == "[]"
8
- assert repair_json(" { } ") == "{}"
9
- assert repair_json("\"") == '""'
10
- assert repair_json("\n") == '""'
11
- assert repair_json(' {"key": true, "key2": false, "key3": null}') == '{"key": true, "key2": false, "key3": null}'
12
- assert repair_json('{"key": TRUE, "key2": FALSE, "key3": Null} ') == '{"key": true, "key2": false, "key3": null}'
13
- assert repair_json("{'key': 'string', 'key2': false, \"key3\": null, \"key4\": unquoted}") == '{"key": "string", "key2": false, "key3": null, "key4": "unquoted"}'
6
+ assert repair_json("[1, 2, 3, 4]") == "[1, 2, 3, 4]"
14
7
  assert (
15
8
  repair_json('{"name": "John", "age": 30, "city": "New York"}')
16
9
  == '{"name": "John", "age": 30, "city": "New York"}'
17
10
  )
18
- assert repair_json("[1, 2, 3, 4]") == "[1, 2, 3, 4]"
19
11
  assert (
20
12
  repair_json('{"employees":["John", "Anna", "Peter"]} ')
21
13
  == '{"employees": ["John", "Anna", "Peter"]}'
22
14
  )
15
+ assert repair_json('{"key": "value:value"}') == '{"key": "value:value"}'
16
+ assert (
17
+ repair_json('{"text": "The quick brown fox,"}')
18
+ == '{"text": "The quick brown fox,"}'
19
+ )
20
+ assert (
21
+ repair_json('{"text": "The quick brown fox won\'t jump"}')
22
+ == '{"text": "The quick brown fox won\'t jump"}'
23
+ )
24
+ assert repair_json('{"key": ""') == '{"key": ""}'
25
+ assert (
26
+ repair_json('{"key1": {"key2": [1, 2, 3]}}') == '{"key1": {"key2": [1, 2, 3]}}'
27
+ )
28
+ assert (
29
+ repair_json('{"key": 12345678901234567890}') == '{"key": 12345678901234567890}'
30
+ )
31
+ assert repair_json('{"key": "value\u263A"}') == '{"key": "value\\u263a"}'
32
+ assert repair_json('{"key": "value\\nvalue"}') == '{"key": "value\\nvalue"}'
33
+
34
+ def test_brackets_edge_cases():
35
+ assert repair_json("[{]") == "[]"
36
+ assert repair_json(" { } ") == "{}"
37
+ assert repair_json("[") == "[]"
38
+ assert repair_json("]") == '""'
39
+ assert repair_json("{") == "{}"
40
+ assert repair_json("}") == '""'
41
+ assert repair_json('{"') == '{}'
42
+ assert repair_json('["') == '[]'
43
+ assert repair_json('{foo: [}') == '{"foo": []}'
23
44
 
24
- # Test with invalid JSON strings
45
+ def test_general_edge_cases():
46
+ assert repair_json("\"") == '""'
47
+ assert repair_json("\n") == '""'
48
+ assert repair_json(" ") == '""'
49
+ assert repair_json("[[1\n\n]") == "[[1]]"
50
+ assert repair_json("string") == '""'
51
+ assert repair_json("stringbeforeobject {}") == '{}'
52
+
53
+ def test_mixed_data_types():
54
+ assert repair_json(' {"key": true, "key2": false, "key3": null}') == '{"key": true, "key2": false, "key3": null}'
55
+ assert repair_json('{"key": TRUE, "key2": FALSE, "key3": Null} ') == '{"key": true, "key2": false, "key3": null}'
56
+
57
+ def test_missing_and_mixed_quotes():
58
+ assert repair_json("{'key': 'string', 'key2': false, \"key3\": null, \"key4\": unquoted}") == '{"key": "string", "key2": false, "key3": null, "key4": "unquoted"}'
25
59
  assert (
26
60
  repair_json('{"name": "John", "age": 30, "city": "New York')
27
61
  == '{"name": "John", "age": 30, "city": "New York"}'
@@ -38,6 +72,13 @@ def test_repair_json():
38
72
  repair_json('{"name": John, "age": 30, "city": "New York"}')
39
73
  == '{"name": "John", "age": 30, "city": "New York"}'
40
74
  )
75
+ assert repair_json('{“slanted_delimiter”: "value"}') == '{"slanted_delimiter": "value"}'
76
+ assert (
77
+ repair_json('{"name": "John", "age": 30, "city": "New')
78
+ == '{"name": "John", "age": 30, "city": "New"}'
79
+ )
80
+
81
+ def test_array_edge_cases():
41
82
  assert repair_json("[1, 2, 3,") == "[1, 2, 3]"
42
83
  assert repair_json("[1, 2, 3, ...]") == "[1, 2, 3]"
43
84
  assert repair_json("[1, 2, ... , 3]") == "[1, 2, 3]"
@@ -46,51 +87,28 @@ def test_repair_json():
46
87
  repair_json('{"employees":["John", "Anna",')
47
88
  == '{"employees": ["John", "Anna"]}'
48
89
  )
49
-
50
- # Test with edge cases
51
- assert repair_json(" ") == '""'
52
- assert repair_json("[") == "[]"
53
- assert repair_json("]") == '""'
54
- assert repair_json("[[1\n\n]") == "[[1]]"
55
- assert repair_json("{") == "{}"
56
- assert repair_json("}") == '""'
57
- assert repair_json("string") == '""'
58
- assert repair_json("stringbeforeobject {}") == '{}'
59
- assert repair_json('{"') == '{}'
60
- assert repair_json('["') == '[]'
61
- assert repair_json("'\"'") == '"\\\""'
62
- assert repair_json("{\"key\": 'string\"\n\t\le'") == '{"key": "string\\"\\n\\tle"}'
63
- assert repair_json('{foo: [}') == '{"foo": []}'
64
- assert repair_json('''{ "a": "{ b": {} }" }''') == '{"a": "{ b"}'
65
- assert repair_json('{"key": "value:value"}') == '{"key": "value:value"}'
66
- assert repair_json('{“slanted_delimiter”: "value"}') == '{"slanted_delimiter": "value"}'
67
- assert (
68
- repair_json('{"name": "John", "age": 30, "city": "New')
69
- == '{"name": "John", "age": 30, "city": "New"}'
70
- )
71
90
  assert (
72
91
  repair_json('{"employees":["John", "Anna", "Peter')
73
92
  == '{"employees": ["John", "Anna", "Peter"]}'
74
93
  )
75
- assert (
76
- repair_json('{"employees":["John", "Anna", "Peter"]}')
77
- == '{"employees": ["John", "Anna", "Peter"]}'
78
- )
79
- assert (
80
- repair_json('{"text": "The quick brown fox,"}')
81
- == '{"text": "The quick brown fox,"}'
82
- )
83
- assert (
84
- repair_json('{"text": "The quick brown fox won\'t jump"}')
85
- == '{"text": "The quick brown fox won\'t jump"}'
86
- )
94
+ assert repair_json('{"key1": {"key2": [1, 2, 3') == '{"key1": {"key2": [1, 2, 3]}}'
95
+
96
+
97
+ def test_escaping():
98
+ assert repair_json("'\"'") == '"\\\""'
99
+ assert repair_json("{\"key\": 'string\"\n\t\le'") == '{"key": "string\\"\\n\\tle"}'
100
+ assert repair_json(r'{"real_content": "Some string: Some other string \t Some string <a href=\"https://domain.com\">Some link</a>"') == r'{"real_content": "Some string: Some other string \t Some string <a href=\"https://domain.com\">Some link</a>"}'
101
+ assert repair_json('{"key_1\n": "value"}') == '{"key_1": "value"}'
102
+ assert repair_json('{"key\t_": "value"}') == '{"key\\t_": "value"}'
103
+
104
+
105
+ def test_object_edge_cases():
87
106
  assert {
88
107
  repair_json('{"value_1": "value_2": "data"}') == '{"value_1": "value_2", "data": ""}'
89
108
  }
90
109
  assert {
91
110
  repair_json('{"value_1": true, COMMENT "value_2": "data"}') == '{"value_1": "value_2", "": "data"}'
92
111
  }
93
- # Test with garbage comments
94
112
  assert repair_json('{"value_1": true, SHOULD_NOT_EXIST "value_2": "data" AAAA }') == '{"value_1": true, "value_2": "data"}'
95
113
  assert {
96
114
  repair_json('{"" : true, "key2": "value2"}') == '{" ": true, "key2": "value_2"}'
@@ -98,35 +116,38 @@ def test_repair_json():
98
116
  assert {
99
117
  repair_json('{"": true, "key2": "value2"}') == '{"empty_placeholder": true, "key2": "value_2"}'
100
118
  }
101
- # Test a nasty corner case
102
- assert repair_json(' - { "test_key": ["test_value", "test_value2"] }') == '{"test_key": ["test_value", "test_value2"]}'
103
-
104
- #Test markdown stupidities from ChatGPT
105
- assert repair_json('{ "content": "[LINK]("https://google.com")" }') == '{"content": "[LINK](\\"https://google.com\\")"}'
106
- assert repair_json('{ "content": "[LINK](" }') == '{"content": "[LINK]("}'
107
- assert repair_json('{ "content": "[LINK](", "key": true }') == '{"content": "[LINK](", "key": true}'
108
- assert repair_json("""
109
- ```json
110
- { "key": "value" }
111
- ```""") == '{"key": "value"}'
112
- assert repair_json('````{ "key": "value" }```') == '{"key": "value"}'
113
- assert repair_json(r'{"real_content": "Some string: Some other string \t Some string <a href=\"https://domain.com\">Some link</a>"') == r'{"real_content": "Some string: Some other string \t Some string <a href=\"https://domain.com\">Some link</a>"}'
114
- assert repair_json('{"key_1\n": "value"}') == '{"key_1": "value"}'
115
- assert repair_json('{"key\t_": "value"}') == '{"key\\t_": "value"}'
116
119
  assert repair_json('{""answer"":[{""traits"":''Female aged 60+'',""answer1"":""5""}]}') == '{"answer": [{"traits": "Female aged 60+", "answer1": "5"}]}'
117
- assert repair_json('{"key":""') == '{"key": ""}'
118
120
  assert repair_json('{ "words": abcdef", "numbers": 12345", "words2": ghijkl" }') == '{"words": "abcdef", "numbers": 12345, "words2": "ghijkl"}'
121
+ assert repair_json('''{"number": 1,"reason": "According...""ans": "YES"}''') == '{"number": 1, "reason": "According...", "ans": "YES"}'
122
+ assert repair_json('''{ "a": "{ b": {} }" }''') == '{"a": "{ b"}'
123
+ assert repair_json("""{"b": "xxxxx" true}""") == '{"b": "xxxxx"}'
124
+ assert repair_json('{"key": "Lorem "ipsum" s,"}') == '{"key": "Lorem \\"ipsum\\" s,"}'
125
+
126
+ def test_number_edge_cases():
127
+ assert repair_json(' - { "test_key": ["test_value", "test_value2"] }') == '{"test_key": ["test_value", "test_value2"]}'
119
128
  assert repair_json('{"key": 1/3}') == '{"key": "1/3"}'
120
129
  assert repair_json('{"key": .25}') == '{"key": 0.25}'
121
- assert repair_json("""{ "a": "", "b": [ { "c": 1} ] \n}```""") == '{"a": "", "b": [{"c": 1}]}'
122
- assert repair_json("Based on the information extracted, here is the filled JSON output: ```json { 'a': 'b' } ```") == '{"a": "b"}'
123
- assert repair_json('''{"number": 1,"reason": "According...""ans": "YES"}''') == '{"number": 1, "reason": "According...", "ans": "YES"}'
124
- assert repair_json('{"key": 1/3, "foo": "bar"}') == '{"key": "1/3", "foo": "bar"}'
125
130
  assert repair_json('{"here": "now", "key": 1/3, "foo": "bar"}') == '{"here": "now", "key": "1/3", "foo": "bar"}'
126
131
  assert repair_json('{"key": 12345/67890}') == '{"key": "12345/67890"}'
127
132
  assert repair_json('[105,12') == '["105,12"]'
128
133
  assert repair_json('{"key", 105,12,') == '{"key": "105,12"}'
134
+ assert repair_json('{"key": 1/3, "foo": "bar"}') == '{"key": "1/3", "foo": "bar"}'
135
+ assert repair_json('{"key": 10-20}') == '{"key": "10-20"}'
136
+ assert repair_json('{"key": 1.1.1}') == '{"key": "1.1.1"}'
137
+
138
+ def test_markdown():
139
+ assert repair_json('{ "content": "[LINK]("https://google.com")" }') == '{"content": "[LINK](\\"https://google.com\\")"}'
140
+ assert repair_json('{ "content": "[LINK](" }') == '{"content": "[LINK]("}'
141
+ assert repair_json('{ "content": "[LINK](", "key": true }') == '{"content": "[LINK](", "key": true}'
129
142
 
143
+ def test_leading_trailing_characters():
144
+ assert repair_json('````{ "key": "value" }```') == '{"key": "value"}'
145
+ assert repair_json("""{ "a": "", "b": [ { "c": 1} ] \n}```""") == '{"a": "", "b": [{"c": 1}]}'
146
+ assert repair_json("Based on the information extracted, here is the filled JSON output: ```json { 'a': 'b' } ```") == '{"a": "b"}'
147
+ assert repair_json("""
148
+ ```json
149
+ { "key": "value" }
150
+ ```""") == '{"key": "value"}'
130
151
 
131
152
  def test_repair_json_with_objects():
132
153
  # Test with valid JSON strings
@@ -142,47 +163,6 @@ def test_repair_json_with_objects():
142
163
  assert repair_json('{"employees":["John", "Anna", "Peter"]} ', return_objects=True) == {
143
164
  "employees": ["John", "Anna", "Peter"]
144
165
  }
145
-
146
- # Test with invalid JSON strings
147
- assert repair_json('{"name": "John", "age": 30, "city": "New York', return_objects=True) == {
148
- "name": "John",
149
- "age": 30,
150
- "city": "New York",
151
- }
152
- assert repair_json('{"name": "John", "age": 30, city: "New York"}', return_objects=True) == {
153
- "name": "John",
154
- "age": 30,
155
- "city": "New York",
156
- }
157
- assert repair_json('{"name": "John", "age": 30, "city": New York}', return_objects=True) == {
158
- "name": "John",
159
- "age": 30,
160
- "city": "New York",
161
- }
162
- assert repair_json('{"employees":["John", "Anna",', return_objects=True) == {
163
- "employees": ["John", "Anna"]
164
- }
165
-
166
- # Test with edge cases
167
- assert repair_json(" ", return_objects=True) == ""
168
- assert repair_json("[", return_objects=True) == []
169
- assert repair_json("{", return_objects=True) == {}
170
- assert repair_json('{"key": "value:value"}', return_objects=True) == {"key": "value:value"}
171
- assert repair_json("{'key': 'string', 'key2': false, \"key3\": null, \"key4\": unquoted}", return_objects=True) == {"key": "string", "key2": False, "key3": None, "key4": "unquoted"}
172
- assert repair_json('{"name": "John", "age": 30, "city": "New', return_objects=True) == {
173
- "name": "John",
174
- "age": 30,
175
- "city": "New",
176
- }
177
- assert repair_json('{"employees":["John", "Anna", "Peter', return_objects=True) == {
178
- "employees": ["John", "Anna", "Peter"]
179
- }
180
-
181
- #Test with garbage comments
182
- assert repair_json('{"value_1": true, SHOULD_NOT_EXIST "value_2": "data" AAAA }', return_objects=True) == {'value_1': True, 'value_2': 'data'}
183
-
184
- #Test markdown stupidities from ChatGPT
185
- assert repair_json('{ "content": "[LINK]("https://google.com")" }', return_objects=True) == { "content": "[LINK](\"https://google.com\")"}
186
166
  assert repair_json('''
187
167
  {
188
168
  "resourceType": "Bundle",
@@ -216,73 +196,6 @@ def test_repair_json_with_objects():
216
196
  ]
217
197
  """, return_objects=True) == [{"foo": "Foo bar baz", "tag": "#foo-bar-baz"},{"foo": "foo bar \"foobar\" foo bar baz.", "tag": "#foo-bar-foobar" }]
218
198
 
219
-
220
- def test_repair_json_corner_cases_generate_by_gpt():
221
- # Test with nested JSON
222
- assert (
223
- repair_json('{"key1": {"key2": [1, 2, 3]}}') == '{"key1": {"key2": [1, 2, 3]}}'
224
- )
225
- assert repair_json('{"key1": {"key2": [1, 2, 3') == '{"key1": {"key2": [1, 2, 3]}}'
226
-
227
- # Test with empty keys
228
- assert repair_json('{"": "value"}') == '{"": "value"}'
229
-
230
- # Test with Unicode characters
231
- assert repair_json('{"key": "value\u263A"}') == '{"key": "value\\u263a"}'
232
-
233
- # Test with special characters
234
- assert repair_json('{"key": "value\\nvalue"}') == '{"key": "value\\nvalue"}'
235
-
236
- # Test with large numbers
237
- assert (
238
- repair_json('{"key": 12345678901234567890}') == '{"key": 12345678901234567890}'
239
- )
240
-
241
- # Test with whitespace
242
- assert repair_json(' { "key" : "value" } ') == '{"key": "value"}'
243
-
244
- # Test with null values
245
- assert repair_json('{"key": null}') == '{"key": null}'
246
-
247
- # Test with numeric-like values
248
- assert repair_json('{"key": 10-20}') == '{"key": "10-20"}'
249
- assert repair_json('{"key": 1.1.1}') == '{"key": "1.1.1"}'
250
-
251
-
252
- def test_repair_json_corner_cases_generate_by_gpt_with_objects():
253
- # Test with nested JSON
254
- assert repair_json('{"key1": {"key2": [1, 2, 3]}}', return_objects=True) == {
255
- "key1": {"key2": [1, 2, 3]}
256
- }
257
- assert repair_json('{"key1": {"key2": [1, 2, 3', return_objects=True) == {
258
- "key1": {"key2": [1, 2, 3]}
259
- }
260
-
261
- # Test with empty keys
262
- assert repair_json('{"": "value"}', return_objects=True) == {"": "value"}
263
-
264
- # Test with Unicode characters
265
- assert repair_json('{"key": "value\u263A"}', return_objects=True) == {"key": "value☺"}
266
-
267
- # Test with special characters
268
- assert repair_json('{"key": "value\\nvalue"}', return_objects=True) == {"key": "value\nvalue"}
269
-
270
- # Test with large numbers
271
- assert repair_json('{"key": 12345678901234567890}', return_objects=True) == {
272
- "key": 12345678901234567890
273
- }
274
-
275
- # Test with whitespace
276
- assert repair_json(' { "key" : "value" } ', return_objects=True) == {"key": "value"}
277
-
278
- # Test with null values
279
- assert repair_json('{"key": null}', return_objects=True) == {"key": None}
280
-
281
- # Test with numeric-like values
282
- assert repair_json('{"key": 10-20}', return_objects=True) == {"key": "10-20"}
283
- assert repair_json('{"key": 1.1.1}', return_objects=True) == {"key": "1.1.1"}
284
-
285
-
286
199
  def test_repair_json_skip_json_loads():
287
200
  assert repair_json('{"key": true, "key2": false, "key3": null}', skip_json_loads=True) == '{"key": true, "key2": false, "key3": null}'
288
201
  assert repair_json('{"key": true, "key2": false, "key3": null}', return_objects=True, skip_json_loads=True) == {"key": True, "key2": False, "key3": None}
@@ -291,9 +204,20 @@ def test_repair_json_skip_json_loads():
291
204
 
292
205
 
293
206
  def test_repair_json_from_file():
294
-
295
207
  import os.path
296
208
  import pathlib
297
209
  path = pathlib.Path(__file__).parent.resolve()
298
210
 
299
211
  assert(from_file(os.path.join(path,"invalid.json"))) == '[{"_id": "655b66256574f09bdae8abe8", "index": 0, "guid": "31082ae3-b0f3-4406-90f4-cc450bd4379d", "isActive": false, "balance": "$2,562.78", "picture": "http://placehold.it/32x32", "age": 32, "eyeColor": "brown", "name": "Glover Rivas", "gender": "male", "company": "EMPIRICA", "email": "gloverrivas@empirica.com", "phone": "+1 (842) 507-3063", "address": "536 Montague Terrace, Jenkinsville, Kentucky, 2235", "about": "Mollit consectetur excepteur voluptate tempor dolore ullamco enim irure ullamco non enim officia. Voluptate occaecat proident laboris ea Lorem cupidatat reprehenderit nisi nisi aliqua. Amet nulla ipsum deserunt excepteur amet ad aute aute ex. Et enim minim sit veniam est quis dolor nisi sunt quis eiusmod in. Amet eiusmod cillum sunt occaecat dolor laboris voluptate in eiusmod irure aliqua duis.", "registered": "2023-11-18T09:32:36 -01:00", "latitude": 36.26102, "longitude": -91.304608, "tags": ["non", "tempor", "do", "ullamco", "dolore", "sunt", "ipsum"], "friends": [{"id": 0, "name": "Cara Shepherd"}, {"id": 1, "name": "Mason Farley"}, {"id": 2, "name": "Harriet Cochran"}], "greeting": "Hello, Glover Rivas! You have 7 unread messages.", "favoriteFruit": "strawberry"}, {"_id": "655b662585364bc57278bb6f", "index": 1, "guid": "0dea7a3a-f812-4dde-b78d-7a9b58e5da05", "isActive": true, "balance": "$1,359.48", "picture": "http://placehold.it/32x32", "age": 38, "eyeColor": "brown", "name": "Brandi Moreno", "gender": "female", "company": "MARQET", "email": "brandimoreno@marqet.com", "phone": "+1 (850) 434-2077", "address": "537 Doone Court, Waiohinu, Michigan, 3215", "about": "Irure proident adipisicing do Lorem do incididunt in laborum in eiusmod eiusmod ad elit proident. Eiusmod dolor ex magna magna occaecat. Nulla deserunt velit ex exercitation et irure sunt. Cupidatat ut excepteur ea quis labore sint cupidatat incididunt amet eu consectetur cillum ipsum proident. Occaecat exercitation aute laborum dolor proident reprehenderit laborum in voluptate culpa. Exercitation nulla adipisicing culpa aute est deserunt ea nisi deserunt consequat occaecat ut et non. Incididunt ex exercitation dolor dolor anim cillum dolore.", "registered": "2015-09-03T11:47:15 -02:00", "latitude": -19.768953, "longitude": 8.948458, "tags": ["laboris", "occaecat", "laborum", "laborum", "ex", "cillum", "occaecat"], "friends": [{"id": 0, "name": "Erna Kelly"}, {"id": 1, "name": "Black Mays"}, {"id": 2, "name": "Davis Buck"}], "greeting": "Hello, Brandi Moreno! You have 1 unread messages.", "favoriteFruit": "apple"}, {"_id": "655b6625870da431bcf5e0c2", "index": 2, "guid": "b17f6e3f-c898-4334-abbf-05cf222f143b", "isActive": false, "balance": "$1,493.77", "picture": "http://placehold.it/32x32", "age": 20, "eyeColor": "brown", "name": "Moody Meadows", "gender": "male", "company": "OPTIQUE", "email": "moodymeadows@optique.com", "phone": "+1 (993) 566-3041", "address": "766 Osborn Street, Bath, Maine, 7666", "about": "Non commodo excepteur nostrud qui adipisicing aliquip dolor minim nulla culpa proident. In ad cupidatat ea mollit ex est do deserunt proident nostrud. Cillum id id eiusmod amet exercitation nostrud cillum sunt deserunt dolore deserunt eiusmod mollit. Ut ex tempor ad laboris voluptate labore id officia fugiat exercitation amet.", "registered": "2015-01-16T02:48:28 -01:00", "latitude": -25.847327, "longitude": 63.95991, "tags": ["aute", "commodo", "adipisicing", "nostrud", "duis", "mollit", "ut"], "friends": [{"id": 0, "name": "Lacey Cash"}, {"id": 1, "name": "Gabrielle Harmon"}, {"id": 2, "name": "Ellis Lambert"}], "greeting": "Hello, Moody Meadows! You have 4 unread messages.", "favoriteFruit": "strawberry"}, {"_id": "655b6625f3e1bf422220854e", "index": 3, "guid": "92229883-2bfd-4974-a08c-1b506b372e46", "isActive": false, "balance": "$2,215.34", "picture": "http://placehold.it/32x32", "age": 22, "eyeColor": "brown", "name": "Heath Nguyen", "gender": "male", "company": "BLEENDOT", "email": "heathnguyen@bleendot.com", "phone": "+1 (989) 512-2797", "address": "135 Milton Street, Graniteville, Nebraska, 276", "about": "Consequat aliquip irure Lorem cupidatat nulla magna ullamco nulla voluptate adipisicing anim consectetur tempor aliquip. Magna aliqua nulla eu tempor esse proident. Proident fugiat ad ex Lorem reprehenderit dolor aliquip labore labore aliquip. Deserunt aute enim ea minim officia anim culpa sint commodo. Cillum consectetur excepteur aliqua exercitation Lorem veniam voluptate.", "registered": "2016-07-06T01:31:07 -02:00", "latitude": -60.997048, "longitude": -102.397885, "tags": ["do", "ad", "consequat", "irure", "tempor", "elit", "minim"], "friends": [{"id": 0, "name": "Walker Hernandez"}, {"id": 1, "name": "Maria Lane"}, {"id": 2, "name": "Mcknight Barron"}], "greeting": "Hello, Heath Nguyen! You have 4 unread messages.", "favoriteFruit": "apple"}, {"_id": "655b6625519a5b5e4b6742bf", "index": 4, "guid": "c5dc685f-6d0d-4173-b4cf-f5df29a1e8ef", "isActive": true, "balance": "$1,358.90", "picture": "http://placehold.it/32x32", "age": 33, "eyeColor": "brown", "name": "Deidre Duke", "gender": "female", "company": "OATFARM", "email": "deidreduke@oatfarm.com", "phone": "+1 (875) 587-3256", "address": "487 Schaefer Street, Wattsville, West Virginia, 4506", "about": "Laboris eu nulla esse magna sit eu deserunt non est aliqua exercitation commodo. Ad occaecat qui qui laborum dolore anim Lorem. Est qui occaecat irure enim deserunt enim aliqua ex deserunt incididunt esse. Quis in minim laboris proident non mollit. Magna ea do labore commodo. Et elit esse esse occaecat officia ipsum nisi.", "registered": "2021-09-12T04:17:08 -02:00", "latitude": 68.609781, "longitude": -87.509134, "tags": ["mollit", "cupidatat", "irure", "sit", "consequat", "anim", "fugiat"], "friends": [{"id": 0, "name": "Bean Paul"}, {"id": 1, "name": "Cochran Hubbard"}, {"id": 2, "name": "Rodgers Atkinson"}], "greeting": "Hello, Deidre Duke! You have 6 unread messages.", "favoriteFruit": "apple"}, {"_id": "655b6625a19b3f7e5f82f0ea", "index": 5, "guid": "75f3c264-baa1-47a0-b21c-4edac23d9935", "isActive": true, "balance": "$3,554.36", "picture": "http://placehold.it/32x32", "age": 26, "eyeColor": "blue", "name": "Lydia Holland", "gender": "female", "company": "ESCENTA", "email": "lydiaholland@escenta.com", "phone": "+1 (927) 482-3436", "address": "554 Rockaway Parkway, Kohatk, Montana, 6316", "about": "Consectetur ea est labore commodo laborum mollit pariatur non enim. Est dolore et non laboris tempor. Ea incididunt ut adipisicing cillum labore officia tempor eiusmod commodo. Cillum fugiat ex consectetur ut nostrud anim nostrud exercitation ut duis in ea. Eu et id fugiat est duis eiusmod ullamco quis officia minim sint ea nisi in.", "registered": "2018-03-13T01:48:56 -01:00", "latitude": -88.495799, "longitude": 71.840667, "tags": ["veniam", "minim", "consequat", "consequat", "incididunt", "consequat", "elit"], "friends": [{"id": 0, "name": "Debra Massey"}, {"id": 1, "name": "Weiss Savage"}, {"id": 2, "name": "Shannon Guerra"}], "greeting": "Hello, Lydia Holland! You have 5 unread messages.", "favoriteFruit": "banana"}]'
212
+
213
+ import tempfile
214
+ # Create a temporary file
215
+ temp_fd, temp_path = tempfile.mkstemp(suffix=".json")
216
+ try:
217
+ # Write content to the temporary file
218
+ with os.fdopen(temp_fd, 'w') as tmp:
219
+ tmp.write("{key:value}")
220
+ assert(from_file(temp_path, logging=True)) == ({'key': 'value'}, [{'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}, {'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}])
221
+ finally:
222
+ # Clean up - delete the temporary file
223
+ os.remove(temp_path)
@@ -42,7 +42,7 @@ def test_true_false_correct(benchmark):
42
42
  mean_time = benchmark.stats.get("median")
43
43
 
44
44
  # Define your time threshold in seconds
45
- max_time = 20 * (1 / 10 ** 6) # 20 microsecond
45
+ max_time = 30 * (1 / 10 ** 6) # 30 microsecond
46
46
 
47
47
  # Assert that the average time is below the threshold
48
48
  assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
@@ -64,7 +64,7 @@ def test_false_true_correct(benchmark):
64
64
  mean_time = benchmark.stats.get("median")
65
65
 
66
66
  # Define your time threshold in seconds
67
- max_time = 13 / 10 ** 4 # 1.3 millisecond
67
+ max_time = 14 / 10 ** 4 # 1.4 millisecond
68
68
 
69
69
  # Assert that the average time is below the threshold
70
70
  assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
@@ -75,7 +75,7 @@ def test_false_true_incorrect(benchmark):
75
75
  mean_time = benchmark.stats.get("median")
76
76
 
77
77
  # Define your time threshold in seconds
78
- max_time = 13 / 10 ** 4 # 1.3 millisecond
78
+ max_time = 14 / 10 ** 4 # 1.4 millisecond
79
79
 
80
80
  # Assert that the average time is below the threshold
81
81
  assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
@@ -86,7 +86,7 @@ def test_false_false_correct(benchmark):
86
86
  mean_time = benchmark.stats.get("median")
87
87
 
88
88
  # Define your time threshold in seconds
89
- max_time = 50 / 10 ** 6 # 50 microsecond
89
+ max_time = 60 / 10 ** 6 # 60 microsecond
90
90
 
91
91
  # Assert that the average time is below the threshold
92
92
  assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
File without changes
File without changes
File without changes