json-repair 0.20.1__tar.gz → 0.22.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: json_repair
3
- Version: 0.20.1
3
+ Version: 0.22.0
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
3
3
  build-backend = "setuptools.build_meta"
4
4
  [project]
5
5
  name = "json_repair"
6
- version = "0.20.1"
6
+ version = "0.22.0"
7
7
  license = {file = "LICENSE"}
8
8
  authors = [
9
9
  { name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },
@@ -1,7 +1,7 @@
1
1
  """
2
2
  This module will parse the JSON file following the BNF definition:
3
3
 
4
- <json> ::= <primitive> | <container>
4
+ <json> ::= <container>
5
5
 
6
6
  <primitive> ::= <number> | <string> | <boolean>
7
7
  ; Where:
@@ -24,11 +24,55 @@ All supported use cases are in the unit tests
24
24
 
25
25
  import os
26
26
  import json
27
- from typing import Any, Dict, List, Union, TextIO
27
+ from typing import Any, Dict, List, Optional, Union, TextIO, Tuple
28
+
29
+
30
+ class StringFileWrapper:
31
+ # This is a trick to simplify the code, transform the filedescriptor handling into a string handling
32
+ def __init__(self, fd: TextIO) -> None:
33
+ self.fd = fd
34
+ self.length: int = 0
35
+
36
+ def __getitem__(self, index: int) -> str:
37
+ if isinstance(index, slice):
38
+ self.fd.seek(index.start)
39
+ value = self.fd.read(index.stop - index.start)
40
+ self.fd.seek(index.start)
41
+ return value
42
+ else:
43
+ self.fd.seek(index)
44
+ return self.fd.read(1)
45
+
46
+ def __len__(self) -> int:
47
+ if self.length < 1:
48
+ current_position = self.fd.tell()
49
+ self.fd.seek(0, os.SEEK_END)
50
+ self.length = self.fd.tell()
51
+ self.fd.seek(current_position)
52
+ return self.length
53
+
54
+ def __setitem__(self) -> None:
55
+ raise Exception("This is read-only!")
56
+
57
+
58
+ class LoggerConfig:
59
+ # This is a type class to simplify the declaration
60
+ def __init__(self, log_level: Optional[str]):
61
+ self.log: List[Dict[str, str]] = []
62
+ self.window: int = 10
63
+ self.log_level: str = log_level if log_level else "none"
64
+
65
+
66
+ JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
28
67
 
29
68
 
30
69
  class JSONParser:
31
- def __init__(self, json_str: str, json_fd: TextIO, logging: bool = False) -> None:
70
+ def __init__(
71
+ self,
72
+ json_str: Union[str, StringFileWrapper],
73
+ json_fd: Optional[TextIO],
74
+ logging: Optional[bool],
75
+ ) -> None:
32
76
  # The string to parse
33
77
  self.json_str = json_str
34
78
  # Alternatively, the file description with a json file in it
@@ -36,26 +80,26 @@ class JSONParser:
36
80
  # This is a trick we do to treat the file wrapper as an array
37
81
  self.json_str = StringFileWrapper(json_fd)
38
82
  # Index is our iterator that will keep track of which character we are looking at right now
39
- self.index = 0
83
+ self.index: int = 0
40
84
  # This is used in the object member parsing to manage the special cases of missing quotes in key or value
41
- self.context = []
85
+ self.context: list[str] = []
42
86
  # Use this to log the activity, but only if logging is active
43
- self.logger = {
44
- "log": [],
45
- "window": 10,
46
- "log_level": "info" if logging else "none",
47
- }
48
-
49
- def parse(self) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
50
- if self.logger["log_level"] == "none":
87
+ self.logger = LoggerConfig(log_level="info" if logging else None)
88
+
89
+ def parse(
90
+ self,
91
+ ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
92
+ if self.logger.log_level == "none":
51
93
  return self.parse_json()
52
94
  else:
53
- return self.parse_json(), self.logger["log"]
95
+ return self.parse_json(), self.logger.log
54
96
 
55
97
  def parse_json(
56
98
  self,
57
- ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
99
+ ) -> JSONReturnType:
58
100
  char = self.get_char_at()
101
+ # This parser will ignore any basic element (string or number) that is not inside an array or object
102
+ is_in_context = len(self.context) > 0
59
103
  # False means that we are at the end of the string provided, is the base case for recursion
60
104
  if char is False:
61
105
  return ""
@@ -78,10 +122,10 @@ class JSONParser:
78
122
  )
79
123
  return ""
80
124
  # <string> starts with a quote
81
- elif char in ['"', "'", "“"] or char.isalpha():
125
+ elif is_in_context and (char in ['"', "'", "“"] or char.isalpha()):
82
126
  return self.parse_string()
83
127
  # <number> starts with [0-9] or minus
84
- elif char.isdigit() or char == "-" or char == ".":
128
+ elif is_in_context and (char.isdigit() or char == "-" or char == "."):
85
129
  return self.parse_number()
86
130
  # If everything else fails, we just ignore and move on
87
131
  else:
@@ -225,7 +269,7 @@ class JSONParser:
225
269
  self.reset_context()
226
270
  return arr
227
271
 
228
- def parse_string(self) -> str:
272
+ def parse_string(self) -> Union[str, JSONReturnType]:
229
273
  # <string> is a string of valid characters enclosed in quotes
230
274
  # i.e. { name: "John" }
231
275
  # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
@@ -324,7 +368,7 @@ class JSONParser:
324
368
  string_acc = string_acc[:-1]
325
369
  if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
326
370
  escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
327
- string_acc += escape_seqs.get(char, char)
371
+ string_acc += escape_seqs.get(char, char) or char
328
372
  self.index += 1
329
373
  char = self.get_char_at()
330
374
  # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
@@ -362,7 +406,29 @@ class JSONParser:
362
406
  break
363
407
  i += 1
364
408
  next_c = self.get_char_at(i)
365
- if next_c == rstring_delimiter:
409
+ # If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
410
+ if next_c == "," and self.get_context() == "object_value":
411
+ i += 1
412
+ next_c = self.get_char_at(i)
413
+ while next_c and next_c != rstring_delimiter:
414
+ i += 1
415
+ next_c = self.get_char_at(i)
416
+ # Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
417
+ i += 1
418
+ next_c = self.get_char_at(i)
419
+ while next_c and next_c.isspace():
420
+ i += 1
421
+ next_c = self.get_char_at(i)
422
+ if next_c == "}":
423
+ # OK this is valid then
424
+ self.log(
425
+ "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
426
+ "info",
427
+ )
428
+ string_acc += char
429
+ self.index += 1
430
+ char = self.get_char_at()
431
+ elif next_c == rstring_delimiter:
366
432
  if self.get_context() == "object_value":
367
433
  # But this might not be it! This could be just a missing comma
368
434
  # We found a delimiter and we need to check if this is a key
@@ -418,7 +484,7 @@ class JSONParser:
418
484
 
419
485
  return string_acc.rstrip()
420
486
 
421
- def parse_number(self) -> Union[float, int, str]:
487
+ def parse_number(self) -> Union[float, int, str, JSONReturnType]:
422
488
  # <number> is a valid real number expressed in one of a number of given formats
423
489
  number_str = ""
424
490
  number_chars = set("0123456789-.eE/,")
@@ -451,8 +517,7 @@ class JSONParser:
451
517
  def parse_boolean_or_null(self) -> Union[bool, str, None]:
452
518
  # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
453
519
  starting_index = self.index
454
- value = ""
455
- char = self.get_char_at().lower()
520
+ char = (self.get_char_at() or "").lower()
456
521
  if char == "t":
457
522
  value = ("true", True)
458
523
  elif char == "f":
@@ -460,12 +525,12 @@ class JSONParser:
460
525
  elif char == "n":
461
526
  value = ("null", None)
462
527
 
463
- if len(value):
528
+ if value:
464
529
  i = 0
465
530
  while char and i < len(value[0]) and char == value[0][i]:
466
531
  i += 1
467
532
  self.index += 1
468
- char = self.get_char_at().lower()
533
+ char = (self.get_char_at() or "").lower()
469
534
  if i == len(value[0]):
470
535
  return value[1]
471
536
 
@@ -513,12 +578,12 @@ class JSONParser:
513
578
  return ""
514
579
 
515
580
  def log(self, text: str, level: str) -> None:
516
- if level == self.logger["log_level"]:
581
+ if level == self.logger.log_level:
517
582
  context = ""
518
- start = max(self.index - self.logger["window"], 0)
519
- end = min(self.index + self.logger["window"], len(self.json_str))
583
+ start = max(self.index - self.logger.window, 0)
584
+ end = min(self.index + self.logger.window, len(self.json_str))
520
585
  context = self.json_str[start:end]
521
- self.logger["log"].append(
586
+ self.logger.log.append(
522
587
  {
523
588
  "text": text,
524
589
  "context": context,
@@ -528,11 +593,11 @@ class JSONParser:
528
593
 
529
594
  def repair_json(
530
595
  json_str: str = "",
531
- return_objects: bool = False,
532
- skip_json_loads: bool = False,
533
- logging: bool = False,
534
- json_fd: TextIO = None,
535
- ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
596
+ return_objects: Optional[bool] = False,
597
+ skip_json_loads: Optional[bool] = False,
598
+ logging: Optional[bool] = False,
599
+ json_fd: Optional[TextIO] = None,
600
+ ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
536
601
  """
537
602
  Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
538
603
  It will return the fixed string by default.
@@ -559,7 +624,7 @@ def repair_json(
559
624
 
560
625
  def loads(
561
626
  json_str: str, skip_json_loads: bool = False, logging: bool = False
562
- ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
627
+ ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
563
628
  """
564
629
  This function works like `json.loads()` except that it will fix your JSON in the process.
565
630
  It is a wrapper around the `repair_json()` function with `return_objects=True`.
@@ -574,7 +639,7 @@ def loads(
574
639
 
575
640
  def load(
576
641
  fd: TextIO, skip_json_loads: bool = False, logging: bool = False
577
- ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
642
+ ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
578
643
  """
579
644
  This function works like `json.load()` except that it will fix your JSON in the process.
580
645
  It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`.
@@ -584,7 +649,7 @@ def load(
584
649
 
585
650
  def from_file(
586
651
  filename: str, skip_json_loads: bool = False, logging: bool = False
587
- ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
652
+ ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
588
653
  """
589
654
  This function is a wrapper around `load()` so you can pass the filename as string
590
655
  """
@@ -593,31 +658,3 @@ def from_file(
593
658
  fd.close()
594
659
 
595
660
  return jsonobj
596
-
597
-
598
- class StringFileWrapper:
599
- # This is a trick to simplify the code above, transform the filedescriptor handling into an array handling
600
- def __init__(self, fd: TextIO) -> None:
601
- self.fd = fd
602
- self.length = None
603
-
604
- def __getitem__(self, index: int) -> Any:
605
- if isinstance(index, slice):
606
- self.fd.seek(index.start)
607
- value = self.fd.read(index.stop - index.start)
608
- self.fd.seek(index.start)
609
- return value
610
- else:
611
- self.fd.seek(index)
612
- return self.fd.read(1)
613
-
614
- def __len__(self) -> int:
615
- if not self.length:
616
- current_position = self.fd.tell()
617
- self.fd.seek(0, os.SEEK_END)
618
- self.length = self.fd.tell()
619
- self.fd.seek(current_position)
620
- return self.length
621
-
622
- def __setitem__(self):
623
- raise Exception("This is read-only!")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: json_repair
3
- Version: 0.20.1
3
+ Version: 0.22.0
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -1,9 +1,26 @@
1
1
  from src.json_repair.json_repair import from_file, repair_json, loads
2
2
 
3
+ def test_basic_types_valid():
4
+ assert repair_json("True", return_objects=True) == ""
5
+ assert repair_json("False", return_objects=True) == ""
6
+ assert repair_json("Null", return_objects=True) == ""
7
+ assert repair_json("1", return_objects=True) == 1
8
+ assert repair_json("[]", return_objects=True) == []
9
+ assert repair_json("[1, 2, 3, 4]", return_objects=True) == [1, 2, 3, 4]
10
+ assert repair_json("{}", return_objects=True) == {}
11
+ assert repair_json('{ "key": "value", "key2": 1, "key3": True }', return_objects=True) == { "key": "value", "key2": 1, "key3": True }
12
+
13
+ def test_basic_types_invalid():
14
+ assert repair_json("true", return_objects=True) == True
15
+ assert repair_json("false", return_objects=True) == False
16
+ assert repair_json("null", return_objects=True) == None
17
+ assert repair_json("1.2", return_objects=True) == 1.2
18
+ assert repair_json("[", return_objects=True) == []
19
+ assert repair_json("[1, 2, 3, 4", return_objects=True) == [1, 2, 3, 4]
20
+ assert repair_json("{", return_objects=True) == {}
21
+ assert repair_json('{ "key": value, "key2": 1 "key3": null }', return_objects=True) == { "key": "value", "key2": 1, "key3": None }
3
22
 
4
23
  def test_valid_json():
5
- assert repair_json("[]") == "[]"
6
- assert repair_json("[1, 2, 3, 4]") == "[1, 2, 3, 4]"
7
24
  assert (
8
25
  repair_json('{"name": "John", "age": 30, "city": "New York"}')
9
26
  == '{"name": "John", "age": 30, "city": "New York"}'
@@ -77,6 +94,7 @@ def test_missing_and_mixed_quotes():
77
94
  repair_json('{"name": "John", "age": 30, "city": "New')
78
95
  == '{"name": "John", "age": 30, "city": "New"}'
79
96
  )
97
+ assert repair_json('[{"key": "value", COMMENT "notes": "lorem "ipsum", sic."}]') == '[{"key": "value", "notes": "lorem \\"ipsum\\", sic."}]'
80
98
 
81
99
  def test_array_edge_cases():
82
100
  assert repair_json("[1, 2, 3,") == "[1, 2, 3]"
@@ -96,7 +114,7 @@ def test_array_edge_cases():
96
114
 
97
115
 
98
116
  def test_escaping():
99
- assert repair_json("'\"'") == '"\\\""'
117
+ assert repair_json("'\"'") == '""'
100
118
  assert repair_json("{\"key\": 'string\"\n\t\le'") == '{"key": "string\\"\\n\\tle"}'
101
119
  assert repair_json(r'{"real_content": "Some string: Some other string \t Some string <a href=\"https://domain.com\">Some link</a>"') == r'{"real_content": "Some string: Some other string \t Some string <a href=\"https://domain.com\">Some link</a>"}'
102
120
  assert repair_json('{"key_1\n": "value"}') == '{"key_1": "value"}'
@@ -146,6 +164,7 @@ def test_leading_trailing_characters():
146
164
  assert repair_json("""{ "a": "", "b": [ { "c": 1} ] \n}```""") == '{"a": "", "b": [{"c": 1}]}'
147
165
  assert repair_json("Based on the information extracted, here is the filled JSON output: ```json { 'a': 'b' } ```") == '{"a": "b"}'
148
166
  assert repair_json("""
167
+ The next 64 elements are:
149
168
  ```json
150
169
  { "key": "value" }
151
170
  ```""") == '{"key": "value"}'
@@ -19,7 +19,7 @@ def test_true_true_correct(benchmark):
19
19
  mean_time = benchmark.stats.get("median")
20
20
 
21
21
  # Define your time threshold in seconds
22
- max_time = 13 / 10 ** 4 # 1.3 millisecond
22
+ max_time = 14 / 10 ** 4 # 1.4 millisecond
23
23
 
24
24
  # Assert that the average time is below the threshold
25
25
  assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
@@ -31,7 +31,7 @@ def test_true_true_incorrect(benchmark):
31
31
  mean_time = benchmark.stats.get("median")
32
32
 
33
33
  # Define your time threshold in seconds
34
- max_time = 13 / 10 ** 4 # 1.3 millisecond
34
+ max_time = 14 / 10 ** 4 # 1.4 millisecond
35
35
 
36
36
  # Assert that the average time is below the threshold
37
37
  assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
@@ -53,7 +53,7 @@ def test_true_false_incorrect(benchmark):
53
53
  mean_time = benchmark.stats.get("median")
54
54
 
55
55
  # Define your time threshold in seconds
56
- max_time = 13 / 10 ** 4 # 1.3 millisecond
56
+ max_time = 14 / 10 ** 4 # 1.4 millisecond
57
57
 
58
58
  # Assert that the average time is below the threshold
59
59
  assert mean_time < max_time, f"Benchmark exceeded threshold: {mean_time:.3f}s > {max_time:.3f}s"
File without changes
File without changes
File without changes