json-repair 0.47.4__py3-none-any.whl → 0.47.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,7 +27,8 @@ import json
27
27
  import sys
28
28
  from typing import Literal, TextIO, overload
29
29
 
30
- from .json_parser import JSONParser, JSONReturnType
30
+ from .constants import JSONReturnType
31
+ from .json_parser import JSONParser
31
32
 
32
33
 
33
34
  @overload
@@ -0,0 +1,50 @@
1
+ from .constants import STRING_DELIMITERS, JSONReturnType
2
+ from .json_context import ContextValues
3
+
4
+
5
+ def parse_array(self) -> list[JSONReturnType]:
6
+ # <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
7
+ arr = []
8
+ self.context.set(ContextValues.ARRAY)
9
+ # Stop when you either find the closing parentheses or you have iterated over the entire string
10
+ char = self.get_char_at()
11
+ while char and char not in ["]", "}"]:
12
+ self.skip_whitespaces_at()
13
+ value: JSONReturnType = ""
14
+ if char in STRING_DELIMITERS:
15
+ # Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
16
+ # So we are going to check if this string is followed by a : or not
17
+ # And either parse the string or parse the object
18
+ i = 1
19
+ i = self.skip_to_character(char, i)
20
+ i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
21
+ value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
22
+ else:
23
+ value = self.parse_json()
24
+
25
+ # It is possible that parse_json() returns nothing valid, so we increase by 1
26
+ if value == "":
27
+ self.index += 1
28
+ elif value == "..." and self.get_char_at(-1) == ".":
29
+ self.log(
30
+ "While parsing an array, found a stray '...'; ignoring it",
31
+ )
32
+ else:
33
+ arr.append(value)
34
+
35
+ # skip over whitespace after a value but before closing ]
36
+ char = self.get_char_at()
37
+ while char and char != "]" and (char.isspace() or char == ","):
38
+ self.index += 1
39
+ char = self.get_char_at()
40
+
41
+ # Especially at the end of an LLM generated json you might miss the last "]"
42
+ if char and char != "]":
43
+ self.log(
44
+ "While parsing an array we missed the closing ], ignoring it",
45
+ )
46
+
47
+ self.index += 1
48
+
49
+ self.context.reset()
50
+ return arr
@@ -0,0 +1,24 @@
1
+ def parse_boolean_or_null(self) -> bool | str | None:
2
+ # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
3
+ starting_index = self.index
4
+ char = (self.get_char_at() or "").lower()
5
+ value: tuple[str, bool | None] | None = None
6
+ if char == "t":
7
+ value = ("true", True)
8
+ elif char == "f":
9
+ value = ("false", False)
10
+ elif char == "n":
11
+ value = ("null", None)
12
+
13
+ if value:
14
+ i = 0
15
+ while char and i < len(value[0]) and char == value[0][i]:
16
+ i += 1
17
+ self.index += 1
18
+ char = (self.get_char_at() or "").lower()
19
+ if i == len(value[0]):
20
+ return value[1]
21
+
22
+ # If nothing works reset the index before returning
23
+ self.index = starting_index
24
+ return ""
@@ -0,0 +1,65 @@
1
+ from .json_context import ContextValues
2
+
3
+
4
+ def parse_comment(self) -> str:
5
+ """
6
+ Parse code-like comments:
7
+
8
+ - "# comment": A line comment that continues until a newline.
9
+ - "// comment": A line comment that continues until a newline.
10
+ - "/* comment */": A block comment that continues until the closing delimiter "*/".
11
+
12
+ The comment is skipped over and an empty string is returned so that comments do not interfere
13
+ with the actual JSON elements.
14
+ """
15
+ char = self.get_char_at()
16
+ termination_characters = ["\n", "\r"]
17
+ if ContextValues.ARRAY in self.context.context:
18
+ termination_characters.append("]")
19
+ if ContextValues.OBJECT_VALUE in self.context.context:
20
+ termination_characters.append("}")
21
+ if ContextValues.OBJECT_KEY in self.context.context:
22
+ termination_characters.append(":")
23
+ # Line comment starting with #
24
+ if char == "#":
25
+ comment = ""
26
+ while char and char not in termination_characters:
27
+ comment += char
28
+ self.index += 1
29
+ char = self.get_char_at()
30
+ self.log(f"Found line comment: {comment}, ignoring")
31
+ # Comments starting with '/'
32
+ elif char == "/":
33
+ next_char = self.get_char_at(1)
34
+ # Handle line comment starting with //
35
+ if next_char == "/":
36
+ comment = "//"
37
+ self.index += 2 # Skip both slashes.
38
+ char = self.get_char_at()
39
+ while char and char not in termination_characters:
40
+ comment += char
41
+ self.index += 1
42
+ char = self.get_char_at()
43
+ self.log(f"Found line comment: {comment}, ignoring")
44
+ # Handle block comment starting with /*
45
+ elif next_char == "*":
46
+ comment = "/*"
47
+ self.index += 2 # Skip '/*'
48
+ while True:
49
+ char = self.get_char_at()
50
+ if not char:
51
+ self.log("Reached end-of-string while parsing block comment; unclosed block comment.")
52
+ break
53
+ comment += char
54
+ self.index += 1
55
+ if comment.endswith("*/"):
56
+ break
57
+ self.log(f"Found block comment: {comment}, ignoring")
58
+ else:
59
+ # Skip standalone '/' characters that are not part of a comment
60
+ # to avoid getting stuck in an infinite loop
61
+ self.index += 1
62
+ if self.context.empty:
63
+ return self.parse_json()
64
+ else:
65
+ return ""
@@ -0,0 +1,32 @@
1
+ from .constants import JSONReturnType
2
+ from .json_context import ContextValues
3
+
4
+ NUMBER_CHARS: set[str] = set("0123456789-.eE/,")
5
+
6
+
7
+ def parse_number(self) -> float | int | str | JSONReturnType:
8
+ # <number> is a valid real number expressed in one of a number of given formats
9
+ number_str = ""
10
+ char = self.get_char_at()
11
+ is_array = self.context.current == ContextValues.ARRAY
12
+ while char and char in NUMBER_CHARS and (not is_array or char != ","):
13
+ number_str += char
14
+ self.index += 1
15
+ char = self.get_char_at()
16
+ if number_str and number_str[-1] in "-eE/,":
17
+ # The number ends with a non valid character for a number/currency, rolling back one
18
+ number_str = number_str[:-1]
19
+ self.index -= 1
20
+ elif (self.get_char_at() or "").isalpha():
21
+ # this was a string instead, sorry
22
+ self.index -= len(number_str)
23
+ return self.parse_string()
24
+ try:
25
+ if "," in number_str:
26
+ return str(number_str)
27
+ if "." in number_str or "e" in number_str or "E" in number_str:
28
+ return float(number_str)
29
+ else:
30
+ return int(number_str)
31
+ except ValueError:
32
+ return number_str
@@ -0,0 +1,110 @@
1
+ from .constants import JSONReturnType
2
+ from .json_context import ContextValues
3
+
4
+
5
+ def parse_object(self) -> dict[str, JSONReturnType]:
6
+ # <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
7
+ obj: dict[str, JSONReturnType] = {}
8
+ # Stop when you either find the closing parentheses or you have iterated over the entire string
9
+ while (self.get_char_at() or "}") != "}":
10
+ # This is what we expect to find:
11
+ # <member> ::= <string> ': ' <json>
12
+
13
+ # Skip filler whitespaces
14
+ self.skip_whitespaces_at()
15
+
16
+ # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
17
+ if (self.get_char_at() or "") == ":":
18
+ self.log(
19
+ "While parsing an object we found a : before a key, ignoring",
20
+ )
21
+ self.index += 1
22
+
23
+ # We are now searching for they string key
24
+ # Context is used in the string parser to manage the lack of quotes
25
+ self.context.set(ContextValues.OBJECT_KEY)
26
+
27
+ # Save this index in case we need find a duplicate key
28
+ rollback_index = self.index
29
+
30
+ # <member> starts with a <string>
31
+ key = ""
32
+ while self.get_char_at():
33
+ # The rollback index needs to be updated here in case the key is empty
34
+ rollback_index = self.index
35
+ if self.get_char_at() == "[" and key == "":
36
+ # Is this an array?
37
+ # Need to check if the previous parsed value contained in obj is an array and in that case parse and merge the two
38
+ prev_key = list(obj.keys())[-1] if obj else None
39
+ if prev_key and isinstance(obj[prev_key], list):
40
+ # If the previous key's value is an array, parse the new array and merge
41
+ self.index += 1
42
+ new_array = self.parse_array()
43
+ if isinstance(new_array, list):
44
+ # Merge and flatten the arrays
45
+ prev_value = obj[prev_key]
46
+ if isinstance(prev_value, list):
47
+ prev_value.extend(
48
+ new_array[0] if len(new_array) == 1 and isinstance(new_array[0], list) else new_array
49
+ )
50
+ self.skip_whitespaces_at()
51
+ if self.get_char_at() == ",":
52
+ self.index += 1
53
+ self.skip_whitespaces_at()
54
+ continue
55
+ key = str(self.parse_string())
56
+ if key == "":
57
+ self.skip_whitespaces_at()
58
+ if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
59
+ # If the string is empty but there is a object divider, we are done here
60
+ break
61
+ if ContextValues.ARRAY in self.context.context and key in obj:
62
+ self.log(
63
+ "While parsing an object we found a duplicate key, closing the object here and rolling back the index",
64
+ )
65
+ self.index = rollback_index - 1
66
+ # add an opening curly brace to make this work
67
+ self.json_str = self.json_str[: self.index + 1] + "{" + self.json_str[self.index + 1 :]
68
+ break
69
+
70
+ # Skip filler whitespaces
71
+ self.skip_whitespaces_at()
72
+
73
+ # We reached the end here
74
+ if (self.get_char_at() or "}") == "}":
75
+ continue
76
+
77
+ self.skip_whitespaces_at()
78
+
79
+ # An extreme case of missing ":" after a key
80
+ if (self.get_char_at() or "") != ":":
81
+ self.log(
82
+ "While parsing an object we missed a : after a key",
83
+ )
84
+
85
+ self.index += 1
86
+ self.context.reset()
87
+ self.context.set(ContextValues.OBJECT_VALUE)
88
+ # The value can be any valid json
89
+ self.skip_whitespaces_at()
90
+ # Corner case, a lone comma
91
+ value: JSONReturnType = ""
92
+ if (self.get_char_at() or "") in [",", "}"]:
93
+ self.log(
94
+ "While parsing an object value we found a stray , ignoring it",
95
+ )
96
+ else:
97
+ value = self.parse_json()
98
+
99
+ # Reset context since our job is done
100
+ self.context.reset()
101
+ obj[key] = value
102
+
103
+ if (self.get_char_at() or "") in [",", "'", '"']:
104
+ self.index += 1
105
+
106
+ # Remove trailing spaces
107
+ self.skip_whitespaces_at()
108
+
109
+ self.index += 1
110
+ return obj