json-repair 0.52.5__tar.gz → 0.53.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {json_repair-0.52.5/src/json_repair.egg-info → json_repair-0.53.1}/PKG-INFO +1 -1
  2. {json_repair-0.52.5 → json_repair-0.53.1}/pyproject.toml +3 -1
  3. {json_repair-0.52.5 → json_repair-0.53.1}/src/json_repair/__init__.py +1 -1
  4. {json_repair-0.52.5 → json_repair-0.53.1}/src/json_repair/json_parser.py +36 -34
  5. {json_repair-0.52.5 → json_repair-0.53.1}/src/json_repair/json_repair.py +6 -6
  6. {json_repair-0.52.5 → json_repair-0.53.1}/src/json_repair/parse_array.py +8 -8
  7. {json_repair-0.52.5 → json_repair-0.53.1}/src/json_repair/parse_comment.py +2 -2
  8. {json_repair-0.52.5 → json_repair-0.53.1}/src/json_repair/parse_number.py +3 -2
  9. {json_repair-0.52.5 → json_repair-0.53.1}/src/json_repair/parse_object.py +19 -19
  10. {json_repair-0.52.5 → json_repair-0.53.1}/src/json_repair/parse_string.py +55 -55
  11. json_repair-0.53.1/src/json_repair/parse_string_helpers/parse_boolean_or_null.py +28 -0
  12. json_repair-0.53.1/src/json_repair/parse_string_helpers/parse_json_llm_block.py +19 -0
  13. {json_repair-0.52.5 → json_repair-0.53.1/src/json_repair.egg-info}/PKG-INFO +1 -1
  14. {json_repair-0.52.5 → json_repair-0.53.1}/src/json_repair.egg-info/SOURCES.txt +6 -6
  15. {json_repair-0.52.5 → json_repair-0.53.1}/tests/test_parse_string.py +21 -0
  16. {json_repair-0.52.5 → json_repair-0.53.1}/tests/test_repair_json_cli.py +5 -5
  17. json_repair-0.52.5/src/json_repair/parse_boolean_or_null.py +0 -30
  18. json_repair-0.52.5/tests/test_parse_boolean_or_null.py +0 -12
  19. {json_repair-0.52.5 → json_repair-0.53.1}/LICENSE +0 -0
  20. {json_repair-0.52.5 → json_repair-0.53.1}/README.md +0 -0
  21. {json_repair-0.52.5 → json_repair-0.53.1}/setup.cfg +0 -0
  22. {json_repair-0.52.5 → json_repair-0.53.1}/src/json_repair/__main__.py +0 -0
  23. {json_repair-0.52.5 → json_repair-0.53.1}/src/json_repair/py.typed +0 -0
  24. {json_repair-0.52.5/src/json_repair → json_repair-0.53.1/src/json_repair/utils}/constants.py +0 -0
  25. {json_repair-0.52.5/src/json_repair → json_repair-0.53.1/src/json_repair/utils}/json_context.py +0 -0
  26. {json_repair-0.52.5/src/json_repair → json_repair-0.53.1/src/json_repair/utils}/object_comparer.py +0 -0
  27. {json_repair-0.52.5/src/json_repair → json_repair-0.53.1/src/json_repair/utils}/string_file_wrapper.py +0 -0
  28. {json_repair-0.52.5 → json_repair-0.53.1}/src/json_repair.egg-info/dependency_links.txt +0 -0
  29. {json_repair-0.52.5 → json_repair-0.53.1}/src/json_repair.egg-info/entry_points.txt +0 -0
  30. {json_repair-0.52.5 → json_repair-0.53.1}/src/json_repair.egg-info/top_level.txt +0 -0
  31. {json_repair-0.52.5 → json_repair-0.53.1}/tests/test_json_repair.py +0 -0
  32. {json_repair-0.52.5 → json_repair-0.53.1}/tests/test_parse_array.py +0 -0
  33. {json_repair-0.52.5 → json_repair-0.53.1}/tests/test_parse_comment.py +0 -0
  34. {json_repair-0.52.5 → json_repair-0.53.1}/tests/test_parse_number.py +0 -0
  35. {json_repair-0.52.5 → json_repair-0.53.1}/tests/test_parse_object.py +0 -0
  36. {json_repair-0.52.5 → json_repair-0.53.1}/tests/test_performance.py +0 -0
  37. {json_repair-0.52.5 → json_repair-0.53.1}/tests/test_repair_json_from_file.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: json_repair
3
- Version: 0.52.5
3
+ Version: 0.53.1
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License-Expression: MIT
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
3
3
  build-backend = "setuptools.build_meta"
4
4
  [project]
5
5
  name = "json_repair"
6
- version = "0.52.5"
6
+ version = "0.53.1"
7
7
  license = "MIT"
8
8
  license-files = ["LICENSE"]
9
9
  authors = [
@@ -117,3 +117,5 @@ line-ending = "auto"
117
117
  [tool.ruff.lint.per-file-ignores]
118
118
  # Explicit re-exports is fine in __init__.py, still a code smell elsewhere.
119
119
  "__init__.py" = ["PLC0414"]
120
+ [tool.mypy]
121
+ strict = true
@@ -1,4 +1,4 @@
1
- from .constants import JSONReturnType
2
1
  from .json_repair import from_file, load, loads, repair_json
2
+ from .utils.constants import JSONReturnType
3
3
 
4
4
  __all__ = ["from_file", "load", "loads", "repair_json", "JSONReturnType"]
@@ -1,36 +1,32 @@
1
- from typing import Literal, TextIO
1
+ from typing import TextIO
2
2
 
3
- from .constants import STRING_DELIMITERS, JSONReturnType
4
- from .json_context import JsonContext
5
- from .object_comparer import ObjectComparer
6
3
  from .parse_array import parse_array as _parse_array
7
- from .parse_boolean_or_null import parse_boolean_or_null as _parse_boolean_or_null
8
4
  from .parse_comment import parse_comment as _parse_comment
9
5
  from .parse_number import parse_number as _parse_number
10
6
  from .parse_object import parse_object as _parse_object
11
7
  from .parse_string import parse_string as _parse_string
12
- from .string_file_wrapper import StringFileWrapper
8
+ from .utils.constants import STRING_DELIMITERS, JSONReturnType
9
+ from .utils.json_context import JsonContext
10
+ from .utils.object_comparer import ObjectComparer
11
+ from .utils.string_file_wrapper import StringFileWrapper
13
12
 
14
13
 
15
14
  class JSONParser:
16
15
  # Split the parse methods into separate files because this one was like 3000 lines
17
- def parse_array(self, *args, **kwargs):
18
- return _parse_array(self, *args, **kwargs)
16
+ def parse_array(self) -> list[JSONReturnType]:
17
+ return _parse_array(self)
19
18
 
20
- def parse_boolean_or_null(self, *args, **kwargs):
21
- return _parse_boolean_or_null(self, *args, **kwargs)
19
+ def parse_comment(self) -> JSONReturnType:
20
+ return _parse_comment(self)
22
21
 
23
- def parse_comment(self, *args, **kwargs):
24
- return _parse_comment(self, *args, **kwargs)
22
+ def parse_number(self) -> JSONReturnType:
23
+ return _parse_number(self)
25
24
 
26
- def parse_number(self, *args, **kwargs):
27
- return _parse_number(self, *args, **kwargs)
25
+ def parse_object(self) -> JSONReturnType:
26
+ return _parse_object(self)
28
27
 
29
- def parse_object(self, *args, **kwargs):
30
- return _parse_object(self, *args, **kwargs)
31
-
32
- def parse_string(self, *args, **kwargs):
33
- return _parse_string(self, *args, **kwargs)
28
+ def parse_string(self) -> JSONReturnType:
29
+ return _parse_string(self)
34
30
 
35
31
  def __init__(
36
32
  self,
@@ -107,8 +103,8 @@ class JSONParser:
107
103
  ) -> JSONReturnType:
108
104
  while True:
109
105
  char = self.get_char_at()
110
- # False means that we are at the end of the string provided
111
- if char is False:
106
+ # None means that we are at the end of the string provided
107
+ if char is None:
112
108
  return ""
113
109
  # <object> starts with '{'
114
110
  elif char == "{":
@@ -130,30 +126,36 @@ class JSONParser:
130
126
  else:
131
127
  self.index += 1
132
128
 
133
- def get_char_at(self, count: int = 0) -> str | Literal[False]:
129
+ def get_char_at(self, count: int = 0) -> str | None:
134
130
  # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
135
131
  try:
136
132
  return self.json_str[self.index + count]
137
133
  except IndexError:
138
- return False
134
+ return None
139
135
 
140
- def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
136
+ def skip_whitespaces(self) -> None:
141
137
  """
142
- This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
138
+ This function quickly iterates on whitespaces, moving the self.index forward
143
139
  """
144
140
  try:
145
- char = self.json_str[self.index + idx]
146
- except IndexError:
147
- return idx
148
- while char.isspace():
149
- if move_main_index:
141
+ char = self.json_str[self.index]
142
+ while char.isspace():
150
143
  self.index += 1
151
- else:
144
+ char = self.json_str[self.index]
145
+ except IndexError:
146
+ pass
147
+
148
+ def scroll_whitespaces(self, idx: int = 0) -> int:
149
+ """
150
+ This function quickly iterates on whitespaces. Doesn't move the self.index and returns the offset from self.index
151
+ """
152
+ try:
153
+ char = self.json_str[self.index + idx]
154
+ while char.isspace():
152
155
  idx += 1
153
- try:
154
156
  char = self.json_str[self.index + idx]
155
- except IndexError:
156
- return idx
157
+ except IndexError:
158
+ pass
157
159
  return idx
158
160
 
159
161
  def skip_to_character(self, character: str | list[str], idx: int = 0) -> int:
@@ -25,10 +25,10 @@ All supported use cases are in the unit tests
25
25
  import argparse
26
26
  import json
27
27
  import sys
28
- from typing import Literal, TextIO, overload
28
+ from typing import Any, Literal, TextIO, overload
29
29
 
30
- from .constants import JSONReturnType
31
30
  from .json_parser import JSONParser
31
+ from .utils.constants import JSONReturnType
32
32
 
33
33
 
34
34
  @overload
@@ -40,7 +40,7 @@ def repair_json(
40
40
  json_fd: TextIO | None = None,
41
41
  chunk_length: int = 0,
42
42
  stream_stable: bool = False,
43
- **json_dumps_args,
43
+ **json_dumps_args: Any,
44
44
  ) -> str: ...
45
45
 
46
46
 
@@ -53,7 +53,7 @@ def repair_json(
53
53
  json_fd: TextIO | None = None,
54
54
  chunk_length: int = 0,
55
55
  stream_stable: bool = False,
56
- **json_dumps_args,
56
+ **json_dumps_args: Any,
57
57
  ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]: ...
58
58
 
59
59
 
@@ -65,8 +65,8 @@ def repair_json(
65
65
  json_fd: TextIO | None = None,
66
66
  chunk_length: int = 0,
67
67
  stream_stable: bool = False,
68
- **json_dumps_args,
69
- ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | tuple[JSONReturnType, list]:
68
+ **json_dumps_args: Any,
69
+ ) -> JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]]:
70
70
  """
71
71
  Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
72
72
 
@@ -1,8 +1,8 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from .constants import STRING_DELIMITERS, JSONReturnType
4
- from .json_context import ContextValues
5
- from .object_comparer import ObjectComparer
3
+ from .utils.constants import STRING_DELIMITERS, JSONReturnType
4
+ from .utils.json_context import ContextValues
5
+ from .utils.object_comparer import ObjectComparer
6
6
 
7
7
  if TYPE_CHECKING:
8
8
  from .json_parser import JSONParser
@@ -15,7 +15,7 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
15
15
  # Stop when you either find the closing parentheses or you have iterated over the entire string
16
16
  char = self.get_char_at()
17
17
  while char and char not in ["]", "}"]:
18
- self.skip_whitespaces_at()
18
+ self.skip_whitespaces()
19
19
  value: JSONReturnType = ""
20
20
  if char in STRING_DELIMITERS:
21
21
  # Sometimes it can happen that LLMs forget to start an object and then you think it's a string in an array
@@ -23,13 +23,13 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
23
23
  # And either parse the string or parse the object
24
24
  i = 1
25
25
  i = self.skip_to_character(char, i)
26
- i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
26
+ i = self.scroll_whitespaces(idx=i + 1)
27
27
  value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
28
28
  else:
29
29
  value = self.parse_json()
30
30
 
31
- # It is possible that parse_json() returns nothing valid, so we increase by 1
32
- if ObjectComparer.is_strictly_empty(value):
31
+ # It is possible that parse_json() returns nothing valid, so we increase by 1, unless we find an array separator
32
+ if ObjectComparer.is_strictly_empty(value) and self.get_char_at() not in ["]", ","]:
33
33
  self.index += 1
34
34
  elif value == "..." and self.get_char_at(-1) == ".":
35
35
  self.log(
@@ -45,7 +45,7 @@ def parse_array(self: "JSONParser") -> list[JSONReturnType]:
45
45
  char = self.get_char_at()
46
46
 
47
47
  # Especially at the end of an LLM generated json you might miss the last "]"
48
- if char and char != "]":
48
+ if char != "]":
49
49
  self.log(
50
50
  "While parsing an array we missed the closing ], ignoring it",
51
51
  )
@@ -1,7 +1,7 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from .constants import JSONReturnType
4
- from .json_context import ContextValues
3
+ from .utils.constants import JSONReturnType
4
+ from .utils.json_context import ContextValues
5
5
 
6
6
  if TYPE_CHECKING:
7
7
  from .json_parser import JSONParser
@@ -1,6 +1,7 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from .json_context import ContextValues
3
+ from .utils.constants import JSONReturnType
4
+ from .utils.json_context import ContextValues
4
5
 
5
6
  NUMBER_CHARS: set[str] = set("0123456789-.eE/,")
6
7
 
@@ -9,7 +10,7 @@ if TYPE_CHECKING:
9
10
  from .json_parser import JSONParser
10
11
 
11
12
 
12
- def parse_number(self: "JSONParser") -> float | int | str | bool | None:
13
+ def parse_number(self: "JSONParser") -> JSONReturnType:
13
14
  # <number> is a valid real number expressed in one of a number of given formats
14
15
  number_str = ""
15
16
  char = self.get_char_at()
@@ -1,13 +1,13 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from .constants import STRING_DELIMITERS, JSONReturnType
4
- from .json_context import ContextValues
3
+ from .utils.constants import STRING_DELIMITERS, JSONReturnType
4
+ from .utils.json_context import ContextValues
5
5
 
6
6
  if TYPE_CHECKING:
7
7
  from .json_parser import JSONParser
8
8
 
9
9
 
10
- def parse_object(self: "JSONParser") -> dict[str, JSONReturnType]:
10
+ def parse_object(self: "JSONParser") -> JSONReturnType:
11
11
  # <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
12
12
  obj: dict[str, JSONReturnType] = {}
13
13
  start_index = self.index
@@ -17,10 +17,10 @@ def parse_object(self: "JSONParser") -> dict[str, JSONReturnType]:
17
17
  # <member> ::= <string> ': ' <json>
18
18
 
19
19
  # Skip filler whitespaces
20
- self.skip_whitespaces_at()
20
+ self.skip_whitespaces()
21
21
 
22
22
  # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
23
- if (self.get_char_at() or "") == ":":
23
+ if self.get_char_at() == ":":
24
24
  self.log(
25
25
  "While parsing an object we found a : before a key, ignoring",
26
26
  )
@@ -53,14 +53,14 @@ def parse_object(self: "JSONParser") -> dict[str, JSONReturnType]:
53
53
  prev_value.extend(
54
54
  new_array[0] if len(new_array) == 1 and isinstance(new_array[0], list) else new_array
55
55
  )
56
- self.skip_whitespaces_at()
56
+ self.skip_whitespaces()
57
57
  if self.get_char_at() == ",":
58
58
  self.index += 1
59
- self.skip_whitespaces_at()
59
+ self.skip_whitespaces()
60
60
  continue
61
61
  key = str(self.parse_string())
62
62
  if key == "":
63
- self.skip_whitespaces_at()
63
+ self.skip_whitespaces()
64
64
  if key != "" or (key == "" and self.get_char_at() in [":", "}"]):
65
65
  # If the string is empty but there is a object divider, we are done here
66
66
  break
@@ -74,16 +74,16 @@ def parse_object(self: "JSONParser") -> dict[str, JSONReturnType]:
74
74
  break
75
75
 
76
76
  # Skip filler whitespaces
77
- self.skip_whitespaces_at()
77
+ self.skip_whitespaces()
78
78
 
79
79
  # We reached the end here
80
80
  if (self.get_char_at() or "}") == "}":
81
81
  continue
82
82
 
83
- self.skip_whitespaces_at()
83
+ self.skip_whitespaces()
84
84
 
85
85
  # An extreme case of missing ":" after a key
86
- if (self.get_char_at() or "") != ":":
86
+ if self.get_char_at() != ":":
87
87
  self.log(
88
88
  "While parsing an object we missed a : after a key",
89
89
  )
@@ -92,10 +92,10 @@ def parse_object(self: "JSONParser") -> dict[str, JSONReturnType]:
92
92
  self.context.reset()
93
93
  self.context.set(ContextValues.OBJECT_VALUE)
94
94
  # The value can be any valid json
95
- self.skip_whitespaces_at()
95
+ self.skip_whitespaces()
96
96
  # Corner case, a lone comma
97
97
  value: JSONReturnType = ""
98
- if (self.get_char_at() or "") in [",", "}"]:
98
+ if self.get_char_at() in [",", "}"]:
99
99
  self.log(
100
100
  "While parsing an object value we found a stray , ignoring it",
101
101
  )
@@ -106,11 +106,11 @@ def parse_object(self: "JSONParser") -> dict[str, JSONReturnType]:
106
106
  self.context.reset()
107
107
  obj[key] = value
108
108
 
109
- if (self.get_char_at() or "") in [",", "'", '"']:
109
+ if self.get_char_at() in [",", "'", '"']:
110
110
  self.index += 1
111
111
 
112
112
  # Remove trailing spaces
113
- self.skip_whitespaces_at()
113
+ self.skip_whitespaces()
114
114
 
115
115
  self.index += 1
116
116
 
@@ -126,12 +126,12 @@ def parse_object(self: "JSONParser") -> dict[str, JSONReturnType]:
126
126
  if not self.context.empty:
127
127
  return obj
128
128
 
129
- self.skip_whitespaces_at()
130
- if (self.get_char_at() or "") != ",":
129
+ self.skip_whitespaces()
130
+ if self.get_char_at() != ",":
131
131
  return obj
132
132
  self.index += 1
133
- self.skip_whitespaces_at()
134
- if (self.get_char_at() or "") not in STRING_DELIMITERS:
133
+ self.skip_whitespaces()
134
+ if self.get_char_at() not in STRING_DELIMITERS:
135
135
  return obj
136
136
  self.log(
137
137
  "Found a comma and string delimiter after object closing brace, checking for additional key-value pairs",
@@ -1,13 +1,22 @@
1
1
  from typing import TYPE_CHECKING
2
2
 
3
- from .constants import STRING_DELIMITERS, JSONReturnType
4
- from .json_context import ContextValues
3
+ from .parse_string_helpers.parse_boolean_or_null import parse_boolean_or_null
4
+ from .parse_string_helpers.parse_json_llm_block import parse_json_llm_block
5
+ from .utils.constants import STRING_DELIMITERS, JSONReturnType
6
+ from .utils.json_context import ContextValues
5
7
 
6
8
  if TYPE_CHECKING:
7
9
  from .json_parser import JSONParser
8
10
 
9
11
 
10
12
  def parse_string(self: "JSONParser") -> JSONReturnType:
13
+ # Utility function to append a character to the accumulator and update the index
14
+ def _append_literal_char(acc: str, current_char: str | None) -> tuple[str, str | None]:
15
+ acc += str(current_char)
16
+ self.index += 1
17
+ char = self.get_char_at()
18
+ return acc, char
19
+
11
20
  # <string> is a string of valid characters enclosed in quotes
12
21
  # i.e. { name: "John" }
13
22
  # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
@@ -39,7 +48,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
39
48
  # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
40
49
  # But remember, object keys are only of type string
41
50
  if char.lower() in ["t", "f", "n"] and self.context.current != ContextValues.OBJECT_KEY:
42
- value = self.parse_boolean_or_null()
51
+ value = parse_boolean_or_null(self)
43
52
  if value != "":
44
53
  return value
45
54
  self.log(
@@ -49,12 +58,21 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
49
58
 
50
59
  if not missing_quotes:
51
60
  self.index += 1
52
-
61
+ if self.get_char_at() == "`":
62
+ ret_val = parse_json_llm_block(self)
63
+ # If we found a valid JSON block, return it, otherwise continue parsing the string
64
+ if ret_val is not False:
65
+ return ret_val
66
+ self.log(
67
+ "While parsing a string, we found code fences but they did not enclose valid JSON, continuing parsing the string",
68
+ )
53
69
  # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
54
- if self.get_char_at() in STRING_DELIMITERS and self.get_char_at() == lstring_delimiter:
70
+ if self.get_char_at() == lstring_delimiter:
55
71
  # If it's an empty key, this was easy
56
- if (self.context.current == ContextValues.OBJECT_KEY and self.get_char_at(1) == ":") or (
57
- self.context.current == ContextValues.OBJECT_VALUE and self.get_char_at(1) in [",", "}"]
72
+ if (
73
+ (self.context.current == ContextValues.OBJECT_KEY and self.get_char_at(1) == ":")
74
+ or (self.context.current == ContextValues.OBJECT_VALUE and self.get_char_at(1) in [",", "}"])
75
+ or (self.context.current == ContextValues.ARRAY and self.get_char_at(1) in [",", "]"])
58
76
  ):
59
77
  self.index += 1
60
78
  return ""
@@ -69,7 +87,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
69
87
  next_c = self.get_char_at(i)
70
88
  # Now check that the next character is also a delimiter to ensure that we have "".....""
71
89
  # In that case we ignore this rstring delimiter
72
- if next_c and (self.get_char_at(i + 1) or "") == rstring_delimiter:
90
+ if self.get_char_at(i + 1) == rstring_delimiter:
73
91
  self.log(
74
92
  "While parsing a string, we found a valid starting doubled quote",
75
93
  )
@@ -77,7 +95,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
77
95
  self.index += 1
78
96
  else:
79
97
  # Ok this is not a doubled quote, check if this is an empty string or not
80
- i = self.skip_whitespaces_at(idx=1, move_main_index=False)
98
+ i = self.scroll_whitespaces(idx=1)
81
99
  next_c = self.get_char_at(i)
82
100
  if next_c in STRING_DELIMITERS + ["{", "["]:
83
101
  # something fishy is going on here
@@ -127,7 +145,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
127
145
  ):
128
146
  rstring_delimiter_missing = True
129
147
  # check if this is a case in which the closing comma is NOT missing instead
130
- self.skip_whitespaces_at()
148
+ self.skip_whitespaces()
131
149
  if self.get_char_at(1) == "\\":
132
150
  # Ok this is a quoted string, skip
133
151
  rstring_delimiter_missing = False
@@ -137,7 +155,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
137
155
  i += 1
138
156
  # found a delimiter, now we need to check that is followed strictly by a comma or brace
139
157
  # or the string ended
140
- i = self.skip_whitespaces_at(idx=i, move_main_index=False)
158
+ i = self.scroll_whitespaces(idx=i)
141
159
  next_c = self.get_char_at(i)
142
160
  if not next_c or next_c in [",", "}"]:
143
161
  rstring_delimiter_missing = False
@@ -152,7 +170,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
152
170
  else:
153
171
  # But again, this could just be something a bit stupid like "lorem, "ipsum" sic"
154
172
  # Check if we find a : afterwards (skipping space)
155
- i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
173
+ i = self.scroll_whitespaces(idx=i + 1)
156
174
  next_c = self.get_char_at(i)
157
175
  if next_c and next_c != ":":
158
176
  rstring_delimiter_missing = False
@@ -167,7 +185,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
167
185
  break
168
186
  else:
169
187
  # skip any whitespace first
170
- i = self.skip_whitespaces_at(idx=1, move_main_index=False)
188
+ i = self.scroll_whitespaces(idx=1)
171
189
  # We couldn't find any rstring_delimeter before the end of the string
172
190
  # check if this is the last string of an object and therefore we can keep going
173
191
  # make an exception if this is the last char before the closing brace
@@ -204,19 +222,15 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
204
222
  if self.context.current == ContextValues.OBJECT_VALUE and char == "}":
205
223
  # We found the end of an object while parsing a value
206
224
  # Check if the object is really over, to avoid doubling the closing brace
207
- i = self.skip_whitespaces_at(idx=1, move_main_index=False)
225
+ i = self.scroll_whitespaces(idx=1)
208
226
  next_c = self.get_char_at(i)
209
- if next_c and next_c == "`":
227
+ if next_c == "`" and self.get_char_at(i + 1) == "`" and self.get_char_at(i + 2) == "`":
210
228
  # This could be a special case in which the LLM added code fences after the object
211
229
  # So we need to check if there are another two ` after this one`
212
- next_c = self.get_char_at(i + 1)
213
- if next_c and next_c == "`":
214
- next_c = self.get_char_at(i + 2)
215
- if next_c and next_c == "`":
216
- self.log(
217
- "While parsing a string in object value context, we found a } that closes the object before code fences, stopping here",
218
- )
219
- break
230
+ self.log(
231
+ "While parsing a string in object value context, we found a } that closes the object before code fences, stopping here",
232
+ )
233
+ break
220
234
  if not next_c:
221
235
  self.log(
222
236
  "While parsing a string in object value context, we found a } that closes the object, stopping here",
@@ -274,12 +288,11 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
274
288
  # found a second delimiter
275
289
  i += 1
276
290
  # Skip spaces
277
- i = self.skip_whitespaces_at(idx=i, move_main_index=False)
278
- next_c = self.get_char_at(i)
279
- if next_c and next_c in [",", "}"]:
291
+ i = self.scroll_whitespaces(idx=i)
292
+ if self.get_char_at(i) in [",", "}"]:
280
293
  # Ok then this is a missing right quote
281
294
  self.log(
282
- "While parsing a string missing the right delimiter in object key context, we found a :, stopping here",
295
+ "While parsing a string missing the right delimiter in object key context, we found a , or } stopping here",
283
296
  )
284
297
  break
285
298
  else:
@@ -308,9 +321,8 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
308
321
  # We found a quote, now let's make sure there's a ":" following
309
322
  i += 1
310
323
  # found a delimiter, now we need to check that is followed strictly by a comma or brace
311
- i = self.skip_whitespaces_at(idx=i, move_main_index=False)
312
- next_c = self.get_char_at(i)
313
- if next_c and next_c == ":":
324
+ i = self.scroll_whitespaces(idx=i)
325
+ if self.get_char_at(i) == ":":
314
326
  # Reset the cursor
315
327
  self.index -= 1
316
328
  char = self.get_char_at()
@@ -320,9 +332,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
320
332
  break
321
333
  elif unmatched_delimiter:
322
334
  unmatched_delimiter = False
323
- string_acc += str(char)
324
- self.index += 1
325
- char = self.get_char_at()
335
+ string_acc, char = _append_literal_char(string_acc, char)
326
336
  else:
327
337
  # Check if eventually there is a rstring delimiter, otherwise we bail
328
338
  i = 1
@@ -357,22 +367,20 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
357
367
  next_c = self.get_char_at(i)
358
368
  # Ok now I found a delimiter, let's skip whitespaces and see if next we find a } or a ,
359
369
  i += 1
360
- i = self.skip_whitespaces_at(idx=i, move_main_index=False)
370
+ i = self.scroll_whitespaces(idx=i)
361
371
  next_c = self.get_char_at(i)
362
372
  if next_c in ["}", ","]:
363
373
  self.log(
364
- "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
374
+ "While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
365
375
  )
366
- string_acc += str(char)
367
- self.index += 1
368
- char = self.get_char_at()
376
+ string_acc, char = _append_literal_char(string_acc, char)
369
377
  continue
370
378
  elif next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\":
371
379
  # Check if self.index:self.index+i is only whitespaces, break if that's the case
372
380
  if all(str(self.get_char_at(j)).isspace() for j in range(1, i) if self.get_char_at(j)):
373
381
  break
374
382
  if self.context.current == ContextValues.OBJECT_VALUE:
375
- i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
383
+ i = self.scroll_whitespaces(idx=i + 1)
376
384
  if self.get_char_at(i) == ",":
377
385
  # So we found a comma, this could be a case of a single quote like "va"lue",
378
386
  # Search if it's followed by another key, starting with the first delimeter
@@ -380,15 +388,13 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
380
388
  i += 1
381
389
  i = self.skip_to_character(character=rstring_delimiter, idx=i + 1)
382
390
  i += 1
383
- i = self.skip_whitespaces_at(idx=i, move_main_index=False)
391
+ i = self.scroll_whitespaces(idx=i)
384
392
  next_c = self.get_char_at(i)
385
393
  if next_c == ":":
386
394
  self.log(
387
- "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
395
+ "While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
388
396
  )
389
- string_acc += str(char)
390
- self.index += 1
391
- char = self.get_char_at()
397
+ string_acc, char = _append_literal_char(string_acc, char)
392
398
  continue
393
399
  # We found a delimiter and we need to check if this is a key
394
400
  # so find a rstring_delimiter and a colon after
@@ -405,12 +411,10 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
405
411
  # Only if we fail to find a ':' then we know this is misplaced quote
406
412
  if next_c != ":":
407
413
  self.log(
408
- "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
414
+ "While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
409
415
  )
410
416
  unmatched_delimiter = not unmatched_delimiter
411
- string_acc += str(char)
412
- self.index += 1
413
- char = self.get_char_at()
417
+ string_acc, char = _append_literal_char(string_acc, char)
414
418
  elif self.context.current == ContextValues.ARRAY:
415
419
  # So here we can have a few valid cases:
416
420
  # ["bla bla bla "puppy" bla bla bla "kitty" bla bla"]
@@ -434,9 +438,7 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
434
438
  "While parsing a string in Array context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
435
439
  )
436
440
  unmatched_delimiter = not unmatched_delimiter
437
- string_acc += str(char)
438
- self.index += 1
439
- char = self.get_char_at()
441
+ string_acc, char = _append_literal_char(string_acc, char)
440
442
  else:
441
443
  break
442
444
  elif self.context.current == ContextValues.OBJECT_KEY:
@@ -444,14 +446,12 @@ def parse_string(self: "JSONParser") -> JSONReturnType:
444
446
  self.log(
445
447
  "While parsing a string in Object Key context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
446
448
  )
447
- string_acc += str(char)
448
- self.index += 1
449
- char = self.get_char_at()
449
+ string_acc, char = _append_literal_char(string_acc, char)
450
450
  if char and missing_quotes and self.context.current == ContextValues.OBJECT_KEY and char.isspace():
451
451
  self.log(
452
452
  "While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
453
453
  )
454
- self.skip_whitespaces_at()
454
+ self.skip_whitespaces()
455
455
  if self.get_char_at() not in [":", ","]:
456
456
  return ""
457
457
 
@@ -0,0 +1,28 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from ..json_parser import JSONParser # noqa: TID252
5
+
6
+
7
+ def parse_boolean_or_null(parser: "JSONParser") -> bool | str | None:
8
+ # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
9
+ char = (parser.get_char_at() or "").lower()
10
+ value_map: dict[str, tuple[str, bool | None]] = {
11
+ "t": ("true", True),
12
+ "f": ("false", False),
13
+ "n": ("null", None),
14
+ }
15
+ value: tuple[str, bool | None] = value_map[char]
16
+
17
+ i = 0
18
+ starting_index = parser.index
19
+ while char and i < len(value[0]) and char == value[0][i]:
20
+ i += 1
21
+ parser.index += 1
22
+ char = (parser.get_char_at() or "").lower()
23
+ if i == len(value[0]):
24
+ return value[1]
25
+
26
+ # If nothing works reset the index before returning
27
+ parser.index = starting_index
28
+ return ""
@@ -0,0 +1,19 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ..utils.constants import JSONReturnType # noqa: TID252
4
+
5
+ if TYPE_CHECKING:
6
+ from ..json_parser import JSONParser # noqa: TID252
7
+
8
+
9
+ def parse_json_llm_block(parser: "JSONParser") -> JSONReturnType:
10
+ """
11
+ Extracts and normalizes JSON enclosed in ```json ... ``` blocks.
12
+ """
13
+ # Try to find a ```json ... ``` block
14
+ if parser.json_str[parser.index : parser.index + 7] == "```json":
15
+ i = parser.skip_to_character("`", idx=7)
16
+ if parser.json_str[parser.index + i : parser.index + i + 3] == "```":
17
+ parser.index += 7 # Move past ```json
18
+ return parser.parse_json()
19
+ return False
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: json_repair
3
- Version: 0.52.5
3
+ Version: 0.53.1
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License-Expression: MIT
@@ -3,27 +3,27 @@ README.md
3
3
  pyproject.toml
4
4
  src/json_repair/__init__.py
5
5
  src/json_repair/__main__.py
6
- src/json_repair/constants.py
7
- src/json_repair/json_context.py
8
6
  src/json_repair/json_parser.py
9
7
  src/json_repair/json_repair.py
10
- src/json_repair/object_comparer.py
11
8
  src/json_repair/parse_array.py
12
- src/json_repair/parse_boolean_or_null.py
13
9
  src/json_repair/parse_comment.py
14
10
  src/json_repair/parse_number.py
15
11
  src/json_repair/parse_object.py
16
12
  src/json_repair/parse_string.py
17
13
  src/json_repair/py.typed
18
- src/json_repair/string_file_wrapper.py
19
14
  src/json_repair.egg-info/PKG-INFO
20
15
  src/json_repair.egg-info/SOURCES.txt
21
16
  src/json_repair.egg-info/dependency_links.txt
22
17
  src/json_repair.egg-info/entry_points.txt
23
18
  src/json_repair.egg-info/top_level.txt
19
+ src/json_repair/parse_string_helpers/parse_boolean_or_null.py
20
+ src/json_repair/parse_string_helpers/parse_json_llm_block.py
21
+ src/json_repair/utils/constants.py
22
+ src/json_repair/utils/json_context.py
23
+ src/json_repair/utils/object_comparer.py
24
+ src/json_repair/utils/string_file_wrapper.py
24
25
  tests/test_json_repair.py
25
26
  tests/test_parse_array.py
26
- tests/test_parse_boolean_or_null.py
27
27
  tests/test_parse_comment.py
28
28
  tests/test_parse_number.py
29
29
  tests/test_parse_object.py
@@ -100,3 +100,24 @@ def test_leading_trailing_characters():
100
100
  ```""")
101
101
  == '{"key": "value"}'
102
102
  )
103
+
104
+
105
+ def test_string_json_llm_block():
106
+ assert repair_json('{"key": "``"') == '{"key": "``"}'
107
+ assert repair_json('{"key": "```json"') == '{"key": "```json"}'
108
+ assert (
109
+ repair_json('{"key": "```json {"key": [{"key1": 1},{"key2": 2}]}```"}')
110
+ == '{"key": {"key": [{"key1": 1}, {"key2": 2}]}}'
111
+ )
112
+ assert repair_json('{"response": "```json{}"') == '{"response": "```json{}"}'
113
+
114
+
115
+ def test_parse_boolean_or_null():
116
+ assert repair_json("True", return_objects=True) == ""
117
+ assert repair_json("False", return_objects=True) == ""
118
+ assert repair_json("Null", return_objects=True) == ""
119
+ assert repair_json("true", return_objects=True)
120
+ assert not repair_json("false", return_objects=True)
121
+ assert repair_json("null", return_objects=True) is None
122
+ assert repair_json(' {"key": true, "key2": false, "key3": null}') == '{"key": true, "key2": false, "key3": null}'
123
+ assert repair_json('{"key": TRUE, "key2": FALSE, "key3": Null} ') == '{"key": true, "key2": false, "key3": null}'
@@ -11,23 +11,23 @@ from src.json_repair.json_repair import cli
11
11
  def test_cli(capsys):
12
12
  # Create a temporary file
13
13
  temp_fd, temp_path = tempfile.mkstemp(suffix=".json")
14
+ _, tempout_path = tempfile.mkstemp(suffix=".json")
14
15
  try:
15
16
  # Write content to the temporary file
16
17
  with os.fdopen(temp_fd, "w") as tmp:
17
18
  tmp.write("{key:value")
18
- cli(inline_args=[temp_path, "--indent", 0, "--ensure_ascii"])
19
+ cli(inline_args=[temp_path, "--indent", "0", "--ensure_ascii"])
19
20
  captured = capsys.readouterr()
20
21
  assert captured.out == '{\n"key": "value"\n}\n'
21
22
 
22
23
  # Test the output option
23
- tempout_fd, tempout_path = tempfile.mkstemp(suffix=".json")
24
- cli(inline_args=[temp_path, "--indent", 0, "-o", tempout_path])
24
+ cli(inline_args=[temp_path, "--indent", "0", "-o", tempout_path])
25
25
  with open(tempout_path) as tmp:
26
26
  out = tmp.read()
27
27
  assert out == '{\n"key": "value"\n}'
28
28
 
29
29
  # Test the inline option
30
- cli(inline_args=[temp_path, "--indent", 0, "-i"])
30
+ cli(inline_args=[temp_path, "--indent", "0", "-i"])
31
31
  with open(temp_path) as tmp:
32
32
  out = tmp.read()
33
33
  assert out == '{\n"key": "value"\n}'
@@ -43,7 +43,7 @@ def test_cli(capsys):
43
43
  expected_output = '{\n"key": "value"\n}\n'
44
44
  # Patch sys.stdin so that cli() reads from it instead of a file.
45
45
  with patch("sys.stdin", new=io.StringIO(test_input)):
46
- cli(inline_args=["--indent", 0])
46
+ cli(inline_args=["--indent", "0"])
47
47
  captured = capsys.readouterr()
48
48
  assert captured.out == expected_output
49
49
 
@@ -1,30 +0,0 @@
1
- from typing import TYPE_CHECKING
2
-
3
- if TYPE_CHECKING:
4
- from .json_parser import JSONParser
5
-
6
-
7
- def parse_boolean_or_null(self: "JSONParser") -> bool | str | None:
8
- # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
9
- starting_index = self.index
10
- char = (self.get_char_at() or "").lower()
11
- value: tuple[str, bool | None] | None = None
12
- if char == "t":
13
- value = ("true", True)
14
- elif char == "f":
15
- value = ("false", False)
16
- elif char == "n":
17
- value = ("null", None)
18
-
19
- if value:
20
- i = 0
21
- while char and i < len(value[0]) and char == value[0][i]:
22
- i += 1
23
- self.index += 1
24
- char = (self.get_char_at() or "").lower()
25
- if i == len(value[0]):
26
- return value[1]
27
-
28
- # If nothing works reset the index before returning
29
- self.index = starting_index
30
- return ""
@@ -1,12 +0,0 @@
1
- from src.json_repair.json_repair import repair_json
2
-
3
-
4
- def test_parse_boolean_or_null():
5
- assert repair_json("True", return_objects=True) == ""
6
- assert repair_json("False", return_objects=True) == ""
7
- assert repair_json("Null", return_objects=True) == ""
8
- assert repair_json("true", return_objects=True)
9
- assert not repair_json("false", return_objects=True)
10
- assert repair_json("null", return_objects=True) is None
11
- assert repair_json(' {"key": true, "key2": false, "key3": null}') == '{"key": true, "key2": false, "key3": null}'
12
- assert repair_json('{"key": TRUE, "key2": FALSE, "key3": Null} ') == '{"key": true, "key2": false, "key3": null}'
File without changes
File without changes
File without changes