json-repair 0.3.0__py3-none-any.whl → 0.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -52,13 +52,17 @@ class JSONParser:
52
52
  # <array> starts with '['
53
53
  elif char == "[":
54
54
  return self.parse_array()
55
+ # there can be an edge case in which a key is empty and at the end of an object
56
+ # like "key": }. We return an empty string here to close the object properly
57
+ elif char == "}" and self.context == "object_value":
58
+ return ""
55
59
  # <string> starts with '"'
56
60
  elif char == '"':
57
61
  return self.parse_string()
58
62
  # <number> starts with [0-9] or minus
59
63
  elif char.isdigit() or char == "-":
60
64
  return self.parse_number()
61
- # <boolean> could (T)rue or (F)alse or (N)ull
65
+ # <boolean> could be (T)rue or (F)alse or (N)ull
62
66
  elif char == "t" or char == "f" or char == "n":
63
67
  return self.parse_boolean_or_null()
64
68
  # This might be a <string> that is missing the starting '"'
@@ -67,6 +71,7 @@ class JSONParser:
67
71
  # Ignore whitespaces outside of strings
68
72
  elif char.isspace():
69
73
  self.index += 1
74
+ self.skip_whitespaces_at()
70
75
  return self.parse_json()
71
76
  # If everything else fails, then we give up and return an exception
72
77
  else:
@@ -87,7 +92,7 @@ class JSONParser:
87
92
  # <member> ::= <string> ': ' <json>
88
93
 
89
94
  # Skip filler whitespaces
90
- self.trim()
95
+ self.skip_whitespaces_at()
91
96
 
92
97
  # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
93
98
  if self.get_char_at() == ":":
@@ -100,6 +105,7 @@ class JSONParser:
100
105
  self.context = "object_key"
101
106
 
102
107
  # <member> starts with a <string>
108
+ self.skip_whitespaces_at()
103
109
  key = self.parse_string()
104
110
  while key == "":
105
111
  key = self.parse_string()
@@ -124,7 +130,7 @@ class JSONParser:
124
130
  self.index += 1
125
131
 
126
132
  # Remove trailing spaces
127
- self.trim()
133
+ self.skip_whitespaces_at()
128
134
 
129
135
  # Especially at the end of an LLM generated json you might miss the last "}"
130
136
  if self.get_char_at() and self.get_char_at() != "}":
@@ -172,8 +178,6 @@ class JSONParser:
172
178
  # Flag to manage corner cases related to missing starting quote
173
179
  fixed_quotes = False
174
180
  # i.e. { name: "John" }
175
- # Remove any trailing space
176
- self.trim()
177
181
  if self.get_char_at() != '"':
178
182
  self.insert_char_at('"')
179
183
  fixed_quotes = True
@@ -211,14 +215,10 @@ class JSONParser:
211
215
  and self.get_char_at().isspace()
212
216
  ):
213
217
  # skip whitespaces
214
- self.trim()
218
+ self.skip_whitespaces_at()
215
219
  # This string is invalid if there's no valid termination afterwards
216
220
 
217
- if (
218
- self.get_char_at() != ":"
219
- or self.get_char_at() != ","
220
- or self.get_char_at() != "}"
221
- ):
221
+ if self.get_char_at() not in [":", ","]:
222
222
  return ""
223
223
 
224
224
  end = self.index
@@ -232,11 +232,13 @@ class JSONParser:
232
232
 
233
233
  def parse_number(self) -> Union[float, int]:
234
234
  # <number> is a valid real number expressed in one of a number of given formats
235
- number_pattern = r"-?\d+(\.\d+)?([eE][+-]?\d+)?"
236
- match = re.match(number_pattern, self.json_str[self.index :])
237
- if match:
238
- number_str = match.group()
239
- self.index += len(number_str)
235
+ number_str = ""
236
+ char = self.get_char_at()
237
+ while char and (char.isdigit() or char in "-.eE"):
238
+ number_str += char
239
+ self.index += 1
240
+ char = self.get_char_at()
241
+ if number_str:
240
242
  if "." in number_str or "e" in number_str or "E" in number_str:
241
243
  return float(number_str)
242
244
  else:
@@ -264,23 +266,34 @@ class JSONParser:
264
266
  self.json_str = self.json_str[: self.index] + char + self.json_str[self.index :]
265
267
  self.index += 1
266
268
 
267
- def get_char_at(self, idx=0) -> Union[str, bool]:
269
+ def get_char_at(self) -> Union[str, bool]:
268
270
  # Why not use something simpler? Because we might be out of bounds and doing this check all the time is annoying
269
- idx = self.index + idx
270
- return self.json_str[idx] if idx < len(self.json_str) else False
271
+ try:
272
+ return self.json_str[self.index]
273
+ except IndexError:
274
+ return False
271
275
 
272
- def remove_char_at(self, idx=0) -> None:
273
- idx += 1
274
- self.json_str = self.json_str[: self.index] + self.json_str[self.index + idx :]
276
+ def remove_char_at(self) -> None:
277
+ self.json_str = self.json_str[: self.index] + self.json_str[self.index + 1 :]
275
278
 
276
- def trim(self) -> None:
279
+ def skip_whitespaces_at(self) -> None:
277
280
  # Remove trailing spaces
278
- while self.get_char_at() and self.get_char_at().isspace():
281
+ # I'd rather not do this BUT this method is called so many times that it makes sense to expand get_char_at
282
+ # At least this is what the profiler said and I believe in our lord and savior the profiler
283
+ try:
284
+ char = self.json_str[self.index]
285
+ except IndexError:
286
+ return
287
+ while char and char.isspace():
279
288
  self.index += 1
289
+ try:
290
+ char = self.json_str[self.index]
291
+ except IndexError:
292
+ return
280
293
 
281
294
 
282
295
  def repair_json(
283
- json_str: str, return_objects: bool = False
296
+ json_str: str, return_objects: bool = False, skip_json_loads: bool = False
284
297
  ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
285
298
  """
286
299
  Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
@@ -290,11 +303,14 @@ def repair_json(
290
303
  json_str = re.sub(r"^\s+", "", json_str)
291
304
  json_str = re.sub(r"\s+$", "", json_str)
292
305
  json_str = re.sub(r"/\*.*?\*/", "", json_str)
293
- try:
294
- parsed_json = json.loads(json_str)
295
- except Exception:
296
- parser = JSONParser(json_str)
306
+ parser = JSONParser(json_str)
307
+ if skip_json_loads:
297
308
  parsed_json = parser.parse()
309
+ else:
310
+ try:
311
+ parsed_json = json.loads(json_str)
312
+ except json.JSONDecodeError:
313
+ parsed_json = parser.parse()
298
314
  # It's useful to return the actual object instead of the json string, it allows this lib to be a replacement of the json library
299
315
  if return_objects:
300
316
  return parsed_json
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: json-repair
3
- Version: 0.3.0
3
+ Version: 0.4.1
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -59,7 +59,7 @@ You can look how I used it by checking out this demo: https://huggingface.co/spa
59
59
  # Not even this library could fix this JSON
60
60
 
61
61
  You can use this library to completely replace `json.loads()`:
62
-
62
+
63
63
  import json_repair
64
64
  try:
65
65
  decoded_object = json_repair.loads(json_string)
@@ -74,6 +74,21 @@ or just
74
74
  except Exception:
75
75
  # Manage Exception
76
76
 
77
+ ## Performance
78
+ If you find this library too slow because is using `json.loads()` you can skip that by passing `skip_json_loads=True` to `repair_json`. Like:
79
+
80
+ from json_repair import repair_json
81
+ try:
82
+ good_json_string = repair_json(bad_json_string, skip_json_loads=True)
83
+ except Exception:
84
+ # Not even this library could fix this JSON
85
+
86
+ I made a choice of not using any fast json library to avoid having any external dependency, so that anybody can use it regardless of their stack.
87
+
88
+ Some rules of thumb to use:
89
+ - Setting `return_objects=True` will always be faster because the parser returns an object already and it doesn't have serialize that object to JSON
90
+ - `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
91
+
77
92
  # How it works
78
93
  This module will parse the JSON file following the BNF definition:
79
94
 
@@ -0,0 +1,7 @@
1
+ json_repair/__init__.py,sha256=p9mZnte8Bg18NcxqgJ7vopH2gQv_XbZ0dRnk686QuRE,92
2
+ json_repair/json_repair.py,sha256=Qxy8eQpkm9e1qVUDvhYOuID7flLHIzAsr5cB1NWb3Y4,12974
3
+ json_repair-0.4.1.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
4
+ json_repair-0.4.1.dist-info/METADATA,sha256=MXc1lBLV9wr-DNTqqkzuH8W-VYiDNY9UVsvsHJGqMw4,6155
5
+ json_repair-0.4.1.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
6
+ json_repair-0.4.1.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
7
+ json_repair-0.4.1.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- json_repair/__init__.py,sha256=p9mZnte8Bg18NcxqgJ7vopH2gQv_XbZ0dRnk686QuRE,92
2
- json_repair/json_repair.py,sha256=QdgEnpE4bDqy9nCR9Gw61MB6igfI4K0YFOrBI_dh5mc,12224
3
- json_repair-0.3.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
4
- json_repair-0.3.0.dist-info/METADATA,sha256=EECNyro1Jtknlrtf3vqhl3StFJ_8drVdpTOodvZl7Us,5386
5
- json_repair-0.3.0.dist-info/WHEEL,sha256=Xo9-1PvkuimrydujYJAjF7pCkriuXBpUPEjma1nZyJ0,92
6
- json_repair-0.3.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
7
- json_repair-0.3.0.dist-info/RECORD,,