json-repair 0.13.1__py3-none-any.whl → 0.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,17 +27,25 @@ from typing import Any, Dict, List, Union, TextIO
27
27
 
28
28
 
29
29
  class JSONParser:
30
- def __init__(self, json_str: str) -> None:
30
+ def __init__(self, json_str: str, logging: bool = False) -> None:
31
31
  # The string to parse
32
32
  self.json_str = json_str
33
33
  # Index is our iterator that will keep track of which character we are looking at right now
34
34
  self.index = 0
35
35
  # This is used in the object member parsing to manage the special cases of missing quotes in key or value
36
- self.context = ""
37
- self.context_stack = []
36
+ self.context = []
37
+ # Use this to log the activity, but only if logging is active
38
+ self.logger = {
39
+ "log": [],
40
+ "window": 10,
41
+ "log_level": "info" if logging else "none",
42
+ }
38
43
 
39
44
  def parse(self) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
40
- return self.parse_json()
45
+ if self.logger["log_level"] == "none":
46
+ return self.parse_json()
47
+ else:
48
+ return self.parse_json(), self.logger["log"]
41
49
 
42
50
  def parse_json(
43
51
  self,
@@ -56,7 +64,11 @@ class JSONParser:
56
64
  return self.parse_array()
57
65
  # there can be an edge case in which a key is empty and at the end of an object
58
66
  # like "key": }. We return an empty string here to close the object properly
59
- elif char == "}" and self.context == "object_value":
67
+ elif char == "}" and self.get_context() == "object_value":
68
+ self.log(
69
+ "At the end of an object we found a key with missing value, skipping",
70
+ "info",
71
+ )
60
72
  return ""
61
73
  # <string> starts with '"'
62
74
  elif char == '"':
@@ -92,13 +104,17 @@ class JSONParser:
92
104
 
93
105
  # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
94
106
  if (self.get_char_at() or "") == ":":
107
+ self.log(
108
+ "While parsing an object we found a : before a key, replacing with ,",
109
+ "info",
110
+ )
95
111
  self.remove_char_at()
96
112
  self.insert_char_at(",")
97
113
  self.index += 1
98
114
 
99
115
  # We are now searching for they string key
100
116
  # Context is used in the string parser to manage the lack of quotes
101
- self.update_context("object_key")
117
+ self.set_context("object_key")
102
118
 
103
119
  self.skip_whitespaces_at()
104
120
 
@@ -110,6 +126,10 @@ class JSONParser:
110
126
  # This can happen sometimes like { "": "value" }
111
127
  if key == "" and self.get_char_at() == ":":
112
128
  key = "empty_placeholder"
129
+ self.log(
130
+ "While parsing an object we found an empty key, replacing with empty_placeholder",
131
+ "info",
132
+ )
113
133
  break
114
134
 
115
135
  # We reached the end here
@@ -118,15 +138,19 @@ class JSONParser:
118
138
 
119
139
  # An extreme case of missing ":" after a key
120
140
  if (self.get_char_at() or "") != ":":
141
+ self.log(
142
+ "While parsing an object we missed a : after a key, adding it back",
143
+ "info",
144
+ )
121
145
  self.insert_char_at(":")
122
146
  self.index += 1
123
- self.update_context("")
124
- self.update_context("object_value")
147
+ self.reset_context()
148
+ self.set_context("object_value")
125
149
  # The value can be any valid json
126
150
  value = self.parse_json()
127
151
 
128
152
  # Reset context since our job is done
129
- self.update_context("")
153
+ self.reset_context()
130
154
  obj[key] = value
131
155
 
132
156
  if (self.get_char_at() or "") in [",", "'", '"']:
@@ -137,6 +161,10 @@ class JSONParser:
137
161
 
138
162
  # Especially at the end of an LLM generated json you might miss the last "}"
139
163
  if (self.get_char_at() or "}") != "}":
164
+ self.log(
165
+ "While parsing an object, we couldn't find the closing }, adding it back",
166
+ "info",
167
+ )
140
168
  self.insert_char_at("}")
141
169
  self.index += 1
142
170
  return obj
@@ -160,15 +188,22 @@ class JSONParser:
160
188
  self.index += 1
161
189
  char = self.get_char_at()
162
190
  # If this is the right value of an object and we are closing the object, it means the array is over
163
- if self.context == "object_value" and char == "}":
191
+ if self.get_context() == "object_value" and char == "}":
164
192
  break
165
193
 
166
194
  # Especially at the end of an LLM generated json you might miss the last "]"
167
195
  char = self.get_char_at()
168
196
  if char and char != "]":
197
+ self.log(
198
+ "While parsing an array we missed the closing ], adding it back", "info"
199
+ )
169
200
  # Sometimes when you fix a missing "]" you'll have a trailing "," there that makes the JSON invalid
170
201
  if char == ",":
171
202
  # Remove trailing "," before adding the "]"
203
+ self.log(
204
+ "While parsing an array, remove a trailing , before adding ]",
205
+ "info",
206
+ )
172
207
  self.remove_char_at()
173
208
  self.insert_char_at("]")
174
209
  self.index -= 1
@@ -183,18 +218,23 @@ class JSONParser:
183
218
 
184
219
  # Flag to manage corner cases related to missing starting quote
185
220
  fixed_quotes = False
186
- double_delimiter = False
187
221
  lstring_delimiter = rstring_delimiter = '"'
188
222
  if isinstance(string_quotes, list):
189
223
  lstring_delimiter = string_quotes[0]
190
224
  rstring_delimiter = string_quotes[1]
191
225
  elif isinstance(string_quotes, str):
192
226
  lstring_delimiter = rstring_delimiter = string_quotes
227
+ # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
193
228
  if self.get_char_at(1) == lstring_delimiter:
194
- double_delimiter = True
229
+ self.log(
230
+ "While parsing a string, we found a doubled quote, ignoring it", "info"
231
+ )
195
232
  self.index += 1
196
233
  char = self.get_char_at()
197
234
  if char != lstring_delimiter:
235
+ self.log(
236
+ "While parsing a string, we found no starting quote, adding it", "info"
237
+ )
198
238
  self.insert_char_at(lstring_delimiter)
199
239
  fixed_quotes = True
200
240
  else:
@@ -210,12 +250,13 @@ class JSONParser:
210
250
  # * It iterated over the entire sequence
211
251
  # * If we are fixing missing quotes in an object, when it finds the special terminators
212
252
  char = self.get_char_at()
213
- fix_broken_markdown_link = False
214
253
  while char and char != rstring_delimiter:
215
254
  if fixed_quotes:
216
- if self.context == "object_key" and (char == ":" or char.isspace()):
255
+ if self.get_context() == "object_key" and (
256
+ char == ":" or char.isspace()
257
+ ):
217
258
  break
218
- elif self.context == "object_value" and char in [",", "}"]:
259
+ elif self.get_context() == "object_value" and char in [",", "}"]:
219
260
  break
220
261
  self.index += 1
221
262
  char = self.get_char_at()
@@ -227,21 +268,46 @@ class JSONParser:
227
268
  else:
228
269
  self.remove_char_at(-1)
229
270
  self.index -= 1
230
- # ChatGPT sometimes forget to quote links in markdown like: { "content": "[LINK]("https://google.com")" }
271
+ # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
231
272
  if (
232
273
  char == rstring_delimiter
233
- # Next character is not a comma
234
- and self.get_char_at(1) != ","
235
- and (
236
- fix_broken_markdown_link
237
- or (self.get_char_at(-2) == "]" and self.get_char_at(-1)) == "("
238
- )
274
+ # Next character is not a delimiter
275
+ and self.get_char_at(1) not in [",", ":", "]", "}"]
239
276
  ):
240
- fix_broken_markdown_link = not fix_broken_markdown_link
241
- self.index += 1
242
- char = self.get_char_at()
243
-
244
- if char and fixed_quotes and self.context == "object_key" and char.isspace():
277
+ # Special case here, in case of double quotes one after another
278
+ if self.get_char_at(1) == rstring_delimiter:
279
+ self.log(
280
+ "While parsing a string, we found a doubled quote, ignoring it",
281
+ "info",
282
+ )
283
+ # self destruct this character
284
+ self.remove_char_at()
285
+ else:
286
+ # Check if eventually there is a rstring delimiter, otherwise we bail
287
+ i = 2
288
+ next_c = self.get_char_at(i)
289
+ while next_c and next_c != rstring_delimiter:
290
+ i += 1
291
+ next_c = self.get_char_at(i)
292
+ # In that case we ignore this rstring delimiter
293
+ if next_c:
294
+ self.log(
295
+ "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
296
+ "info",
297
+ )
298
+ self.index += 1
299
+ char = self.get_char_at()
300
+
301
+ if (
302
+ char
303
+ and fixed_quotes
304
+ and self.get_context() == "object_key"
305
+ and char.isspace()
306
+ ):
307
+ self.log(
308
+ "While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
309
+ "info",
310
+ )
245
311
  self.skip_whitespaces_at()
246
312
  if self.get_char_at() not in [":", ","]:
247
313
  return ""
@@ -250,13 +316,15 @@ class JSONParser:
250
316
 
251
317
  # A fallout of the previous special case in the while loop, we need to update the index only if we had a closing quote
252
318
  if char != rstring_delimiter:
319
+ self.log(
320
+ "While parsing a string, we missed the closing quote, adding it back",
321
+ "info",
322
+ )
253
323
  self.insert_char_at(rstring_delimiter)
254
324
  else:
255
325
  self.index += 1
256
- if double_delimiter and self.get_char_at() == rstring_delimiter:
257
- self.index += 1
258
326
 
259
- return self.json_str[start:end]
327
+ return self.json_str[start:end].rstrip()
260
328
 
261
329
  def parse_number(self) -> Union[float, int, str]:
262
330
  # <number> is a valid real number expressed in one of a number of given formats
@@ -325,30 +393,52 @@ class JSONParser:
325
393
  except IndexError:
326
394
  return
327
395
 
328
- def update_context(self, value: str) -> None:
396
+ def set_context(self, value: str) -> None:
329
397
  # If a value is provided update the context variable and save in stack
330
398
  if value:
331
- if self.context:
332
- self.context_stack.append(self.context)
333
- self.context = value
334
- # Otherwise pop and update the context, or empty if the stack is empty
335
- else:
336
- try:
337
- self.context = self.context_stack.pop()
338
- except Exception:
339
- self.context = ""
399
+ self.context.append(value)
400
+
401
+ def reset_context(self) -> None:
402
+ try:
403
+ self.context.pop()
404
+ except Exception:
405
+ return
406
+
407
+ def get_context(self) -> str:
408
+ try:
409
+ return self.context[0]
410
+ except Exception:
411
+ return ""
412
+
413
+ def log(self, text: str, level: str) -> None:
414
+ if level == self.logger["log_level"]:
415
+ self.logger["log"].append(
416
+ {
417
+ "text": text,
418
+ "context": self.json_str[
419
+ self.index
420
+ - self.logger["window"] : self.index
421
+ + self.logger["window"]
422
+ ],
423
+ }
424
+ )
340
425
 
341
426
 
342
427
  def repair_json(
343
- json_str: str, return_objects: bool = False, skip_json_loads: bool = False
428
+ json_str: str,
429
+ return_objects: bool = False,
430
+ skip_json_loads: bool = False,
431
+ logging: bool = False,
344
432
  ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
345
433
  """
346
434
  Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
347
435
  It will return the fixed string by default.
348
436
  When `return_objects=True` is passed, it will return the decoded data structure instead.
437
+ When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
438
+ When `logging=True` is passed, it will return an tuple with the repaired json and a log of all repair actions
349
439
  """
350
440
  json_str = json_str.strip().lstrip("```json")
351
- parser = JSONParser(json_str)
441
+ parser = JSONParser(json_str, logging)
352
442
  if skip_json_loads:
353
443
  parsed_json = parser.parse()
354
444
  else:
@@ -357,7 +447,7 @@ def repair_json(
357
447
  except json.JSONDecodeError:
358
448
  parsed_json = parser.parse()
359
449
  # It's useful to return the actual object instead of the json string, it allows this lib to be a replacement of the json library
360
- if return_objects:
450
+ if return_objects or logging:
361
451
  return parsed_json
362
452
  return json.dumps(parsed_json)
363
453
 
@@ -384,3 +474,6 @@ def from_file(
384
474
  fd.close()
385
475
 
386
476
  return jsonobj
477
+
478
+
479
+ print(repair_json('{ "key": "value", "key2": }', logging=True))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: json_repair
3
- Version: 0.13.1
3
+ Version: 0.15.0
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -150,7 +150,10 @@ You will need owner access to this repository
150
150
  - Run `python -m build`
151
151
  - Create a new release in Github, making sure to tag all the issues solved and contributors. Create the new tag, same as the one in the build configuration
152
152
  - Once the release is created, a new Github Actions workflow will start to publish on Pypi, make sure it didn't fail
153
-
153
+ ---
154
+ # Repair JSON in other programming languages
155
+ - Typescript: https://github.com/josdejong/jsonrepair
156
+ - Go: https://github.com/RealAlexandreAI/json-repair
154
157
  ---
155
158
  # Bonus Content
156
159
  If you need some good Custom Instructions (System Message) to improve your chatbot responses try https://gist.github.com/mangiucugna/7ec015c4266df11be8aa510be0110fe4
@@ -0,0 +1,7 @@
1
+ json_repair/__init__.py,sha256=AlNie5y6BZBioGi5fzTAUvum_y0U5aL5aNsuQ_68LQc,175
2
+ json_repair/json_repair.py,sha256=ctuP4AaBrsWBzhF2Al-gX_itHcTG15cqU4Z56KYxNfA,19119
3
+ json_repair-0.15.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
4
+ json_repair-0.15.0.dist-info/METADATA,sha256=bnhSr8AectNHH-ljyaqwIC5BqPkVf2uYwLHuNuUYpyQ,7355
5
+ json_repair-0.15.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
6
+ json_repair-0.15.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
7
+ json_repair-0.15.0.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- json_repair/__init__.py,sha256=AlNie5y6BZBioGi5fzTAUvum_y0U5aL5aNsuQ_68LQc,175
2
- json_repair/json_repair.py,sha256=DB220fZ1BCf--9CeP6AzL2FCk9tpE1Eh3WxgFo33P88,15460
3
- json_repair-0.13.1.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
4
- json_repair-0.13.1.dist-info/METADATA,sha256=gcfdAU5nhvlHlE3wMvGcwHRQUzgPT2p08cgDg7iyLvw,7200
5
- json_repair-0.13.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
6
- json_repair-0.13.1.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
7
- json_repair-0.13.1.dist-info/RECORD,,