json-repair 0.14.0__py3-none-any.whl → 0.15.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -27,17 +27,25 @@ from typing import Any, Dict, List, Union, TextIO
27
27
 
28
28
 
29
29
  class JSONParser:
30
- def __init__(self, json_str: str) -> None:
30
+ def __init__(self, json_str: str, logging: bool = False) -> None:
31
31
  # The string to parse
32
32
  self.json_str = json_str
33
33
  # Index is our iterator that will keep track of which character we are looking at right now
34
34
  self.index = 0
35
35
  # This is used in the object member parsing to manage the special cases of missing quotes in key or value
36
- self.context = ""
37
- self.context_stack = []
36
+ self.context = []
37
+ # Use this to log the activity, but only if logging is active
38
+ self.logger = {
39
+ "log": [],
40
+ "window": 10,
41
+ "log_level": "info" if logging else "none",
42
+ }
38
43
 
39
44
  def parse(self) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
40
- return self.parse_json()
45
+ if self.logger["log_level"] == "none":
46
+ return self.parse_json()
47
+ else:
48
+ return self.parse_json(), self.logger["log"]
41
49
 
42
50
  def parse_json(
43
51
  self,
@@ -56,7 +64,11 @@ class JSONParser:
56
64
  return self.parse_array()
57
65
  # there can be an edge case in which a key is empty and at the end of an object
58
66
  # like "key": }. We return an empty string here to close the object properly
59
- elif char == "}" and self.context == "object_value":
67
+ elif char == "}" and self.get_context() == "object_value":
68
+ self.log(
69
+ "At the end of an object we found a key with missing value, skipping",
70
+ "info",
71
+ )
60
72
  return ""
61
73
  # <string> starts with '"'
62
74
  elif char == '"':
@@ -92,13 +104,17 @@ class JSONParser:
92
104
 
93
105
  # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
94
106
  if (self.get_char_at() or "") == ":":
107
+ self.log(
108
+ "While parsing an object we found a : before a key, replacing with ,",
109
+ "info",
110
+ )
95
111
  self.remove_char_at()
96
112
  self.insert_char_at(",")
97
113
  self.index += 1
98
114
 
99
115
  # We are now searching for they string key
100
116
  # Context is used in the string parser to manage the lack of quotes
101
- self.update_context("object_key")
117
+ self.set_context("object_key")
102
118
 
103
119
  self.skip_whitespaces_at()
104
120
 
@@ -110,6 +126,10 @@ class JSONParser:
110
126
  # This can happen sometimes like { "": "value" }
111
127
  if key == "" and self.get_char_at() == ":":
112
128
  key = "empty_placeholder"
129
+ self.log(
130
+ "While parsing an object we found an empty key, replacing with empty_placeholder",
131
+ "info",
132
+ )
113
133
  break
114
134
 
115
135
  # We reached the end here
@@ -118,15 +138,19 @@ class JSONParser:
118
138
 
119
139
  # An extreme case of missing ":" after a key
120
140
  if (self.get_char_at() or "") != ":":
141
+ self.log(
142
+ "While parsing an object we missed a : after a key, adding it back",
143
+ "info",
144
+ )
121
145
  self.insert_char_at(":")
122
146
  self.index += 1
123
- self.update_context("")
124
- self.update_context("object_value")
147
+ self.reset_context()
148
+ self.set_context("object_value")
125
149
  # The value can be any valid json
126
150
  value = self.parse_json()
127
151
 
128
152
  # Reset context since our job is done
129
- self.update_context("")
153
+ self.reset_context()
130
154
  obj[key] = value
131
155
 
132
156
  if (self.get_char_at() or "") in [",", "'", '"']:
@@ -137,6 +161,10 @@ class JSONParser:
137
161
 
138
162
  # Especially at the end of an LLM generated json you might miss the last "}"
139
163
  if (self.get_char_at() or "}") != "}":
164
+ self.log(
165
+ "While parsing an object, we couldn't find the closing }, adding it back",
166
+ "info",
167
+ )
140
168
  self.insert_char_at("}")
141
169
  self.index += 1
142
170
  return obj
@@ -160,15 +188,22 @@ class JSONParser:
160
188
  self.index += 1
161
189
  char = self.get_char_at()
162
190
  # If this is the right value of an object and we are closing the object, it means the array is over
163
- if self.context == "object_value" and char == "}":
191
+ if self.get_context() == "object_value" and char == "}":
164
192
  break
165
193
 
166
194
  # Especially at the end of an LLM generated json you might miss the last "]"
167
195
  char = self.get_char_at()
168
196
  if char and char != "]":
197
+ self.log(
198
+ "While parsing an array we missed the closing ], adding it back", "info"
199
+ )
169
200
  # Sometimes when you fix a missing "]" you'll have a trailing "," there that makes the JSON invalid
170
201
  if char == ",":
171
202
  # Remove trailing "," before adding the "]"
203
+ self.log(
204
+ "While parsing an array, remove a trailing , before adding ]",
205
+ "info",
206
+ )
172
207
  self.remove_char_at()
173
208
  self.insert_char_at("]")
174
209
  self.index -= 1
@@ -191,9 +226,15 @@ class JSONParser:
191
226
  lstring_delimiter = rstring_delimiter = string_quotes
192
227
  # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
193
228
  if self.get_char_at(1) == lstring_delimiter:
229
+ self.log(
230
+ "While parsing a string, we found a doubled quote, ignoring it", "info"
231
+ )
194
232
  self.index += 1
195
233
  char = self.get_char_at()
196
234
  if char != lstring_delimiter:
235
+ self.log(
236
+ "While parsing a string, we found no starting quote, adding it", "info"
237
+ )
197
238
  self.insert_char_at(lstring_delimiter)
198
239
  fixed_quotes = True
199
240
  else:
@@ -211,9 +252,11 @@ class JSONParser:
211
252
  char = self.get_char_at()
212
253
  while char and char != rstring_delimiter:
213
254
  if fixed_quotes:
214
- if self.context == "object_key" and (char == ":" or char.isspace()):
255
+ if self.get_context() == "object_key" and (
256
+ char == ":" or char.isspace()
257
+ ):
215
258
  break
216
- elif self.context == "object_value" and char in [",", "}"]:
259
+ elif self.get_context() == "object_value" and char in [",", "}"]:
217
260
  break
218
261
  self.index += 1
219
262
  char = self.get_char_at()
@@ -233,6 +276,10 @@ class JSONParser:
233
276
  ):
234
277
  # Special case here, in case of double quotes one after another
235
278
  if self.get_char_at(1) == rstring_delimiter:
279
+ self.log(
280
+ "While parsing a string, we found a doubled quote, ignoring it",
281
+ "info",
282
+ )
236
283
  # self destruct this character
237
284
  self.remove_char_at()
238
285
  else:
@@ -244,10 +291,23 @@ class JSONParser:
244
291
  next_c = self.get_char_at(i)
245
292
  # In that case we ignore this rstring delimiter
246
293
  if next_c:
294
+ self.log(
295
+ "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
296
+ "info",
297
+ )
247
298
  self.index += 1
248
299
  char = self.get_char_at()
249
300
 
250
- if char and fixed_quotes and self.context == "object_key" and char.isspace():
301
+ if (
302
+ char
303
+ and fixed_quotes
304
+ and self.get_context() == "object_key"
305
+ and char.isspace()
306
+ ):
307
+ self.log(
308
+ "While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
309
+ "info",
310
+ )
251
311
  self.skip_whitespaces_at()
252
312
  if self.get_char_at() not in [":", ","]:
253
313
  return ""
@@ -256,6 +316,10 @@ class JSONParser:
256
316
 
257
317
  # A fallout of the previous special case in the while loop, we need to update the index only if we had a closing quote
258
318
  if char != rstring_delimiter:
319
+ self.log(
320
+ "While parsing a string, we missed the closing quote, adding it back",
321
+ "info",
322
+ )
259
323
  self.insert_char_at(rstring_delimiter)
260
324
  else:
261
325
  self.index += 1
@@ -329,30 +393,52 @@ class JSONParser:
329
393
  except IndexError:
330
394
  return
331
395
 
332
- def update_context(self, value: str) -> None:
396
+ def set_context(self, value: str) -> None:
333
397
  # If a value is provided update the context variable and save in stack
334
398
  if value:
335
- if self.context:
336
- self.context_stack.append(self.context)
337
- self.context = value
338
- # Otherwise pop and update the context, or empty if the stack is empty
339
- else:
340
- try:
341
- self.context = self.context_stack.pop()
342
- except Exception:
343
- self.context = ""
399
+ self.context.append(value)
400
+
401
+ def reset_context(self) -> None:
402
+ try:
403
+ self.context.pop()
404
+ except Exception:
405
+ return
406
+
407
+ def get_context(self) -> str:
408
+ try:
409
+ return self.context[0]
410
+ except Exception:
411
+ return ""
412
+
413
+ def log(self, text: str, level: str) -> None:
414
+ if level == self.logger["log_level"]:
415
+ self.logger["log"].append(
416
+ {
417
+ "text": text,
418
+ "context": self.json_str[
419
+ self.index
420
+ - self.logger["window"] : self.index
421
+ + self.logger["window"]
422
+ ],
423
+ }
424
+ )
344
425
 
345
426
 
346
427
  def repair_json(
347
- json_str: str, return_objects: bool = False, skip_json_loads: bool = False
428
+ json_str: str,
429
+ return_objects: bool = False,
430
+ skip_json_loads: bool = False,
431
+ logging: bool = False,
348
432
  ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
349
433
  """
350
434
  Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
351
435
  It will return the fixed string by default.
352
436
  When `return_objects=True` is passed, it will return the decoded data structure instead.
437
+ When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
438
+ When `logging=True` is passed, it will return an tuple with the repaired json and a log of all repair actions
353
439
  """
354
440
  json_str = json_str.strip().lstrip("```json")
355
- parser = JSONParser(json_str)
441
+ parser = JSONParser(json_str, logging)
356
442
  if skip_json_loads:
357
443
  parsed_json = parser.parse()
358
444
  else:
@@ -361,30 +447,32 @@ def repair_json(
361
447
  except json.JSONDecodeError:
362
448
  parsed_json = parser.parse()
363
449
  # It's useful to return the actual object instead of the json string, it allows this lib to be a replacement of the json library
364
- if return_objects:
450
+ if return_objects or logging:
365
451
  return parsed_json
366
452
  return json.dumps(parsed_json)
367
453
 
368
454
 
369
455
  def loads(
370
- json_str: str,
456
+ json_str: str, skip_json_loads: bool = False, logging: bool = False
371
457
  ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
372
458
  """
373
459
  This function works like `json.loads()` except that it will fix your JSON in the process.
374
460
  It is a wrapper around the `repair_json()` function with `return_objects=True`.
375
461
  """
376
- return repair_json(json_str, True)
462
+ return repair_json(json_str, True, skip_json_loads, logging)
377
463
 
378
464
 
379
- def load(fp: TextIO) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
380
- return loads(fp.read())
465
+ def load(
466
+ fp: TextIO, skip_json_loads: bool = False, logging: bool = False
467
+ ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
468
+ return loads(fp.read(), skip_json_loads, logging)
381
469
 
382
470
 
383
471
  def from_file(
384
- filename: str,
472
+ filename: str, skip_json_loads: bool = False, logging: bool = False
385
473
  ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
386
474
  fd = open(filename)
387
- jsonobj = load(fd)
475
+ jsonobj = load(fd, skip_json_loads, logging)
388
476
  fd.close()
389
477
 
390
478
  return jsonobj
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: json_repair
3
- Version: 0.14.0
3
+ Version: 0.15.1
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -0,0 +1,7 @@
1
+ json_repair/__init__.py,sha256=AlNie5y6BZBioGi5fzTAUvum_y0U5aL5aNsuQ_68LQc,175
2
+ json_repair/json_repair.py,sha256=oGD3DDU_Gni7HA25Mlnx4b5_b7c8F_kRSNrxAR3f9sk,19297
3
+ json_repair-0.15.1.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
4
+ json_repair-0.15.1.dist-info/METADATA,sha256=Kpv-aRkvY2N4YOLS9z61lKtD5IeoN0qg74Z0-y7mHtk,7355
5
+ json_repair-0.15.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
6
+ json_repair-0.15.1.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
7
+ json_repair-0.15.1.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- json_repair/__init__.py,sha256=AlNie5y6BZBioGi5fzTAUvum_y0U5aL5aNsuQ_68LQc,175
2
- json_repair/json_repair.py,sha256=8B5HfWoLlUUtRYq1cnbajOxWiMSD9nxNW2cRFPjFfVE,15817
3
- json_repair-0.14.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
4
- json_repair-0.14.0.dist-info/METADATA,sha256=82KldmuVFLXbNy6SXar9MsulkcBUM1K8RX13pNysHQU,7355
5
- json_repair-0.14.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
6
- json_repair-0.14.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
7
- json_repair-0.14.0.dist-info/RECORD,,