json-repair 0.16.2__tar.gz → 0.17.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: json_repair
3
- Version: 0.16.2
3
+ Version: 0.17.0
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
3
3
  build-backend = "setuptools.build_meta"
4
4
  [project]
5
5
  name = "json_repair"
6
- version = "0.16.2"
6
+ version = "0.17.0"
7
7
  license = {file = "LICENSE"}
8
8
  authors = [
9
9
  { name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },
@@ -1,4 +1,4 @@
1
1
  from .json_repair import repair_json as repair_json
2
2
  from .json_repair import loads as loads
3
- from .json_repair import loads as load
4
- from .json_repair import loads as from_file
3
+ from .json_repair import load as load
4
+ from .json_repair import from_file as from_file
@@ -11,7 +11,7 @@ This module will parse the JSON file following the BNF definition:
11
11
 
12
12
  <container> ::= <object> | <array>
13
13
  <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
14
- <object> ::= '{' [ <string> *(', ' <member>) ] '}' ; A sequence of 'members'
14
+ <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
15
15
  <member> ::= <string> ': ' <json> ; A pair consisting of a name, and a JSON value
16
16
 
17
17
  If something is wrong (a missing parantheses or quotes for example) it will use a few simple heuristics to fix the JSON string:
@@ -27,9 +27,11 @@ from typing import Any, Dict, List, Union, TextIO
27
27
 
28
28
 
29
29
  class JSONParser:
30
- def __init__(self, json_str: str, logging: bool = False) -> None:
30
+ def __init__(self, json_str: str, json_fd: TextIO, logging: bool = False) -> None:
31
31
  # The string to parse
32
32
  self.json_str = json_str
33
+ # Alternatively, the file description with a json file in it
34
+ self.json_fd = json_fd
33
35
  # Index is our iterator that will keep track of which character we are looking at right now
34
36
  self.index = 0
35
37
  # This is used in the object member parsing to manage the special cases of missing quotes in key or value
@@ -56,48 +58,28 @@ class JSONParser:
56
58
  return ""
57
59
  # <object> starts with '{'
58
60
  # but an object key must be a string
59
- elif self.get_context() != "object_key" and char == "{":
61
+ elif char == "{":
60
62
  self.index += 1
61
63
  return self.parse_object()
62
64
  # <array> starts with '['
63
65
  # but an object key must be a string
64
- elif self.get_context() != "object_key" and char == "[":
66
+ elif char == "[":
65
67
  self.index += 1
66
68
  return self.parse_array()
67
69
  # there can be an edge case in which a key is empty and at the end of an object
68
70
  # like "key": }. We return an empty string here to close the object properly
69
- elif self.get_context() != "object_key" and char == "}":
71
+ elif char == "}":
70
72
  self.log(
71
73
  "At the end of an object we found a key with missing value, skipping",
72
74
  "info",
73
75
  )
74
76
  return ""
75
- # <string> starts with '"'
76
- elif char == '"':
77
+ # <string> starts with a quote
78
+ elif char in ['"', "'", "“"] or char.isalpha():
77
79
  return self.parse_string()
78
- elif char == "'":
79
- return self.parse_string(string_quotes="'")
80
- elif char == "“":
81
- return self.parse_string(string_quotes=["“", "”"])
82
80
  # <number> starts with [0-9] or minus
83
- elif (
84
- self.get_context() != ""
85
- and self.get_context() != "object_key"
86
- and char.isdigit()
87
- or char == "-"
88
- or char == "."
89
- ):
81
+ elif char.isdigit() or char == "-" or char == ".":
90
82
  return self.parse_number()
91
- # <boolean> could be (T)rue or (F)alse or (N)ull
92
- elif (
93
- self.get_context() != ""
94
- and self.get_context() != "object_key"
95
- and char.lower() in ["t", "f", "n"]
96
- ):
97
- return self.parse_boolean_or_null()
98
- # This might be a <string> that is missing the starting '"'
99
- elif self.get_context() != "" and char.isalpha():
100
- return self.parse_string()
101
83
  # If everything else fails, we just ignore and move on
102
84
  else:
103
85
  self.index += 1
@@ -117,11 +99,9 @@ class JSONParser:
117
99
  # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
118
100
  if (self.get_char_at() or "") == ":":
119
101
  self.log(
120
- "While parsing an object we found a : before a key, replacing with ,",
102
+ "While parsing an object we found a : before a key, ignoring",
121
103
  "info",
122
104
  )
123
- self.remove_char_at()
124
- self.insert_char_at(",")
125
105
  self.index += 1
126
106
 
127
107
  # We are now searching for they string key
@@ -133,7 +113,7 @@ class JSONParser:
133
113
  # <member> starts with a <string>
134
114
  key = ""
135
115
  while key == "" and self.get_char_at():
136
- key = self.parse_json()
116
+ key = self.parse_string()
137
117
 
138
118
  # This can happen sometimes like { "": "value" }
139
119
  if key == "" and self.get_char_at() == ":":
@@ -153,10 +133,10 @@ class JSONParser:
153
133
  # An extreme case of missing ":" after a key
154
134
  if (self.get_char_at() or "") != ":":
155
135
  self.log(
156
- "While parsing an object we missed a : after a key, adding it back",
136
+ "While parsing an object we missed a : after a key",
157
137
  "info",
158
138
  )
159
- self.insert_char_at(":")
139
+
160
140
  self.index += 1
161
141
  self.reset_context()
162
142
  self.set_context("object_value")
@@ -176,10 +156,10 @@ class JSONParser:
176
156
  # Especially at the end of an LLM generated json you might miss the last "}"
177
157
  if (self.get_char_at() or "}") != "}":
178
158
  self.log(
179
- "While parsing an object, we couldn't find the closing }, adding it back",
159
+ "While parsing an object, we couldn't find the closing }, ignoring",
180
160
  "info",
181
161
  )
182
- self.insert_char_at("}")
162
+
183
163
  self.index += 1
184
164
  return obj
185
165
 
@@ -205,6 +185,10 @@ class JSONParser:
205
185
  char = self.get_char_at()
206
186
  # If this is the right value of an object and we are closing the object, it means the array is over
207
187
  if self.get_context() == "object_value" and char == "}":
188
+ self.log(
189
+ "While parsing an array inside an object, we got to the end without finding a ]. Stopped parsing",
190
+ "info",
191
+ )
208
192
  break
209
193
 
210
194
  # Especially at the end of an LLM generated json you might miss the last "]"
@@ -217,35 +201,68 @@ class JSONParser:
217
201
  if char == ",":
218
202
  # Remove trailing "," before adding the "]"
219
203
  self.log(
220
- "While parsing an array, remove a trailing , before adding ]",
204
+ "While parsing an array, found a trailing , before adding ]",
221
205
  "info",
222
206
  )
223
- self.remove_char_at()
224
- self.insert_char_at("]")
207
+
225
208
  self.index -= 1
226
209
 
227
210
  self.index += 1
228
211
  self.reset_context()
229
212
  return arr
230
213
 
231
- def parse_string(self, string_quotes=False) -> str:
214
+ def parse_string(self) -> str:
232
215
  # <string> is a string of valid characters enclosed in quotes
233
216
  # i.e. { name: "John" }
234
217
  # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
235
218
 
236
219
  # Flag to manage corner cases related to missing starting quote
237
- fixed_quotes = False
220
+ missing_quotes = False
238
221
  doubled_quotes = False
239
222
  lstring_delimiter = rstring_delimiter = '"'
240
- if isinstance(string_quotes, list):
241
- lstring_delimiter = string_quotes[0]
242
- rstring_delimiter = string_quotes[1]
243
- elif isinstance(string_quotes, str):
244
- lstring_delimiter = rstring_delimiter = string_quotes
223
+
224
+ char = self.get_char_at()
225
+ # A valid string can only start with a valid quote or, in our case, with a literal
226
+ while char and char not in ['"', "'", "“"] and not char.isalpha():
227
+ self.index += 1
228
+ char = self.get_char_at()
229
+
230
+ # Ensuring we use the right delimiter
231
+ if char == "'":
232
+ lstring_delimiter = rstring_delimiter = "'"
233
+ elif char == "“":
234
+ lstring_delimiter = "“"
235
+ rstring_delimiter = "”"
236
+ elif char.isalpha():
237
+ # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
238
+ if char.lower() in ["t", "f", "n"]:
239
+ value = self.parse_boolean_or_null()
240
+ if value != "":
241
+ return value
242
+ self.log(
243
+ "While parsing a string, we found a literal instead of a quote",
244
+ "info",
245
+ )
246
+ if self.get_context() == "":
247
+ # A string literal in the wild isn't a valid json and not something we can fix
248
+ self.log(
249
+ "While parsing a string, we found a literal outside of context, ignoring it",
250
+ "info",
251
+ )
252
+ self.index += 1
253
+ return self.parse_json()
254
+ self.log(
255
+ "While parsing a string, we found no starting quote, ignoring", "info"
256
+ )
257
+ missing_quotes = True
258
+
259
+ if not missing_quotes:
260
+ self.index += 1
261
+
245
262
  # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
246
- if self.get_char_at(1) == lstring_delimiter:
263
+ if self.get_char_at() == lstring_delimiter:
247
264
  # This is a valid exception only if it's closed by a double delimiter again
248
- i = 2
265
+ i = 1
249
266
  next_c = self.get_char_at(i)
250
267
  while next_c and next_c != rstring_delimiter:
251
268
  i += 1
@@ -259,18 +276,9 @@ class JSONParser:
259
276
  )
260
277
  doubled_quotes = True
261
278
  self.index += 1
262
- char = self.get_char_at()
263
- if char != lstring_delimiter:
264
- self.log(
265
- "While parsing a string, we found no starting quote, adding it", "info"
266
- )
267
- self.insert_char_at(lstring_delimiter)
268
- fixed_quotes = True
269
- else:
270
- self.index += 1
271
279
 
272
- # Start position of the string (to use later in the return value)
273
- start = self.index
280
+ # Initialize our return value
281
+ string_acc = ""
274
282
 
275
283
  # Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
276
284
  # In that case we need to use the ":|,|}" characters as terminators of the string
@@ -280,22 +288,25 @@ class JSONParser:
280
288
  # * If we are fixing missing quotes in an object, when it finds the special terminators
281
289
  char = self.get_char_at()
282
290
  while char and char != rstring_delimiter:
283
- if fixed_quotes:
291
+ if missing_quotes:
284
292
  if self.get_context() == "object_key" and (
285
293
  char == ":" or char.isspace()
286
294
  ):
287
295
  break
288
296
  elif self.get_context() == "object_value" and char in [",", "}"]:
289
297
  break
298
+ string_acc += char
290
299
  self.index += 1
291
300
  char = self.get_char_at()
292
301
  # If the string contains an escaped character we should respect that or remove the escape
293
302
  if self.get_char_at(-1) == "\\":
294
303
  if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
304
+ string_acc += char
295
305
  self.index += 1
296
306
  char = self.get_char_at()
297
307
  else:
298
- self.remove_char_at(-1)
308
+ # Remove this character from the final output
309
+ string_acc = string_acc[:-2] + string_acc[-1:]
299
310
  self.index -= 1
300
311
  # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
301
312
  if char == rstring_delimiter:
@@ -305,8 +316,6 @@ class JSONParser:
305
316
  "While parsing a string, we found a doubled quote, ignoring it",
306
317
  "info",
307
318
  )
308
- # self destruct this character
309
- self.remove_char_at()
310
319
  else:
311
320
  # Check if eventually there is a rstring delimiter, otherwise we bail
312
321
  i = 1
@@ -343,12 +352,13 @@ class JSONParser:
343
352
  "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
344
353
  "info",
345
354
  )
355
+ string_acc += char
346
356
  self.index += 1
347
357
  char = self.get_char_at()
348
358
 
349
359
  if (
350
360
  char
351
- and fixed_quotes
361
+ and missing_quotes
352
362
  and self.get_context() == "object_key"
353
363
  and char.isspace()
354
364
  ):
@@ -360,19 +370,16 @@ class JSONParser:
360
370
  if self.get_char_at() not in [":", ","]:
361
371
  return ""
362
372
 
363
- end = self.index
364
-
365
373
  # A fallout of the previous special case in the while loop, we need to update the index only if we had a closing quote
366
374
  if char != rstring_delimiter:
367
375
  self.log(
368
- "While parsing a string, we missed the closing quote, adding it back",
376
+ "While parsing a string, we missed the closing quote, ignoring",
369
377
  "info",
370
378
  )
371
- self.insert_char_at(rstring_delimiter)
372
379
  else:
373
380
  self.index += 1
374
381
 
375
- return self.json_str[start:end].rstrip()
382
+ return string_acc.rstrip()
376
383
 
377
384
  def parse_number(self) -> Union[float, int, str]:
378
385
  # <number> is a valid real number expressed in one of a number of given formats
@@ -395,51 +402,57 @@ class JSONParser:
395
402
  except ValueError:
396
403
  return number_str
397
404
  else:
398
- # This is a string then
399
- return self.parse_string()
405
+ # If nothing works, let's skip and keep parsing
406
+ return self.parse_json()
400
407
 
401
408
  def parse_boolean_or_null(self) -> Union[bool, str, None]:
402
409
  # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
403
- boolean_map = {"true": (True, 4), "false": (False, 5), "null": (None, 4)}
404
- for key, (value, length) in boolean_map.items():
405
- if self.json_str.lower().startswith(key, self.index):
406
- self.index += length
407
- return value
410
+ starting_index = self.index
411
+ value = ""
412
+ char = self.get_char_at().lower()
413
+ if char == "t":
414
+ value = ("true", True)
415
+ elif char == "f":
416
+ value = ("false", False)
417
+ elif char == "n":
418
+ value = ("null", None)
419
+
420
+ if len(value):
421
+ i = 0
422
+ while char and i < len(value[0]) and char == value[0][i]:
423
+ i += 1
424
+ self.index += 1
425
+ char = self.get_char_at().lower()
426
+ if i == len(value[0]):
427
+ return value[1]
408
428
 
409
- # This is a string then
410
- return self.parse_string()
411
-
412
- def insert_char_at(self, char: str) -> None:
413
- self.json_str = self.json_str[: self.index] + char + self.json_str[self.index :]
414
- self.index += 1
429
+ # If nothing works reset the index before returning
430
+ self.index = starting_index
431
+ return ""
415
432
 
416
433
  def get_char_at(self, count: int = 0) -> Union[str, bool]:
417
- # Why not use something simpler? Because we might be out of bounds and doing this check all the time is annoying
418
- try:
419
- return self.json_str[self.index + count]
420
- except IndexError:
421
- return False
422
-
423
- def remove_char_at(self, count: int = 0) -> None:
424
- self.json_str = (
425
- self.json_str[: self.index + count]
426
- + self.json_str[self.index + count + 1 :]
427
- )
434
+ if self.json_fd:
435
+ self.json_fd.seek(self.index + count)
436
+ char = self.json_fd.read(1)
437
+ if char == "":
438
+ return False
439
+ return char
440
+ else:
441
+ # Why not use something simpler? Because we might be out of bounds and doing this check all the time is annoying
442
+ try:
443
+ return self.json_str[self.index + count]
444
+ except IndexError:
445
+ return False
428
446
 
429
447
  def skip_whitespaces_at(self) -> None:
430
- # Remove trailing spaces
431
- # I'd rather not do this BUT this method is called so many times that it makes sense to expand get_char_at
432
- # At least this is what the profiler said and I believe in our lord and savior the profiler
433
- try:
434
- char = self.json_str[self.index]
435
- except IndexError:
436
- return
448
+ """
449
+ This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
450
+ """
451
+
452
+ char = self.get_char_at()
437
453
  while char and char.isspace():
438
454
  self.index += 1
439
- try:
440
- char = self.json_str[self.index]
441
- except IndexError:
442
- return
455
+ char = self.get_char_at()
443
456
 
444
457
  def set_context(self, value: str) -> None:
445
458
  # If a value is provided update the context variable and save in stack
@@ -460,23 +473,31 @@ class JSONParser:
460
473
 
461
474
  def log(self, text: str, level: str) -> None:
462
475
  if level == self.logger["log_level"]:
476
+ context = ""
477
+ if self.json_fd:
478
+ self.json_fd.seek(self.index - self.logger["window"])
479
+ context = self.json_fd.read(self.logger["window"] * 2)
480
+ self.json_fd.seek(self.index)
481
+ else:
482
+ context = self.json_str[
483
+ self.index
484
+ - self.logger["window"] : self.index
485
+ + self.logger["window"]
486
+ ]
463
487
  self.logger["log"].append(
464
488
  {
465
489
  "text": text,
466
- "context": self.json_str[
467
- self.index
468
- - self.logger["window"] : self.index
469
- + self.logger["window"]
470
- ],
490
+ "context": context,
471
491
  }
472
492
  )
473
493
 
474
494
 
475
495
  def repair_json(
476
- json_str: str,
496
+ json_str: str = "",
477
497
  return_objects: bool = False,
478
498
  skip_json_loads: bool = False,
479
499
  logging: bool = False,
500
+ json_fd: TextIO = None,
480
501
  ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
481
502
  """
482
503
  Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
@@ -485,13 +506,15 @@ def repair_json(
485
506
  When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
486
507
  When `logging=True` is passed, it will return an tuple with the repaired json and a log of all repair actions
487
508
  """
488
- json_str = json_str.strip().lstrip("```json")
489
- parser = JSONParser(json_str, logging)
509
+ parser = JSONParser(json_str, json_fd, logging)
490
510
  if skip_json_loads:
491
511
  parsed_json = parser.parse()
492
512
  else:
493
513
  try:
494
- parsed_json = json.loads(json_str)
514
+ if json_fd:
515
+ parsed_json = json.load(json_fd)
516
+ else:
517
+ parsed_json = json.loads(json_str)
495
518
  except json.JSONDecodeError:
496
519
  parsed_json = parser.parse()
497
520
  # It's useful to return the actual object instead of the json string, it allows this lib to be a replacement of the json library
@@ -507,18 +530,30 @@ def loads(
507
530
  This function works like `json.loads()` except that it will fix your JSON in the process.
508
531
  It is a wrapper around the `repair_json()` function with `return_objects=True`.
509
532
  """
510
- return repair_json(json_str, True, skip_json_loads, logging)
533
+ return repair_json(
534
+ json_str=json_str,
535
+ return_objects=True,
536
+ skip_json_loads=skip_json_loads,
537
+ logging=logging,
538
+ )
511
539
 
512
540
 
513
541
  def load(
514
- fp: TextIO, skip_json_loads: bool = False, logging: bool = False
542
+ fd: TextIO, skip_json_loads: bool = False, logging: bool = False
515
543
  ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
516
- return loads(fp.read(), skip_json_loads, logging)
544
+ """
545
+ This function works like `json.load()` except that it will fix your JSON in the process.
546
+ It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`.
547
+ """
548
+ return repair_json(json_fd=fd, skip_json_loads=skip_json_loads, logging=logging)
517
549
 
518
550
 
519
551
  def from_file(
520
552
  filename: str, skip_json_loads: bool = False, logging: bool = False
521
553
  ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
554
+ """
555
+ This function is a wrapper around `load()` so you can pass the filename as string
556
+ """
522
557
  fd = open(filename)
523
558
  jsonobj = load(fd, skip_json_loads, logging)
524
559
  fd.close()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: json_repair
3
- Version: 0.16.2
3
+ Version: 0.17.0
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License