json-repair 0.16.3__py3-none-any.whl → 0.17.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,7 +11,7 @@ This module will parse the JSON file following the BNF definition:
11
11
 
12
12
  <container> ::= <object> | <array>
13
13
  <array> ::= '[' [ <json> *(', ' <json>) ] ']' ; A sequence of JSON values separated by commas
14
- <object> ::= '{' [ <string> *(', ' <member>) ] '}' ; A sequence of 'members'
14
+ <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
15
15
  <member> ::= <string> ': ' <json> ; A pair consisting of a name, and a JSON value
16
16
 
17
17
  If something is wrong (a missing parantheses or quotes for example) it will use a few simple heuristics to fix the JSON string:
@@ -27,9 +27,11 @@ from typing import Any, Dict, List, Union, TextIO
27
27
 
28
28
 
29
29
  class JSONParser:
30
- def __init__(self, json_str: str, logging: bool = False) -> None:
30
+ def __init__(self, json_str: str, json_fd: TextIO, logging: bool = False) -> None:
31
31
  # The string to parse
32
32
  self.json_str = json_str
33
+ # Alternatively, the file description with a json file in it
34
+ self.json_fd = json_fd
33
35
  # Index is our iterator that will keep track of which character we are looking at right now
34
36
  self.index = 0
35
37
  # This is used in the object member parsing to manage the special cases of missing quotes in key or value
@@ -56,48 +58,28 @@ class JSONParser:
56
58
  return ""
57
59
  # <object> starts with '{'
58
60
  # but an object key must be a string
59
- elif self.get_context() != "object_key" and char == "{":
61
+ elif char == "{":
60
62
  self.index += 1
61
63
  return self.parse_object()
62
64
  # <array> starts with '['
63
65
  # but an object key must be a string
64
- elif self.get_context() != "object_key" and char == "[":
66
+ elif char == "[":
65
67
  self.index += 1
66
68
  return self.parse_array()
67
69
  # there can be an edge case in which a key is empty and at the end of an object
68
70
  # like "key": }. We return an empty string here to close the object properly
69
- elif self.get_context() != "object_key" and char == "}":
71
+ elif char == "}":
70
72
  self.log(
71
73
  "At the end of an object we found a key with missing value, skipping",
72
74
  "info",
73
75
  )
74
76
  return ""
75
- # <string> starts with '"'
76
- elif char == '"':
77
+ # <string> starts with a quote
78
+ elif char in ['"', "'", "“"] or char.isalpha():
77
79
  return self.parse_string()
78
- elif char == "'":
79
- return self.parse_string(string_quotes="'")
80
- elif char == "“":
81
- return self.parse_string(string_quotes=["“", "”"])
82
80
  # <number> starts with [0-9] or minus
83
- elif (
84
- self.get_context() != ""
85
- and self.get_context() != "object_key"
86
- and char.isdigit()
87
- or char == "-"
88
- or char == "."
89
- ):
81
+ elif char.isdigit() or char == "-" or char == ".":
90
82
  return self.parse_number()
91
- # <boolean> could be (T)rue or (F)alse or (N)ull
92
- elif (
93
- self.get_context() != ""
94
- and self.get_context() != "object_key"
95
- and char.lower() in ["t", "f", "n"]
96
- ):
97
- return self.parse_boolean_or_null()
98
- # This might be a <string> that is missing the starting '"'
99
- elif self.get_context() != "" and char.isalpha():
100
- return self.parse_string()
101
83
  # If everything else fails, we just ignore and move on
102
84
  else:
103
85
  self.index += 1
@@ -117,11 +99,9 @@ class JSONParser:
117
99
  # Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
118
100
  if (self.get_char_at() or "") == ":":
119
101
  self.log(
120
- "While parsing an object we found a : before a key, replacing with ,",
102
+ "While parsing an object we found a : before a key, ignoring",
121
103
  "info",
122
104
  )
123
- self.remove_char_at()
124
- self.insert_char_at(",")
125
105
  self.index += 1
126
106
 
127
107
  # We are now searching for they string key
@@ -133,7 +113,7 @@ class JSONParser:
133
113
  # <member> starts with a <string>
134
114
  key = ""
135
115
  while key == "" and self.get_char_at():
136
- key = self.parse_json()
116
+ key = self.parse_string()
137
117
 
138
118
  # This can happen sometimes like { "": "value" }
139
119
  if key == "" and self.get_char_at() == ":":
@@ -153,10 +133,10 @@ class JSONParser:
153
133
  # An extreme case of missing ":" after a key
154
134
  if (self.get_char_at() or "") != ":":
155
135
  self.log(
156
- "While parsing an object we missed a : after a key, adding it back",
136
+ "While parsing an object we missed a : after a key",
157
137
  "info",
158
138
  )
159
- self.insert_char_at(":")
139
+
160
140
  self.index += 1
161
141
  self.reset_context()
162
142
  self.set_context("object_value")
@@ -176,10 +156,10 @@ class JSONParser:
176
156
  # Especially at the end of an LLM generated json you might miss the last "}"
177
157
  if (self.get_char_at() or "}") != "}":
178
158
  self.log(
179
- "While parsing an object, we couldn't find the closing }, adding it back",
159
+ "While parsing an object, we couldn't find the closing }, ignoring",
180
160
  "info",
181
161
  )
182
- self.insert_char_at("}")
162
+
183
163
  self.index += 1
184
164
  return obj
185
165
 
@@ -205,6 +185,10 @@ class JSONParser:
205
185
  char = self.get_char_at()
206
186
  # If this is the right value of an object and we are closing the object, it means the array is over
207
187
  if self.get_context() == "object_value" and char == "}":
188
+ self.log(
189
+ "While parsing an array inside an object, we got to the end without finding a ]. Stopped parsing",
190
+ "info",
191
+ )
208
192
  break
209
193
 
210
194
  # Especially at the end of an LLM generated json you might miss the last "]"
@@ -217,35 +201,72 @@ class JSONParser:
217
201
  if char == ",":
218
202
  # Remove trailing "," before adding the "]"
219
203
  self.log(
220
- "While parsing an array, remove a trailing , before adding ]",
204
+ "While parsing an array, found a trailing , before adding ]",
221
205
  "info",
222
206
  )
223
- self.remove_char_at()
224
- self.insert_char_at("]")
207
+
225
208
  self.index -= 1
226
209
 
227
210
  self.index += 1
228
211
  self.reset_context()
229
212
  return arr
230
213
 
231
- def parse_string(self, string_quotes=False) -> str:
214
+ def parse_string(self) -> str:
232
215
  # <string> is a string of valid characters enclosed in quotes
233
216
  # i.e. { name: "John" }
234
217
  # Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
235
218
 
236
219
  # Flag to manage corner cases related to missing starting quote
237
- fixed_quotes = False
220
+ missing_quotes = False
238
221
  doubled_quotes = False
239
222
  lstring_delimiter = rstring_delimiter = '"'
240
- if isinstance(string_quotes, list):
241
- lstring_delimiter = string_quotes[0]
242
- rstring_delimiter = string_quotes[1]
243
- elif isinstance(string_quotes, str):
244
- lstring_delimiter = rstring_delimiter = string_quotes
223
+
224
+ char = self.get_char_at()
225
+ # A valid string can only start with a valid quote or, in our case, with a literal
226
+ while char and char not in ['"', "'", "“"] and not char.isalpha():
227
+ self.index += 1
228
+ char = self.get_char_at()
229
+
230
+ if not char:
231
+ # This is an empty string
232
+ return ""
233
+
234
+ # Ensuring we use the right delimiter
235
+ if char == "'":
236
+ lstring_delimiter = rstring_delimiter = "'"
237
+ elif char == "“":
238
+ lstring_delimiter = "“"
239
+ rstring_delimiter = "”"
240
+ elif char.isalpha():
241
+ # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
242
+ if char.lower() in ["t", "f", "n"]:
243
+ value = self.parse_boolean_or_null()
244
+ if value != "":
245
+ return value
246
+ self.log(
247
+ "While parsing a string, we found a literal instead of a quote",
248
+ "info",
249
+ )
250
+ if self.get_context() == "":
251
+ # A string literal in the wild isn't a valid json and not something we can fix
252
+ self.log(
253
+ "While parsing a string, we found a literal outside of context, ignoring it",
254
+ "info",
255
+ )
256
+ self.index += 1
257
+ return self.parse_json()
258
+ self.log(
259
+ "While parsing a string, we found no starting quote, ignoring", "info"
260
+ )
261
+ missing_quotes = True
262
+
263
+ if not missing_quotes:
264
+ self.index += 1
265
+
245
266
  # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
246
- if self.get_char_at(1) == lstring_delimiter:
267
+ if self.get_char_at() == lstring_delimiter:
247
268
  # This is a valid exception only if it's closed by a double delimiter again
248
- i = 2
269
+ i = 1
249
270
  next_c = self.get_char_at(i)
250
271
  while next_c and next_c != rstring_delimiter:
251
272
  i += 1
@@ -259,18 +280,9 @@ class JSONParser:
259
280
  )
260
281
  doubled_quotes = True
261
282
  self.index += 1
262
- char = self.get_char_at()
263
- if char != lstring_delimiter:
264
- self.log(
265
- "While parsing a string, we found no starting quote, adding it", "info"
266
- )
267
- self.insert_char_at(lstring_delimiter)
268
- fixed_quotes = True
269
- else:
270
- self.index += 1
271
283
 
272
- # Start position of the string (to use later in the return value)
273
- start = self.index
284
+ # Initialize our return value
285
+ string_acc = ""
274
286
 
275
287
  # Here things get a bit hairy because a string missing the final quote can also be a key or a value in an object
276
288
  # In that case we need to use the ":|,|}" characters as terminators of the string
@@ -280,22 +292,25 @@ class JSONParser:
280
292
  # * If we are fixing missing quotes in an object, when it finds the special terminators
281
293
  char = self.get_char_at()
282
294
  while char and char != rstring_delimiter:
283
- if fixed_quotes:
295
+ if missing_quotes:
284
296
  if self.get_context() == "object_key" and (
285
297
  char == ":" or char.isspace()
286
298
  ):
287
299
  break
288
300
  elif self.get_context() == "object_value" and char in [",", "}"]:
289
301
  break
302
+ string_acc += char
290
303
  self.index += 1
291
304
  char = self.get_char_at()
292
305
  # If the string contains an escaped character we should respect that or remove the escape
293
306
  if self.get_char_at(-1) == "\\":
294
307
  if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
308
+ string_acc += char
295
309
  self.index += 1
296
310
  char = self.get_char_at()
297
311
  else:
298
- self.remove_char_at(-1)
312
+ # Remove this character from the final output
313
+ string_acc = string_acc[:-2] + string_acc[-1:]
299
314
  self.index -= 1
300
315
  # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
301
316
  if char == rstring_delimiter:
@@ -305,8 +320,6 @@ class JSONParser:
305
320
  "While parsing a string, we found a doubled quote, ignoring it",
306
321
  "info",
307
322
  )
308
- # self destruct this character
309
- self.remove_char_at()
310
323
  else:
311
324
  # Check if eventually there is a rstring delimiter, otherwise we bail
312
325
  i = 1
@@ -343,12 +356,13 @@ class JSONParser:
343
356
  "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
344
357
  "info",
345
358
  )
359
+ string_acc += char
346
360
  self.index += 1
347
361
  char = self.get_char_at()
348
362
 
349
363
  if (
350
364
  char
351
- and fixed_quotes
365
+ and missing_quotes
352
366
  and self.get_context() == "object_key"
353
367
  and char.isspace()
354
368
  ):
@@ -360,19 +374,16 @@ class JSONParser:
360
374
  if self.get_char_at() not in [":", ","]:
361
375
  return ""
362
376
 
363
- end = self.index
364
-
365
377
  # A fallout of the previous special case in the while loop, we need to update the index only if we had a closing quote
366
378
  if char != rstring_delimiter:
367
379
  self.log(
368
- "While parsing a string, we missed the closing quote, adding it back",
380
+ "While parsing a string, we missed the closing quote, ignoring",
369
381
  "info",
370
382
  )
371
- self.insert_char_at(rstring_delimiter)
372
383
  else:
373
384
  self.index += 1
374
385
 
375
- return self.json_str[start:end].rstrip()
386
+ return string_acc.rstrip()
376
387
 
377
388
  def parse_number(self) -> Union[float, int, str]:
378
389
  # <number> is a valid real number expressed in one of a number of given formats
@@ -395,51 +406,69 @@ class JSONParser:
395
406
  except ValueError:
396
407
  return number_str
397
408
  else:
398
- # This is a string then
399
- return self.parse_string()
409
+ # If nothing works, let's skip and keep parsing
410
+ return self.parse_json()
400
411
 
401
412
  def parse_boolean_or_null(self) -> Union[bool, str, None]:
402
413
  # <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
403
- boolean_map = {"true": (True, 4), "false": (False, 5), "null": (None, 4)}
404
- for key, (value, length) in boolean_map.items():
405
- if self.json_str.lower().startswith(key, self.index):
406
- self.index += length
407
- return value
408
-
409
- # This is a string then
410
- return self.parse_string()
414
+ starting_index = self.index
415
+ value = ""
416
+ char = self.get_char_at().lower()
417
+ if char == "t":
418
+ value = ("true", True)
419
+ elif char == "f":
420
+ value = ("false", False)
421
+ elif char == "n":
422
+ value = ("null", None)
423
+
424
+ if len(value):
425
+ i = 0
426
+ while char and i < len(value[0]) and char == value[0][i]:
427
+ i += 1
428
+ self.index += 1
429
+ char = self.get_char_at().lower()
430
+ if i == len(value[0]):
431
+ return value[1]
411
432
 
412
- def insert_char_at(self, char: str) -> None:
413
- self.json_str = self.json_str[: self.index] + char + self.json_str[self.index :]
414
- self.index += 1
433
+ # If nothing works reset the index before returning
434
+ self.index = starting_index
435
+ return ""
415
436
 
416
437
  def get_char_at(self, count: int = 0) -> Union[str, bool]:
417
- # Why not use something simpler? Because we might be out of bounds and doing this check all the time is annoying
438
+ # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
418
439
  try:
419
440
  return self.json_str[self.index + count]
420
441
  except IndexError:
421
- return False
422
-
423
- def remove_char_at(self, count: int = 0) -> None:
424
- self.json_str = (
425
- self.json_str[: self.index + count]
426
- + self.json_str[self.index + count + 1 :]
427
- )
442
+ if self.json_fd:
443
+ self.json_fd.seek(self.index + count)
444
+ char = self.json_fd.read(1)
445
+ if char == "":
446
+ return False
447
+ return char
448
+ else:
449
+ return False
428
450
 
429
451
  def skip_whitespaces_at(self) -> None:
430
- # Remove trailing spaces
431
- # I'd rather not do this BUT this method is called so many times that it makes sense to expand get_char_at
432
- # At least this is what the profiler said and I believe in our lord and savior the profiler
433
- try:
434
- char = self.json_str[self.index]
435
- except IndexError:
436
- return
437
- while char and char.isspace():
438
- self.index += 1
452
+ """
453
+ This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
454
+ """
455
+ if self.json_fd:
456
+ char = self.get_char_at()
457
+ while char and char.isspace():
458
+ self.index += 1
459
+ char = self.get_char_at()
460
+ else:
461
+ # If this is not a file stream, we do this monster here to make this function much much faster
439
462
  try:
440
463
  char = self.json_str[self.index]
441
464
  except IndexError:
442
465
  return
466
+ while char.isspace():
467
+ self.index += 1
468
+ try:
469
+ char = self.json_str[self.index]
470
+ except IndexError:
471
+ return
443
472
 
444
473
  def set_context(self, value: str) -> None:
445
474
  # If a value is provided update the context variable and save in stack
@@ -460,23 +489,31 @@ class JSONParser:
460
489
 
461
490
  def log(self, text: str, level: str) -> None:
462
491
  if level == self.logger["log_level"]:
492
+ context = ""
493
+ if self.json_fd:
494
+ self.json_fd.seek(self.index - self.logger["window"])
495
+ context = self.json_fd.read(self.logger["window"] * 2)
496
+ self.json_fd.seek(self.index)
497
+ else:
498
+ context = self.json_str[
499
+ self.index
500
+ - self.logger["window"] : self.index
501
+ + self.logger["window"]
502
+ ]
463
503
  self.logger["log"].append(
464
504
  {
465
505
  "text": text,
466
- "context": self.json_str[
467
- self.index
468
- - self.logger["window"] : self.index
469
- + self.logger["window"]
470
- ],
506
+ "context": context,
471
507
  }
472
508
  )
473
509
 
474
510
 
475
511
  def repair_json(
476
- json_str: str,
512
+ json_str: str = "",
477
513
  return_objects: bool = False,
478
514
  skip_json_loads: bool = False,
479
515
  logging: bool = False,
516
+ json_fd: TextIO = None,
480
517
  ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
481
518
  """
482
519
  Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
@@ -485,13 +522,15 @@ def repair_json(
485
522
  When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
486
523
  When `logging=True` is passed, it will return an tuple with the repaired json and a log of all repair actions
487
524
  """
488
- json_str = json_str.strip().lstrip("```json")
489
- parser = JSONParser(json_str, logging)
525
+ parser = JSONParser(json_str, json_fd, logging)
490
526
  if skip_json_loads:
491
527
  parsed_json = parser.parse()
492
528
  else:
493
529
  try:
494
- parsed_json = json.loads(json_str)
530
+ if json_fd:
531
+ parsed_json = json.load(json_fd)
532
+ else:
533
+ parsed_json = json.loads(json_str)
495
534
  except json.JSONDecodeError:
496
535
  parsed_json = parser.parse()
497
536
  # It's useful to return the actual object instead of the json string, it allows this lib to be a replacement of the json library
@@ -507,18 +546,30 @@ def loads(
507
546
  This function works like `json.loads()` except that it will fix your JSON in the process.
508
547
  It is a wrapper around the `repair_json()` function with `return_objects=True`.
509
548
  """
510
- return repair_json(json_str, True, skip_json_loads, logging)
549
+ return repair_json(
550
+ json_str=json_str,
551
+ return_objects=True,
552
+ skip_json_loads=skip_json_loads,
553
+ logging=logging,
554
+ )
511
555
 
512
556
 
513
557
  def load(
514
- fp: TextIO, skip_json_loads: bool = False, logging: bool = False
558
+ fd: TextIO, skip_json_loads: bool = False, logging: bool = False
515
559
  ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
516
- return loads(fp.read(), skip_json_loads, logging)
560
+ """
561
+ This function works like `json.load()` except that it will fix your JSON in the process.
562
+ It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`.
563
+ """
564
+ return repair_json(json_fd=fd, skip_json_loads=skip_json_loads, logging=logging)
517
565
 
518
566
 
519
567
  def from_file(
520
568
  filename: str, skip_json_loads: bool = False, logging: bool = False
521
569
  ) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
570
+ """
571
+ This function is a wrapper around `load()` so you can pass the filename as string
572
+ """
522
573
  fd = open(filename)
523
574
  jsonobj = load(fd, skip_json_loads, logging)
524
575
  fd.close()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: json_repair
3
- Version: 0.16.3
3
+ Version: 0.17.1
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -0,0 +1,7 @@
1
+ json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
2
+ json_repair/json_repair.py,sha256=STzwcsoAV8jB1hXQXKs9vYMhemV22vCiH14jyVG4v4A,23311
3
+ json_repair-0.17.1.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
4
+ json_repair-0.17.1.dist-info/METADATA,sha256=LdjjpdQsJ1WuyQ28Z36cvDfMJE91lO4iHV2NhQ_RqNc,7355
5
+ json_repair-0.17.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
6
+ json_repair-0.17.1.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
7
+ json_repair-0.17.1.dist-info/RECORD,,
@@ -1,7 +0,0 @@
1
- json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
2
- json_repair/json_repair.py,sha256=Z1BiZlCBWDGiZeARAMcQ-PYRJE5PHFeTDGLLTEVg4fs,21822
3
- json_repair-0.16.3.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
4
- json_repair-0.16.3.dist-info/METADATA,sha256=0qkkPNvtdSpJk7c8fwnRMhStZKjFs6MoUVwYJwUxv7M,7355
5
- json_repair-0.16.3.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
6
- json_repair-0.16.3.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
7
- json_repair-0.16.3.dist-info/RECORD,,