json-repair 0.46.1__py3-none-any.whl → 0.47.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
json_repair/__init__.py CHANGED
@@ -1,4 +1,3 @@
1
- from .json_repair import from_file as from_file
2
- from .json_repair import load as load
3
- from .json_repair import loads as loads
4
- from .json_repair import repair_json as repair_json
1
+ from .json_repair import from_file, load, loads, repair_json
2
+
3
+ __all__ = ["from_file", "load", "loads", "repair_json"]
@@ -105,14 +105,10 @@ class JSONParser:
105
105
  )
106
106
  return ""
107
107
  # <string> starts with a quote
108
- elif not self.context.empty and (
109
- char in self.STRING_DELIMITERS or char.isalpha()
110
- ):
108
+ elif not self.context.empty and (char in self.STRING_DELIMITERS or char.isalpha()):
111
109
  return self.parse_string()
112
110
  # <number> starts with [0-9] or minus
113
- elif not self.context.empty and (
114
- char.isdigit() or char == "-" or char == "."
115
- ):
111
+ elif not self.context.empty and (char.isdigit() or char == "-" or char == "."):
116
112
  return self.parse_number()
117
113
  elif char in ["#", "/"]:
118
114
  return self.parse_comment()
@@ -164,8 +160,7 @@ class JSONParser:
164
160
  if isinstance(prev_value, list):
165
161
  prev_value.extend(
166
162
  new_array[0]
167
- if len(new_array) == 1
168
- and isinstance(new_array[0], list)
163
+ if len(new_array) == 1 and isinstance(new_array[0], list)
169
164
  else new_array
170
165
  )
171
166
  self.skip_whitespaces_at()
@@ -185,11 +180,7 @@ class JSONParser:
185
180
  )
186
181
  self.index = rollback_index - 1
187
182
  # add an opening curly brace to make this work
188
- self.json_str = (
189
- self.json_str[: self.index + 1]
190
- + "{"
191
- + self.json_str[self.index + 1 :]
192
- )
183
+ self.json_str = self.json_str[: self.index + 1] + "{" + self.json_str[self.index + 1 :]
193
184
  break
194
185
 
195
186
  # Skip filler whitespaces
@@ -242,10 +233,7 @@ class JSONParser:
242
233
  i = 1
243
234
  i = self.skip_to_character(char, i)
244
235
  i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
245
- if self.get_char_at(i) == ":":
246
- value = self.parse_object()
247
- else:
248
- value = self.parse_string()
236
+ value = self.parse_object() if self.get_char_at(i) == ":" else self.parse_string()
249
237
  else:
250
238
  value = self.parse_json()
251
239
 
@@ -307,10 +295,7 @@ class JSONParser:
307
295
  elif char.isalnum():
308
296
  # This could be a <boolean> and not a string. Because (T)rue or (F)alse or (N)ull are valid
309
297
  # But remember, object keys are only of type string
310
- if (
311
- char.lower() in ["t", "f", "n"]
312
- and self.context.current != ContextValues.OBJECT_KEY
313
- ):
298
+ if char.lower() in ["t", "f", "n"] and self.context.current != ContextValues.OBJECT_KEY:
314
299
  value = self.parse_boolean_or_null()
315
300
  if value != "":
316
301
  return value
@@ -323,15 +308,9 @@ class JSONParser:
323
308
  self.index += 1
324
309
 
325
310
  # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
326
- if (
327
- self.get_char_at() in self.STRING_DELIMITERS
328
- and self.get_char_at() == lstring_delimiter
329
- ):
311
+ if self.get_char_at() in self.STRING_DELIMITERS and self.get_char_at() == lstring_delimiter:
330
312
  # If it's an empty key, this was easy
331
- if (
332
- self.context.current == ContextValues.OBJECT_KEY
333
- and self.get_char_at(1) == ":"
334
- ):
313
+ if self.context.current == ContextValues.OBJECT_KEY and self.get_char_at(1) == ":":
335
314
  self.index += 1
336
315
  return ""
337
316
  if self.get_char_at(1) == lstring_delimiter:
@@ -380,23 +359,20 @@ class JSONParser:
380
359
  char = self.get_char_at()
381
360
  unmatched_delimiter = False
382
361
  while char and char != rstring_delimiter:
383
- if (
384
- missing_quotes
385
- and self.context.current == ContextValues.OBJECT_KEY
386
- and (char == ":" or char.isspace())
387
- ):
362
+ if missing_quotes and self.context.current == ContextValues.OBJECT_KEY and (char == ":" or char.isspace()):
388
363
  self.log(
389
364
  "While parsing a string missing the left delimiter in object key context, we found a :, stopping here",
390
365
  )
391
366
  break
392
367
  if (
393
- (missing_quotes or not self.stream_stable)
368
+ not self.stream_stable
394
369
  and self.context.current == ContextValues.OBJECT_VALUE
395
370
  and char
396
371
  in [
397
372
  ",",
398
373
  "}",
399
374
  ]
375
+ and string_acc[-1] != rstring_delimiter
400
376
  ):
401
377
  rstring_delimiter_missing = True
402
378
  # check if this is a case in which the closing comma is NOT missing instead
@@ -421,9 +397,7 @@ class JSONParser:
421
397
  else:
422
398
  # But again, this could just be something a bit stupid like "lorem, "ipsum" sic"
423
399
  # Check if we find a : afterwards (skipping space)
424
- i = self.skip_whitespaces_at(
425
- idx=i + 1, move_main_index=False
426
- )
400
+ i = self.skip_whitespaces_at(idx=i + 1, move_main_index=False)
427
401
  next_c = self.get_char_at(i)
428
402
  if next_c and next_c != ":":
429
403
  rstring_delimiter_missing = False
@@ -461,9 +435,10 @@ class JSONParser:
461
435
  )
462
436
  break
463
437
  if (
464
- (missing_quotes or not self.stream_stable)
438
+ not self.stream_stable
465
439
  and char == "]"
466
440
  and ContextValues.ARRAY in self.context.context
441
+ and string_acc[-1] != rstring_delimiter
467
442
  ):
468
443
  # We found the end of an array and we are in array context
469
444
  # So let's check if we find a rstring_delimiter forward otherwise end early
@@ -483,15 +458,30 @@ class JSONParser:
483
458
  if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
484
459
  string_acc = string_acc[:-1]
485
460
  escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
486
- string_acc += escape_seqs.get(char, char) or char
461
+ string_acc += escape_seqs.get(char, char)
487
462
  self.index += 1
488
463
  char = self.get_char_at()
464
+ while char and string_acc[-1] == "\\" and char in [rstring_delimiter, "\\"]:
465
+ # this is a bit of a special case, if I don't do this it will close the loop or create a train of \\
466
+ # I don't love it though
467
+ string_acc = string_acc[:-1]
468
+ string_acc += char
469
+ self.index += 1
470
+ char = self.get_char_at()
471
+ continue
472
+ elif char in ["u", "x"]:
473
+ # If we find a unicode escape sequence, normalize it
474
+ num_chars = 4 if char == "u" else 2
475
+ next_chars = self.json_str[self.index + 1 : self.index + 1 + num_chars]
476
+ if len(next_chars) == num_chars and all(c in "0123456789abcdefABCDEF" for c in next_chars):
477
+ self.log("Found a unicode escape sequence, normalizing it")
478
+ string_acc = string_acc[:-1]
479
+ string_acc += chr(int(next_chars, 16))
480
+ self.index += 1 + num_chars
481
+ char = self.get_char_at()
482
+ continue
489
483
  # If we are in object key context and we find a colon, it could be a missing right quote
490
- if (
491
- char == ":"
492
- and not missing_quotes
493
- and self.context.current == ContextValues.OBJECT_KEY
494
- ):
484
+ if char == ":" and not missing_quotes and self.context.current == ContextValues.OBJECT_KEY:
495
485
  # Ok now we need to check if this is followed by a value like "..."
496
486
  i = self.skip_to_character(character=lstring_delimiter, idx=1)
497
487
  next_c = self.get_char_at(i)
@@ -519,17 +509,12 @@ class JSONParser:
519
509
  )
520
510
  break
521
511
  # ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
522
- if char == rstring_delimiter:
512
+ if char == rstring_delimiter and string_acc[-1] != "\\":
523
513
  # Special case here, in case of double quotes one after another
524
514
  if doubled_quotes and self.get_char_at(1) == rstring_delimiter:
525
- self.log(
526
- "While parsing a string, we found a doubled quote, ignoring it"
527
- )
515
+ self.log("While parsing a string, we found a doubled quote, ignoring it")
528
516
  self.index += 1
529
- elif (
530
- missing_quotes
531
- and self.context.current == ContextValues.OBJECT_VALUE
532
- ):
517
+ elif missing_quotes and self.context.current == ContextValues.OBJECT_VALUE:
533
518
  # In case of missing starting quote I need to check if the delimeter is the end or the beginning of a key
534
519
  i = 1
535
520
  next_c = self.get_char_at(i)
@@ -573,18 +558,9 @@ class JSONParser:
573
558
  check_comma_in_object_value = False
574
559
  # If we are in an object context, let's check for the right delimiters
575
560
  if (
576
- (
577
- ContextValues.OBJECT_KEY in self.context.context
578
- and next_c in [":", "}"]
579
- )
580
- or (
581
- ContextValues.OBJECT_VALUE in self.context.context
582
- and next_c == "}"
583
- )
584
- or (
585
- ContextValues.ARRAY in self.context.context
586
- and next_c in ["]", ","]
587
- )
561
+ (ContextValues.OBJECT_KEY in self.context.context and next_c in [":", "}"])
562
+ or (ContextValues.OBJECT_VALUE in self.context.context and next_c == "}")
563
+ or (ContextValues.ARRAY in self.context.context and next_c in ["]", ","])
588
564
  or (
589
565
  check_comma_in_object_value
590
566
  and self.context.current == ContextValues.OBJECT_VALUE
@@ -595,10 +571,7 @@ class JSONParser:
595
571
  i += 1
596
572
  next_c = self.get_char_at(i)
597
573
  # If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
598
- if (
599
- next_c == ","
600
- and self.context.current == ContextValues.OBJECT_VALUE
601
- ):
574
+ if next_c == "," and self.context.current == ContextValues.OBJECT_VALUE:
602
575
  i += 1
603
576
  i = self.skip_to_character(character=rstring_delimiter, idx=i)
604
577
  next_c = self.get_char_at(i)
@@ -606,29 +579,20 @@ class JSONParser:
606
579
  i += 1
607
580
  i = self.skip_whitespaces_at(idx=i, move_main_index=False)
608
581
  next_c = self.get_char_at(i)
609
- elif (
610
- next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\"
611
- ):
582
+ elif next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\":
612
583
  # Check if self.index:self.index+i is only whitespaces, break if that's the case
613
- if all(
614
- str(self.get_char_at(j)).isspace()
615
- for j in range(1, i)
616
- if self.get_char_at(j)
617
- ):
584
+ if all(str(self.get_char_at(j)).isspace() for j in range(1, i) if self.get_char_at(j)):
618
585
  break
619
586
  if self.context.current == ContextValues.OBJECT_VALUE:
620
587
  # But this might not be it! This could be just a missing comma
621
588
  # We found a delimiter and we need to check if this is a key
622
589
  # so find a rstring_delimiter and a colon after
623
- i = self.skip_to_character(
624
- character=rstring_delimiter, idx=i + 1
625
- )
590
+ i = self.skip_to_character(character=rstring_delimiter, idx=i + 1)
626
591
  i += 1
627
592
  next_c = self.get_char_at(i)
628
593
  while next_c and next_c != ":":
629
594
  if next_c in [",", "]", "}"] or (
630
- next_c == rstring_delimiter
631
- and self.get_char_at(i - 1) != "\\"
595
+ next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\"
632
596
  ):
633
597
  break
634
598
  i += 1
@@ -661,12 +625,7 @@ class JSONParser:
661
625
  string_acc += str(char)
662
626
  self.index += 1
663
627
  char = self.get_char_at()
664
- if (
665
- char
666
- and missing_quotes
667
- and self.context.current == ContextValues.OBJECT_KEY
668
- and char.isspace()
669
- ):
628
+ if char and missing_quotes and self.context.current == ContextValues.OBJECT_KEY and char.isspace():
670
629
  self.log(
671
630
  "While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
672
631
  )
@@ -686,9 +645,7 @@ class JSONParser:
686
645
  else:
687
646
  self.index += 1
688
647
 
689
- if not self.stream_stable and (
690
- missing_quotes or (string_acc and string_acc[-1] == "\n")
691
- ):
648
+ if not self.stream_stable and (missing_quotes or (string_acc and string_acc[-1] == "\n")):
692
649
  # Clean the whitespaces for some corner cases
693
650
  string_acc = string_acc.rstrip()
694
651
 
@@ -796,9 +753,7 @@ class JSONParser:
796
753
  while True:
797
754
  char = self.get_char_at()
798
755
  if not char:
799
- self.log(
800
- "Reached end-of-string while parsing block comment; unclosed block comment."
801
- )
756
+ self.log("Reached end-of-string while parsing block comment; unclosed block comment.")
802
757
  break
803
758
  comment += char
804
759
  self.index += 1
@@ -236,10 +236,7 @@ def cli(inline_args: list[str] | None = None) -> int:
236
236
  help="Number of spaces for indentation (Default 2)",
237
237
  )
238
238
 
239
- if inline_args is None: # pragma: no cover
240
- args = parser.parse_args()
241
- else:
242
- args = parser.parse_args(inline_args)
239
+ args = parser.parse_args() if inline_args is None else parser.parse_args(inline_args)
243
240
 
244
241
  # Inline mode requires a filename, so error out if none was provided.
245
242
  if args.inline and not args.filename: # pragma: no cover
@@ -30,10 +30,7 @@ class ObjectComparer: # pragma: no cover
30
30
  elif isinstance(obj1, list):
31
31
  if len(obj1) != len(obj2):
32
32
  return False
33
- for i in range(len(obj1)):
34
- if not ObjectComparer.is_same_object(obj1[i], obj2[i]):
35
- return False
36
- return True
33
+ return all(ObjectComparer.is_same_object(obj1[i], obj2[i]) for i in range(len(obj1)))
37
34
 
38
35
  # For atoms: types already match, so just return True
39
36
  return True
@@ -4,7 +4,7 @@ from typing import TextIO
4
4
 
5
5
  class StringFileWrapper:
6
6
  # This is a trick to simplify the code, transform the filedescriptor handling into a string handling
7
- def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
7
+ def __init__(self, fd: TextIO, chunk_length: int) -> None:
8
8
  """
9
9
  Initialize the StringFileWrapper with a file descriptor and chunk length.
10
10
 
@@ -23,10 +23,10 @@ class StringFileWrapper:
23
23
  # Buffers are 1MB strings that are read from the file
24
24
  # and kept in memory to keep reads low
25
25
  self.buffers: dict[int, str] = {}
26
- # CHUNK_LENGTH is in bytes
27
- if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
28
- CHUNK_LENGTH = 1_000_000
29
- self.buffer_length = CHUNK_LENGTH
26
+ # chunk_length is in bytes
27
+ if not chunk_length or chunk_length < 2:
28
+ chunk_length = 1_000_000
29
+ self.buffer_length = chunk_length
30
30
 
31
31
  def get_buffer(self, index: int) -> str:
32
32
  """
@@ -65,19 +65,11 @@ class StringFileWrapper:
65
65
  buffer_index = index.start // self.buffer_length
66
66
  buffer_end = index.stop // self.buffer_length
67
67
  if buffer_index == buffer_end:
68
- return self.get_buffer(buffer_index)[
69
- index.start % self.buffer_length : index.stop % self.buffer_length
70
- ]
68
+ return self.get_buffer(buffer_index)[index.start % self.buffer_length : index.stop % self.buffer_length]
71
69
  else:
72
- start_slice = self.get_buffer(buffer_index)[
73
- index.start % self.buffer_length :
74
- ]
75
- end_slice = self.get_buffer(buffer_end)[
76
- : index.stop % self.buffer_length
77
- ]
78
- middle_slices = [
79
- self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)
80
- ]
70
+ start_slice = self.get_buffer(buffer_index)[index.start % self.buffer_length :]
71
+ end_slice = self.get_buffer(buffer_end)[: index.stop % self.buffer_length]
72
+ middle_slices = [self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)]
81
73
  return start_slice + "".join(middle_slices) + end_slice
82
74
  else:
83
75
  buffer_index = index // self.buffer_length
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: json_repair
3
- Version: 0.46.1
3
+ Version: 0.47.0
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -0,0 +1,14 @@
1
+ json_repair/__init__.py,sha256=6FDD6dEVM5Pb5o4Zodgw4ex30Hzy-YvNRy0vts9SQ4I,118
2
+ json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
3
+ json_repair/json_context.py,sha256=WsMOjqpGSr6aaDONcrk8UFtTurzWon2Qq9AoBBYseoI,934
4
+ json_repair/json_parser.py,sha256=YBi07AfBGoZ54locsc6j1Y7WfdretFzmt0wXDEWwRo8,40321
5
+ json_repair/json_repair.py,sha256=pyH5fCkS1lyNPVjkqXerQ91lBz3eTHDPgV1QtnvJm-Y,11243
6
+ json_repair/object_comparer.py,sha256=LlIF0MisRglzC-CiG5AxAEDCBWBHeJd-6uXYx0uRmCk,1175
7
+ json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
+ json_repair/string_file_wrapper.py,sha256=tGkWBEUPE-CZPf4uSM5NE9oSDTpskX0myJiXsl-gbds,4333
9
+ json_repair-0.47.0.dist-info/licenses/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
10
+ json_repair-0.47.0.dist-info/METADATA,sha256=HDyogQyOe0FUVMSnZ-_wm9HlOzXWkRyp4zjQPgZCfMU,12208
11
+ json_repair-0.47.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
+ json_repair-0.47.0.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
13
+ json_repair-0.47.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
14
+ json_repair-0.47.0.dist-info/RECORD,,
@@ -1,14 +0,0 @@
1
- json_repair/__init__.py,sha256=c4L2kZrHvWEKfj_ODU2naliNuvU6FlFVxtF0hbLe6s8,178
2
- json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
3
- json_repair/json_context.py,sha256=WsMOjqpGSr6aaDONcrk8UFtTurzWon2Qq9AoBBYseoI,934
4
- json_repair/json_parser.py,sha256=7IPu-tin9jLX_y1F9tn3UVpqILARhZYFaTTvq9xrLnU,40451
5
- json_repair/json_repair.py,sha256=9wxf0vVNfr_RNQI1rbVPvxQ9feEwwvgnvkiYXwGEBX8,11292
6
- json_repair/object_comparer.py,sha256=5-LK-s_2MAHddTxqXSzSkaIFvPXKGLh6swC1gyN74Lk,1245
7
- json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
- json_repair/string_file_wrapper.py,sha256=uwW4B1s9Cf-iF3ANsCz-RPu2ddCqDETrt8bdojh8ufA,4485
9
- json_repair-0.46.1.dist-info/licenses/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
10
- json_repair-0.46.1.dist-info/METADATA,sha256=y-p_aOKtX4eu7p-JNj6IO3s8svB06IityZRnRKEN_xE,12208
11
- json_repair-0.46.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
12
- json_repair-0.46.1.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
13
- json_repair-0.46.1.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
14
- json_repair-0.46.1.dist-info/RECORD,,