json-repair 0.39.0__py3-none-any.whl → 0.40.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,6 +9,7 @@ JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
9
9
  class JSONParser:
10
10
  # Constants
11
11
  STRING_DELIMITERS = ['"', "'", "“", "”"]
12
+ NUMBER_CHARS = set("0123456789-.eE/,")
12
13
 
13
14
  def __init__(
14
15
  self,
@@ -129,8 +130,6 @@ class JSONParser:
129
130
  # Context is used in the string parser to manage the lack of quotes
130
131
  self.context.set(ContextValues.OBJECT_KEY)
131
132
 
132
- self.skip_whitespaces_at()
133
-
134
133
  # Save this index in case we need find a duplicate key
135
134
  rollback_index = self.index
136
135
 
@@ -219,18 +218,13 @@ class JSONParser:
219
218
  char = self.get_char_at()
220
219
 
221
220
  # Especially at the end of an LLM generated json you might miss the last "]"
222
- char = self.get_char_at()
223
221
  if char and char != "]":
224
222
  self.log(
225
- "While parsing an array we missed the closing ], adding it back",
226
- )
227
- self.index -= 1
228
- # Add the missing closing bracket
229
- self.json_str = (
230
- self.json_str[: self.index + 1] + "]" + self.json_str[self.index + 1 :]
223
+ "While parsing an array we missed the closing ], ignoring it",
231
224
  )
232
225
 
233
226
  self.index += 1
227
+
234
228
  self.context.reset()
235
229
  return arr
236
230
 
@@ -275,15 +269,11 @@ class JSONParser:
275
269
  self.log(
276
270
  "While parsing a string, we found a literal instead of a quote",
277
271
  )
278
- self.log(
279
- "While parsing a string, we found no starting quote. Will add the quote back",
280
- )
281
272
  missing_quotes = True
282
273
 
283
274
  if not missing_quotes:
284
275
  self.index += 1
285
276
 
286
- self.skip_whitespaces_at()
287
277
  # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
288
278
  if self.get_char_at() in self.STRING_DELIMITERS:
289
279
  # If the next character is the same type of quote, then we manage it as double quotes
@@ -583,6 +573,13 @@ class JSONParser:
583
573
  elif (
584
574
  next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\"
585
575
  ):
576
+ # Check if self.index:self.index+i is only whitespaces, break if that's the case
577
+ if all(
578
+ str(self.get_char_at(j)).isspace()
579
+ for j in range(1, i)
580
+ if self.get_char_at(j)
581
+ ):
582
+ break
586
583
  if self.context.current == ContextValues.OBJECT_VALUE:
587
584
  # But this might not be it! This could be just a missing comma
588
585
  # We found a delimiter and we need to check if this is a key
@@ -610,27 +607,24 @@ class JSONParser:
610
607
  self.index += 1
611
608
  char = self.get_char_at()
612
609
  elif self.context.current == ContextValues.ARRAY:
613
- # In array context this could be something like "lorem "ipsum" sic"
614
- # So let's check if we find a rstring_delimiter forward otherwise end early
615
- i = self.skip_to_character(rstring_delimiter, idx=i + 1)
616
- next_c = self.get_char_at(i)
617
- if next_c and next_c == rstring_delimiter:
618
- # Ok now if I find a comma or a closing ], that can be have also an optional rstring_delimiter before them
619
- # We can consider this a misplaced quote
620
- i += 1
621
- i = self.skip_whitespaces_at(
622
- idx=i, move_main_index=False
623
- )
624
- next_c = self.get_char_at(i)
625
- if next_c and next_c in [",", "]"]:
626
- self.log(
627
- "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
628
- )
629
- unmatched_delimiter = not unmatched_delimiter
630
- string_acc += str(char)
631
- self.index += 1
632
- char = self.get_char_at()
633
-
610
+ # If we got up to here it means that this is a situation like this:
611
+ # ["bla bla bla "puppy" bla bla bla "kitty" bla bla"]
612
+ # So we need to ignore this quote
613
+ self.log(
614
+ "While parsing a string in Array context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
615
+ )
616
+ unmatched_delimiter = not unmatched_delimiter
617
+ string_acc += str(char)
618
+ self.index += 1
619
+ char = self.get_char_at()
620
+ elif self.context.current == ContextValues.OBJECT_KEY:
621
+ # In this case we just ignore this and move on
622
+ self.log(
623
+ "While parsing a string in Object Key context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
624
+ )
625
+ string_acc += str(char)
626
+ self.index += 1
627
+ char = self.get_char_at()
634
628
  if (
635
629
  char
636
630
  and missing_quotes
@@ -663,10 +657,9 @@ class JSONParser:
663
657
  def parse_number(self) -> Union[float, int, str, JSONReturnType]:
664
658
  # <number> is a valid real number expressed in one of a number of given formats
665
659
  number_str = ""
666
- number_chars = set("0123456789-.eE/,")
667
660
  char = self.get_char_at()
668
661
  is_array = self.context.current == ContextValues.ARRAY
669
- while char and char in number_chars and (char != "," or not is_array):
662
+ while char and char in self.NUMBER_CHARS and (not is_array or char != ","):
670
663
  number_str += char
671
664
  self.index += 1
672
665
  char = self.get_char_at()
@@ -712,51 +705,6 @@ class JSONParser:
712
705
  self.index = starting_index
713
706
  return ""
714
707
 
715
- def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
716
- # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
717
- try:
718
- return self.json_str[self.index + count]
719
- except IndexError:
720
- return False
721
-
722
- def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
723
- """
724
- This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
725
- """
726
- try:
727
- char = self.json_str[self.index + idx]
728
- except IndexError:
729
- return idx
730
- while char.isspace():
731
- if move_main_index:
732
- self.index += 1
733
- else:
734
- idx += 1
735
- try:
736
- char = self.json_str[self.index + idx]
737
- except IndexError:
738
- return idx
739
- return idx
740
-
741
- def skip_to_character(self, character: str, idx: int = 0) -> int:
742
- """
743
- This function quickly iterates to find a character, syntactic sugar to make the code more concise
744
- """
745
- try:
746
- char = self.json_str[self.index + idx]
747
- except IndexError:
748
- return idx
749
- while char != character:
750
- idx += 1
751
- try:
752
- char = self.json_str[self.index + idx]
753
- except IndexError:
754
- return idx
755
- if self.index + idx > 0 and self.json_str[self.index + idx - 1] == "\\":
756
- # Ah this is an escaped character, try again
757
- return self.skip_to_character(character=character, idx=idx + 1)
758
- return idx
759
-
760
708
  def parse_comment(self) -> str:
761
709
  """
762
710
  Parse code-like comments:
@@ -827,6 +775,51 @@ class JSONParser:
827
775
  self.index += 1
828
776
  return ""
829
777
 
778
+ def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
779
+ # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
780
+ try:
781
+ return self.json_str[self.index + count]
782
+ except IndexError:
783
+ return False
784
+
785
+ def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
786
+ """
787
+ This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
788
+ """
789
+ try:
790
+ char = self.json_str[self.index + idx]
791
+ except IndexError:
792
+ return idx
793
+ while char.isspace():
794
+ if move_main_index:
795
+ self.index += 1
796
+ else:
797
+ idx += 1
798
+ try:
799
+ char = self.json_str[self.index + idx]
800
+ except IndexError:
801
+ return idx
802
+ return idx
803
+
804
+ def skip_to_character(self, character: str, idx: int = 0) -> int:
805
+ """
806
+ This function quickly iterates to find a character, syntactic sugar to make the code more concise
807
+ """
808
+ try:
809
+ char = self.json_str[self.index + idx]
810
+ except IndexError:
811
+ return idx
812
+ while char != character:
813
+ idx += 1
814
+ try:
815
+ char = self.json_str[self.index + idx]
816
+ except IndexError:
817
+ return idx
818
+ if self.index + idx > 0 and self.json_str[self.index + idx - 1] == "\\":
819
+ # Ah this is an escaped character, try again
820
+ return self.skip_to_character(character=character, idx=idx + 1)
821
+ return idx
822
+
830
823
  def _log(self, text: str) -> None:
831
824
  window: int = 10
832
825
  start: int = max(self.index - window, 0)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: json_repair
3
- Version: 0.39.0
3
+ Version: 0.40.0
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -196,12 +196,12 @@ pipx install json-repair
196
196
  to know all options available:
197
197
  ```
198
198
  $ json_repair -h
199
- usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] filename
199
+ usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] [filename]
200
200
 
201
201
  Repair and parse JSON files.
202
202
 
203
203
  positional arguments:
204
- filename The JSON file to repair
204
+ filename The JSON file to repair (if omitted, reads from stdin)
205
205
 
206
206
  options:
207
207
  -h, --help show this help message and exit
@@ -226,13 +226,13 @@ In this example, any version that starts with `0.` will be acceptable, allowing
226
226
  # How to cite
227
227
  If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
228
228
 
229
- @software{Baccianella_JSON_Repair_-_2024,
230
- author = {Baccianella, Stefano},
231
- month = aug,
232
- title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
233
- url = {https://github.com/mangiucugna/json_repair},
234
- version = {0.28.3},
235
- year = {2024}
229
+ @software{Baccianella_JSON_Repair_-_2025,
230
+ author = "Stefano {Baccianella}",
231
+ month = "feb",
232
+ title = "JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs",
233
+ url = "https://github.com/mangiucugna/json_repair",
234
+ version = "0.39.1",
235
+ year = 2025
236
236
  }
237
237
 
238
238
  Thank you for citing my work and please send me a link to the paper if you can!
@@ -1,13 +1,13 @@
1
1
  json_repair/__init__.py,sha256=c4L2kZrHvWEKfj_ODU2naliNuvU6FlFVxtF0hbLe6s8,178
2
2
  json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
3
3
  json_repair/json_context.py,sha256=mm6dOyrPJ1sDskTORZSXCW7W9-5veMlUKqXQ3Hw3EG4,971
4
- json_repair/json_parser.py,sha256=BQsH8CRy59C2176bMwVerfqbHDXfLoEC1v5frmCiv7M,39020
4
+ json_repair/json_parser.py,sha256=aw-iCtblc9iL24w5zljHbbblK7Ao6G49MPoj513D2KE,38750
5
5
  json_repair/json_repair.py,sha256=k-5HRRlCqrxNmJi0u1KE3IUeL4HXqi1XZ7oAL-NFDLo,10314
6
6
  json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
7
  json_repair/string_file_wrapper.py,sha256=koZmdq2-Z5K7XF1bDqX6dEbNaVMJYcMTjq-aGe6NQvA,4526
8
- json_repair-0.39.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
9
- json_repair-0.39.0.dist-info/METADATA,sha256=cArvqcMBL9FVCwnJGtsaeF7lXWjOFWG3_1OueGjOiRs,11794
10
- json_repair-0.39.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
11
- json_repair-0.39.0.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
12
- json_repair-0.39.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
13
- json_repair-0.39.0.dist-info/RECORD,,
8
+ json_repair-0.40.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
9
+ json_repair-0.40.0.dist-info/METADATA,sha256=i43pAASjiIvd0XJ3CMO1nqaV14JNE2MjPx0U8lMJVYc,11838
10
+ json_repair-0.40.0.dist-info/WHEEL,sha256=beeZ86-EfXScwlR_HKu4SllMC9wUEj_8Z_4FJ3egI2w,91
11
+ json_repair-0.40.0.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
12
+ json_repair-0.40.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
13
+ json_repair-0.40.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (76.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5