json-repair 0.28.3__py3-none-any.whl → 0.29.0__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ from .json_repair import cli
2
+
3
+ if __name__ == "__main__":
4
+ cli()
@@ -22,26 +22,62 @@ If something is wrong (a missing parantheses or quotes for example) it will use
22
22
  All supported use cases are in the unit tests
23
23
  """
24
24
 
25
+ import argparse
25
26
  import os
27
+ import sys
26
28
  import json
27
29
  from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
28
30
 
29
31
 
30
32
  class StringFileWrapper:
31
33
  # This is a trick to simplify the code, transform the filedescriptor handling into a string handling
32
- def __init__(self, fd: TextIO) -> None:
34
+ def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
33
35
  self.fd = fd
34
36
  self.length: int = 0
37
+ # Buffers are 1MB strings that are read from the file
38
+ # and kept in memory to keep reads low
39
+ self.buffers: dict[int, str] = {}
40
+ # CHUNK_LENGTH is in bytes
41
+ if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
42
+ CHUNK_LENGTH = 1_000_000
43
+ self.buffer_length = CHUNK_LENGTH
44
+
45
+ def get_buffer(self, index: int) -> str:
46
+ if self.buffers.get(index) is None:
47
+ self.fd.seek(index * self.buffer_length)
48
+ self.buffers[index] = self.fd.read(self.buffer_length)
49
+ # Save memory by keeping max 2MB buffer chunks and min 2 chunks
50
+ if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
51
+ oldest_key = next(iter(self.buffers))
52
+ if oldest_key != index:
53
+ self.buffers.pop(oldest_key)
54
+ return self.buffers[index]
35
55
 
36
56
  def __getitem__(self, index: Union[int, slice]) -> str:
57
+ # The buffer is an array that is seek like a RAM:
58
+ # self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
59
+ # self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
37
60
  if isinstance(index, slice):
38
- self.fd.seek(index.start)
39
- value = self.fd.read(index.stop - index.start)
40
- self.fd.seek(index.start)
41
- return value
61
+ buffer_index = index.start // self.buffer_length
62
+ buffer_end = index.stop // self.buffer_length
63
+ if buffer_index == buffer_end:
64
+ return self.get_buffer(buffer_index)[
65
+ index.start % self.buffer_length : index.stop % self.buffer_length
66
+ ]
67
+ else:
68
+ start_slice = self.get_buffer(buffer_index)[
69
+ index.start % self.buffer_length :
70
+ ]
71
+ end_slice = self.get_buffer(buffer_end)[
72
+ : index.stop % self.buffer_length
73
+ ]
74
+ middle_slices = [
75
+ self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)
76
+ ]
77
+ return start_slice + "".join(middle_slices) + end_slice
42
78
  else:
43
- self.fd.seek(index)
44
- return self.fd.read(1)
79
+ buffer_index = index // self.buffer_length
80
+ return self.get_buffer(buffer_index)[index % self.buffer_length]
45
81
 
46
82
  def __len__(self) -> int:
47
83
  if self.length < 1:
@@ -69,13 +105,14 @@ class JSONParser:
69
105
  json_str: Union[str, StringFileWrapper],
70
106
  json_fd: Optional[TextIO],
71
107
  logging: Optional[bool],
108
+ json_fd_chunk_length: int = 0,
72
109
  ) -> None:
73
110
  # The string to parse
74
111
  self.json_str = json_str
75
112
  # Alternatively, the file description with a json file in it
76
113
  if json_fd:
77
114
  # This is a trick we do to treat the file wrapper as an array
78
- self.json_str = StringFileWrapper(json_fd)
115
+ self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
79
116
  # Index is our iterator that will keep track of which character we are looking at right now
80
117
  self.index: int = 0
81
118
  # This is used in the object member parsing to manage the special cases of missing quotes in key or value
@@ -639,6 +676,7 @@ def repair_json(
639
676
  logging: bool = False,
640
677
  json_fd: Optional[TextIO] = None,
641
678
  ensure_ascii: bool = True,
679
+ chunk_length: int = 0,
642
680
  ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
643
681
  """
644
682
  Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
@@ -647,7 +685,7 @@ def repair_json(
647
685
  When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
648
686
  When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions
649
687
  """
650
- parser = JSONParser(json_str, json_fd, logging)
688
+ parser = JSONParser(json_str, json_fd, logging, chunk_length)
651
689
  if skip_json_loads:
652
690
  parsed_json = parser.parse()
653
691
  else:
@@ -683,7 +721,10 @@ def loads(
683
721
 
684
722
 
685
723
  def load(
686
- fd: TextIO, skip_json_loads: bool = False, logging: bool = False
724
+ fd: TextIO,
725
+ skip_json_loads: bool = False,
726
+ logging: bool = False,
727
+ chunk_length: int = 0,
687
728
  ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
688
729
  """
689
730
  This function works like `json.load()` except that it will fix your JSON in the process.
@@ -691,6 +732,7 @@ def load(
691
732
  """
692
733
  return repair_json(
693
734
  json_fd=fd,
735
+ chunk_length=chunk_length,
694
736
  return_objects=True,
695
737
  skip_json_loads=skip_json_loads,
696
738
  logging=logging,
@@ -701,12 +743,62 @@ def from_file(
701
743
  filename: str,
702
744
  skip_json_loads: bool = False,
703
745
  logging: bool = False,
746
+ chunk_length: int = 0,
704
747
  ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
705
748
  """
706
749
  This function is a wrapper around `load()` so you can pass the filename as string
707
750
  """
708
751
  fd = open(filename)
709
- jsonobj = load(fd, skip_json_loads, logging)
752
+ jsonobj = load(
753
+ fd=fd,
754
+ skip_json_loads=skip_json_loads,
755
+ logging=logging,
756
+ chunk_length=chunk_length,
757
+ )
710
758
  fd.close()
711
759
 
712
760
  return jsonobj
761
+
762
+
763
+ def cli(): # pragma: no cover
764
+ parser = argparse.ArgumentParser(description="Repair and parse JSON files.")
765
+ parser.add_argument("filename", help="The JSON file to repair")
766
+ parser.add_argument(
767
+ "-i",
768
+ "--inline",
769
+ action="store_true",
770
+ help="Replace the file inline instead of returning the output to stdout",
771
+ )
772
+ parser.add_argument(
773
+ "--ensure_ascii",
774
+ action="store_true",
775
+ help="Pass the ensure_ascii parameter to json.dumps()",
776
+ )
777
+ parser.add_argument(
778
+ "--indent",
779
+ type=int,
780
+ default=2,
781
+ help="Number of spaces for indentation (Default 2)",
782
+ )
783
+
784
+ args = parser.parse_args()
785
+
786
+ ensure_ascii = False
787
+ if args.ensure_ascii:
788
+ ensure_ascii = True
789
+ try:
790
+ result = from_file(args.filename)
791
+
792
+ if args.inline:
793
+ fd = open(args.filename, mode="w")
794
+ json.dump(result, fd, indent=args.indent, ensure_ascii=ensure_ascii)
795
+ fd.close()
796
+ else:
797
+ print(json.dumps(result, indent=args.indent, ensure_ascii=ensure_ascii))
798
+ except Exception as e:
799
+ print(f"Error: {str(e)}", file=sys.stderr)
800
+ sys.exit(1)
801
+
802
+
803
+ if __name__ == "__main__": # pragma: no cover
804
+ cli()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: json_repair
3
- Version: 0.28.3
3
+ Version: 0.29.0
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -45,6 +45,21 @@ This simple package can be used to fix an invalid json string. To know all cases
45
45
 
46
46
  Inspired by https://github.com/josdejong/jsonrepair
47
47
 
48
+ ---
49
+ # How to cite
50
+ If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
51
+
52
+ @software{Baccianella_JSON_Repair_-_2024,
53
+ author = {Baccianella, Stefano},
54
+ month = aug,
55
+ title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
56
+ url = {https://github.com/mangiucugna/json_repair},
57
+ version = {0.28.3},
58
+ year = {2024}
59
+ }
60
+
61
+ Thank you for citing my work and please send me a link to the paper if you can!
62
+
48
63
  ---
49
64
  # Offer me a beer
50
65
  If you find this library useful, you can help me by donating toward my monthly beer budget here: https://github.com/sponsors/mangiucugna
@@ -82,6 +97,18 @@ or just
82
97
 
83
98
  decoded_object = json_repair.repair_json(json_string, return_objects=True)
84
99
 
100
+ ### Avoid this antipattern
101
+ Some users of this library adopt the following pattern:
102
+
103
+ obj = {}
104
+ try:
105
+ obj = json.loads(string)
106
+ except json.JSONDecodeError as e:
107
+ obj = json_repair.loads(string)
108
+ ...
109
+
110
+ This is wasteful because `json_repair` will already verify for you if the JSON is valid, if you still want to do that then add `skip_json_loads=True` to the call as explained the section below.
111
+
85
112
  ### Read json from a file or file descriptor
86
113
 
87
114
  JSON repair provides also a drop-in replacement for `json.load()`:
@@ -122,6 +149,32 @@ Some rules of thumb to use:
122
149
  - Setting `return_objects=True` will always be faster because the parser returns an object already and it doesn't have serialize that object to JSON
123
150
  - `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
124
151
  - If you are having issues with escaping pass the string as **raw** string like: `r"string with escaping\""`
152
+
153
+ ### Use json_repair from CLI
154
+
155
+ Install the library for command-line with:
156
+ ```
157
+ pipx install json-repair
158
+ ```
159
+ then run
160
+ ```
161
+ $ json_repair -h
162
+
163
+ usage: json_repair [-h] [-i] [--ensure_ascii] [--indent INDENT] filename
164
+
165
+ Repair and parse JSON files.
166
+
167
+ positional arguments:
168
+ filename The JSON file to repair
169
+
170
+ options:
171
+ -h, --help show this help message and exit
172
+ -i, --inline Replace the file inline instead of returning the output to stdout
173
+ --ensure_ascii Pass the ensure_ascii parameter to json.dumps()
174
+ --indent INDENT Number of spaces for indentation (Default 2)
175
+ ```
176
+ to learn how to use it
177
+
125
178
  ## Adding to requirements
126
179
  **Please pin this library only on the major version!**
127
180
 
@@ -0,0 +1,10 @@
1
+ json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
2
+ json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
3
+ json_repair/json_repair.py,sha256=hltJ3Qa4qFbUD3mVKkYvFWksnCcIZqx8zamKfBpjeNs,33538
4
+ json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ json_repair-0.29.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
6
+ json_repair-0.29.0.dist-info/METADATA,sha256=yh0EJo-I1u0R6X-Gq9ETz0WbgmuGIhzR7Icw9W4Kee0,9630
7
+ json_repair-0.29.0.dist-info/WHEEL,sha256=uCRv0ZEik_232NlR4YDw4Pv3Ajt5bKvMH13NUU7hFuI,91
8
+ json_repair-0.29.0.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
9
+ json_repair-0.29.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
10
+ json_repair-0.29.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.2.0)
2
+ Generator: setuptools (74.1.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ json_repair = json_repair.__main__:cli
@@ -1,8 +0,0 @@
1
- json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
2
- json_repair/json_repair.py,sha256=QShXijcgNG3ejW_rBbmk0RMjJE1KlGvYBhXcQnMZcHo,30129
3
- json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- json_repair-0.28.3.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
5
- json_repair-0.28.3.dist-info/METADATA,sha256=ZIm82pnDJX68089RoprQJq-HrL2LF1LVr4xDTh_6VJI,8043
6
- json_repair-0.28.3.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
7
- json_repair-0.28.3.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
8
- json_repair-0.28.3.dist-info/RECORD,,