json-repair 0.28.3__py3-none-any.whl → 0.29.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ from .json_repair import cli
2
+
3
+ if __name__ == "__main__":
4
+ cli()
@@ -22,26 +22,62 @@ If something is wrong (a missing parantheses or quotes for example) it will use
22
22
  All supported use cases are in the unit tests
23
23
  """
24
24
 
25
+ import argparse
25
26
  import os
27
+ import sys
26
28
  import json
27
29
  from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
28
30
 
29
31
 
30
32
  class StringFileWrapper:
31
33
  # This is a trick to simplify the code, transform the filedescriptor handling into a string handling
32
- def __init__(self, fd: TextIO) -> None:
34
+ def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
33
35
  self.fd = fd
34
36
  self.length: int = 0
37
+ # Buffers are 1MB strings that are read from the file
38
+ # and kept in memory to keep reads low
39
+ self.buffers: dict[int, str] = {}
40
+ # CHUNK_LENGTH is in bytes
41
+ if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
42
+ CHUNK_LENGTH = 1_000_000
43
+ self.buffer_length = CHUNK_LENGTH
44
+
45
+ def get_buffer(self, index: int) -> str:
46
+ if self.buffers.get(index) is None:
47
+ self.fd.seek(index * self.buffer_length)
48
+ self.buffers[index] = self.fd.read(self.buffer_length)
49
+ # Save memory by keeping max 2MB buffer chunks and min 2 chunks
50
+ if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
51
+ oldest_key = next(iter(self.buffers))
52
+ if oldest_key != index:
53
+ self.buffers.pop(oldest_key)
54
+ return self.buffers[index]
35
55
 
36
56
  def __getitem__(self, index: Union[int, slice]) -> str:
57
+ # The buffer is an array that is seek like a RAM:
58
+ # self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
59
+ # self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
37
60
  if isinstance(index, slice):
38
- self.fd.seek(index.start)
39
- value = self.fd.read(index.stop - index.start)
40
- self.fd.seek(index.start)
41
- return value
61
+ buffer_index = index.start // self.buffer_length
62
+ buffer_end = index.stop // self.buffer_length
63
+ if buffer_index == buffer_end:
64
+ return self.get_buffer(buffer_index)[
65
+ index.start % self.buffer_length : index.stop % self.buffer_length
66
+ ]
67
+ else:
68
+ start_slice = self.get_buffer(buffer_index)[
69
+ index.start % self.buffer_length :
70
+ ]
71
+ end_slice = self.get_buffer(buffer_end)[
72
+ : index.stop % self.buffer_length
73
+ ]
74
+ middle_slices = [
75
+ self.get_buffer(i) for i in range(buffer_index + 1, buffer_end)
76
+ ]
77
+ return start_slice + "".join(middle_slices) + end_slice
42
78
  else:
43
- self.fd.seek(index)
44
- return self.fd.read(1)
79
+ buffer_index = index // self.buffer_length
80
+ return self.get_buffer(buffer_index)[index % self.buffer_length]
45
81
 
46
82
  def __len__(self) -> int:
47
83
  if self.length < 1:
@@ -69,13 +105,14 @@ class JSONParser:
69
105
  json_str: Union[str, StringFileWrapper],
70
106
  json_fd: Optional[TextIO],
71
107
  logging: Optional[bool],
108
+ json_fd_chunk_length: int = 0,
72
109
  ) -> None:
73
110
  # The string to parse
74
111
  self.json_str = json_str
75
112
  # Alternatively, the file description with a json file in it
76
113
  if json_fd:
77
114
  # This is a trick we do to treat the file wrapper as an array
78
- self.json_str = StringFileWrapper(json_fd)
115
+ self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
79
116
  # Index is our iterator that will keep track of which character we are looking at right now
80
117
  self.index: int = 0
81
118
  # This is used in the object member parsing to manage the special cases of missing quotes in key or value
@@ -639,6 +676,7 @@ def repair_json(
639
676
  logging: bool = False,
640
677
  json_fd: Optional[TextIO] = None,
641
678
  ensure_ascii: bool = True,
679
+ chunk_length: int = 0,
642
680
  ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
643
681
  """
644
682
  Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
@@ -647,7 +685,7 @@ def repair_json(
647
685
  When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
648
686
  When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions
649
687
  """
650
- parser = JSONParser(json_str, json_fd, logging)
688
+ parser = JSONParser(json_str, json_fd, logging, chunk_length)
651
689
  if skip_json_loads:
652
690
  parsed_json = parser.parse()
653
691
  else:
@@ -683,7 +721,10 @@ def loads(
683
721
 
684
722
 
685
723
  def load(
686
- fd: TextIO, skip_json_loads: bool = False, logging: bool = False
724
+ fd: TextIO,
725
+ skip_json_loads: bool = False,
726
+ logging: bool = False,
727
+ chunk_length: int = 0,
687
728
  ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
688
729
  """
689
730
  This function works like `json.load()` except that it will fix your JSON in the process.
@@ -691,6 +732,7 @@ def load(
691
732
  """
692
733
  return repair_json(
693
734
  json_fd=fd,
735
+ chunk_length=chunk_length,
694
736
  return_objects=True,
695
737
  skip_json_loads=skip_json_loads,
696
738
  logging=logging,
@@ -701,12 +743,62 @@ def from_file(
701
743
  filename: str,
702
744
  skip_json_loads: bool = False,
703
745
  logging: bool = False,
746
+ chunk_length: int = 0,
704
747
  ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
705
748
  """
706
749
  This function is a wrapper around `load()` so you can pass the filename as string
707
750
  """
708
751
  fd = open(filename)
709
- jsonobj = load(fd, skip_json_loads, logging)
752
+ jsonobj = load(
753
+ fd=fd,
754
+ skip_json_loads=skip_json_loads,
755
+ logging=logging,
756
+ chunk_length=chunk_length,
757
+ )
710
758
  fd.close()
711
759
 
712
760
  return jsonobj
761
+
762
+
763
+ def cli(): # pragma: no cover
764
+ parser = argparse.ArgumentParser(description="Repair and parse JSON files.")
765
+ parser.add_argument("filename", help="The JSON file to repair")
766
+ parser.add_argument(
767
+ "-i",
768
+ "--inline",
769
+ action="store_true",
770
+ help="Replace the file inline instead of returning the output to stdout",
771
+ )
772
+ parser.add_argument(
773
+ "--ensure_ascii",
774
+ action="store_true",
775
+ help="Pass the ensure_ascii parameter to json.dumps()",
776
+ )
777
+ parser.add_argument(
778
+ "--indent",
779
+ type=int,
780
+ default=2,
781
+ help="Number of spaces for indentation (Default 2)",
782
+ )
783
+
784
+ args = parser.parse_args()
785
+
786
+ ensure_ascii = False
787
+ if args.ensure_ascii:
788
+ ensure_ascii = True
789
+ try:
790
+ result = from_file(args.filename)
791
+
792
+ if args.inline:
793
+ fd = open(args.filename, mode="w")
794
+ json.dump(result, fd, indent=args.indent, ensure_ascii=ensure_ascii)
795
+ fd.close()
796
+ else:
797
+ print(json.dumps(result, indent=args.indent, ensure_ascii=ensure_ascii))
798
+ except Exception as e:
799
+ print(f"Error: {str(e)}", file=sys.stderr)
800
+ sys.exit(1)
801
+
802
+
803
+ if __name__ == "__main__": # pragma: no cover
804
+ cli()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: json_repair
3
- Version: 0.28.3
3
+ Version: 0.29.0
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -45,6 +45,21 @@ This simple package can be used to fix an invalid json string. To know all cases
45
45
 
46
46
  Inspired by https://github.com/josdejong/jsonrepair
47
47
 
48
+ ---
49
+ # How to cite
50
+ If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
51
+
52
+ @software{Baccianella_JSON_Repair_-_2024,
53
+ author = {Baccianella, Stefano},
54
+ month = aug,
55
+ title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
56
+ url = {https://github.com/mangiucugna/json_repair},
57
+ version = {0.28.3},
58
+ year = {2024}
59
+ }
60
+
61
+ Thank you for citing my work and please send me a link to the paper if you can!
62
+
48
63
  ---
49
64
  # Offer me a beer
50
65
  If you find this library useful, you can help me by donating toward my monthly beer budget here: https://github.com/sponsors/mangiucugna
@@ -82,6 +97,18 @@ or just
82
97
 
83
98
  decoded_object = json_repair.repair_json(json_string, return_objects=True)
84
99
 
100
+ ### Avoid this antipattern
101
+ Some users of this library adopt the following pattern:
102
+
103
+ obj = {}
104
+ try:
105
+ obj = json.loads(string)
106
+ except json.JSONDecodeError as e:
107
+ obj = json_repair.loads(string)
108
+ ...
109
+
110
+ This is wasteful because `json_repair` will already verify for you if the JSON is valid, if you still want to do that then add `skip_json_loads=True` to the call as explained the section below.
111
+
85
112
  ### Read json from a file or file descriptor
86
113
 
87
114
  JSON repair provides also a drop-in replacement for `json.load()`:
@@ -122,6 +149,32 @@ Some rules of thumb to use:
122
149
  - Setting `return_objects=True` will always be faster because the parser returns an object already and it doesn't have serialize that object to JSON
123
150
  - `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
124
151
  - If you are having issues with escaping pass the string as **raw** string like: `r"string with escaping\""`
152
+
153
+ ### Use json_repair from CLI
154
+
155
+ Install the library for command-line with:
156
+ ```
157
+ pipx install json-repair
158
+ ```
159
+ then run
160
+ ```
161
+ $ json_repair -h
162
+
163
+ usage: json_repair [-h] [-i] [--ensure_ascii] [--indent INDENT] filename
164
+
165
+ Repair and parse JSON files.
166
+
167
+ positional arguments:
168
+ filename The JSON file to repair
169
+
170
+ options:
171
+ -h, --help show this help message and exit
172
+ -i, --inline Replace the file inline instead of returning the output to stdout
173
+ --ensure_ascii Pass the ensure_ascii parameter to json.dumps()
174
+ --indent INDENT Number of spaces for indentation (Default 2)
175
+ ```
176
+ to learn how to use it
177
+
125
178
  ## Adding to requirements
126
179
  **Please pin this library only on the major version!**
127
180
 
@@ -0,0 +1,10 @@
1
+ json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
2
+ json_repair/__main__.py,sha256=EsJb-y89uZEvGQQg1GdIDWzfDwfOMvVekKEtdguQXCM,67
3
+ json_repair/json_repair.py,sha256=hltJ3Qa4qFbUD3mVKkYvFWksnCcIZqx8zamKfBpjeNs,33538
4
+ json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ json_repair-0.29.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
6
+ json_repair-0.29.0.dist-info/METADATA,sha256=yh0EJo-I1u0R6X-Gq9ETz0WbgmuGIhzR7Icw9W4Kee0,9630
7
+ json_repair-0.29.0.dist-info/WHEEL,sha256=uCRv0ZEik_232NlR4YDw4Pv3Ajt5bKvMH13NUU7hFuI,91
8
+ json_repair-0.29.0.dist-info/entry_points.txt,sha256=SNfge3zPSP-ASqriYU9r3NAPaXdseYr7ciPMKdV2uSw,57
9
+ json_repair-0.29.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
10
+ json_repair-0.29.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.2.0)
2
+ Generator: setuptools (74.1.1)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ json_repair = json_repair.__main__:cli
@@ -1,8 +0,0 @@
1
- json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
2
- json_repair/json_repair.py,sha256=QShXijcgNG3ejW_rBbmk0RMjJE1KlGvYBhXcQnMZcHo,30129
3
- json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- json_repair-0.28.3.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
5
- json_repair-0.28.3.dist-info/METADATA,sha256=ZIm82pnDJX68089RoprQJq-HrL2LF1LVr4xDTh_6VJI,8043
6
- json_repair-0.28.3.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
7
- json_repair-0.28.3.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
8
- json_repair-0.28.3.dist-info/RECORD,,