json-repair 0.28.2__py3-none-any.whl → 0.28.4__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -29,19 +29,52 @@ from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
29
29
 
30
30
  class StringFileWrapper:
31
31
  # This is a trick to simplify the code, transform the filedescriptor handling into a string handling
32
- def __init__(self, fd: TextIO) -> None:
32
+ def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
33
33
  self.fd = fd
34
34
  self.length: int = 0
35
-
36
- def __getitem__(self, index: int | slice) -> str:
35
+ # Buffers are 1MB strings that are read from the file
36
+ # and kept in memory to keep reads low
37
+ self.buffers: dict[int, str] = {}
38
+ # CHUNK_LENGTH is in bytes
39
+ if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
40
+ CHUNK_LENGTH = 1_000_000
41
+ self.buffer_length = CHUNK_LENGTH
42
+
43
+ def fill_buffer(self, index: int) -> None:
44
+ if self.buffers.get(index) is None:
45
+ self.fd.seek(index * self.buffer_length)
46
+ self.buffers[index] = self.fd.read(self.buffer_length)
47
+ # Save memory by keeping max 2MB buffer chunks and min 2 chunks
48
+ if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
49
+ oldest_key = next(iter(self.buffers))
50
+ self.buffers.pop(oldest_key)
51
+
52
+ def __getitem__(self, index: Union[int, slice]) -> str:
53
+ # The buffer is an array that is seek like a RAM:
54
+ # self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
55
+ # self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
37
56
  if isinstance(index, slice):
38
- self.fd.seek(index.start)
39
- value = self.fd.read(index.stop - index.start)
40
- self.fd.seek(index.start)
41
- return value
57
+ buffer_index = index.start // self.buffer_length
58
+ buffer_end = index.stop // self.buffer_length
59
+ for i in range(buffer_index, buffer_end + 1):
60
+ self.fill_buffer(i)
61
+ if buffer_index == buffer_end:
62
+ return self.buffers[buffer_index][
63
+ index.start % self.buffer_length : index.stop % self.buffer_length
64
+ ]
65
+ else:
66
+ start_slice = self.buffers[buffer_index][
67
+ index.start % self.buffer_length :
68
+ ]
69
+ end_slice = self.buffers[buffer_end][: index.stop % self.buffer_length]
70
+ middle_slices = [
71
+ self.buffers[i] for i in range(buffer_index + 1, buffer_end)
72
+ ]
73
+ return start_slice + "".join(middle_slices) + end_slice
42
74
  else:
43
- self.fd.seek(index)
44
- return self.fd.read(1)
75
+ buffer_index = index // self.buffer_length
76
+ self.fill_buffer(buffer_index)
77
+ return self.buffers[buffer_index][index % self.buffer_length]
45
78
 
46
79
  def __len__(self) -> int:
47
80
  if self.length < 1:
@@ -69,13 +102,14 @@ class JSONParser:
69
102
  json_str: Union[str, StringFileWrapper],
70
103
  json_fd: Optional[TextIO],
71
104
  logging: Optional[bool],
105
+ json_fd_chunk_length: int = 0,
72
106
  ) -> None:
73
107
  # The string to parse
74
108
  self.json_str = json_str
75
109
  # Alternatively, the file description with a json file in it
76
110
  if json_fd:
77
111
  # This is a trick we do to treat the file wrapper as an array
78
- self.json_str = StringFileWrapper(json_fd)
112
+ self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
79
113
  # Index is our iterator that will keep track of which character we are looking at right now
80
114
  self.index: int = 0
81
115
  # This is used in the object member parsing to manage the special cases of missing quotes in key or value
@@ -639,6 +673,7 @@ def repair_json(
639
673
  logging: bool = False,
640
674
  json_fd: Optional[TextIO] = None,
641
675
  ensure_ascii: bool = True,
676
+ chunk_length: int = 0,
642
677
  ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
643
678
  """
644
679
  Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
@@ -647,7 +682,7 @@ def repair_json(
647
682
  When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
648
683
  When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions
649
684
  """
650
- parser = JSONParser(json_str, json_fd, logging)
685
+ parser = JSONParser(json_str, json_fd, logging, chunk_length)
651
686
  if skip_json_loads:
652
687
  parsed_json = parser.parse()
653
688
  else:
@@ -683,7 +718,10 @@ def loads(
683
718
 
684
719
 
685
720
  def load(
686
- fd: TextIO, skip_json_loads: bool = False, logging: bool = False
721
+ fd: TextIO,
722
+ skip_json_loads: bool = False,
723
+ logging: bool = False,
724
+ chunk_length: int = 0,
687
725
  ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
688
726
  """
689
727
  This function works like `json.load()` except that it will fix your JSON in the process.
@@ -691,6 +729,7 @@ def load(
691
729
  """
692
730
  return repair_json(
693
731
  json_fd=fd,
732
+ chunk_length=chunk_length,
694
733
  return_objects=True,
695
734
  skip_json_loads=skip_json_loads,
696
735
  logging=logging,
@@ -701,12 +740,18 @@ def from_file(
701
740
  filename: str,
702
741
  skip_json_loads: bool = False,
703
742
  logging: bool = False,
743
+ chunk_length: int = 0,
704
744
  ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
705
745
  """
706
746
  This function is a wrapper around `load()` so you can pass the filename as string
707
747
  """
708
748
  fd = open(filename)
709
- jsonobj = load(fd, skip_json_loads, logging)
749
+ jsonobj = load(
750
+ fd=fd,
751
+ skip_json_loads=skip_json_loads,
752
+ logging=logging,
753
+ chunk_length=chunk_length,
754
+ )
710
755
  fd.close()
711
756
 
712
757
  return jsonobj
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: json_repair
3
- Version: 0.28.2
3
+ Version: 0.28.4
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -45,6 +45,21 @@ This simple package can be used to fix an invalid json string. To know all cases
45
45
 
46
46
  Inspired by https://github.com/josdejong/jsonrepair
47
47
 
48
+ ---
49
+ # How to cite
50
+ If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
51
+
52
+ @software{Baccianella_JSON_Repair_-_2024,
53
+ author = {Baccianella, Stefano},
54
+ month = aug,
55
+ title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
56
+ url = {https://github.com/mangiucugna/json_repair},
57
+ version = {0.28.3},
58
+ year = {2024}
59
+ }
60
+
61
+ Thank you for citing my work and please send me a link to the paper if you can!
62
+
48
63
  ---
49
64
  # Offer me a beer
50
65
  If you find this library useful, you can help me by donating toward my monthly beer budget here: https://github.com/sponsors/mangiucugna
@@ -82,6 +97,18 @@ or just
82
97
 
83
98
  decoded_object = json_repair.repair_json(json_string, return_objects=True)
84
99
 
100
+ ### Avoid this antipattern
101
+ Some users of this library adopt the following pattern:
102
+
103
+ obj = {}
104
+ try:
105
+ obj = json.loads(string)
106
+ except json.JSONDecodeError as e:
107
+ obj = json_repair.loads(string)
108
+ ...
109
+
110
+ This is wasteful because `json_repair` will already verify for you if the JSON is valid, if you still want to do that then add `skip_json_loads=True` to the call as explained the section below.
111
+
85
112
  ### Read json from a file or file descriptor
86
113
 
87
114
  JSON repair provides also a drop-in replacement for `json.load()`:
@@ -122,6 +149,7 @@ Some rules of thumb to use:
122
149
  - Setting `return_objects=True` will always be faster because the parser returns an object already and it doesn't have serialize that object to JSON
123
150
  - `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
124
151
  - If you are having issues with escaping pass the string as **raw** string like: `r"string with escaping\""`
152
+
125
153
  ## Adding to requirements
126
154
  **Please pin this library only on the major version!**
127
155
 
@@ -0,0 +1,8 @@
1
+ json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
2
+ json_repair/json_repair.py,sha256=odtRiFJ-u8mbdw_3Djx4jADxGoBeQvot3536D6Y6K0c,32266
3
+ json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ json_repair-0.28.4.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
5
+ json_repair-0.28.4.dist-info/METADATA,sha256=2JB2TM0mrFC7OejTtgFrpyr2qQOw8xwX0KmLTSzephk,9019
6
+ json_repair-0.28.4.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
7
+ json_repair-0.28.4.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
8
+ json_repair-0.28.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.2.0)
2
+ Generator: setuptools (74.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,8 +0,0 @@
1
- json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
2
- json_repair/json_repair.py,sha256=v43na-l2g34pwTZH5FDljI_r5ArIaZfCeHW_LbB8puw,30123
3
- json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- json_repair-0.28.2.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
5
- json_repair-0.28.2.dist-info/METADATA,sha256=llPJ1A8UePeGKbPSkC6-b2kE2somdgSMiq-wuPANGZ8,8043
6
- json_repair-0.28.2.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
7
- json_repair-0.28.2.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
8
- json_repair-0.28.2.dist-info/RECORD,,