json-repair 0.28.2__py3-none-any.whl → 0.28.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,19 +29,52 @@ from typing import Any, Dict, List, Optional, Union, TextIO, Tuple, Literal
29
29
 
30
30
  class StringFileWrapper:
31
31
  # This is a trick to simplify the code, transform the filedescriptor handling into a string handling
32
- def __init__(self, fd: TextIO) -> None:
32
+ def __init__(self, fd: TextIO, CHUNK_LENGTH: int) -> None:
33
33
  self.fd = fd
34
34
  self.length: int = 0
35
-
36
- def __getitem__(self, index: int | slice) -> str:
35
+ # Buffers are 1MB strings that are read from the file
36
+ # and kept in memory to keep reads low
37
+ self.buffers: dict[int, str] = {}
38
+ # CHUNK_LENGTH is in bytes
39
+ if not CHUNK_LENGTH or CHUNK_LENGTH < 2:
40
+ CHUNK_LENGTH = 1_000_000
41
+ self.buffer_length = CHUNK_LENGTH
42
+
43
+ def fill_buffer(self, index: int) -> None:
44
+ if self.buffers.get(index) is None:
45
+ self.fd.seek(index * self.buffer_length)
46
+ self.buffers[index] = self.fd.read(self.buffer_length)
47
+ # Save memory by keeping max 2MB buffer chunks and min 2 chunks
48
+ if len(self.buffers) > max(2, 2_000_000 / self.buffer_length):
49
+ oldest_key = next(iter(self.buffers))
50
+ self.buffers.pop(oldest_key)
51
+
52
+ def __getitem__(self, index: Union[int, slice]) -> str:
53
+ # The buffer is an array that is seek like a RAM:
54
+ # self.buffers[index]: the row in the array of length 1MB, index is `i` modulo CHUNK_LENGTH
55
+ # self.buffures[index][j]: the column of the row that is `i` remainder CHUNK_LENGTH
37
56
  if isinstance(index, slice):
38
- self.fd.seek(index.start)
39
- value = self.fd.read(index.stop - index.start)
40
- self.fd.seek(index.start)
41
- return value
57
+ buffer_index = index.start // self.buffer_length
58
+ buffer_end = index.stop // self.buffer_length
59
+ for i in range(buffer_index, buffer_end + 1):
60
+ self.fill_buffer(i)
61
+ if buffer_index == buffer_end:
62
+ return self.buffers[buffer_index][
63
+ index.start % self.buffer_length : index.stop % self.buffer_length
64
+ ]
65
+ else:
66
+ start_slice = self.buffers[buffer_index][
67
+ index.start % self.buffer_length :
68
+ ]
69
+ end_slice = self.buffers[buffer_end][: index.stop % self.buffer_length]
70
+ middle_slices = [
71
+ self.buffers[i] for i in range(buffer_index + 1, buffer_end)
72
+ ]
73
+ return start_slice + "".join(middle_slices) + end_slice
42
74
  else:
43
- self.fd.seek(index)
44
- return self.fd.read(1)
75
+ buffer_index = index // self.buffer_length
76
+ self.fill_buffer(buffer_index)
77
+ return self.buffers[buffer_index][index % self.buffer_length]
45
78
 
46
79
  def __len__(self) -> int:
47
80
  if self.length < 1:
@@ -69,13 +102,14 @@ class JSONParser:
69
102
  json_str: Union[str, StringFileWrapper],
70
103
  json_fd: Optional[TextIO],
71
104
  logging: Optional[bool],
105
+ json_fd_chunk_length: int = 0,
72
106
  ) -> None:
73
107
  # The string to parse
74
108
  self.json_str = json_str
75
109
  # Alternatively, the file description with a json file in it
76
110
  if json_fd:
77
111
  # This is a trick we do to treat the file wrapper as an array
78
- self.json_str = StringFileWrapper(json_fd)
112
+ self.json_str = StringFileWrapper(json_fd, json_fd_chunk_length)
79
113
  # Index is our iterator that will keep track of which character we are looking at right now
80
114
  self.index: int = 0
81
115
  # This is used in the object member parsing to manage the special cases of missing quotes in key or value
@@ -639,6 +673,7 @@ def repair_json(
639
673
  logging: bool = False,
640
674
  json_fd: Optional[TextIO] = None,
641
675
  ensure_ascii: bool = True,
676
+ chunk_length: int = 0,
642
677
  ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
643
678
  """
644
679
  Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
@@ -647,7 +682,7 @@ def repair_json(
647
682
  When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
648
683
  When `logging=True` is passed, it will return a tuple with the repaired json and a log of all repair actions
649
684
  """
650
- parser = JSONParser(json_str, json_fd, logging)
685
+ parser = JSONParser(json_str, json_fd, logging, chunk_length)
651
686
  if skip_json_loads:
652
687
  parsed_json = parser.parse()
653
688
  else:
@@ -683,7 +718,10 @@ def loads(
683
718
 
684
719
 
685
720
  def load(
686
- fd: TextIO, skip_json_loads: bool = False, logging: bool = False
721
+ fd: TextIO,
722
+ skip_json_loads: bool = False,
723
+ logging: bool = False,
724
+ chunk_length: int = 0,
687
725
  ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
688
726
  """
689
727
  This function works like `json.load()` except that it will fix your JSON in the process.
@@ -691,6 +729,7 @@ def load(
691
729
  """
692
730
  return repair_json(
693
731
  json_fd=fd,
732
+ chunk_length=chunk_length,
694
733
  return_objects=True,
695
734
  skip_json_loads=skip_json_loads,
696
735
  logging=logging,
@@ -701,12 +740,18 @@ def from_file(
701
740
  filename: str,
702
741
  skip_json_loads: bool = False,
703
742
  logging: bool = False,
743
+ chunk_length: int = 0,
704
744
  ) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
705
745
  """
706
746
  This function is a wrapper around `load()` so you can pass the filename as string
707
747
  """
708
748
  fd = open(filename)
709
- jsonobj = load(fd, skip_json_loads, logging)
749
+ jsonobj = load(
750
+ fd=fd,
751
+ skip_json_loads=skip_json_loads,
752
+ logging=logging,
753
+ chunk_length=chunk_length,
754
+ )
710
755
  fd.close()
711
756
 
712
757
  return jsonobj
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: json_repair
3
- Version: 0.28.2
3
+ Version: 0.28.4
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -45,6 +45,21 @@ This simple package can be used to fix an invalid json string. To know all cases
45
45
 
46
46
  Inspired by https://github.com/josdejong/jsonrepair
47
47
 
48
+ ---
49
+ # How to cite
50
+ If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
51
+
52
+ @software{Baccianella_JSON_Repair_-_2024,
53
+ author = {Baccianella, Stefano},
54
+ month = aug,
55
+ title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
56
+ url = {https://github.com/mangiucugna/json_repair},
57
+ version = {0.28.3},
58
+ year = {2024}
59
+ }
60
+
61
+ Thank you for citing my work and please send me a link to the paper if you can!
62
+
48
63
  ---
49
64
  # Offer me a beer
50
65
  If you find this library useful, you can help me by donating toward my monthly beer budget here: https://github.com/sponsors/mangiucugna
@@ -82,6 +97,18 @@ or just
82
97
 
83
98
  decoded_object = json_repair.repair_json(json_string, return_objects=True)
84
99
 
100
+ ### Avoid this antipattern
101
+ Some users of this library adopt the following pattern:
102
+
103
+ obj = {}
104
+ try:
105
+ obj = json.loads(string)
106
+ except json.JSONDecodeError as e:
107
+ obj = json_repair.loads(string)
108
+ ...
109
+
110
+ This is wasteful because `json_repair` will already verify for you if the JSON is valid, if you still want to do that then add `skip_json_loads=True` to the call as explained the section below.
111
+
85
112
  ### Read json from a file or file descriptor
86
113
 
87
114
  JSON repair provides also a drop-in replacement for `json.load()`:
@@ -122,6 +149,7 @@ Some rules of thumb to use:
122
149
  - Setting `return_objects=True` will always be faster because the parser returns an object already and it doesn't have serialize that object to JSON
123
150
  - `skip_json_loads` is faster only if you 100% know that the string is not a valid JSON
124
151
  - If you are having issues with escaping pass the string as **raw** string like: `r"string with escaping\""`
152
+
125
153
  ## Adding to requirements
126
154
  **Please pin this library only on the major version!**
127
155
 
@@ -0,0 +1,8 @@
1
+ json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
2
+ json_repair/json_repair.py,sha256=odtRiFJ-u8mbdw_3Djx4jADxGoBeQvot3536D6Y6K0c,32266
3
+ json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
+ json_repair-0.28.4.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
5
+ json_repair-0.28.4.dist-info/METADATA,sha256=2JB2TM0mrFC7OejTtgFrpyr2qQOw8xwX0KmLTSzephk,9019
6
+ json_repair-0.28.4.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
7
+ json_repair-0.28.4.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
8
+ json_repair-0.28.4.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (72.2.0)
2
+ Generator: setuptools (74.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,8 +0,0 @@
1
- json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
2
- json_repair/json_repair.py,sha256=v43na-l2g34pwTZH5FDljI_r5ArIaZfCeHW_LbB8puw,30123
3
- json_repair/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
4
- json_repair-0.28.2.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
5
- json_repair-0.28.2.dist-info/METADATA,sha256=llPJ1A8UePeGKbPSkC6-b2kE2somdgSMiq-wuPANGZ8,8043
6
- json_repair-0.28.2.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
7
- json_repair-0.28.2.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
8
- json_repair-0.28.2.dist-info/RECORD,,