json-repair 0.39.0__tar.gz → 0.40.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (20) hide show
  1. {json_repair-0.39.0/src/json_repair.egg-info → json_repair-0.40.0}/PKG-INFO +10 -10
  2. {json_repair-0.39.0 → json_repair-0.40.0}/README.md +9 -9
  3. {json_repair-0.39.0 → json_repair-0.40.0}/pyproject.toml +1 -1
  4. {json_repair-0.39.0 → json_repair-0.40.0}/src/json_repair/json_parser.py +74 -81
  5. {json_repair-0.39.0 → json_repair-0.40.0/src/json_repair.egg-info}/PKG-INFO +10 -10
  6. {json_repair-0.39.0 → json_repair-0.40.0}/tests/test_json_repair.py +6 -3
  7. {json_repair-0.39.0 → json_repair-0.40.0}/LICENSE +0 -0
  8. {json_repair-0.39.0 → json_repair-0.40.0}/setup.cfg +0 -0
  9. {json_repair-0.39.0 → json_repair-0.40.0}/src/json_repair/__init__.py +0 -0
  10. {json_repair-0.39.0 → json_repair-0.40.0}/src/json_repair/__main__.py +0 -0
  11. {json_repair-0.39.0 → json_repair-0.40.0}/src/json_repair/json_context.py +0 -0
  12. {json_repair-0.39.0 → json_repair-0.40.0}/src/json_repair/json_repair.py +0 -0
  13. {json_repair-0.39.0 → json_repair-0.40.0}/src/json_repair/py.typed +0 -0
  14. {json_repair-0.39.0 → json_repair-0.40.0}/src/json_repair/string_file_wrapper.py +0 -0
  15. {json_repair-0.39.0 → json_repair-0.40.0}/src/json_repair.egg-info/SOURCES.txt +0 -0
  16. {json_repair-0.39.0 → json_repair-0.40.0}/src/json_repair.egg-info/dependency_links.txt +0 -0
  17. {json_repair-0.39.0 → json_repair-0.40.0}/src/json_repair.egg-info/entry_points.txt +0 -0
  18. {json_repair-0.39.0 → json_repair-0.40.0}/src/json_repair.egg-info/top_level.txt +0 -0
  19. {json_repair-0.39.0 → json_repair-0.40.0}/tests/test_coverage.py +0 -0
  20. {json_repair-0.39.0 → json_repair-0.40.0}/tests/test_performance.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: json_repair
3
- Version: 0.39.0
3
+ Version: 0.40.0
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -196,12 +196,12 @@ pipx install json-repair
196
196
  to know all options available:
197
197
  ```
198
198
  $ json_repair -h
199
- usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] filename
199
+ usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] [filename]
200
200
 
201
201
  Repair and parse JSON files.
202
202
 
203
203
  positional arguments:
204
- filename The JSON file to repair
204
+ filename The JSON file to repair (if omitted, reads from stdin)
205
205
 
206
206
  options:
207
207
  -h, --help show this help message and exit
@@ -226,13 +226,13 @@ In this example, any version that starts with `0.` will be acceptable, allowing
226
226
  # How to cite
227
227
  If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
228
228
 
229
- @software{Baccianella_JSON_Repair_-_2024,
230
- author = {Baccianella, Stefano},
231
- month = aug,
232
- title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
233
- url = {https://github.com/mangiucugna/json_repair},
234
- version = {0.28.3},
235
- year = {2024}
229
+ @software{Baccianella_JSON_Repair_-_2025,
230
+ author = "Stefano {Baccianella}",
231
+ month = "feb",
232
+ title = "JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs",
233
+ url = "https://github.com/mangiucugna/json_repair",
234
+ version = "0.39.1",
235
+ year = 2025
236
236
  }
237
237
 
238
238
  Thank you for citing my work and please send me a link to the paper if you can!
@@ -158,12 +158,12 @@ pipx install json-repair
158
158
  to know all options available:
159
159
  ```
160
160
  $ json_repair -h
161
- usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] filename
161
+ usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] [filename]
162
162
 
163
163
  Repair and parse JSON files.
164
164
 
165
165
  positional arguments:
166
- filename The JSON file to repair
166
+ filename The JSON file to repair (if omitted, reads from stdin)
167
167
 
168
168
  options:
169
169
  -h, --help show this help message and exit
@@ -188,13 +188,13 @@ In this example, any version that starts with `0.` will be acceptable, allowing
188
188
  # How to cite
189
189
  If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
190
190
 
191
- @software{Baccianella_JSON_Repair_-_2024,
192
- author = {Baccianella, Stefano},
193
- month = aug,
194
- title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
195
- url = {https://github.com/mangiucugna/json_repair},
196
- version = {0.28.3},
197
- year = {2024}
191
+ @software{Baccianella_JSON_Repair_-_2025,
192
+ author = "Stefano {Baccianella}",
193
+ month = "feb",
194
+ title = "JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs",
195
+ url = "https://github.com/mangiucugna/json_repair",
196
+ version = "0.39.1",
197
+ year = 2025
198
198
  }
199
199
 
200
200
  Thank you for citing my work and please send me a link to the paper if you can!
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
3
3
  build-backend = "setuptools.build_meta"
4
4
  [project]
5
5
  name = "json_repair"
6
- version = "0.39.0"
6
+ version = "0.40.0"
7
7
  license = {file = "LICENSE"}
8
8
  authors = [
9
9
  { name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },
@@ -9,6 +9,7 @@ JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
9
9
  class JSONParser:
10
10
  # Constants
11
11
  STRING_DELIMITERS = ['"', "'", "“", "”"]
12
+ NUMBER_CHARS = set("0123456789-.eE/,")
12
13
 
13
14
  def __init__(
14
15
  self,
@@ -129,8 +130,6 @@ class JSONParser:
129
130
  # Context is used in the string parser to manage the lack of quotes
130
131
  self.context.set(ContextValues.OBJECT_KEY)
131
132
 
132
- self.skip_whitespaces_at()
133
-
134
133
  # Save this index in case we need find a duplicate key
135
134
  rollback_index = self.index
136
135
 
@@ -219,18 +218,13 @@ class JSONParser:
219
218
  char = self.get_char_at()
220
219
 
221
220
  # Especially at the end of an LLM generated json you might miss the last "]"
222
- char = self.get_char_at()
223
221
  if char and char != "]":
224
222
  self.log(
225
- "While parsing an array we missed the closing ], adding it back",
226
- )
227
- self.index -= 1
228
- # Add the missing closing bracket
229
- self.json_str = (
230
- self.json_str[: self.index + 1] + "]" + self.json_str[self.index + 1 :]
223
+ "While parsing an array we missed the closing ], ignoring it",
231
224
  )
232
225
 
233
226
  self.index += 1
227
+
234
228
  self.context.reset()
235
229
  return arr
236
230
 
@@ -275,15 +269,11 @@ class JSONParser:
275
269
  self.log(
276
270
  "While parsing a string, we found a literal instead of a quote",
277
271
  )
278
- self.log(
279
- "While parsing a string, we found no starting quote. Will add the quote back",
280
- )
281
272
  missing_quotes = True
282
273
 
283
274
  if not missing_quotes:
284
275
  self.index += 1
285
276
 
286
- self.skip_whitespaces_at()
287
277
  # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
288
278
  if self.get_char_at() in self.STRING_DELIMITERS:
289
279
  # If the next character is the same type of quote, then we manage it as double quotes
@@ -583,6 +573,13 @@ class JSONParser:
583
573
  elif (
584
574
  next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\"
585
575
  ):
576
+ # Check if self.index:self.index+i is only whitespaces, break if that's the case
577
+ if all(
578
+ str(self.get_char_at(j)).isspace()
579
+ for j in range(1, i)
580
+ if self.get_char_at(j)
581
+ ):
582
+ break
586
583
  if self.context.current == ContextValues.OBJECT_VALUE:
587
584
  # But this might not be it! This could be just a missing comma
588
585
  # We found a delimiter and we need to check if this is a key
@@ -610,27 +607,24 @@ class JSONParser:
610
607
  self.index += 1
611
608
  char = self.get_char_at()
612
609
  elif self.context.current == ContextValues.ARRAY:
613
- # In array context this could be something like "lorem "ipsum" sic"
614
- # So let's check if we find a rstring_delimiter forward otherwise end early
615
- i = self.skip_to_character(rstring_delimiter, idx=i + 1)
616
- next_c = self.get_char_at(i)
617
- if next_c and next_c == rstring_delimiter:
618
- # Ok now if I find a comma or a closing ], that can be have also an optional rstring_delimiter before them
619
- # We can consider this a misplaced quote
620
- i += 1
621
- i = self.skip_whitespaces_at(
622
- idx=i, move_main_index=False
623
- )
624
- next_c = self.get_char_at(i)
625
- if next_c and next_c in [",", "]"]:
626
- self.log(
627
- "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
628
- )
629
- unmatched_delimiter = not unmatched_delimiter
630
- string_acc += str(char)
631
- self.index += 1
632
- char = self.get_char_at()
633
-
610
+ # If we got up to here it means that this is a situation like this:
611
+ # ["bla bla bla "puppy" bla bla bla "kitty" bla bla"]
612
+ # So we need to ignore this quote
613
+ self.log(
614
+ "While parsing a string in Array context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
615
+ )
616
+ unmatched_delimiter = not unmatched_delimiter
617
+ string_acc += str(char)
618
+ self.index += 1
619
+ char = self.get_char_at()
620
+ elif self.context.current == ContextValues.OBJECT_KEY:
621
+ # In this case we just ignore this and move on
622
+ self.log(
623
+ "While parsing a string in Object Key context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
624
+ )
625
+ string_acc += str(char)
626
+ self.index += 1
627
+ char = self.get_char_at()
634
628
  if (
635
629
  char
636
630
  and missing_quotes
@@ -663,10 +657,9 @@ class JSONParser:
663
657
  def parse_number(self) -> Union[float, int, str, JSONReturnType]:
664
658
  # <number> is a valid real number expressed in one of a number of given formats
665
659
  number_str = ""
666
- number_chars = set("0123456789-.eE/,")
667
660
  char = self.get_char_at()
668
661
  is_array = self.context.current == ContextValues.ARRAY
669
- while char and char in number_chars and (char != "," or not is_array):
662
+ while char and char in self.NUMBER_CHARS and (not is_array or char != ","):
670
663
  number_str += char
671
664
  self.index += 1
672
665
  char = self.get_char_at()
@@ -712,51 +705,6 @@ class JSONParser:
712
705
  self.index = starting_index
713
706
  return ""
714
707
 
715
- def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
716
- # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
717
- try:
718
- return self.json_str[self.index + count]
719
- except IndexError:
720
- return False
721
-
722
- def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
723
- """
724
- This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
725
- """
726
- try:
727
- char = self.json_str[self.index + idx]
728
- except IndexError:
729
- return idx
730
- while char.isspace():
731
- if move_main_index:
732
- self.index += 1
733
- else:
734
- idx += 1
735
- try:
736
- char = self.json_str[self.index + idx]
737
- except IndexError:
738
- return idx
739
- return idx
740
-
741
- def skip_to_character(self, character: str, idx: int = 0) -> int:
742
- """
743
- This function quickly iterates to find a character, syntactic sugar to make the code more concise
744
- """
745
- try:
746
- char = self.json_str[self.index + idx]
747
- except IndexError:
748
- return idx
749
- while char != character:
750
- idx += 1
751
- try:
752
- char = self.json_str[self.index + idx]
753
- except IndexError:
754
- return idx
755
- if self.index + idx > 0 and self.json_str[self.index + idx - 1] == "\\":
756
- # Ah this is an escaped character, try again
757
- return self.skip_to_character(character=character, idx=idx + 1)
758
- return idx
759
-
760
708
  def parse_comment(self) -> str:
761
709
  """
762
710
  Parse code-like comments:
@@ -827,6 +775,51 @@ class JSONParser:
827
775
  self.index += 1
828
776
  return ""
829
777
 
778
+ def get_char_at(self, count: int = 0) -> Union[str, Literal[False]]:
779
+ # Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
780
+ try:
781
+ return self.json_str[self.index + count]
782
+ except IndexError:
783
+ return False
784
+
785
+ def skip_whitespaces_at(self, idx: int = 0, move_main_index=True) -> int:
786
+ """
787
+ This function quickly iterates on whitespaces, syntactic sugar to make the code more concise
788
+ """
789
+ try:
790
+ char = self.json_str[self.index + idx]
791
+ except IndexError:
792
+ return idx
793
+ while char.isspace():
794
+ if move_main_index:
795
+ self.index += 1
796
+ else:
797
+ idx += 1
798
+ try:
799
+ char = self.json_str[self.index + idx]
800
+ except IndexError:
801
+ return idx
802
+ return idx
803
+
804
+ def skip_to_character(self, character: str, idx: int = 0) -> int:
805
+ """
806
+ This function quickly iterates to find a character, syntactic sugar to make the code more concise
807
+ """
808
+ try:
809
+ char = self.json_str[self.index + idx]
810
+ except IndexError:
811
+ return idx
812
+ while char != character:
813
+ idx += 1
814
+ try:
815
+ char = self.json_str[self.index + idx]
816
+ except IndexError:
817
+ return idx
818
+ if self.index + idx > 0 and self.json_str[self.index + idx - 1] == "\\":
819
+ # Ah this is an escaped character, try again
820
+ return self.skip_to_character(character=character, idx=idx + 1)
821
+ return idx
822
+
830
823
  def _log(self, text: str) -> None:
831
824
  window: int = 10
832
825
  start: int = max(self.index - window, 0)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: json_repair
3
- Version: 0.39.0
3
+ Version: 0.40.0
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -196,12 +196,12 @@ pipx install json-repair
196
196
  to know all options available:
197
197
  ```
198
198
  $ json_repair -h
199
- usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] filename
199
+ usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] [filename]
200
200
 
201
201
  Repair and parse JSON files.
202
202
 
203
203
  positional arguments:
204
- filename The JSON file to repair
204
+ filename The JSON file to repair (if omitted, reads from stdin)
205
205
 
206
206
  options:
207
207
  -h, --help show this help message and exit
@@ -226,13 +226,13 @@ In this example, any version that starts with `0.` will be acceptable, allowing
226
226
  # How to cite
227
227
  If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
228
228
 
229
- @software{Baccianella_JSON_Repair_-_2024,
230
- author = {Baccianella, Stefano},
231
- month = aug,
232
- title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
233
- url = {https://github.com/mangiucugna/json_repair},
234
- version = {0.28.3},
235
- year = {2024}
229
+ @software{Baccianella_JSON_Repair_-_2025,
230
+ author = "Stefano {Baccianella}",
231
+ month = "feb",
232
+ title = "JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs",
233
+ url = "https://github.com/mangiucugna/json_repair",
234
+ version = "0.39.1",
235
+ year = 2025
236
236
  }
237
237
 
238
238
  Thank you for citing my work and please send me a link to the paper if you can!
@@ -126,6 +126,9 @@ def test_array_edge_cases():
126
126
  assert repair_json('["lorem "ipsum" sic"]') == '["lorem \\"ipsum\\" sic"]'
127
127
  assert repair_json('{"key1": ["value1", "value2"}, "key2": ["value3", "value4"]}') == '{"key1": ["value1", "value2"], "key2": ["value3", "value4"]}'
128
128
  assert repair_json('[ "value", /* comment */ "value2" ]') == '["value", "value2"]'
129
+ assert repair_json('{"key": ["value" "value1" "value2"]}') == '{"key": ["value", "value1", "value2"]}'
130
+ assert repair_json('{"key": ["lorem "ipsum" dolor "sit" amet, "consectetur" ", "lorem "ipsum" dolor", "lorem"]}') == '{"key": ["lorem \\"ipsum\\" dolor \\"sit\\" amet, \\"consectetur\\" ", "lorem \\"ipsum\\" dolor", "lorem"]}'
131
+ assert repair_json('{"k"e"y": "value"}') == '{"k\\"e\\"y": "value"}'
129
132
 
130
133
  def test_escaping():
131
134
  assert repair_json("'\"'") == '""'
@@ -150,7 +153,7 @@ def test_object_edge_cases():
150
153
  assert repair_json('{"lorem": ipsum, sic, datum.",}') == '{"lorem": "ipsum, sic, datum."}'
151
154
  assert repair_json('{"lorem": sic tamet. "ipsum": sic tamet, quick brown fox. "sic": ipsum}') == '{"lorem": "sic tamet.", "ipsum": "sic tamet", "sic": "ipsum"}'
152
155
  assert repair_json('{"lorem_ipsum": "sic tamet, quick brown fox. }') == '{"lorem_ipsum": "sic tamet, quick brown fox."}'
153
- assert repair_json('{"key":value, " key2":"value2" }') == '{"key": "value", "key2": "value2"}'
156
+ assert repair_json('{"key":value, " key2":"value2" }') == '{"key": "value", " key2": "value2"}'
154
157
  assert repair_json('{"key":value "key2":"value2" }') == '{"key": "value", "key2": "value2"}'
155
158
  assert repair_json("{'text': 'words{words in brackets}more words'}") == '{"text": "words{words in brackets}more words"}'
156
159
  assert repair_json('{text:words{words in brackets}}') == '{"text": "words{words in brackets}"}'
@@ -265,8 +268,8 @@ def test_repair_json_from_file():
265
268
  # Write content to the temporary file
266
269
  with os.fdopen(temp_fd, 'w') as tmp:
267
270
  tmp.write("{key:value}")
268
- assert from_file(filename=temp_path, logging=True) == ({'key': 'value'}, [{'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object key context, we found a :, stopping here',}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}, {'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn\'t determine that a right delimiter was present. Stopping here'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}])
269
- assert from_file(filename=temp_path, logging=True, chunk_length=2) == ({'key': 'value'}, [{'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object key context, we found a :, stopping here',}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}, {'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn\'t determine that a right delimiter was present. Stopping here'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}])
271
+ assert from_file(filename=temp_path, logging=True) == ({'key': 'value'}, [{'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object key context, we found a :, stopping here',}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}, {'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn\'t determine that a right delimiter was present. Stopping here'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}])
272
+ assert from_file(filename=temp_path, logging=True, chunk_length=2) == ({'key': 'value'}, [{'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object key context, we found a :, stopping here',}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}, {'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn\'t determine that a right delimiter was present. Stopping here'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}])
270
273
  finally:
271
274
  # Clean up - delete the temporary file
272
275
  os.remove(temp_path)
File without changes
File without changes