json-repair 0.38.0__tar.gz → 0.39.1__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (20) hide show
  1. {json_repair-0.38.0/src/json_repair.egg-info → json_repair-0.39.1}/PKG-INFO +7 -7
  2. {json_repair-0.38.0 → json_repair-0.39.1}/README.md +6 -6
  3. {json_repair-0.38.0 → json_repair-0.39.1}/pyproject.toml +1 -1
  4. {json_repair-0.38.0 → json_repair-0.39.1}/src/json_repair/json_parser.py +21 -35
  5. {json_repair-0.38.0 → json_repair-0.39.1}/src/json_repair/json_repair.py +20 -7
  6. {json_repair-0.38.0 → json_repair-0.39.1/src/json_repair.egg-info}/PKG-INFO +7 -7
  7. {json_repair-0.38.0 → json_repair-0.39.1}/tests/test_json_repair.py +17 -4
  8. {json_repair-0.38.0 → json_repair-0.39.1}/LICENSE +0 -0
  9. {json_repair-0.38.0 → json_repair-0.39.1}/setup.cfg +0 -0
  10. {json_repair-0.38.0 → json_repair-0.39.1}/src/json_repair/__init__.py +0 -0
  11. {json_repair-0.38.0 → json_repair-0.39.1}/src/json_repair/__main__.py +0 -0
  12. {json_repair-0.38.0 → json_repair-0.39.1}/src/json_repair/json_context.py +0 -0
  13. {json_repair-0.38.0 → json_repair-0.39.1}/src/json_repair/py.typed +0 -0
  14. {json_repair-0.38.0 → json_repair-0.39.1}/src/json_repair/string_file_wrapper.py +0 -0
  15. {json_repair-0.38.0 → json_repair-0.39.1}/src/json_repair.egg-info/SOURCES.txt +0 -0
  16. {json_repair-0.38.0 → json_repair-0.39.1}/src/json_repair.egg-info/dependency_links.txt +0 -0
  17. {json_repair-0.38.0 → json_repair-0.39.1}/src/json_repair.egg-info/entry_points.txt +0 -0
  18. {json_repair-0.38.0 → json_repair-0.39.1}/src/json_repair.egg-info/top_level.txt +0 -0
  19. {json_repair-0.38.0 → json_repair-0.39.1}/tests/test_coverage.py +0 -0
  20. {json_repair-0.38.0 → json_repair-0.39.1}/tests/test_performance.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: json_repair
3
- Version: 0.38.0
3
+ Version: 0.39.1
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -196,12 +196,12 @@ pipx install json-repair
196
196
  to know all options available:
197
197
  ```
198
198
  $ json_repair -h
199
- usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] filename
199
+ usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] [filename]
200
200
 
201
201
  Repair and parse JSON files.
202
202
 
203
203
  positional arguments:
204
- filename The JSON file to repair
204
+ filename The JSON file to repair (if omitted, reads from stdin)
205
205
 
206
206
  options:
207
207
  -h, --help show this help message and exit
@@ -226,13 +226,13 @@ In this example, any version that starts with `0.` will be acceptable, allowing
226
226
  # How to cite
227
227
  If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
228
228
 
229
- @software{Baccianella_JSON_Repair_-_2024,
229
+ @software{Baccianella_JSON_Repair_-_2025,
230
230
  author = {Baccianella, Stefano},
231
- month = aug,
231
+ month = feb,
232
232
  title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
233
233
  url = {https://github.com/mangiucugna/json_repair},
234
- version = {0.28.3},
235
- year = {2024}
234
+ version = {0.39.0},
235
+ year = {2025}
236
236
  }
237
237
 
238
238
  Thank you for citing my work and please send me a link to the paper if you can!
@@ -158,12 +158,12 @@ pipx install json-repair
158
158
  to know all options available:
159
159
  ```
160
160
  $ json_repair -h
161
- usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] filename
161
+ usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] [filename]
162
162
 
163
163
  Repair and parse JSON files.
164
164
 
165
165
  positional arguments:
166
- filename The JSON file to repair
166
+ filename The JSON file to repair (if omitted, reads from stdin)
167
167
 
168
168
  options:
169
169
  -h, --help show this help message and exit
@@ -188,13 +188,13 @@ In this example, any version that starts with `0.` will be acceptable, allowing
188
188
  # How to cite
189
189
  If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
190
190
 
191
- @software{Baccianella_JSON_Repair_-_2024,
191
+ @software{Baccianella_JSON_Repair_-_2025,
192
192
  author = {Baccianella, Stefano},
193
- month = aug,
193
+ month = feb,
194
194
  title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
195
195
  url = {https://github.com/mangiucugna/json_repair},
196
- version = {0.28.3},
197
- year = {2024}
196
+ version = {0.39.0},
197
+ year = {2025}
198
198
  }
199
199
 
200
200
  Thank you for citing my work and please send me a link to the paper if you can!
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
3
3
  build-backend = "setuptools.build_meta"
4
4
  [project]
5
5
  name = "json_repair"
6
- version = "0.38.0"
6
+ version = "0.39.1"
7
7
  license = {file = "LICENSE"}
8
8
  authors = [
9
9
  { name="Stefano Baccianella", email="4247706+mangiucugna@users.noreply.github.com" },
@@ -9,6 +9,7 @@ JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
9
9
  class JSONParser:
10
10
  # Constants
11
11
  STRING_DELIMITERS = ['"', "'", "“", "”"]
12
+ NUMBER_CHARS = set("0123456789-.eE/,")
12
13
 
13
14
  def __init__(
14
15
  self,
@@ -129,8 +130,6 @@ class JSONParser:
129
130
  # Context is used in the string parser to manage the lack of quotes
130
131
  self.context.set(ContextValues.OBJECT_KEY)
131
132
 
132
- self.skip_whitespaces_at()
133
-
134
133
  # Save this index in case we need find a duplicate key
135
134
  rollback_index = self.index
136
135
 
@@ -219,18 +218,13 @@ class JSONParser:
219
218
  char = self.get_char_at()
220
219
 
221
220
  # Especially at the end of an LLM generated json you might miss the last "]"
222
- char = self.get_char_at()
223
221
  if char and char != "]":
224
222
  self.log(
225
- "While parsing an array we missed the closing ], adding it back",
226
- )
227
- self.index -= 1
228
- # Add the missing closing bracket
229
- self.json_str = (
230
- self.json_str[: self.index + 1] + "]" + self.json_str[self.index + 1 :]
223
+ "While parsing an array we missed the closing ], ignoring it",
231
224
  )
232
225
 
233
226
  self.index += 1
227
+
234
228
  self.context.reset()
235
229
  return arr
236
230
 
@@ -275,15 +269,11 @@ class JSONParser:
275
269
  self.log(
276
270
  "While parsing a string, we found a literal instead of a quote",
277
271
  )
278
- self.log(
279
- "While parsing a string, we found no starting quote. Will add the quote back",
280
- )
281
272
  missing_quotes = True
282
273
 
283
274
  if not missing_quotes:
284
275
  self.index += 1
285
276
 
286
- self.skip_whitespaces_at()
287
277
  # There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
288
278
  if self.get_char_at() in self.STRING_DELIMITERS:
289
279
  # If the next character is the same type of quote, then we manage it as double quotes
@@ -583,6 +573,13 @@ class JSONParser:
583
573
  elif (
584
574
  next_c == rstring_delimiter and self.get_char_at(i - 1) != "\\"
585
575
  ):
576
+ # Check if self.index:self.index+i is only whitespaces, break if that's the case
577
+ if all(
578
+ str(self.get_char_at(j)).isspace()
579
+ for j in range(1, i)
580
+ if self.get_char_at(j)
581
+ ):
582
+ break
586
583
  if self.context.current == ContextValues.OBJECT_VALUE:
587
584
  # But this might not be it! This could be just a missing comma
588
585
  # We found a delimiter and we need to check if this is a key
@@ -610,26 +607,16 @@ class JSONParser:
610
607
  self.index += 1
611
608
  char = self.get_char_at()
612
609
  elif self.context.current == ContextValues.ARRAY:
613
- # In array context this could be something like "lorem "ipsum" sic"
614
- # So let's check if we find a rstring_delimiter forward otherwise end early
615
- i = self.skip_to_character(rstring_delimiter, idx=i + 1)
616
- next_c = self.get_char_at(i)
617
- if next_c and next_c == rstring_delimiter:
618
- # Ok now if I find a comma or a closing ], that can be have also an optional rstring_delimiter before them
619
- # We can consider this a misplaced quote
620
- i += 1
621
- i = self.skip_whitespaces_at(
622
- idx=i, move_main_index=False
623
- )
624
- next_c = self.get_char_at(i)
625
- if next_c and next_c in [",", "]"]:
626
- self.log(
627
- "While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
628
- )
629
- unmatched_delimiter = not unmatched_delimiter
630
- string_acc += str(char)
631
- self.index += 1
632
- char = self.get_char_at()
610
+ # If we got up to here it means that this is a situation like this:
611
+ # ["bla bla bla "puppy" bla bla bla "kitty" bla bla"]
612
+ # So we need to ignore this quote
613
+ self.log(
614
+ "While parsing a string in Array context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it",
615
+ )
616
+ unmatched_delimiter = not unmatched_delimiter
617
+ string_acc += str(char)
618
+ self.index += 1
619
+ char = self.get_char_at()
633
620
 
634
621
  if (
635
622
  char
@@ -663,10 +650,9 @@ class JSONParser:
663
650
  def parse_number(self) -> Union[float, int, str, JSONReturnType]:
664
651
  # <number> is a valid real number expressed in one of a number of given formats
665
652
  number_str = ""
666
- number_chars = set("0123456789-.eE/,")
667
653
  char = self.get_char_at()
668
654
  is_array = self.context.current == ContextValues.ARRAY
669
- while char and char in number_chars and (char != "," or not is_array):
655
+ while char and char in self.NUMBER_CHARS and (not is_array or char != ","):
670
656
  number_str += char
671
657
  self.index += 1
672
658
  char = self.get_char_at()
@@ -160,7 +160,7 @@ def cli(inline_args: Optional[List[str]] = None) -> int:
160
160
 
161
161
  Args:
162
162
  inline_args (Optional[List[str]]): List of command-line arguments for testing purposes. Defaults to None.
163
- - filename (str): The JSON file to repair
163
+ - filename (str): The JSON file to repair. If omitted, the JSON is read from stdin.
164
164
  - -i, --inline (bool): Replace the file inline instead of returning the output to stdout.
165
165
  - -o, --output TARGET (str): If specified, the output will be written to TARGET filename instead of stdout.
166
166
  - --ensure_ascii (bool): Pass ensure_ascii=True to json.dumps(). Will pass False otherwise.
@@ -174,9 +174,15 @@ def cli(inline_args: Optional[List[str]] = None) -> int:
174
174
 
175
175
  Example:
176
176
  >>> cli(['example.json', '--indent', '4'])
177
+ >>> cat json.txt | json_repair
177
178
  """
178
179
  parser = argparse.ArgumentParser(description="Repair and parse JSON files.")
179
- parser.add_argument("filename", help="The JSON file to repair")
180
+ # Make the filename argument optional; if omitted, we will read from stdin.
181
+ parser.add_argument(
182
+ "filename",
183
+ nargs="?",
184
+ help="The JSON file to repair (if omitted, reads from stdin)",
185
+ )
180
186
  parser.add_argument(
181
187
  "-i",
182
188
  "--inline",
@@ -204,9 +210,12 @@ def cli(inline_args: Optional[List[str]] = None) -> int:
204
210
  if inline_args is None: # pragma: no cover
205
211
  args = parser.parse_args()
206
212
  else:
207
- args = parser.parse_args(
208
- inline_args
209
- ) # This is needed so this function is testable
213
+ args = parser.parse_args(inline_args)
214
+
215
+ # Inline mode requires a filename, so error out if none was provided.
216
+ if args.inline and not args.filename: # pragma: no cover
217
+ print("Error: Inline mode requires a filename", file=sys.stderr)
218
+ sys.exit(1)
210
219
 
211
220
  if args.inline and args.output: # pragma: no cover
212
221
  print("Error: You cannot pass both --inline and --output", file=sys.stderr)
@@ -217,8 +226,12 @@ def cli(inline_args: Optional[List[str]] = None) -> int:
217
226
  ensure_ascii = True
218
227
 
219
228
  try:
220
- result = from_file(args.filename)
221
-
229
+ # Use from_file if a filename is provided; otherwise read from stdin.
230
+ if args.filename:
231
+ result = from_file(args.filename)
232
+ else:
233
+ data = sys.stdin.read()
234
+ result = loads(data)
222
235
  if args.inline or args.output:
223
236
  with open(args.output or args.filename, mode="w") as fd:
224
237
  json.dump(result, fd, indent=args.indent, ensure_ascii=ensure_ascii)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: json_repair
3
- Version: 0.38.0
3
+ Version: 0.39.1
4
4
  Summary: A package to repair broken json strings
5
5
  Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
6
6
  License: MIT License
@@ -196,12 +196,12 @@ pipx install json-repair
196
196
  to know all options available:
197
197
  ```
198
198
  $ json_repair -h
199
- usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] filename
199
+ usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT] [filename]
200
200
 
201
201
  Repair and parse JSON files.
202
202
 
203
203
  positional arguments:
204
- filename The JSON file to repair
204
+ filename The JSON file to repair (if omitted, reads from stdin)
205
205
 
206
206
  options:
207
207
  -h, --help show this help message and exit
@@ -226,13 +226,13 @@ In this example, any version that starts with `0.` will be acceptable, allowing
226
226
  # How to cite
227
227
  If you are using this library in your academic work (as I know many folks are) please find the BibTex here:
228
228
 
229
- @software{Baccianella_JSON_Repair_-_2024,
229
+ @software{Baccianella_JSON_Repair_-_2025,
230
230
  author = {Baccianella, Stefano},
231
- month = aug,
231
+ month = feb,
232
232
  title = {{JSON Repair - A python module to repair invalid JSON, commonly used to parse the output of LLMs}},
233
233
  url = {https://github.com/mangiucugna/json_repair},
234
- version = {0.28.3},
235
- year = {2024}
234
+ version = {0.39.0},
235
+ year = {2025}
236
236
  }
237
237
 
238
238
  Thank you for citing my work and please send me a link to the paper if you can!
@@ -3,6 +3,7 @@ from unittest.mock import patch
3
3
  import os.path
4
4
  import pathlib
5
5
  import tempfile
6
+ import io
6
7
 
7
8
  def test_basic_types_valid():
8
9
  assert repair_json("True", return_objects=True) == ""
@@ -125,6 +126,8 @@ def test_array_edge_cases():
125
126
  assert repair_json('["lorem "ipsum" sic"]') == '["lorem \\"ipsum\\" sic"]'
126
127
  assert repair_json('{"key1": ["value1", "value2"}, "key2": ["value3", "value4"]}') == '{"key1": ["value1", "value2"], "key2": ["value3", "value4"]}'
127
128
  assert repair_json('[ "value", /* comment */ "value2" ]') == '["value", "value2"]'
129
+ assert repair_json('{"key": ["value" "value1" "value2"]}') == '{"key": ["value", "value1", "value2"]}'
130
+ assert repair_json('{"key": ["lorem "ipsum" dolor "sit" amet, "consectetur" ", "lorem "ipsum" dolor", "lorem"]}') == '{"key": ["lorem \\"ipsum\\" dolor \\"sit\\" amet, \\"consectetur\\" ", "lorem \\"ipsum\\" dolor", "lorem"]}'
128
131
 
129
132
  def test_escaping():
130
133
  assert repair_json("'\"'") == '""'
@@ -149,7 +152,7 @@ def test_object_edge_cases():
149
152
  assert repair_json('{"lorem": ipsum, sic, datum.",}') == '{"lorem": "ipsum, sic, datum."}'
150
153
  assert repair_json('{"lorem": sic tamet. "ipsum": sic tamet, quick brown fox. "sic": ipsum}') == '{"lorem": "sic tamet.", "ipsum": "sic tamet", "sic": "ipsum"}'
151
154
  assert repair_json('{"lorem_ipsum": "sic tamet, quick brown fox. }') == '{"lorem_ipsum": "sic tamet, quick brown fox."}'
152
- assert repair_json('{"key":value, " key2":"value2" }') == '{"key": "value", "key2": "value2"}'
155
+ assert repair_json('{"key":value, " key2":"value2" }') == '{"key": "value", " key2": "value2"}'
153
156
  assert repair_json('{"key":value "key2":"value2" }') == '{"key": "value", "key2": "value2"}'
154
157
  assert repair_json("{'text': 'words{words in brackets}more words'}") == '{"text": "words{words in brackets}more words"}'
155
158
  assert repair_json('{text:words{words in brackets}}') == '{"text": "words{words in brackets}"}'
@@ -264,8 +267,8 @@ def test_repair_json_from_file():
264
267
  # Write content to the temporary file
265
268
  with os.fdopen(temp_fd, 'w') as tmp:
266
269
  tmp.write("{key:value}")
267
- assert from_file(filename=temp_path, logging=True) == ({'key': 'value'}, [{'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object key context, we found a :, stopping here',}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}, {'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn\'t determine that a right delimiter was present. Stopping here'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}])
268
- assert from_file(filename=temp_path, logging=True, chunk_length=2) == ({'key': 'value'}, [{'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object key context, we found a :, stopping here',}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}, {'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'text': 'While parsing a string, we found no starting quote. Will add the quote back', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn\'t determine that a right delimiter was present. Stopping here'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}])
270
+ assert from_file(filename=temp_path, logging=True) == ({'key': 'value'}, [{'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object key context, we found a :, stopping here',}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}, {'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn\'t determine that a right delimiter was present. Stopping here'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}])
271
+ assert from_file(filename=temp_path, logging=True, chunk_length=2) == ({'key': 'value'}, [{'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object key context, we found a :, stopping here',}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}, {'text': 'While parsing a string, we found a literal instead of a quote', 'context': '{key:value}'}, {'context': '{key:value}', 'text': 'While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn\'t determine that a right delimiter was present. Stopping here'}, {'text': 'While parsing a string, we missed the closing quote, ignoring', 'context': '{key:value}'}])
269
272
  finally:
270
273
  # Clean up - delete the temporary file
271
274
  os.remove(temp_path)
@@ -316,4 +319,14 @@ def test_cli(capsys):
316
319
  finally:
317
320
  # Clean up - delete the temporary file
318
321
  os.remove(temp_path)
319
- os.remove(tempout_path)
322
+ os.remove(tempout_path)
323
+
324
+ # Prepare a JSON string that needs to be repaired.
325
+ test_input = "{key:value"
326
+ # Expected output when running cli with --indent 0.
327
+ expected_output = '{\n"key": "value"\n}\n'
328
+ # Patch sys.stdin so that cli() reads from it instead of a file.
329
+ with patch('sys.stdin', new=io.StringIO(test_input)):
330
+ cli(inline_args=['--indent', 0])
331
+ captured = capsys.readouterr()
332
+ assert captured.out == expected_output
File without changes
File without changes