json-repair 0.13.1__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/json_repair.py +136 -43
- {json_repair-0.13.1.dist-info → json_repair-0.15.0.dist-info}/METADATA +5 -2
- json_repair-0.15.0.dist-info/RECORD +7 -0
- json_repair-0.13.1.dist-info/RECORD +0 -7
- {json_repair-0.13.1.dist-info → json_repair-0.15.0.dist-info}/LICENSE +0 -0
- {json_repair-0.13.1.dist-info → json_repair-0.15.0.dist-info}/WHEEL +0 -0
- {json_repair-0.13.1.dist-info → json_repair-0.15.0.dist-info}/top_level.txt +0 -0
json_repair/json_repair.py
CHANGED
@@ -27,17 +27,25 @@ from typing import Any, Dict, List, Union, TextIO
|
|
27
27
|
|
28
28
|
|
29
29
|
class JSONParser:
|
30
|
-
def __init__(self, json_str: str) -> None:
|
30
|
+
def __init__(self, json_str: str, logging: bool = False) -> None:
|
31
31
|
# The string to parse
|
32
32
|
self.json_str = json_str
|
33
33
|
# Index is our iterator that will keep track of which character we are looking at right now
|
34
34
|
self.index = 0
|
35
35
|
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
|
36
|
-
self.context =
|
37
|
-
|
36
|
+
self.context = []
|
37
|
+
# Use this to log the activity, but only if logging is active
|
38
|
+
self.logger = {
|
39
|
+
"log": [],
|
40
|
+
"window": 10,
|
41
|
+
"log_level": "info" if logging else "none",
|
42
|
+
}
|
38
43
|
|
39
44
|
def parse(self) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
|
40
|
-
|
45
|
+
if self.logger["log_level"] == "none":
|
46
|
+
return self.parse_json()
|
47
|
+
else:
|
48
|
+
return self.parse_json(), self.logger["log"]
|
41
49
|
|
42
50
|
def parse_json(
|
43
51
|
self,
|
@@ -56,7 +64,11 @@ class JSONParser:
|
|
56
64
|
return self.parse_array()
|
57
65
|
# there can be an edge case in which a key is empty and at the end of an object
|
58
66
|
# like "key": }. We return an empty string here to close the object properly
|
59
|
-
elif char == "}" and self.
|
67
|
+
elif char == "}" and self.get_context() == "object_value":
|
68
|
+
self.log(
|
69
|
+
"At the end of an object we found a key with missing value, skipping",
|
70
|
+
"info",
|
71
|
+
)
|
60
72
|
return ""
|
61
73
|
# <string> starts with '"'
|
62
74
|
elif char == '"':
|
@@ -92,13 +104,17 @@ class JSONParser:
|
|
92
104
|
|
93
105
|
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
|
94
106
|
if (self.get_char_at() or "") == ":":
|
107
|
+
self.log(
|
108
|
+
"While parsing an object we found a : before a key, replacing with ,",
|
109
|
+
"info",
|
110
|
+
)
|
95
111
|
self.remove_char_at()
|
96
112
|
self.insert_char_at(",")
|
97
113
|
self.index += 1
|
98
114
|
|
99
115
|
# We are now searching for they string key
|
100
116
|
# Context is used in the string parser to manage the lack of quotes
|
101
|
-
self.
|
117
|
+
self.set_context("object_key")
|
102
118
|
|
103
119
|
self.skip_whitespaces_at()
|
104
120
|
|
@@ -110,6 +126,10 @@ class JSONParser:
|
|
110
126
|
# This can happen sometimes like { "": "value" }
|
111
127
|
if key == "" and self.get_char_at() == ":":
|
112
128
|
key = "empty_placeholder"
|
129
|
+
self.log(
|
130
|
+
"While parsing an object we found an empty key, replacing with empty_placeholder",
|
131
|
+
"info",
|
132
|
+
)
|
113
133
|
break
|
114
134
|
|
115
135
|
# We reached the end here
|
@@ -118,15 +138,19 @@ class JSONParser:
|
|
118
138
|
|
119
139
|
# An extreme case of missing ":" after a key
|
120
140
|
if (self.get_char_at() or "") != ":":
|
141
|
+
self.log(
|
142
|
+
"While parsing an object we missed a : after a key, adding it back",
|
143
|
+
"info",
|
144
|
+
)
|
121
145
|
self.insert_char_at(":")
|
122
146
|
self.index += 1
|
123
|
-
self.
|
124
|
-
self.
|
147
|
+
self.reset_context()
|
148
|
+
self.set_context("object_value")
|
125
149
|
# The value can be any valid json
|
126
150
|
value = self.parse_json()
|
127
151
|
|
128
152
|
# Reset context since our job is done
|
129
|
-
self.
|
153
|
+
self.reset_context()
|
130
154
|
obj[key] = value
|
131
155
|
|
132
156
|
if (self.get_char_at() or "") in [",", "'", '"']:
|
@@ -137,6 +161,10 @@ class JSONParser:
|
|
137
161
|
|
138
162
|
# Especially at the end of an LLM generated json you might miss the last "}"
|
139
163
|
if (self.get_char_at() or "}") != "}":
|
164
|
+
self.log(
|
165
|
+
"While parsing an object, we couldn't find the closing }, adding it back",
|
166
|
+
"info",
|
167
|
+
)
|
140
168
|
self.insert_char_at("}")
|
141
169
|
self.index += 1
|
142
170
|
return obj
|
@@ -160,15 +188,22 @@ class JSONParser:
|
|
160
188
|
self.index += 1
|
161
189
|
char = self.get_char_at()
|
162
190
|
# If this is the right value of an object and we are closing the object, it means the array is over
|
163
|
-
if self.
|
191
|
+
if self.get_context() == "object_value" and char == "}":
|
164
192
|
break
|
165
193
|
|
166
194
|
# Especially at the end of an LLM generated json you might miss the last "]"
|
167
195
|
char = self.get_char_at()
|
168
196
|
if char and char != "]":
|
197
|
+
self.log(
|
198
|
+
"While parsing an array we missed the closing ], adding it back", "info"
|
199
|
+
)
|
169
200
|
# Sometimes when you fix a missing "]" you'll have a trailing "," there that makes the JSON invalid
|
170
201
|
if char == ",":
|
171
202
|
# Remove trailing "," before adding the "]"
|
203
|
+
self.log(
|
204
|
+
"While parsing an array, remove a trailing , before adding ]",
|
205
|
+
"info",
|
206
|
+
)
|
172
207
|
self.remove_char_at()
|
173
208
|
self.insert_char_at("]")
|
174
209
|
self.index -= 1
|
@@ -183,18 +218,23 @@ class JSONParser:
|
|
183
218
|
|
184
219
|
# Flag to manage corner cases related to missing starting quote
|
185
220
|
fixed_quotes = False
|
186
|
-
double_delimiter = False
|
187
221
|
lstring_delimiter = rstring_delimiter = '"'
|
188
222
|
if isinstance(string_quotes, list):
|
189
223
|
lstring_delimiter = string_quotes[0]
|
190
224
|
rstring_delimiter = string_quotes[1]
|
191
225
|
elif isinstance(string_quotes, str):
|
192
226
|
lstring_delimiter = rstring_delimiter = string_quotes
|
227
|
+
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
|
193
228
|
if self.get_char_at(1) == lstring_delimiter:
|
194
|
-
|
229
|
+
self.log(
|
230
|
+
"While parsing a string, we found a doubled quote, ignoring it", "info"
|
231
|
+
)
|
195
232
|
self.index += 1
|
196
233
|
char = self.get_char_at()
|
197
234
|
if char != lstring_delimiter:
|
235
|
+
self.log(
|
236
|
+
"While parsing a string, we found no starting quote, adding it", "info"
|
237
|
+
)
|
198
238
|
self.insert_char_at(lstring_delimiter)
|
199
239
|
fixed_quotes = True
|
200
240
|
else:
|
@@ -210,12 +250,13 @@ class JSONParser:
|
|
210
250
|
# * It iterated over the entire sequence
|
211
251
|
# * If we are fixing missing quotes in an object, when it finds the special terminators
|
212
252
|
char = self.get_char_at()
|
213
|
-
fix_broken_markdown_link = False
|
214
253
|
while char and char != rstring_delimiter:
|
215
254
|
if fixed_quotes:
|
216
|
-
if self.
|
255
|
+
if self.get_context() == "object_key" and (
|
256
|
+
char == ":" or char.isspace()
|
257
|
+
):
|
217
258
|
break
|
218
|
-
elif self.
|
259
|
+
elif self.get_context() == "object_value" and char in [",", "}"]:
|
219
260
|
break
|
220
261
|
self.index += 1
|
221
262
|
char = self.get_char_at()
|
@@ -227,21 +268,46 @@ class JSONParser:
|
|
227
268
|
else:
|
228
269
|
self.remove_char_at(-1)
|
229
270
|
self.index -= 1
|
230
|
-
# ChatGPT sometimes forget to quote
|
271
|
+
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
|
231
272
|
if (
|
232
273
|
char == rstring_delimiter
|
233
|
-
# Next character is not a
|
234
|
-
and self.get_char_at(1)
|
235
|
-
and (
|
236
|
-
fix_broken_markdown_link
|
237
|
-
or (self.get_char_at(-2) == "]" and self.get_char_at(-1)) == "("
|
238
|
-
)
|
274
|
+
# Next character is not a delimiter
|
275
|
+
and self.get_char_at(1) not in [",", ":", "]", "}"]
|
239
276
|
):
|
240
|
-
|
241
|
-
self.
|
242
|
-
|
243
|
-
|
244
|
-
|
277
|
+
# Special case here, in case of double quotes one after another
|
278
|
+
if self.get_char_at(1) == rstring_delimiter:
|
279
|
+
self.log(
|
280
|
+
"While parsing a string, we found a doubled quote, ignoring it",
|
281
|
+
"info",
|
282
|
+
)
|
283
|
+
# self destruct this character
|
284
|
+
self.remove_char_at()
|
285
|
+
else:
|
286
|
+
# Check if eventually there is a rstring delimiter, otherwise we bail
|
287
|
+
i = 2
|
288
|
+
next_c = self.get_char_at(i)
|
289
|
+
while next_c and next_c != rstring_delimiter:
|
290
|
+
i += 1
|
291
|
+
next_c = self.get_char_at(i)
|
292
|
+
# In that case we ignore this rstring delimiter
|
293
|
+
if next_c:
|
294
|
+
self.log(
|
295
|
+
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
296
|
+
"info",
|
297
|
+
)
|
298
|
+
self.index += 1
|
299
|
+
char = self.get_char_at()
|
300
|
+
|
301
|
+
if (
|
302
|
+
char
|
303
|
+
and fixed_quotes
|
304
|
+
and self.get_context() == "object_key"
|
305
|
+
and char.isspace()
|
306
|
+
):
|
307
|
+
self.log(
|
308
|
+
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
|
309
|
+
"info",
|
310
|
+
)
|
245
311
|
self.skip_whitespaces_at()
|
246
312
|
if self.get_char_at() not in [":", ","]:
|
247
313
|
return ""
|
@@ -250,13 +316,15 @@ class JSONParser:
|
|
250
316
|
|
251
317
|
# A fallout of the previous special case in the while loop, we need to update the index only if we had a closing quote
|
252
318
|
if char != rstring_delimiter:
|
319
|
+
self.log(
|
320
|
+
"While parsing a string, we missed the closing quote, adding it back",
|
321
|
+
"info",
|
322
|
+
)
|
253
323
|
self.insert_char_at(rstring_delimiter)
|
254
324
|
else:
|
255
325
|
self.index += 1
|
256
|
-
if double_delimiter and self.get_char_at() == rstring_delimiter:
|
257
|
-
self.index += 1
|
258
326
|
|
259
|
-
return self.json_str[start:end]
|
327
|
+
return self.json_str[start:end].rstrip()
|
260
328
|
|
261
329
|
def parse_number(self) -> Union[float, int, str]:
|
262
330
|
# <number> is a valid real number expressed in one of a number of given formats
|
@@ -325,30 +393,52 @@ class JSONParser:
|
|
325
393
|
except IndexError:
|
326
394
|
return
|
327
395
|
|
328
|
-
def
|
396
|
+
def set_context(self, value: str) -> None:
|
329
397
|
# If a value is provided update the context variable and save in stack
|
330
398
|
if value:
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
399
|
+
self.context.append(value)
|
400
|
+
|
401
|
+
def reset_context(self) -> None:
|
402
|
+
try:
|
403
|
+
self.context.pop()
|
404
|
+
except Exception:
|
405
|
+
return
|
406
|
+
|
407
|
+
def get_context(self) -> str:
|
408
|
+
try:
|
409
|
+
return self.context[0]
|
410
|
+
except Exception:
|
411
|
+
return ""
|
412
|
+
|
413
|
+
def log(self, text: str, level: str) -> None:
|
414
|
+
if level == self.logger["log_level"]:
|
415
|
+
self.logger["log"].append(
|
416
|
+
{
|
417
|
+
"text": text,
|
418
|
+
"context": self.json_str[
|
419
|
+
self.index
|
420
|
+
- self.logger["window"] : self.index
|
421
|
+
+ self.logger["window"]
|
422
|
+
],
|
423
|
+
}
|
424
|
+
)
|
340
425
|
|
341
426
|
|
342
427
|
def repair_json(
|
343
|
-
json_str: str,
|
428
|
+
json_str: str,
|
429
|
+
return_objects: bool = False,
|
430
|
+
skip_json_loads: bool = False,
|
431
|
+
logging: bool = False,
|
344
432
|
) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
|
345
433
|
"""
|
346
434
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
|
347
435
|
It will return the fixed string by default.
|
348
436
|
When `return_objects=True` is passed, it will return the decoded data structure instead.
|
437
|
+
When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
|
438
|
+
When `logging=True` is passed, it will return an tuple with the repaired json and a log of all repair actions
|
349
439
|
"""
|
350
440
|
json_str = json_str.strip().lstrip("```json")
|
351
|
-
parser = JSONParser(json_str)
|
441
|
+
parser = JSONParser(json_str, logging)
|
352
442
|
if skip_json_loads:
|
353
443
|
parsed_json = parser.parse()
|
354
444
|
else:
|
@@ -357,7 +447,7 @@ def repair_json(
|
|
357
447
|
except json.JSONDecodeError:
|
358
448
|
parsed_json = parser.parse()
|
359
449
|
# It's useful to return the actual object instead of the json string, it allows this lib to be a replacement of the json library
|
360
|
-
if return_objects:
|
450
|
+
if return_objects or logging:
|
361
451
|
return parsed_json
|
362
452
|
return json.dumps(parsed_json)
|
363
453
|
|
@@ -384,3 +474,6 @@ def from_file(
|
|
384
474
|
fd.close()
|
385
475
|
|
386
476
|
return jsonobj
|
477
|
+
|
478
|
+
|
479
|
+
print(repair_json('{ "key": "value", "key2": }', logging=True))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: json_repair
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.15.0
|
4
4
|
Summary: A package to repair broken json strings
|
5
5
|
Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
|
6
6
|
License: MIT License
|
@@ -150,7 +150,10 @@ You will need owner access to this repository
|
|
150
150
|
- Run `python -m build`
|
151
151
|
- Create a new release in Github, making sure to tag all the issues solved and contributors. Create the new tag, same as the one in the build configuration
|
152
152
|
- Once the release is created, a new Github Actions workflow will start to publish on Pypi, make sure it didn't fail
|
153
|
-
|
153
|
+
---
|
154
|
+
# Repair JSON in other programming languages
|
155
|
+
- Typescript: https://github.com/josdejong/jsonrepair
|
156
|
+
- Go: https://github.com/RealAlexandreAI/json-repair
|
154
157
|
---
|
155
158
|
# Bonus Content
|
156
159
|
If you need some good Custom Instructions (System Message) to improve your chatbot responses try https://gist.github.com/mangiucugna/7ec015c4266df11be8aa510be0110fe4
|
@@ -0,0 +1,7 @@
|
|
1
|
+
json_repair/__init__.py,sha256=AlNie5y6BZBioGi5fzTAUvum_y0U5aL5aNsuQ_68LQc,175
|
2
|
+
json_repair/json_repair.py,sha256=ctuP4AaBrsWBzhF2Al-gX_itHcTG15cqU4Z56KYxNfA,19119
|
3
|
+
json_repair-0.15.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
4
|
+
json_repair-0.15.0.dist-info/METADATA,sha256=bnhSr8AectNHH-ljyaqwIC5BqPkVf2uYwLHuNuUYpyQ,7355
|
5
|
+
json_repair-0.15.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
6
|
+
json_repair-0.15.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
7
|
+
json_repair-0.15.0.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
json_repair/__init__.py,sha256=AlNie5y6BZBioGi5fzTAUvum_y0U5aL5aNsuQ_68LQc,175
|
2
|
-
json_repair/json_repair.py,sha256=DB220fZ1BCf--9CeP6AzL2FCk9tpE1Eh3WxgFo33P88,15460
|
3
|
-
json_repair-0.13.1.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
4
|
-
json_repair-0.13.1.dist-info/METADATA,sha256=gcfdAU5nhvlHlE3wMvGcwHRQUzgPT2p08cgDg7iyLvw,7200
|
5
|
-
json_repair-0.13.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
6
|
-
json_repair-0.13.1.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
7
|
-
json_repair-0.13.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|