json-repair 0.14.0__py3-none-any.whl → 0.15.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/json_repair.py +115 -26
- {json_repair-0.14.0.dist-info → json_repair-0.15.0.dist-info}/METADATA +1 -1
- json_repair-0.15.0.dist-info/RECORD +7 -0
- json_repair-0.14.0.dist-info/RECORD +0 -7
- {json_repair-0.14.0.dist-info → json_repair-0.15.0.dist-info}/LICENSE +0 -0
- {json_repair-0.14.0.dist-info → json_repair-0.15.0.dist-info}/WHEEL +0 -0
- {json_repair-0.14.0.dist-info → json_repair-0.15.0.dist-info}/top_level.txt +0 -0
json_repair/json_repair.py
CHANGED
@@ -27,17 +27,25 @@ from typing import Any, Dict, List, Union, TextIO
|
|
27
27
|
|
28
28
|
|
29
29
|
class JSONParser:
|
30
|
-
def __init__(self, json_str: str) -> None:
|
30
|
+
def __init__(self, json_str: str, logging: bool = False) -> None:
|
31
31
|
# The string to parse
|
32
32
|
self.json_str = json_str
|
33
33
|
# Index is our iterator that will keep track of which character we are looking at right now
|
34
34
|
self.index = 0
|
35
35
|
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
|
36
|
-
self.context =
|
37
|
-
|
36
|
+
self.context = []
|
37
|
+
# Use this to log the activity, but only if logging is active
|
38
|
+
self.logger = {
|
39
|
+
"log": [],
|
40
|
+
"window": 10,
|
41
|
+
"log_level": "info" if logging else "none",
|
42
|
+
}
|
38
43
|
|
39
44
|
def parse(self) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
|
40
|
-
|
45
|
+
if self.logger["log_level"] == "none":
|
46
|
+
return self.parse_json()
|
47
|
+
else:
|
48
|
+
return self.parse_json(), self.logger["log"]
|
41
49
|
|
42
50
|
def parse_json(
|
43
51
|
self,
|
@@ -56,7 +64,11 @@ class JSONParser:
|
|
56
64
|
return self.parse_array()
|
57
65
|
# there can be an edge case in which a key is empty and at the end of an object
|
58
66
|
# like "key": }. We return an empty string here to close the object properly
|
59
|
-
elif char == "}" and self.
|
67
|
+
elif char == "}" and self.get_context() == "object_value":
|
68
|
+
self.log(
|
69
|
+
"At the end of an object we found a key with missing value, skipping",
|
70
|
+
"info",
|
71
|
+
)
|
60
72
|
return ""
|
61
73
|
# <string> starts with '"'
|
62
74
|
elif char == '"':
|
@@ -92,13 +104,17 @@ class JSONParser:
|
|
92
104
|
|
93
105
|
# Sometimes LLMs do weird things, if we find a ":" so early, we'll change it to "," and move on
|
94
106
|
if (self.get_char_at() or "") == ":":
|
107
|
+
self.log(
|
108
|
+
"While parsing an object we found a : before a key, replacing with ,",
|
109
|
+
"info",
|
110
|
+
)
|
95
111
|
self.remove_char_at()
|
96
112
|
self.insert_char_at(",")
|
97
113
|
self.index += 1
|
98
114
|
|
99
115
|
# We are now searching for they string key
|
100
116
|
# Context is used in the string parser to manage the lack of quotes
|
101
|
-
self.
|
117
|
+
self.set_context("object_key")
|
102
118
|
|
103
119
|
self.skip_whitespaces_at()
|
104
120
|
|
@@ -110,6 +126,10 @@ class JSONParser:
|
|
110
126
|
# This can happen sometimes like { "": "value" }
|
111
127
|
if key == "" and self.get_char_at() == ":":
|
112
128
|
key = "empty_placeholder"
|
129
|
+
self.log(
|
130
|
+
"While parsing an object we found an empty key, replacing with empty_placeholder",
|
131
|
+
"info",
|
132
|
+
)
|
113
133
|
break
|
114
134
|
|
115
135
|
# We reached the end here
|
@@ -118,15 +138,19 @@ class JSONParser:
|
|
118
138
|
|
119
139
|
# An extreme case of missing ":" after a key
|
120
140
|
if (self.get_char_at() or "") != ":":
|
141
|
+
self.log(
|
142
|
+
"While parsing an object we missed a : after a key, adding it back",
|
143
|
+
"info",
|
144
|
+
)
|
121
145
|
self.insert_char_at(":")
|
122
146
|
self.index += 1
|
123
|
-
self.
|
124
|
-
self.
|
147
|
+
self.reset_context()
|
148
|
+
self.set_context("object_value")
|
125
149
|
# The value can be any valid json
|
126
150
|
value = self.parse_json()
|
127
151
|
|
128
152
|
# Reset context since our job is done
|
129
|
-
self.
|
153
|
+
self.reset_context()
|
130
154
|
obj[key] = value
|
131
155
|
|
132
156
|
if (self.get_char_at() or "") in [",", "'", '"']:
|
@@ -137,6 +161,10 @@ class JSONParser:
|
|
137
161
|
|
138
162
|
# Especially at the end of an LLM generated json you might miss the last "}"
|
139
163
|
if (self.get_char_at() or "}") != "}":
|
164
|
+
self.log(
|
165
|
+
"While parsing an object, we couldn't find the closing }, adding it back",
|
166
|
+
"info",
|
167
|
+
)
|
140
168
|
self.insert_char_at("}")
|
141
169
|
self.index += 1
|
142
170
|
return obj
|
@@ -160,15 +188,22 @@ class JSONParser:
|
|
160
188
|
self.index += 1
|
161
189
|
char = self.get_char_at()
|
162
190
|
# If this is the right value of an object and we are closing the object, it means the array is over
|
163
|
-
if self.
|
191
|
+
if self.get_context() == "object_value" and char == "}":
|
164
192
|
break
|
165
193
|
|
166
194
|
# Especially at the end of an LLM generated json you might miss the last "]"
|
167
195
|
char = self.get_char_at()
|
168
196
|
if char and char != "]":
|
197
|
+
self.log(
|
198
|
+
"While parsing an array we missed the closing ], adding it back", "info"
|
199
|
+
)
|
169
200
|
# Sometimes when you fix a missing "]" you'll have a trailing "," there that makes the JSON invalid
|
170
201
|
if char == ",":
|
171
202
|
# Remove trailing "," before adding the "]"
|
203
|
+
self.log(
|
204
|
+
"While parsing an array, remove a trailing , before adding ]",
|
205
|
+
"info",
|
206
|
+
)
|
172
207
|
self.remove_char_at()
|
173
208
|
self.insert_char_at("]")
|
174
209
|
self.index -= 1
|
@@ -191,9 +226,15 @@ class JSONParser:
|
|
191
226
|
lstring_delimiter = rstring_delimiter = string_quotes
|
192
227
|
# There is sometimes a weird case of doubled quotes, we manage this also later in the while loop
|
193
228
|
if self.get_char_at(1) == lstring_delimiter:
|
229
|
+
self.log(
|
230
|
+
"While parsing a string, we found a doubled quote, ignoring it", "info"
|
231
|
+
)
|
194
232
|
self.index += 1
|
195
233
|
char = self.get_char_at()
|
196
234
|
if char != lstring_delimiter:
|
235
|
+
self.log(
|
236
|
+
"While parsing a string, we found no starting quote, adding it", "info"
|
237
|
+
)
|
197
238
|
self.insert_char_at(lstring_delimiter)
|
198
239
|
fixed_quotes = True
|
199
240
|
else:
|
@@ -211,9 +252,11 @@ class JSONParser:
|
|
211
252
|
char = self.get_char_at()
|
212
253
|
while char and char != rstring_delimiter:
|
213
254
|
if fixed_quotes:
|
214
|
-
if self.
|
255
|
+
if self.get_context() == "object_key" and (
|
256
|
+
char == ":" or char.isspace()
|
257
|
+
):
|
215
258
|
break
|
216
|
-
elif self.
|
259
|
+
elif self.get_context() == "object_value" and char in [",", "}"]:
|
217
260
|
break
|
218
261
|
self.index += 1
|
219
262
|
char = self.get_char_at()
|
@@ -233,6 +276,10 @@ class JSONParser:
|
|
233
276
|
):
|
234
277
|
# Special case here, in case of double quotes one after another
|
235
278
|
if self.get_char_at(1) == rstring_delimiter:
|
279
|
+
self.log(
|
280
|
+
"While parsing a string, we found a doubled quote, ignoring it",
|
281
|
+
"info",
|
282
|
+
)
|
236
283
|
# self destruct this character
|
237
284
|
self.remove_char_at()
|
238
285
|
else:
|
@@ -244,10 +291,23 @@ class JSONParser:
|
|
244
291
|
next_c = self.get_char_at(i)
|
245
292
|
# In that case we ignore this rstring delimiter
|
246
293
|
if next_c:
|
294
|
+
self.log(
|
295
|
+
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here, ignoring it",
|
296
|
+
"info",
|
297
|
+
)
|
247
298
|
self.index += 1
|
248
299
|
char = self.get_char_at()
|
249
300
|
|
250
|
-
if
|
301
|
+
if (
|
302
|
+
char
|
303
|
+
and fixed_quotes
|
304
|
+
and self.get_context() == "object_key"
|
305
|
+
and char.isspace()
|
306
|
+
):
|
307
|
+
self.log(
|
308
|
+
"While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value",
|
309
|
+
"info",
|
310
|
+
)
|
251
311
|
self.skip_whitespaces_at()
|
252
312
|
if self.get_char_at() not in [":", ","]:
|
253
313
|
return ""
|
@@ -256,6 +316,10 @@ class JSONParser:
|
|
256
316
|
|
257
317
|
# A fallout of the previous special case in the while loop, we need to update the index only if we had a closing quote
|
258
318
|
if char != rstring_delimiter:
|
319
|
+
self.log(
|
320
|
+
"While parsing a string, we missed the closing quote, adding it back",
|
321
|
+
"info",
|
322
|
+
)
|
259
323
|
self.insert_char_at(rstring_delimiter)
|
260
324
|
else:
|
261
325
|
self.index += 1
|
@@ -329,30 +393,52 @@ class JSONParser:
|
|
329
393
|
except IndexError:
|
330
394
|
return
|
331
395
|
|
332
|
-
def
|
396
|
+
def set_context(self, value: str) -> None:
|
333
397
|
# If a value is provided update the context variable and save in stack
|
334
398
|
if value:
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
399
|
+
self.context.append(value)
|
400
|
+
|
401
|
+
def reset_context(self) -> None:
|
402
|
+
try:
|
403
|
+
self.context.pop()
|
404
|
+
except Exception:
|
405
|
+
return
|
406
|
+
|
407
|
+
def get_context(self) -> str:
|
408
|
+
try:
|
409
|
+
return self.context[0]
|
410
|
+
except Exception:
|
411
|
+
return ""
|
412
|
+
|
413
|
+
def log(self, text: str, level: str) -> None:
|
414
|
+
if level == self.logger["log_level"]:
|
415
|
+
self.logger["log"].append(
|
416
|
+
{
|
417
|
+
"text": text,
|
418
|
+
"context": self.json_str[
|
419
|
+
self.index
|
420
|
+
- self.logger["window"] : self.index
|
421
|
+
+ self.logger["window"]
|
422
|
+
],
|
423
|
+
}
|
424
|
+
)
|
344
425
|
|
345
426
|
|
346
427
|
def repair_json(
|
347
|
-
json_str: str,
|
428
|
+
json_str: str,
|
429
|
+
return_objects: bool = False,
|
430
|
+
skip_json_loads: bool = False,
|
431
|
+
logging: bool = False,
|
348
432
|
) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
|
349
433
|
"""
|
350
434
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
|
351
435
|
It will return the fixed string by default.
|
352
436
|
When `return_objects=True` is passed, it will return the decoded data structure instead.
|
437
|
+
When `skip_json_loads=True` is passed, it will not call the built-in json.loads() function
|
438
|
+
When `logging=True` is passed, it will return an tuple with the repaired json and a log of all repair actions
|
353
439
|
"""
|
354
440
|
json_str = json_str.strip().lstrip("```json")
|
355
|
-
parser = JSONParser(json_str)
|
441
|
+
parser = JSONParser(json_str, logging)
|
356
442
|
if skip_json_loads:
|
357
443
|
parsed_json = parser.parse()
|
358
444
|
else:
|
@@ -361,7 +447,7 @@ def repair_json(
|
|
361
447
|
except json.JSONDecodeError:
|
362
448
|
parsed_json = parser.parse()
|
363
449
|
# It's useful to return the actual object instead of the json string, it allows this lib to be a replacement of the json library
|
364
|
-
if return_objects:
|
450
|
+
if return_objects or logging:
|
365
451
|
return parsed_json
|
366
452
|
return json.dumps(parsed_json)
|
367
453
|
|
@@ -388,3 +474,6 @@ def from_file(
|
|
388
474
|
fd.close()
|
389
475
|
|
390
476
|
return jsonobj
|
477
|
+
|
478
|
+
|
479
|
+
print(repair_json('{ "key": "value", "key2": }', logging=True))
|
@@ -0,0 +1,7 @@
|
|
1
|
+
json_repair/__init__.py,sha256=AlNie5y6BZBioGi5fzTAUvum_y0U5aL5aNsuQ_68LQc,175
|
2
|
+
json_repair/json_repair.py,sha256=ctuP4AaBrsWBzhF2Al-gX_itHcTG15cqU4Z56KYxNfA,19119
|
3
|
+
json_repair-0.15.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
4
|
+
json_repair-0.15.0.dist-info/METADATA,sha256=bnhSr8AectNHH-ljyaqwIC5BqPkVf2uYwLHuNuUYpyQ,7355
|
5
|
+
json_repair-0.15.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
6
|
+
json_repair-0.15.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
7
|
+
json_repair-0.15.0.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
json_repair/__init__.py,sha256=AlNie5y6BZBioGi5fzTAUvum_y0U5aL5aNsuQ_68LQc,175
|
2
|
-
json_repair/json_repair.py,sha256=8B5HfWoLlUUtRYq1cnbajOxWiMSD9nxNW2cRFPjFfVE,15817
|
3
|
-
json_repair-0.14.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
4
|
-
json_repair-0.14.0.dist-info/METADATA,sha256=82KldmuVFLXbNy6SXar9MsulkcBUM1K8RX13pNysHQU,7355
|
5
|
-
json_repair-0.14.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
6
|
-
json_repair-0.14.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
7
|
-
json_repair-0.14.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|