json-repair 0.20.0__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/json_repair.py +102 -63
- {json_repair-0.20.0.dist-info → json_repair-0.21.0.dist-info}/METADATA +1 -1
- json_repair-0.21.0.dist-info/RECORD +7 -0
- json_repair-0.20.0.dist-info/RECORD +0 -7
- {json_repair-0.20.0.dist-info → json_repair-0.21.0.dist-info}/LICENSE +0 -0
- {json_repair-0.20.0.dist-info → json_repair-0.21.0.dist-info}/WHEEL +0 -0
- {json_repair-0.20.0.dist-info → json_repair-0.21.0.dist-info}/top_level.txt +0 -0
json_repair/json_repair.py
CHANGED
@@ -24,11 +24,55 @@ All supported use cases are in the unit tests
|
|
24
24
|
|
25
25
|
import os
|
26
26
|
import json
|
27
|
-
from typing import Any, Dict, List, Union, TextIO
|
27
|
+
from typing import Any, Dict, List, Optional, Union, TextIO, Tuple
|
28
|
+
|
29
|
+
|
30
|
+
class StringFileWrapper:
|
31
|
+
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
|
32
|
+
def __init__(self, fd: TextIO) -> None:
|
33
|
+
self.fd = fd
|
34
|
+
self.length: int = 0
|
35
|
+
|
36
|
+
def __getitem__(self, index: int) -> str:
|
37
|
+
if isinstance(index, slice):
|
38
|
+
self.fd.seek(index.start)
|
39
|
+
value = self.fd.read(index.stop - index.start)
|
40
|
+
self.fd.seek(index.start)
|
41
|
+
return value
|
42
|
+
else:
|
43
|
+
self.fd.seek(index)
|
44
|
+
return self.fd.read(1)
|
45
|
+
|
46
|
+
def __len__(self) -> int:
|
47
|
+
if self.length < 1:
|
48
|
+
current_position = self.fd.tell()
|
49
|
+
self.fd.seek(0, os.SEEK_END)
|
50
|
+
self.length = self.fd.tell()
|
51
|
+
self.fd.seek(current_position)
|
52
|
+
return self.length
|
53
|
+
|
54
|
+
def __setitem__(self) -> None:
|
55
|
+
raise Exception("This is read-only!")
|
56
|
+
|
57
|
+
|
58
|
+
class LoggerConfig:
|
59
|
+
# This is a type class to simplify the declaration
|
60
|
+
def __init__(self, log_level: Optional[str]):
|
61
|
+
self.log: List[Dict[str, str]] = []
|
62
|
+
self.window: int = 10
|
63
|
+
self.log_level: str = log_level if log_level else "none"
|
64
|
+
|
65
|
+
|
66
|
+
JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
|
28
67
|
|
29
68
|
|
30
69
|
class JSONParser:
|
31
|
-
def __init__(
|
70
|
+
def __init__(
|
71
|
+
self,
|
72
|
+
json_str: Union[str, StringFileWrapper],
|
73
|
+
json_fd: Optional[TextIO],
|
74
|
+
logging: Optional[bool],
|
75
|
+
) -> None:
|
32
76
|
# The string to parse
|
33
77
|
self.json_str = json_str
|
34
78
|
# Alternatively, the file description with a json file in it
|
@@ -36,25 +80,23 @@ class JSONParser:
|
|
36
80
|
# This is a trick we do to treat the file wrapper as an array
|
37
81
|
self.json_str = StringFileWrapper(json_fd)
|
38
82
|
# Index is our iterator that will keep track of which character we are looking at right now
|
39
|
-
self.index = 0
|
83
|
+
self.index: int = 0
|
40
84
|
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
|
41
|
-
self.context = []
|
85
|
+
self.context: list[str] = []
|
42
86
|
# Use this to log the activity, but only if logging is active
|
43
|
-
self.logger =
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
def parse(self) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
|
50
|
-
if self.logger["log_level"] == "none":
|
87
|
+
self.logger = LoggerConfig(log_level="info" if logging else None)
|
88
|
+
|
89
|
+
def parse(
|
90
|
+
self,
|
91
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
92
|
+
if self.logger.log_level == "none":
|
51
93
|
return self.parse_json()
|
52
94
|
else:
|
53
|
-
return self.parse_json(), self.logger
|
95
|
+
return self.parse_json(), self.logger.log
|
54
96
|
|
55
97
|
def parse_json(
|
56
98
|
self,
|
57
|
-
) ->
|
99
|
+
) -> JSONReturnType:
|
58
100
|
char = self.get_char_at()
|
59
101
|
# False means that we are at the end of the string provided, is the base case for recursion
|
60
102
|
if char is False:
|
@@ -131,10 +173,14 @@ class JSONParser:
|
|
131
173
|
# Sometimes the string search might not move the index at all, that might lead us to an infinite loop
|
132
174
|
self.index += 1
|
133
175
|
|
176
|
+
self.skip_whitespaces_at()
|
177
|
+
|
134
178
|
# We reached the end here
|
135
179
|
if (self.get_char_at() or "}") == "}":
|
136
180
|
continue
|
137
181
|
|
182
|
+
self.skip_whitespaces_at()
|
183
|
+
|
138
184
|
# An extreme case of missing ":" after a key
|
139
185
|
if (self.get_char_at() or "") != ":":
|
140
186
|
self.log(
|
@@ -178,7 +224,7 @@ class JSONParser:
|
|
178
224
|
value = self.parse_json()
|
179
225
|
|
180
226
|
# It is possible that parse_json() returns nothing valid, so we stop
|
181
|
-
if
|
227
|
+
if value == "":
|
182
228
|
break
|
183
229
|
|
184
230
|
if value == "..." and self.get_char_at(-1) == ".":
|
@@ -221,7 +267,7 @@ class JSONParser:
|
|
221
267
|
self.reset_context()
|
222
268
|
return arr
|
223
269
|
|
224
|
-
def parse_string(self) -> str:
|
270
|
+
def parse_string(self) -> Union[str, JSONReturnType]:
|
225
271
|
# <string> is a string of valid characters enclosed in quotes
|
226
272
|
# i.e. { name: "John" }
|
227
273
|
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
|
@@ -320,7 +366,7 @@ class JSONParser:
|
|
320
366
|
string_acc = string_acc[:-1]
|
321
367
|
if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
|
322
368
|
escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
|
323
|
-
string_acc += escape_seqs.get(char, char)
|
369
|
+
string_acc += escape_seqs.get(char, char) or char
|
324
370
|
self.index += 1
|
325
371
|
char = self.get_char_at()
|
326
372
|
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
|
@@ -358,7 +404,29 @@ class JSONParser:
|
|
358
404
|
break
|
359
405
|
i += 1
|
360
406
|
next_c = self.get_char_at(i)
|
361
|
-
if
|
407
|
+
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
|
408
|
+
if next_c == "," and self.get_context() == "object_value":
|
409
|
+
i += 1
|
410
|
+
next_c = self.get_char_at(i)
|
411
|
+
while next_c and next_c != rstring_delimiter:
|
412
|
+
i += 1
|
413
|
+
next_c = self.get_char_at(i)
|
414
|
+
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
|
415
|
+
i += 1
|
416
|
+
next_c = self.get_char_at(i)
|
417
|
+
while next_c and next_c.isspace():
|
418
|
+
i += 1
|
419
|
+
next_c = self.get_char_at(i)
|
420
|
+
if next_c == "}":
|
421
|
+
# OK this is valid then
|
422
|
+
self.log(
|
423
|
+
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
|
424
|
+
"info",
|
425
|
+
)
|
426
|
+
string_acc += char
|
427
|
+
self.index += 1
|
428
|
+
char = self.get_char_at()
|
429
|
+
elif next_c == rstring_delimiter:
|
362
430
|
if self.get_context() == "object_value":
|
363
431
|
# But this might not be it! This could be just a missing comma
|
364
432
|
# We found a delimiter and we need to check if this is a key
|
@@ -414,7 +482,7 @@ class JSONParser:
|
|
414
482
|
|
415
483
|
return string_acc.rstrip()
|
416
484
|
|
417
|
-
def parse_number(self) -> Union[float, int, str]:
|
485
|
+
def parse_number(self) -> Union[float, int, str, JSONReturnType]:
|
418
486
|
# <number> is a valid real number expressed in one of a number of given formats
|
419
487
|
number_str = ""
|
420
488
|
number_chars = set("0123456789-.eE/,")
|
@@ -447,8 +515,7 @@ class JSONParser:
|
|
447
515
|
def parse_boolean_or_null(self) -> Union[bool, str, None]:
|
448
516
|
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
|
449
517
|
starting_index = self.index
|
450
|
-
|
451
|
-
char = self.get_char_at().lower()
|
518
|
+
char = (self.get_char_at() or "").lower()
|
452
519
|
if char == "t":
|
453
520
|
value = ("true", True)
|
454
521
|
elif char == "f":
|
@@ -456,12 +523,12 @@ class JSONParser:
|
|
456
523
|
elif char == "n":
|
457
524
|
value = ("null", None)
|
458
525
|
|
459
|
-
if
|
526
|
+
if value:
|
460
527
|
i = 0
|
461
528
|
while char and i < len(value[0]) and char == value[0][i]:
|
462
529
|
i += 1
|
463
530
|
self.index += 1
|
464
|
-
char = self.get_char_at().lower()
|
531
|
+
char = (self.get_char_at() or "").lower()
|
465
532
|
if i == len(value[0]):
|
466
533
|
return value[1]
|
467
534
|
|
@@ -509,12 +576,12 @@ class JSONParser:
|
|
509
576
|
return ""
|
510
577
|
|
511
578
|
def log(self, text: str, level: str) -> None:
|
512
|
-
if level == self.logger
|
579
|
+
if level == self.logger.log_level:
|
513
580
|
context = ""
|
514
|
-
start = max(self.index - self.logger
|
515
|
-
end = min(self.index + self.logger
|
581
|
+
start = max(self.index - self.logger.window, 0)
|
582
|
+
end = min(self.index + self.logger.window, len(self.json_str))
|
516
583
|
context = self.json_str[start:end]
|
517
|
-
self.logger
|
584
|
+
self.logger.log.append(
|
518
585
|
{
|
519
586
|
"text": text,
|
520
587
|
"context": context,
|
@@ -524,11 +591,11 @@ class JSONParser:
|
|
524
591
|
|
525
592
|
def repair_json(
|
526
593
|
json_str: str = "",
|
527
|
-
return_objects: bool = False,
|
528
|
-
skip_json_loads: bool = False,
|
529
|
-
logging: bool = False,
|
530
|
-
json_fd: TextIO = None,
|
531
|
-
) -> Union[
|
594
|
+
return_objects: Optional[bool] = False,
|
595
|
+
skip_json_loads: Optional[bool] = False,
|
596
|
+
logging: Optional[bool] = False,
|
597
|
+
json_fd: Optional[TextIO] = None,
|
598
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
532
599
|
"""
|
533
600
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
|
534
601
|
It will return the fixed string by default.
|
@@ -555,7 +622,7 @@ def repair_json(
|
|
555
622
|
|
556
623
|
def loads(
|
557
624
|
json_str: str, skip_json_loads: bool = False, logging: bool = False
|
558
|
-
) -> Union[
|
625
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
559
626
|
"""
|
560
627
|
This function works like `json.loads()` except that it will fix your JSON in the process.
|
561
628
|
It is a wrapper around the `repair_json()` function with `return_objects=True`.
|
@@ -570,7 +637,7 @@ def loads(
|
|
570
637
|
|
571
638
|
def load(
|
572
639
|
fd: TextIO, skip_json_loads: bool = False, logging: bool = False
|
573
|
-
) -> Union[
|
640
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
574
641
|
"""
|
575
642
|
This function works like `json.load()` except that it will fix your JSON in the process.
|
576
643
|
It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`.
|
@@ -580,7 +647,7 @@ def load(
|
|
580
647
|
|
581
648
|
def from_file(
|
582
649
|
filename: str, skip_json_loads: bool = False, logging: bool = False
|
583
|
-
) -> Union[
|
650
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
584
651
|
"""
|
585
652
|
This function is a wrapper around `load()` so you can pass the filename as string
|
586
653
|
"""
|
@@ -589,31 +656,3 @@ def from_file(
|
|
589
656
|
fd.close()
|
590
657
|
|
591
658
|
return jsonobj
|
592
|
-
|
593
|
-
|
594
|
-
class StringFileWrapper:
|
595
|
-
# This is a trick to simplify the code above, transform the filedescriptor handling into an array handling
|
596
|
-
def __init__(self, fd: TextIO) -> None:
|
597
|
-
self.fd = fd
|
598
|
-
self.length = None
|
599
|
-
|
600
|
-
def __getitem__(self, index: int) -> Any:
|
601
|
-
if isinstance(index, slice):
|
602
|
-
self.fd.seek(index.start)
|
603
|
-
value = self.fd.read(index.stop - index.start)
|
604
|
-
self.fd.seek(index.start)
|
605
|
-
return value
|
606
|
-
else:
|
607
|
-
self.fd.seek(index)
|
608
|
-
return self.fd.read(1)
|
609
|
-
|
610
|
-
def __len__(self) -> int:
|
611
|
-
if not self.length:
|
612
|
-
current_position = self.fd.tell()
|
613
|
-
self.fd.seek(0, os.SEEK_END)
|
614
|
-
self.length = self.fd.tell()
|
615
|
-
self.fd.seek(current_position)
|
616
|
-
return self.length
|
617
|
-
|
618
|
-
def __setitem__(self):
|
619
|
-
raise Exception("This is read-only!")
|
@@ -0,0 +1,7 @@
|
|
1
|
+
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
+
json_repair/json_repair.py,sha256=ry94U3QoJwVgyG1qeQNEb8Qt8NtCLpCGR41GBA7tozY,27320
|
3
|
+
json_repair-0.21.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
4
|
+
json_repair-0.21.0.dist-info/METADATA,sha256=obBsHuNN7Ph5zX77VHmER2O9A61F3MXGBreEowdr-so,7333
|
5
|
+
json_repair-0.21.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
6
|
+
json_repair-0.21.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
7
|
+
json_repair-0.21.0.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
-
json_repair/json_repair.py,sha256=zYg4tIwZ4rdVkCQ5XVceNQaOz2MT50O7jHJbJ1EpKhk,25446
|
3
|
-
json_repair-0.20.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
4
|
-
json_repair-0.20.0.dist-info/METADATA,sha256=2WrsjORPx37e4CqdkdiARwB-VoP5EyjsZaS0hcVnVBo,7333
|
5
|
-
json_repair-0.20.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
6
|
-
json_repair-0.20.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
7
|
-
json_repair-0.20.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|