json-repair 0.20.1__py3-none-any.whl → 0.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/json_repair.py +102 -65
- {json_repair-0.20.1.dist-info → json_repair-0.22.0.dist-info}/METADATA +1 -1
- json_repair-0.22.0.dist-info/RECORD +7 -0
- json_repair-0.20.1.dist-info/RECORD +0 -7
- {json_repair-0.20.1.dist-info → json_repair-0.22.0.dist-info}/LICENSE +0 -0
- {json_repair-0.20.1.dist-info → json_repair-0.22.0.dist-info}/WHEEL +0 -0
- {json_repair-0.20.1.dist-info → json_repair-0.22.0.dist-info}/top_level.txt +0 -0
json_repair/json_repair.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
"""
|
2
2
|
This module will parse the JSON file following the BNF definition:
|
3
3
|
|
4
|
-
<json> ::= <
|
4
|
+
<json> ::= <container>
|
5
5
|
|
6
6
|
<primitive> ::= <number> | <string> | <boolean>
|
7
7
|
; Where:
|
@@ -24,11 +24,55 @@ All supported use cases are in the unit tests
|
|
24
24
|
|
25
25
|
import os
|
26
26
|
import json
|
27
|
-
from typing import Any, Dict, List, Union, TextIO
|
27
|
+
from typing import Any, Dict, List, Optional, Union, TextIO, Tuple
|
28
|
+
|
29
|
+
|
30
|
+
class StringFileWrapper:
|
31
|
+
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
|
32
|
+
def __init__(self, fd: TextIO) -> None:
|
33
|
+
self.fd = fd
|
34
|
+
self.length: int = 0
|
35
|
+
|
36
|
+
def __getitem__(self, index: int) -> str:
|
37
|
+
if isinstance(index, slice):
|
38
|
+
self.fd.seek(index.start)
|
39
|
+
value = self.fd.read(index.stop - index.start)
|
40
|
+
self.fd.seek(index.start)
|
41
|
+
return value
|
42
|
+
else:
|
43
|
+
self.fd.seek(index)
|
44
|
+
return self.fd.read(1)
|
45
|
+
|
46
|
+
def __len__(self) -> int:
|
47
|
+
if self.length < 1:
|
48
|
+
current_position = self.fd.tell()
|
49
|
+
self.fd.seek(0, os.SEEK_END)
|
50
|
+
self.length = self.fd.tell()
|
51
|
+
self.fd.seek(current_position)
|
52
|
+
return self.length
|
53
|
+
|
54
|
+
def __setitem__(self) -> None:
|
55
|
+
raise Exception("This is read-only!")
|
56
|
+
|
57
|
+
|
58
|
+
class LoggerConfig:
|
59
|
+
# This is a type class to simplify the declaration
|
60
|
+
def __init__(self, log_level: Optional[str]):
|
61
|
+
self.log: List[Dict[str, str]] = []
|
62
|
+
self.window: int = 10
|
63
|
+
self.log_level: str = log_level if log_level else "none"
|
64
|
+
|
65
|
+
|
66
|
+
JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
|
28
67
|
|
29
68
|
|
30
69
|
class JSONParser:
|
31
|
-
def __init__(
|
70
|
+
def __init__(
|
71
|
+
self,
|
72
|
+
json_str: Union[str, StringFileWrapper],
|
73
|
+
json_fd: Optional[TextIO],
|
74
|
+
logging: Optional[bool],
|
75
|
+
) -> None:
|
32
76
|
# The string to parse
|
33
77
|
self.json_str = json_str
|
34
78
|
# Alternatively, the file description with a json file in it
|
@@ -36,26 +80,26 @@ class JSONParser:
|
|
36
80
|
# This is a trick we do to treat the file wrapper as an array
|
37
81
|
self.json_str = StringFileWrapper(json_fd)
|
38
82
|
# Index is our iterator that will keep track of which character we are looking at right now
|
39
|
-
self.index = 0
|
83
|
+
self.index: int = 0
|
40
84
|
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
|
41
|
-
self.context = []
|
85
|
+
self.context: list[str] = []
|
42
86
|
# Use this to log the activity, but only if logging is active
|
43
|
-
self.logger =
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
def parse(self) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
|
50
|
-
if self.logger["log_level"] == "none":
|
87
|
+
self.logger = LoggerConfig(log_level="info" if logging else None)
|
88
|
+
|
89
|
+
def parse(
|
90
|
+
self,
|
91
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
92
|
+
if self.logger.log_level == "none":
|
51
93
|
return self.parse_json()
|
52
94
|
else:
|
53
|
-
return self.parse_json(), self.logger
|
95
|
+
return self.parse_json(), self.logger.log
|
54
96
|
|
55
97
|
def parse_json(
|
56
98
|
self,
|
57
|
-
) ->
|
99
|
+
) -> JSONReturnType:
|
58
100
|
char = self.get_char_at()
|
101
|
+
# This parser will ignore any basic element (string or number) that is not inside an array or object
|
102
|
+
is_in_context = len(self.context) > 0
|
59
103
|
# False means that we are at the end of the string provided, is the base case for recursion
|
60
104
|
if char is False:
|
61
105
|
return ""
|
@@ -78,10 +122,10 @@ class JSONParser:
|
|
78
122
|
)
|
79
123
|
return ""
|
80
124
|
# <string> starts with a quote
|
81
|
-
elif char in ['"', "'", "“"] or char.isalpha():
|
125
|
+
elif is_in_context and (char in ['"', "'", "“"] or char.isalpha()):
|
82
126
|
return self.parse_string()
|
83
127
|
# <number> starts with [0-9] or minus
|
84
|
-
elif char.isdigit() or char == "-" or char == ".":
|
128
|
+
elif is_in_context and (char.isdigit() or char == "-" or char == "."):
|
85
129
|
return self.parse_number()
|
86
130
|
# If everything else fails, we just ignore and move on
|
87
131
|
else:
|
@@ -225,7 +269,7 @@ class JSONParser:
|
|
225
269
|
self.reset_context()
|
226
270
|
return arr
|
227
271
|
|
228
|
-
def parse_string(self) -> str:
|
272
|
+
def parse_string(self) -> Union[str, JSONReturnType]:
|
229
273
|
# <string> is a string of valid characters enclosed in quotes
|
230
274
|
# i.e. { name: "John" }
|
231
275
|
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
|
@@ -324,7 +368,7 @@ class JSONParser:
|
|
324
368
|
string_acc = string_acc[:-1]
|
325
369
|
if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
|
326
370
|
escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
|
327
|
-
string_acc += escape_seqs.get(char, char)
|
371
|
+
string_acc += escape_seqs.get(char, char) or char
|
328
372
|
self.index += 1
|
329
373
|
char = self.get_char_at()
|
330
374
|
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
|
@@ -362,7 +406,29 @@ class JSONParser:
|
|
362
406
|
break
|
363
407
|
i += 1
|
364
408
|
next_c = self.get_char_at(i)
|
365
|
-
if
|
409
|
+
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
|
410
|
+
if next_c == "," and self.get_context() == "object_value":
|
411
|
+
i += 1
|
412
|
+
next_c = self.get_char_at(i)
|
413
|
+
while next_c and next_c != rstring_delimiter:
|
414
|
+
i += 1
|
415
|
+
next_c = self.get_char_at(i)
|
416
|
+
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
|
417
|
+
i += 1
|
418
|
+
next_c = self.get_char_at(i)
|
419
|
+
while next_c and next_c.isspace():
|
420
|
+
i += 1
|
421
|
+
next_c = self.get_char_at(i)
|
422
|
+
if next_c == "}":
|
423
|
+
# OK this is valid then
|
424
|
+
self.log(
|
425
|
+
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
|
426
|
+
"info",
|
427
|
+
)
|
428
|
+
string_acc += char
|
429
|
+
self.index += 1
|
430
|
+
char = self.get_char_at()
|
431
|
+
elif next_c == rstring_delimiter:
|
366
432
|
if self.get_context() == "object_value":
|
367
433
|
# But this might not be it! This could be just a missing comma
|
368
434
|
# We found a delimiter and we need to check if this is a key
|
@@ -418,7 +484,7 @@ class JSONParser:
|
|
418
484
|
|
419
485
|
return string_acc.rstrip()
|
420
486
|
|
421
|
-
def parse_number(self) -> Union[float, int, str]:
|
487
|
+
def parse_number(self) -> Union[float, int, str, JSONReturnType]:
|
422
488
|
# <number> is a valid real number expressed in one of a number of given formats
|
423
489
|
number_str = ""
|
424
490
|
number_chars = set("0123456789-.eE/,")
|
@@ -451,8 +517,7 @@ class JSONParser:
|
|
451
517
|
def parse_boolean_or_null(self) -> Union[bool, str, None]:
|
452
518
|
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
|
453
519
|
starting_index = self.index
|
454
|
-
|
455
|
-
char = self.get_char_at().lower()
|
520
|
+
char = (self.get_char_at() or "").lower()
|
456
521
|
if char == "t":
|
457
522
|
value = ("true", True)
|
458
523
|
elif char == "f":
|
@@ -460,12 +525,12 @@ class JSONParser:
|
|
460
525
|
elif char == "n":
|
461
526
|
value = ("null", None)
|
462
527
|
|
463
|
-
if
|
528
|
+
if value:
|
464
529
|
i = 0
|
465
530
|
while char and i < len(value[0]) and char == value[0][i]:
|
466
531
|
i += 1
|
467
532
|
self.index += 1
|
468
|
-
char = self.get_char_at().lower()
|
533
|
+
char = (self.get_char_at() or "").lower()
|
469
534
|
if i == len(value[0]):
|
470
535
|
return value[1]
|
471
536
|
|
@@ -513,12 +578,12 @@ class JSONParser:
|
|
513
578
|
return ""
|
514
579
|
|
515
580
|
def log(self, text: str, level: str) -> None:
|
516
|
-
if level == self.logger
|
581
|
+
if level == self.logger.log_level:
|
517
582
|
context = ""
|
518
|
-
start = max(self.index - self.logger
|
519
|
-
end = min(self.index + self.logger
|
583
|
+
start = max(self.index - self.logger.window, 0)
|
584
|
+
end = min(self.index + self.logger.window, len(self.json_str))
|
520
585
|
context = self.json_str[start:end]
|
521
|
-
self.logger
|
586
|
+
self.logger.log.append(
|
522
587
|
{
|
523
588
|
"text": text,
|
524
589
|
"context": context,
|
@@ -528,11 +593,11 @@ class JSONParser:
|
|
528
593
|
|
529
594
|
def repair_json(
|
530
595
|
json_str: str = "",
|
531
|
-
return_objects: bool = False,
|
532
|
-
skip_json_loads: bool = False,
|
533
|
-
logging: bool = False,
|
534
|
-
json_fd: TextIO = None,
|
535
|
-
) -> Union[
|
596
|
+
return_objects: Optional[bool] = False,
|
597
|
+
skip_json_loads: Optional[bool] = False,
|
598
|
+
logging: Optional[bool] = False,
|
599
|
+
json_fd: Optional[TextIO] = None,
|
600
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
536
601
|
"""
|
537
602
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
|
538
603
|
It will return the fixed string by default.
|
@@ -559,7 +624,7 @@ def repair_json(
|
|
559
624
|
|
560
625
|
def loads(
|
561
626
|
json_str: str, skip_json_loads: bool = False, logging: bool = False
|
562
|
-
) -> Union[
|
627
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
563
628
|
"""
|
564
629
|
This function works like `json.loads()` except that it will fix your JSON in the process.
|
565
630
|
It is a wrapper around the `repair_json()` function with `return_objects=True`.
|
@@ -574,7 +639,7 @@ def loads(
|
|
574
639
|
|
575
640
|
def load(
|
576
641
|
fd: TextIO, skip_json_loads: bool = False, logging: bool = False
|
577
|
-
) -> Union[
|
642
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
578
643
|
"""
|
579
644
|
This function works like `json.load()` except that it will fix your JSON in the process.
|
580
645
|
It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`.
|
@@ -584,7 +649,7 @@ def load(
|
|
584
649
|
|
585
650
|
def from_file(
|
586
651
|
filename: str, skip_json_loads: bool = False, logging: bool = False
|
587
|
-
) -> Union[
|
652
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
588
653
|
"""
|
589
654
|
This function is a wrapper around `load()` so you can pass the filename as string
|
590
655
|
"""
|
@@ -593,31 +658,3 @@ def from_file(
|
|
593
658
|
fd.close()
|
594
659
|
|
595
660
|
return jsonobj
|
596
|
-
|
597
|
-
|
598
|
-
class StringFileWrapper:
|
599
|
-
# This is a trick to simplify the code above, transform the filedescriptor handling into an array handling
|
600
|
-
def __init__(self, fd: TextIO) -> None:
|
601
|
-
self.fd = fd
|
602
|
-
self.length = None
|
603
|
-
|
604
|
-
def __getitem__(self, index: int) -> Any:
|
605
|
-
if isinstance(index, slice):
|
606
|
-
self.fd.seek(index.start)
|
607
|
-
value = self.fd.read(index.stop - index.start)
|
608
|
-
self.fd.seek(index.start)
|
609
|
-
return value
|
610
|
-
else:
|
611
|
-
self.fd.seek(index)
|
612
|
-
return self.fd.read(1)
|
613
|
-
|
614
|
-
def __len__(self) -> int:
|
615
|
-
if not self.length:
|
616
|
-
current_position = self.fd.tell()
|
617
|
-
self.fd.seek(0, os.SEEK_END)
|
618
|
-
self.length = self.fd.tell()
|
619
|
-
self.fd.seek(current_position)
|
620
|
-
return self.length
|
621
|
-
|
622
|
-
def __setitem__(self):
|
623
|
-
raise Exception("This is read-only!")
|
@@ -0,0 +1,7 @@
|
|
1
|
+
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
+
json_repair/json_repair.py,sha256=r3Vx2rhFwqL201vXB5hgiJR2zLll_Rx4cn1_xKnLSTo,27501
|
3
|
+
json_repair-0.22.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
4
|
+
json_repair-0.22.0.dist-info/METADATA,sha256=D8wdIzgpU3n8Wzwv9_qCsiNsi5410vq7ouOR7l9zke0,7333
|
5
|
+
json_repair-0.22.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
6
|
+
json_repair-0.22.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
7
|
+
json_repair-0.22.0.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
-
json_repair/json_repair.py,sha256=evtrrG5xGfWBa1tSTW07u03PXP3bGoKsN7A_8WcsN1s,25528
|
3
|
-
json_repair-0.20.1.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
4
|
-
json_repair-0.20.1.dist-info/METADATA,sha256=dFTIO7S7G_bZDgNgWHD7Ey7B5qB2Q_9CXHXGycmldsU,7333
|
5
|
-
json_repair-0.20.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
6
|
-
json_repair-0.20.1.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
7
|
-
json_repair-0.20.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|