json-repair 0.20.1__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/json_repair.py +97 -62
- {json_repair-0.20.1.dist-info → json_repair-0.21.0.dist-info}/METADATA +1 -1
- json_repair-0.21.0.dist-info/RECORD +7 -0
- json_repair-0.20.1.dist-info/RECORD +0 -7
- {json_repair-0.20.1.dist-info → json_repair-0.21.0.dist-info}/LICENSE +0 -0
- {json_repair-0.20.1.dist-info → json_repair-0.21.0.dist-info}/WHEEL +0 -0
- {json_repair-0.20.1.dist-info → json_repair-0.21.0.dist-info}/top_level.txt +0 -0
json_repair/json_repair.py
CHANGED
@@ -24,11 +24,55 @@ All supported use cases are in the unit tests
|
|
24
24
|
|
25
25
|
import os
|
26
26
|
import json
|
27
|
-
from typing import Any, Dict, List, Union, TextIO
|
27
|
+
from typing import Any, Dict, List, Optional, Union, TextIO, Tuple
|
28
|
+
|
29
|
+
|
30
|
+
class StringFileWrapper:
|
31
|
+
# This is a trick to simplify the code, transform the filedescriptor handling into a string handling
|
32
|
+
def __init__(self, fd: TextIO) -> None:
|
33
|
+
self.fd = fd
|
34
|
+
self.length: int = 0
|
35
|
+
|
36
|
+
def __getitem__(self, index: int) -> str:
|
37
|
+
if isinstance(index, slice):
|
38
|
+
self.fd.seek(index.start)
|
39
|
+
value = self.fd.read(index.stop - index.start)
|
40
|
+
self.fd.seek(index.start)
|
41
|
+
return value
|
42
|
+
else:
|
43
|
+
self.fd.seek(index)
|
44
|
+
return self.fd.read(1)
|
45
|
+
|
46
|
+
def __len__(self) -> int:
|
47
|
+
if self.length < 1:
|
48
|
+
current_position = self.fd.tell()
|
49
|
+
self.fd.seek(0, os.SEEK_END)
|
50
|
+
self.length = self.fd.tell()
|
51
|
+
self.fd.seek(current_position)
|
52
|
+
return self.length
|
53
|
+
|
54
|
+
def __setitem__(self) -> None:
|
55
|
+
raise Exception("This is read-only!")
|
56
|
+
|
57
|
+
|
58
|
+
class LoggerConfig:
|
59
|
+
# This is a type class to simplify the declaration
|
60
|
+
def __init__(self, log_level: Optional[str]):
|
61
|
+
self.log: List[Dict[str, str]] = []
|
62
|
+
self.window: int = 10
|
63
|
+
self.log_level: str = log_level if log_level else "none"
|
64
|
+
|
65
|
+
|
66
|
+
JSONReturnType = Union[Dict[str, Any], List[Any], str, float, int, bool, None]
|
28
67
|
|
29
68
|
|
30
69
|
class JSONParser:
|
31
|
-
def __init__(
|
70
|
+
def __init__(
|
71
|
+
self,
|
72
|
+
json_str: Union[str, StringFileWrapper],
|
73
|
+
json_fd: Optional[TextIO],
|
74
|
+
logging: Optional[bool],
|
75
|
+
) -> None:
|
32
76
|
# The string to parse
|
33
77
|
self.json_str = json_str
|
34
78
|
# Alternatively, the file description with a json file in it
|
@@ -36,25 +80,23 @@ class JSONParser:
|
|
36
80
|
# This is a trick we do to treat the file wrapper as an array
|
37
81
|
self.json_str = StringFileWrapper(json_fd)
|
38
82
|
# Index is our iterator that will keep track of which character we are looking at right now
|
39
|
-
self.index = 0
|
83
|
+
self.index: int = 0
|
40
84
|
# This is used in the object member parsing to manage the special cases of missing quotes in key or value
|
41
|
-
self.context = []
|
85
|
+
self.context: list[str] = []
|
42
86
|
# Use this to log the activity, but only if logging is active
|
43
|
-
self.logger =
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
def parse(self) -> Union[Dict[str, Any], List[Any], str, float, int, bool, None]:
|
50
|
-
if self.logger["log_level"] == "none":
|
87
|
+
self.logger = LoggerConfig(log_level="info" if logging else None)
|
88
|
+
|
89
|
+
def parse(
|
90
|
+
self,
|
91
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
92
|
+
if self.logger.log_level == "none":
|
51
93
|
return self.parse_json()
|
52
94
|
else:
|
53
|
-
return self.parse_json(), self.logger
|
95
|
+
return self.parse_json(), self.logger.log
|
54
96
|
|
55
97
|
def parse_json(
|
56
98
|
self,
|
57
|
-
) ->
|
99
|
+
) -> JSONReturnType:
|
58
100
|
char = self.get_char_at()
|
59
101
|
# False means that we are at the end of the string provided, is the base case for recursion
|
60
102
|
if char is False:
|
@@ -225,7 +267,7 @@ class JSONParser:
|
|
225
267
|
self.reset_context()
|
226
268
|
return arr
|
227
269
|
|
228
|
-
def parse_string(self) -> str:
|
270
|
+
def parse_string(self) -> Union[str, JSONReturnType]:
|
229
271
|
# <string> is a string of valid characters enclosed in quotes
|
230
272
|
# i.e. { name: "John" }
|
231
273
|
# Somehow all weird cases in an invalid JSON happen to be resolved in this function, so be careful here
|
@@ -324,7 +366,7 @@ class JSONParser:
|
|
324
366
|
string_acc = string_acc[:-1]
|
325
367
|
if char in [rstring_delimiter, "t", "n", "r", "b", "\\"]:
|
326
368
|
escape_seqs = {"t": "\t", "n": "\n", "r": "\r", "b": "\b"}
|
327
|
-
string_acc += escape_seqs.get(char, char)
|
369
|
+
string_acc += escape_seqs.get(char, char) or char
|
328
370
|
self.index += 1
|
329
371
|
char = self.get_char_at()
|
330
372
|
# ChatGPT sometimes forget to quote stuff in html tags or markdown, so we do this whole thing here
|
@@ -362,7 +404,29 @@ class JSONParser:
|
|
362
404
|
break
|
363
405
|
i += 1
|
364
406
|
next_c = self.get_char_at(i)
|
365
|
-
if
|
407
|
+
# If we stopped for a comma in object_value context, let's check if find a "} at the end of the string
|
408
|
+
if next_c == "," and self.get_context() == "object_value":
|
409
|
+
i += 1
|
410
|
+
next_c = self.get_char_at(i)
|
411
|
+
while next_c and next_c != rstring_delimiter:
|
412
|
+
i += 1
|
413
|
+
next_c = self.get_char_at(i)
|
414
|
+
# Ok now I found a delimiter, let's skip whitespaces and see if next we find a }
|
415
|
+
i += 1
|
416
|
+
next_c = self.get_char_at(i)
|
417
|
+
while next_c and next_c.isspace():
|
418
|
+
i += 1
|
419
|
+
next_c = self.get_char_at(i)
|
420
|
+
if next_c == "}":
|
421
|
+
# OK this is valid then
|
422
|
+
self.log(
|
423
|
+
"While parsing a string, we a misplaced quote that would have closed the string but has a different meaning here since this is the last element of the object, ignoring it",
|
424
|
+
"info",
|
425
|
+
)
|
426
|
+
string_acc += char
|
427
|
+
self.index += 1
|
428
|
+
char = self.get_char_at()
|
429
|
+
elif next_c == rstring_delimiter:
|
366
430
|
if self.get_context() == "object_value":
|
367
431
|
# But this might not be it! This could be just a missing comma
|
368
432
|
# We found a delimiter and we need to check if this is a key
|
@@ -418,7 +482,7 @@ class JSONParser:
|
|
418
482
|
|
419
483
|
return string_acc.rstrip()
|
420
484
|
|
421
|
-
def parse_number(self) -> Union[float, int, str]:
|
485
|
+
def parse_number(self) -> Union[float, int, str, JSONReturnType]:
|
422
486
|
# <number> is a valid real number expressed in one of a number of given formats
|
423
487
|
number_str = ""
|
424
488
|
number_chars = set("0123456789-.eE/,")
|
@@ -451,8 +515,7 @@ class JSONParser:
|
|
451
515
|
def parse_boolean_or_null(self) -> Union[bool, str, None]:
|
452
516
|
# <boolean> is one of the literal strings 'true', 'false', or 'null' (unquoted)
|
453
517
|
starting_index = self.index
|
454
|
-
|
455
|
-
char = self.get_char_at().lower()
|
518
|
+
char = (self.get_char_at() or "").lower()
|
456
519
|
if char == "t":
|
457
520
|
value = ("true", True)
|
458
521
|
elif char == "f":
|
@@ -460,12 +523,12 @@ class JSONParser:
|
|
460
523
|
elif char == "n":
|
461
524
|
value = ("null", None)
|
462
525
|
|
463
|
-
if
|
526
|
+
if value:
|
464
527
|
i = 0
|
465
528
|
while char and i < len(value[0]) and char == value[0][i]:
|
466
529
|
i += 1
|
467
530
|
self.index += 1
|
468
|
-
char = self.get_char_at().lower()
|
531
|
+
char = (self.get_char_at() or "").lower()
|
469
532
|
if i == len(value[0]):
|
470
533
|
return value[1]
|
471
534
|
|
@@ -513,12 +576,12 @@ class JSONParser:
|
|
513
576
|
return ""
|
514
577
|
|
515
578
|
def log(self, text: str, level: str) -> None:
|
516
|
-
if level == self.logger
|
579
|
+
if level == self.logger.log_level:
|
517
580
|
context = ""
|
518
|
-
start = max(self.index - self.logger
|
519
|
-
end = min(self.index + self.logger
|
581
|
+
start = max(self.index - self.logger.window, 0)
|
582
|
+
end = min(self.index + self.logger.window, len(self.json_str))
|
520
583
|
context = self.json_str[start:end]
|
521
|
-
self.logger
|
584
|
+
self.logger.log.append(
|
522
585
|
{
|
523
586
|
"text": text,
|
524
587
|
"context": context,
|
@@ -528,11 +591,11 @@ class JSONParser:
|
|
528
591
|
|
529
592
|
def repair_json(
|
530
593
|
json_str: str = "",
|
531
|
-
return_objects: bool = False,
|
532
|
-
skip_json_loads: bool = False,
|
533
|
-
logging: bool = False,
|
534
|
-
json_fd: TextIO = None,
|
535
|
-
) -> Union[
|
594
|
+
return_objects: Optional[bool] = False,
|
595
|
+
skip_json_loads: Optional[bool] = False,
|
596
|
+
logging: Optional[bool] = False,
|
597
|
+
json_fd: Optional[TextIO] = None,
|
598
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
536
599
|
"""
|
537
600
|
Given a json formatted string, it will try to decode it and, if it fails, it will try to fix it.
|
538
601
|
It will return the fixed string by default.
|
@@ -559,7 +622,7 @@ def repair_json(
|
|
559
622
|
|
560
623
|
def loads(
|
561
624
|
json_str: str, skip_json_loads: bool = False, logging: bool = False
|
562
|
-
) -> Union[
|
625
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
563
626
|
"""
|
564
627
|
This function works like `json.loads()` except that it will fix your JSON in the process.
|
565
628
|
It is a wrapper around the `repair_json()` function with `return_objects=True`.
|
@@ -574,7 +637,7 @@ def loads(
|
|
574
637
|
|
575
638
|
def load(
|
576
639
|
fd: TextIO, skip_json_loads: bool = False, logging: bool = False
|
577
|
-
) -> Union[
|
640
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
578
641
|
"""
|
579
642
|
This function works like `json.load()` except that it will fix your JSON in the process.
|
580
643
|
It is a wrapper around the `repair_json()` function with `json_fd=fd` and `return_objects=True`.
|
@@ -584,7 +647,7 @@ def load(
|
|
584
647
|
|
585
648
|
def from_file(
|
586
649
|
filename: str, skip_json_loads: bool = False, logging: bool = False
|
587
|
-
) -> Union[
|
650
|
+
) -> Union[JSONReturnType, Tuple[JSONReturnType, List[Dict[str, str]]]]:
|
588
651
|
"""
|
589
652
|
This function is a wrapper around `load()` so you can pass the filename as string
|
590
653
|
"""
|
@@ -593,31 +656,3 @@ def from_file(
|
|
593
656
|
fd.close()
|
594
657
|
|
595
658
|
return jsonobj
|
596
|
-
|
597
|
-
|
598
|
-
class StringFileWrapper:
|
599
|
-
# This is a trick to simplify the code above, transform the filedescriptor handling into an array handling
|
600
|
-
def __init__(self, fd: TextIO) -> None:
|
601
|
-
self.fd = fd
|
602
|
-
self.length = None
|
603
|
-
|
604
|
-
def __getitem__(self, index: int) -> Any:
|
605
|
-
if isinstance(index, slice):
|
606
|
-
self.fd.seek(index.start)
|
607
|
-
value = self.fd.read(index.stop - index.start)
|
608
|
-
self.fd.seek(index.start)
|
609
|
-
return value
|
610
|
-
else:
|
611
|
-
self.fd.seek(index)
|
612
|
-
return self.fd.read(1)
|
613
|
-
|
614
|
-
def __len__(self) -> int:
|
615
|
-
if not self.length:
|
616
|
-
current_position = self.fd.tell()
|
617
|
-
self.fd.seek(0, os.SEEK_END)
|
618
|
-
self.length = self.fd.tell()
|
619
|
-
self.fd.seek(current_position)
|
620
|
-
return self.length
|
621
|
-
|
622
|
-
def __setitem__(self):
|
623
|
-
raise Exception("This is read-only!")
|
@@ -0,0 +1,7 @@
|
|
1
|
+
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
+
json_repair/json_repair.py,sha256=ry94U3QoJwVgyG1qeQNEb8Qt8NtCLpCGR41GBA7tozY,27320
|
3
|
+
json_repair-0.21.0.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
4
|
+
json_repair-0.21.0.dist-info/METADATA,sha256=obBsHuNN7Ph5zX77VHmER2O9A61F3MXGBreEowdr-so,7333
|
5
|
+
json_repair-0.21.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
6
|
+
json_repair-0.21.0.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
7
|
+
json_repair-0.21.0.dist-info/RECORD,,
|
@@ -1,7 +0,0 @@
|
|
1
|
-
json_repair/__init__.py,sha256=IIzSm1DsCRrr8seF3UeMZXwxcq-tE3j-8d1WBxvEJvE,178
|
2
|
-
json_repair/json_repair.py,sha256=evtrrG5xGfWBa1tSTW07u03PXP3bGoKsN7A_8WcsN1s,25528
|
3
|
-
json_repair-0.20.1.dist-info/LICENSE,sha256=wrjQo8MhNrNCicXtMe3MHmS-fx8AmQk1ue8AQwiiFV8,1076
|
4
|
-
json_repair-0.20.1.dist-info/METADATA,sha256=dFTIO7S7G_bZDgNgWHD7Ey7B5qB2Q_9CXHXGycmldsU,7333
|
5
|
-
json_repair-0.20.1.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
6
|
-
json_repair-0.20.1.dist-info/top_level.txt,sha256=7-VZwZN2CgB_n0NlSLk-rEUFh8ug21lESbsblOYuZqw,12
|
7
|
-
json_repair-0.20.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|