json-repair 0.55.2__py3-none-any.whl → 0.56.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- json_repair/json_parser.py +59 -19
- json_repair/json_repair.py +91 -12
- json_repair/parse_array.py +76 -15
- json_repair/parse_object.py +107 -10
- json_repair/schema_repair.py +508 -0
- json_repair/utils/constants.py +11 -0
- {json_repair-0.55.2.dist-info → json_repair-0.56.0.dist-info}/METADATA +67 -3
- {json_repair-0.55.2.dist-info → json_repair-0.56.0.dist-info}/RECORD +12 -11
- {json_repair-0.55.2.dist-info → json_repair-0.56.0.dist-info}/WHEEL +0 -0
- {json_repair-0.55.2.dist-info → json_repair-0.56.0.dist-info}/entry_points.txt +0 -0
- {json_repair-0.55.2.dist-info → json_repair-0.56.0.dist-info}/licenses/LICENSE +0 -0
- {json_repair-0.55.2.dist-info → json_repair-0.56.0.dist-info}/top_level.txt +0 -0
json_repair/parse_object.py
CHANGED
|
@@ -1,16 +1,63 @@
|
|
|
1
|
-
|
|
1
|
+
import re
|
|
2
|
+
from typing import TYPE_CHECKING, Any
|
|
2
3
|
|
|
3
|
-
from .utils.constants import STRING_DELIMITERS, JSONReturnType
|
|
4
|
+
from .utils.constants import MISSING_VALUE, STRING_DELIMITERS, JSONReturnType
|
|
4
5
|
from .utils.json_context import ContextValues
|
|
5
6
|
|
|
6
7
|
if TYPE_CHECKING:
|
|
7
8
|
from .json_parser import JSONParser
|
|
9
|
+
from .schema_repair import SchemaRepairer
|
|
8
10
|
|
|
9
11
|
|
|
10
|
-
def parse_object(
|
|
12
|
+
def parse_object(
|
|
13
|
+
self: "JSONParser",
|
|
14
|
+
schema: dict[str, Any] | bool | None = None,
|
|
15
|
+
path: str = "$",
|
|
16
|
+
) -> JSONReturnType:
|
|
11
17
|
# <object> ::= '{' [ <member> *(', ' <member>) ] '}' ; A sequence of 'members'
|
|
12
18
|
obj: dict[str, JSONReturnType] = {}
|
|
13
19
|
start_index = self.index
|
|
20
|
+
|
|
21
|
+
# Only activate schema-guided parsing if a repairer is available and schema looks object-like.
|
|
22
|
+
schema_repairer: SchemaRepairer | None = None
|
|
23
|
+
properties: dict[str, Any] = {}
|
|
24
|
+
pattern_properties: dict[str, Any] = {}
|
|
25
|
+
additional_properties: object | None = None
|
|
26
|
+
required: set[str] = set()
|
|
27
|
+
|
|
28
|
+
if schema is not None and schema is not True:
|
|
29
|
+
repairer = self.schema_repairer
|
|
30
|
+
if repairer is not None:
|
|
31
|
+
schema = repairer.resolve_schema(schema)
|
|
32
|
+
if schema is False:
|
|
33
|
+
raise ValueError("Schema does not allow any values.")
|
|
34
|
+
if schema is not True and repairer.is_object_schema(schema):
|
|
35
|
+
schema_repairer = repairer
|
|
36
|
+
properties = schema.get("properties", {})
|
|
37
|
+
if not isinstance(properties, dict):
|
|
38
|
+
properties = {}
|
|
39
|
+
pattern_properties = schema.get("patternProperties", {})
|
|
40
|
+
if not isinstance(pattern_properties, dict):
|
|
41
|
+
pattern_properties = {}
|
|
42
|
+
additional_properties = schema.get("additionalProperties", None)
|
|
43
|
+
required = set(schema.get("required", []))
|
|
44
|
+
|
|
45
|
+
def finalize_obj() -> dict[str, JSONReturnType]:
|
|
46
|
+
if schema_repairer is None:
|
|
47
|
+
return obj
|
|
48
|
+
schema_repairer_local = schema_repairer
|
|
49
|
+
# Enforce required fields and insert defaults for optional properties.
|
|
50
|
+
missing_required = [key for key in required if key not in obj]
|
|
51
|
+
if missing_required:
|
|
52
|
+
raise ValueError(f"Missing required properties at {path}: {', '.join(missing_required)}")
|
|
53
|
+
for key, prop_schema in properties.items():
|
|
54
|
+
if key in obj or key in required:
|
|
55
|
+
continue
|
|
56
|
+
if isinstance(prop_schema, dict) and "default" in prop_schema:
|
|
57
|
+
obj[key] = schema_repairer_local._copy_json_value(prop_schema["default"], f"{path}.{key}", "default")
|
|
58
|
+
schema_repairer_local._log("Inserted default value for missing property", f"{path}.{key}")
|
|
59
|
+
return obj
|
|
60
|
+
|
|
14
61
|
# Stop when you either find the closing parentheses or you have iterated over the entire string
|
|
15
62
|
while (self.get_char_at() or "}") != "}":
|
|
16
63
|
# This is what we expect to find:
|
|
@@ -145,21 +192,71 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
145
192
|
self.skip_whitespaces()
|
|
146
193
|
# Corner case, a lone comma
|
|
147
194
|
value: JSONReturnType = ""
|
|
195
|
+
prop_schema: dict[str, Any] | bool | None = None
|
|
196
|
+
extra_schemas: list[dict[str, Any] | bool | None] = []
|
|
197
|
+
drop_property = False
|
|
198
|
+
|
|
199
|
+
if schema_repairer is not None:
|
|
200
|
+
if key in properties:
|
|
201
|
+
schema_value = properties[key]
|
|
202
|
+
# Schema entries must be dict/bool; reject invalid metadata early.
|
|
203
|
+
if schema_value is not None and not isinstance(schema_value, (dict, bool)):
|
|
204
|
+
raise ValueError("Schema must be an object.")
|
|
205
|
+
prop_schema = schema_value
|
|
206
|
+
else:
|
|
207
|
+
matched = [
|
|
208
|
+
schema_value for pattern, schema_value in pattern_properties.items() if re.search(pattern, key)
|
|
209
|
+
]
|
|
210
|
+
if matched:
|
|
211
|
+
# patternProperties can stack: apply the first schema, then any extras in order.
|
|
212
|
+
primary_schema = matched[0]
|
|
213
|
+
if primary_schema is not None and not isinstance(primary_schema, (dict, bool)):
|
|
214
|
+
raise ValueError("Schema must be an object.")
|
|
215
|
+
prop_schema = primary_schema
|
|
216
|
+
for extra_schema in matched[1:]:
|
|
217
|
+
if extra_schema is not None and not isinstance(extra_schema, (dict, bool)):
|
|
218
|
+
raise ValueError("Schema must be an object.")
|
|
219
|
+
extra_schemas.append(extra_schema)
|
|
220
|
+
else:
|
|
221
|
+
if additional_properties is False:
|
|
222
|
+
# Schema forbids unknown keys: parse but drop this property.
|
|
223
|
+
drop_property = True
|
|
224
|
+
elif isinstance(additional_properties, dict):
|
|
225
|
+
prop_schema = additional_properties
|
|
226
|
+
else:
|
|
227
|
+
prop_schema = True
|
|
228
|
+
|
|
148
229
|
char = self.get_char_at()
|
|
230
|
+
key_path = f"{path}.{key}"
|
|
149
231
|
if char in [",", "}"]:
|
|
150
232
|
self.log(
|
|
151
233
|
f"While parsing an object value we found a stray {char}, ignoring it",
|
|
152
234
|
)
|
|
235
|
+
if schema_repairer is not None:
|
|
236
|
+
# Missing value: fill according to schema (defaults/const/enum/type).
|
|
237
|
+
value = schema_repairer.repair_value(MISSING_VALUE, prop_schema, key_path)
|
|
153
238
|
else:
|
|
154
|
-
|
|
155
|
-
|
|
239
|
+
# Schema-aware parsing guides repairs inside nested values.
|
|
240
|
+
value = self.parse_json(prop_schema, key_path) if schema_repairer is not None else self.parse_json()
|
|
241
|
+
|
|
242
|
+
if schema_repairer is not None and extra_schemas:
|
|
243
|
+
# Apply any additional pattern schemas in order.
|
|
244
|
+
for extra_schema in extra_schemas:
|
|
245
|
+
value = schema_repairer.repair_value(value, extra_schema, key_path)
|
|
246
|
+
|
|
247
|
+
if schema_repairer is None and value == "" and self.strict and self.get_char_at(-1) not in STRING_DELIMITERS:
|
|
156
248
|
self.log(
|
|
157
249
|
"Parsed value is empty in strict mode while parsing object, raising an error",
|
|
158
250
|
)
|
|
159
251
|
raise ValueError("Parsed value is empty in strict mode while parsing object.")
|
|
252
|
+
|
|
160
253
|
# Reset context since our job is done
|
|
161
254
|
self.context.reset()
|
|
162
|
-
|
|
255
|
+
if schema_repairer is None or not drop_property:
|
|
256
|
+
obj[key] = value
|
|
257
|
+
else:
|
|
258
|
+
# Keep parsing but omit forbidden properties to respect the schema.
|
|
259
|
+
schema_repairer._log("Dropped extra property not covered by schema", key_path)
|
|
163
260
|
|
|
164
261
|
if self.get_char_at() in [",", "'", '"']:
|
|
165
262
|
self.index += 1
|
|
@@ -204,17 +301,17 @@ def parse_object(self: "JSONParser") -> JSONReturnType:
|
|
|
204
301
|
|
|
205
302
|
self.skip_whitespaces()
|
|
206
303
|
if self.get_char_at() != ",":
|
|
207
|
-
return
|
|
304
|
+
return finalize_obj()
|
|
208
305
|
self.index += 1
|
|
209
306
|
self.skip_whitespaces()
|
|
210
307
|
if self.get_char_at() not in STRING_DELIMITERS:
|
|
211
|
-
return
|
|
308
|
+
return finalize_obj()
|
|
212
309
|
if not self.strict:
|
|
213
310
|
self.log(
|
|
214
311
|
"Found a comma and string delimiter after object closing brace, checking for additional key-value pairs",
|
|
215
312
|
)
|
|
216
|
-
additional_obj = self.parse_object()
|
|
313
|
+
additional_obj = self.parse_object(schema, path)
|
|
217
314
|
if isinstance(additional_obj, dict):
|
|
218
315
|
obj.update(additional_obj)
|
|
219
316
|
|
|
220
|
-
return
|
|
317
|
+
return finalize_obj()
|
|
@@ -0,0 +1,508 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import copy
|
|
4
|
+
import importlib
|
|
5
|
+
import re
|
|
6
|
+
from types import ModuleType
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
from .utils.constants import MISSING_VALUE, JSONReturnType, MissingValueType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def _require_jsonschema() -> Any:
|
|
13
|
+
try:
|
|
14
|
+
return importlib.import_module("jsonschema")
|
|
15
|
+
except ImportError as exc: # pragma: no cover - optional dependency
|
|
16
|
+
raise ValueError("jsonschema is required when using schema-aware repair.") from exc
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _require_pydantic() -> Any:
|
|
20
|
+
try:
|
|
21
|
+
return importlib.import_module("pydantic")
|
|
22
|
+
except ImportError as exc: # pragma: no cover - optional dependency
|
|
23
|
+
raise ValueError("pydantic is required when using schema models.") from exc
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def load_schema_model(path: str) -> type[Any]:
|
|
27
|
+
if ":" not in path:
|
|
28
|
+
raise ValueError("Schema model must be in the form 'module:ClassName'.")
|
|
29
|
+
module_name, class_name = path.split(":", 1)
|
|
30
|
+
module: ModuleType = importlib.import_module(module_name)
|
|
31
|
+
model: object | None = module.__dict__.get(class_name)
|
|
32
|
+
if model is None or not isinstance(model, type):
|
|
33
|
+
raise ValueError(f"Schema model '{class_name}' not found in module '{module_name}'.")
|
|
34
|
+
return model
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def normalize_missing_values(value: object) -> JSONReturnType:
|
|
38
|
+
if value is MISSING_VALUE or isinstance(value, MissingValueType):
|
|
39
|
+
return ""
|
|
40
|
+
if isinstance(value, dict):
|
|
41
|
+
normalized: dict[str, JSONReturnType] = {}
|
|
42
|
+
for key, item in value.items():
|
|
43
|
+
if not isinstance(key, str):
|
|
44
|
+
raise ValueError("Object keys must be strings.")
|
|
45
|
+
normalized[key] = normalize_missing_values(item)
|
|
46
|
+
return normalized
|
|
47
|
+
if isinstance(value, list):
|
|
48
|
+
return [normalize_missing_values(item) for item in value]
|
|
49
|
+
if value is None or isinstance(value, (str, int, float, bool)):
|
|
50
|
+
return value
|
|
51
|
+
raise ValueError("Value is not JSON compatible.")
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def schema_from_input(schema: Any) -> dict[str, Any] | bool:
|
|
55
|
+
if isinstance(schema, dict):
|
|
56
|
+
return schema
|
|
57
|
+
if schema is True or schema is False:
|
|
58
|
+
return schema
|
|
59
|
+
if hasattr(schema, "model_json_schema"):
|
|
60
|
+
pydantic = _require_pydantic()
|
|
61
|
+
version = getattr(pydantic, "VERSION", getattr(pydantic, "__version__", "0"))
|
|
62
|
+
if int(version.split(".")[0]) < 2:
|
|
63
|
+
raise ValueError("pydantic v2 is required for schema models.")
|
|
64
|
+
schema_dict: dict[str, Any] = schema.model_json_schema()
|
|
65
|
+
if hasattr(schema, "model_fields"):
|
|
66
|
+
properties = schema_dict.setdefault("properties", {})
|
|
67
|
+
if not isinstance(properties, dict):
|
|
68
|
+
properties = {}
|
|
69
|
+
schema_dict["properties"] = properties
|
|
70
|
+
for name, field in schema.model_fields.items():
|
|
71
|
+
if field.is_required():
|
|
72
|
+
continue
|
|
73
|
+
property_schema = properties.setdefault(name, {})
|
|
74
|
+
if not isinstance(property_schema, dict):
|
|
75
|
+
property_schema = {}
|
|
76
|
+
properties[name] = property_schema
|
|
77
|
+
if "default" in property_schema:
|
|
78
|
+
continue
|
|
79
|
+
if field.default_factory is not None:
|
|
80
|
+
property_schema["default"] = field.default_factory()
|
|
81
|
+
else:
|
|
82
|
+
property_schema["default"] = field.default
|
|
83
|
+
return schema_dict
|
|
84
|
+
raise ValueError("Schema must be a JSON Schema dict, boolean schema, or pydantic v2 model.")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
class SchemaRepairer:
|
|
88
|
+
def __init__(self, schema: dict[str, Any] | bool, log: list[dict[str, str]] | None) -> None:
|
|
89
|
+
self.root_schema = schema
|
|
90
|
+
self.log = log
|
|
91
|
+
|
|
92
|
+
def _log(self, text: str, path: str) -> None:
|
|
93
|
+
if self.log is not None:
|
|
94
|
+
self.log.append({"text": text, "context": path})
|
|
95
|
+
|
|
96
|
+
def validate(self, value: JSONReturnType, schema: dict[str, Any] | bool) -> None:
|
|
97
|
+
schema = self.resolve_schema(schema)
|
|
98
|
+
if schema is True:
|
|
99
|
+
return
|
|
100
|
+
if schema is False:
|
|
101
|
+
raise ValueError("Schema does not allow any values.")
|
|
102
|
+
schema_for_validation = self._prepare_schema_for_validation(schema)
|
|
103
|
+
jsonschema = _require_jsonschema()
|
|
104
|
+
validator_cls = jsonschema.validators.validator_for(schema_for_validation)
|
|
105
|
+
validator = validator_cls(schema_for_validation)
|
|
106
|
+
errors = sorted(validator.iter_errors(value), key=lambda e: e.path)
|
|
107
|
+
if errors:
|
|
108
|
+
raise ValueError(errors[0].message)
|
|
109
|
+
|
|
110
|
+
def resolve_schema(self, schema: object | None) -> dict[str, Any] | bool:
|
|
111
|
+
if schema is None:
|
|
112
|
+
return True
|
|
113
|
+
if isinstance(schema, bool):
|
|
114
|
+
return schema
|
|
115
|
+
if not isinstance(schema, dict):
|
|
116
|
+
raise ValueError("Schema must be an object.")
|
|
117
|
+
schema_dict: dict[str, Any] = {}
|
|
118
|
+
for key, value in schema.items():
|
|
119
|
+
if not isinstance(key, str):
|
|
120
|
+
raise ValueError("Schema keys must be strings.")
|
|
121
|
+
schema_dict[key] = value
|
|
122
|
+
while "$ref" in schema_dict:
|
|
123
|
+
ref = schema_dict["$ref"]
|
|
124
|
+
resolved = self._resolve_ref(ref)
|
|
125
|
+
if isinstance(resolved, bool):
|
|
126
|
+
return resolved
|
|
127
|
+
schema_dict = resolved
|
|
128
|
+
return schema_dict
|
|
129
|
+
|
|
130
|
+
def is_object_schema(self, schema: dict[str, Any] | bool | None) -> bool:
|
|
131
|
+
schema = self.resolve_schema(schema)
|
|
132
|
+
if not isinstance(schema, dict):
|
|
133
|
+
return False
|
|
134
|
+
schema_type = schema.get("type")
|
|
135
|
+
if schema_type == "object":
|
|
136
|
+
return True
|
|
137
|
+
if isinstance(schema_type, list) and "object" in schema_type:
|
|
138
|
+
return True
|
|
139
|
+
return any(key in schema for key in ("properties", "patternProperties", "additionalProperties", "required"))
|
|
140
|
+
|
|
141
|
+
def is_array_schema(self, schema: dict[str, Any] | bool | None) -> bool:
|
|
142
|
+
schema = self.resolve_schema(schema)
|
|
143
|
+
if not isinstance(schema, dict):
|
|
144
|
+
return False
|
|
145
|
+
schema_type = schema.get("type")
|
|
146
|
+
if schema_type == "array":
|
|
147
|
+
return True
|
|
148
|
+
if isinstance(schema_type, list) and "array" in schema_type:
|
|
149
|
+
return True
|
|
150
|
+
return "items" in schema
|
|
151
|
+
|
|
152
|
+
def repair_value(self, value: Any, schema: dict[str, Any] | bool | None, path: str) -> JSONReturnType:
|
|
153
|
+
"""Apply schema rules to a parsed value, including unions, coercions, and defaults."""
|
|
154
|
+
schema = self.resolve_schema(schema)
|
|
155
|
+
if schema is True:
|
|
156
|
+
return normalize_missing_values(value)
|
|
157
|
+
if schema is False:
|
|
158
|
+
raise ValueError("Schema does not allow any values.")
|
|
159
|
+
if not schema:
|
|
160
|
+
return normalize_missing_values(value)
|
|
161
|
+
|
|
162
|
+
if value is MISSING_VALUE:
|
|
163
|
+
return self._fill_missing(schema, path)
|
|
164
|
+
|
|
165
|
+
if "allOf" in schema:
|
|
166
|
+
subschemas = schema["allOf"]
|
|
167
|
+
if not subschemas:
|
|
168
|
+
return normalize_missing_values(value)
|
|
169
|
+
repaired = self.repair_value(value, subschemas[0], path)
|
|
170
|
+
for subschema in subschemas[1:]:
|
|
171
|
+
repaired = self.repair_value(repaired, subschema, path)
|
|
172
|
+
return repaired
|
|
173
|
+
|
|
174
|
+
if "oneOf" in schema:
|
|
175
|
+
return self._repair_union(value, schema["oneOf"], path)
|
|
176
|
+
if "anyOf" in schema:
|
|
177
|
+
return self._repair_union(value, schema["anyOf"], path)
|
|
178
|
+
|
|
179
|
+
expected_type = schema.get("type")
|
|
180
|
+
if expected_type is None:
|
|
181
|
+
if self.is_object_schema(schema):
|
|
182
|
+
expected_type = "object"
|
|
183
|
+
elif self.is_array_schema(schema):
|
|
184
|
+
expected_type = "array"
|
|
185
|
+
|
|
186
|
+
if isinstance(expected_type, list):
|
|
187
|
+
return self._repair_type_union(value, expected_type, schema, path)
|
|
188
|
+
|
|
189
|
+
if expected_type == "object":
|
|
190
|
+
repaired = self._repair_object(value, schema, path)
|
|
191
|
+
elif expected_type == "array":
|
|
192
|
+
repaired = self._repair_array(value, schema, path)
|
|
193
|
+
elif isinstance(expected_type, str):
|
|
194
|
+
repaired = self._coerce_scalar(value, expected_type, path)
|
|
195
|
+
else:
|
|
196
|
+
repaired = normalize_missing_values(value)
|
|
197
|
+
|
|
198
|
+
return self._apply_enum_const(repaired, schema, path)
|
|
199
|
+
|
|
200
|
+
def _repair_union(self, value: Any, schemas: list[dict[str, Any] | bool], path: str) -> JSONReturnType:
|
|
201
|
+
last_error: Exception | None = None
|
|
202
|
+
for subschema in schemas:
|
|
203
|
+
try:
|
|
204
|
+
candidate = self.repair_value(copy.deepcopy(value), subschema, path)
|
|
205
|
+
self.validate(candidate, subschema)
|
|
206
|
+
return candidate
|
|
207
|
+
except ValueError as exc:
|
|
208
|
+
last_error = exc
|
|
209
|
+
if last_error:
|
|
210
|
+
raise ValueError(str(last_error)) from last_error
|
|
211
|
+
raise ValueError("No schema matched the value.")
|
|
212
|
+
|
|
213
|
+
def _repair_type_union(
|
|
214
|
+
self,
|
|
215
|
+
value: Any,
|
|
216
|
+
types: list[str],
|
|
217
|
+
schema: dict[str, Any],
|
|
218
|
+
path: str,
|
|
219
|
+
) -> JSONReturnType:
|
|
220
|
+
last_error: Exception | None = None
|
|
221
|
+
for schema_type in types:
|
|
222
|
+
try:
|
|
223
|
+
candidate = self._repair_by_type(value, schema_type, schema, path)
|
|
224
|
+
return self._apply_enum_const(candidate, schema, path)
|
|
225
|
+
except ValueError as exc:
|
|
226
|
+
last_error = exc
|
|
227
|
+
if last_error:
|
|
228
|
+
raise ValueError(str(last_error)) from last_error
|
|
229
|
+
raise ValueError("No schema type matched the value.")
|
|
230
|
+
|
|
231
|
+
def _repair_by_type(self, value: Any, schema_type: str, schema: dict[str, Any], path: str) -> JSONReturnType:
|
|
232
|
+
if schema_type == "array":
|
|
233
|
+
return self._repair_array(value, schema, path)
|
|
234
|
+
if schema_type == "object":
|
|
235
|
+
return self._repair_object(value, schema, path)
|
|
236
|
+
return self._coerce_scalar(value, schema_type, path)
|
|
237
|
+
|
|
238
|
+
def _repair_array(self, value: Any, schema: dict[str, Any], path: str) -> JSONReturnType:
|
|
239
|
+
if isinstance(value, list):
|
|
240
|
+
items: list[JSONReturnType] = value
|
|
241
|
+
else:
|
|
242
|
+
self._log("Wrapped value in array to match schema", path)
|
|
243
|
+
items = [normalize_missing_values(value)]
|
|
244
|
+
items_schema = schema.get("items")
|
|
245
|
+
if items_schema is not None:
|
|
246
|
+
if isinstance(items_schema, list):
|
|
247
|
+
repaired_items: list[JSONReturnType] = []
|
|
248
|
+
for idx, item_schema in enumerate(items_schema):
|
|
249
|
+
if idx >= len(items):
|
|
250
|
+
break
|
|
251
|
+
repaired_items.append(self.repair_value(items[idx], item_schema, f"{path}[{idx}]"))
|
|
252
|
+
additional_items = schema.get("additionalItems")
|
|
253
|
+
if len(items) > len(items_schema):
|
|
254
|
+
tail = items[len(items_schema) :]
|
|
255
|
+
if isinstance(additional_items, dict):
|
|
256
|
+
for offset, item in enumerate(tail, start=len(items_schema)):
|
|
257
|
+
repaired_items.append(self.repair_value(item, additional_items, f"{path}[{offset}]"))
|
|
258
|
+
elif additional_items is True or additional_items is None:
|
|
259
|
+
repaired_items.extend(normalize_missing_values(item) for item in tail)
|
|
260
|
+
else:
|
|
261
|
+
for offset, _item in enumerate(tail, start=len(items_schema)):
|
|
262
|
+
self._log("Dropped extra array item not covered by schema", f"{path}[{offset}]")
|
|
263
|
+
items = repaired_items
|
|
264
|
+
else:
|
|
265
|
+
items = [self.repair_value(item, items_schema, f"{path}[{idx}]") for idx, item in enumerate(items)]
|
|
266
|
+
min_items = schema.get("minItems")
|
|
267
|
+
if min_items is not None and len(items) < min_items:
|
|
268
|
+
raise ValueError(f"Array at {path} does not meet minItems.")
|
|
269
|
+
return items
|
|
270
|
+
|
|
271
|
+
def _repair_object(self, value: Any, schema: dict[str, Any], path: str) -> JSONReturnType:
|
|
272
|
+
if not isinstance(value, dict):
|
|
273
|
+
raise ValueError(f"Expected object at {path}, got {type(value).__name__}.")
|
|
274
|
+
|
|
275
|
+
properties = schema.get("properties", {})
|
|
276
|
+
if not isinstance(properties, dict):
|
|
277
|
+
properties = {}
|
|
278
|
+
required = set(schema.get("required", []))
|
|
279
|
+
pattern_properties = schema.get("patternProperties", {})
|
|
280
|
+
if not isinstance(pattern_properties, dict):
|
|
281
|
+
pattern_properties = {}
|
|
282
|
+
additional_properties = schema.get("additionalProperties")
|
|
283
|
+
|
|
284
|
+
missing_required = [key for key in required if key not in value]
|
|
285
|
+
if missing_required:
|
|
286
|
+
raise ValueError(f"Missing required properties at {path}: {', '.join(missing_required)}")
|
|
287
|
+
|
|
288
|
+
repaired: dict[str, JSONReturnType] = {}
|
|
289
|
+
|
|
290
|
+
for key, prop_schema in properties.items():
|
|
291
|
+
key_path = f"{path}.{key}"
|
|
292
|
+
if key in value:
|
|
293
|
+
repaired[key] = self.repair_value(value[key], prop_schema, key_path)
|
|
294
|
+
elif isinstance(prop_schema, dict) and "default" in prop_schema and key not in required:
|
|
295
|
+
repaired[key] = self._copy_json_value(prop_schema["default"], key_path, "default")
|
|
296
|
+
self._log("Inserted default value for missing property", key_path)
|
|
297
|
+
|
|
298
|
+
for key, raw_value in value.items():
|
|
299
|
+
if key in properties:
|
|
300
|
+
continue
|
|
301
|
+
key_path = f"{path}.{key}"
|
|
302
|
+
matched = [prop_schema for pattern, prop_schema in pattern_properties.items() if re.search(pattern, key)]
|
|
303
|
+
if matched:
|
|
304
|
+
repaired_value = self.repair_value(raw_value, matched[0], key_path)
|
|
305
|
+
for prop_schema in matched[1:]:
|
|
306
|
+
repaired_value = self.repair_value(repaired_value, prop_schema, key_path)
|
|
307
|
+
repaired[key] = repaired_value
|
|
308
|
+
continue
|
|
309
|
+
if isinstance(additional_properties, dict):
|
|
310
|
+
repaired[key] = self.repair_value(raw_value, additional_properties, key_path)
|
|
311
|
+
continue
|
|
312
|
+
if additional_properties is True or additional_properties is None:
|
|
313
|
+
repaired[key] = normalize_missing_values(raw_value)
|
|
314
|
+
continue
|
|
315
|
+
self._log("Dropped extra property not covered by schema", key_path)
|
|
316
|
+
|
|
317
|
+
min_properties = schema.get("minProperties")
|
|
318
|
+
if min_properties is not None and len(repaired) < min_properties:
|
|
319
|
+
raise ValueError(f"Object at {path} does not meet minProperties.")
|
|
320
|
+
return repaired
|
|
321
|
+
|
|
322
|
+
def _fill_missing(self, schema: dict[str, Any], path: str) -> JSONReturnType:
|
|
323
|
+
if "const" in schema:
|
|
324
|
+
# Const/enum/default have priority over type inference.
|
|
325
|
+
self._log("Filled missing value with const", path)
|
|
326
|
+
return self._copy_json_value(schema["const"], path, "const")
|
|
327
|
+
if "enum" in schema:
|
|
328
|
+
enum_values = schema["enum"]
|
|
329
|
+
if not enum_values:
|
|
330
|
+
raise ValueError(f"Enum at {path} has no values.")
|
|
331
|
+
self._log("Filled missing value with first enum value", path)
|
|
332
|
+
return self._copy_json_value(enum_values[0], path, "enum")
|
|
333
|
+
if "default" in schema:
|
|
334
|
+
self._log("Filled missing value with default", path)
|
|
335
|
+
return self._copy_json_value(schema["default"], path, "default")
|
|
336
|
+
|
|
337
|
+
expected_type = schema.get("type")
|
|
338
|
+
if isinstance(expected_type, list):
|
|
339
|
+
for schema_type in expected_type:
|
|
340
|
+
try:
|
|
341
|
+
return self._fill_missing({**schema, "type": schema_type}, path)
|
|
342
|
+
except ValueError:
|
|
343
|
+
continue
|
|
344
|
+
raise ValueError(f"Cannot infer missing value at {path}.")
|
|
345
|
+
|
|
346
|
+
if expected_type is None:
|
|
347
|
+
# Infer container types based on schema shape if type is omitted.
|
|
348
|
+
if self.is_object_schema(schema):
|
|
349
|
+
expected_type = "object"
|
|
350
|
+
elif self.is_array_schema(schema):
|
|
351
|
+
expected_type = "array"
|
|
352
|
+
|
|
353
|
+
if expected_type == "string":
|
|
354
|
+
self._log("Filled missing value with empty string", path)
|
|
355
|
+
return ""
|
|
356
|
+
if expected_type in ("integer", "number"):
|
|
357
|
+
self._log("Filled missing value with 0", path)
|
|
358
|
+
return 0
|
|
359
|
+
if expected_type == "boolean":
|
|
360
|
+
self._log("Filled missing value with false", path)
|
|
361
|
+
return False
|
|
362
|
+
if expected_type == "array":
|
|
363
|
+
min_items = schema.get("minItems")
|
|
364
|
+
if min_items:
|
|
365
|
+
raise ValueError(f"Array at {path} requires at least {min_items} items.")
|
|
366
|
+
self._log("Filled missing value with empty array", path)
|
|
367
|
+
return []
|
|
368
|
+
if expected_type == "object":
|
|
369
|
+
min_properties = schema.get("minProperties")
|
|
370
|
+
if min_properties:
|
|
371
|
+
raise ValueError(f"Object at {path} requires at least {min_properties} properties.")
|
|
372
|
+
self._log("Filled missing value with empty object", path)
|
|
373
|
+
return {}
|
|
374
|
+
if expected_type == "null":
|
|
375
|
+
self._log("Filled missing value with null", path)
|
|
376
|
+
return None
|
|
377
|
+
|
|
378
|
+
raise ValueError(f"Cannot infer missing value at {path}.")
|
|
379
|
+
|
|
380
|
+
def _coerce_scalar(self, value: Any, schema_type: str, path: str) -> JSONReturnType:
|
|
381
|
+
if schema_type == "string":
|
|
382
|
+
if isinstance(value, str):
|
|
383
|
+
return value
|
|
384
|
+
if isinstance(value, (int, float)) and not isinstance(value, bool):
|
|
385
|
+
self._log("Coerced number to string", path)
|
|
386
|
+
return str(value)
|
|
387
|
+
raise ValueError(f"Expected string at {path}.")
|
|
388
|
+
|
|
389
|
+
if schema_type == "integer":
|
|
390
|
+
if isinstance(value, bool):
|
|
391
|
+
raise ValueError(f"Expected integer at {path}.")
|
|
392
|
+
if isinstance(value, int):
|
|
393
|
+
return value
|
|
394
|
+
if isinstance(value, float):
|
|
395
|
+
if value.is_integer():
|
|
396
|
+
self._log("Coerced number to integer", path)
|
|
397
|
+
return int(value)
|
|
398
|
+
raise ValueError(f"Expected integer at {path}.")
|
|
399
|
+
if isinstance(value, str):
|
|
400
|
+
try:
|
|
401
|
+
int_value = int(value)
|
|
402
|
+
except ValueError:
|
|
403
|
+
int_value = None
|
|
404
|
+
if int_value is not None:
|
|
405
|
+
self._log("Coerced string to integer", path)
|
|
406
|
+
return int_value
|
|
407
|
+
try:
|
|
408
|
+
num = float(value)
|
|
409
|
+
except ValueError as exc:
|
|
410
|
+
raise ValueError(f"Expected integer at {path}.") from exc
|
|
411
|
+
if not num.is_integer():
|
|
412
|
+
raise ValueError(f"Expected integer at {path}.")
|
|
413
|
+
self._log("Coerced number to integer", path)
|
|
414
|
+
return int(num)
|
|
415
|
+
raise ValueError(f"Expected integer at {path}.")
|
|
416
|
+
|
|
417
|
+
if schema_type == "number":
|
|
418
|
+
if isinstance(value, bool):
|
|
419
|
+
raise ValueError(f"Expected number at {path}.")
|
|
420
|
+
if isinstance(value, (int, float)):
|
|
421
|
+
return value
|
|
422
|
+
if isinstance(value, str):
|
|
423
|
+
try:
|
|
424
|
+
float_value = float(value)
|
|
425
|
+
except ValueError as exc:
|
|
426
|
+
raise ValueError(f"Expected number at {path}.") from exc
|
|
427
|
+
self._log("Coerced string to number", path)
|
|
428
|
+
return float_value
|
|
429
|
+
raise ValueError(f"Expected number at {path}.")
|
|
430
|
+
|
|
431
|
+
if schema_type == "boolean":
|
|
432
|
+
if isinstance(value, bool):
|
|
433
|
+
return value
|
|
434
|
+
if isinstance(value, str):
|
|
435
|
+
lowered = value.lower()
|
|
436
|
+
if lowered in ("true", "false"):
|
|
437
|
+
self._log("Coerced string to boolean", path)
|
|
438
|
+
return lowered == "true"
|
|
439
|
+
raise ValueError(f"Expected boolean at {path}.")
|
|
440
|
+
|
|
441
|
+
if schema_type == "null":
|
|
442
|
+
if value is None:
|
|
443
|
+
return None
|
|
444
|
+
raise ValueError(f"Expected null at {path}.")
|
|
445
|
+
|
|
446
|
+
raise ValueError(f"Unsupported schema type {schema_type} at {path}.")
|
|
447
|
+
|
|
448
|
+
def _apply_enum_const(self, value: JSONReturnType, schema: dict[str, Any], path: str) -> JSONReturnType:
|
|
449
|
+
if "const" in schema and value != schema["const"]:
|
|
450
|
+
raise ValueError(f"Value at {path} does not match const.")
|
|
451
|
+
if "enum" in schema and value not in schema["enum"]:
|
|
452
|
+
raise ValueError(f"Value at {path} does not match enum.")
|
|
453
|
+
return value
|
|
454
|
+
|
|
455
|
+
def _resolve_ref(self, ref: str) -> dict[str, Any] | bool:
|
|
456
|
+
if not ref.startswith("#/"):
|
|
457
|
+
raise ValueError(f"Unsupported $ref: {ref}")
|
|
458
|
+
parts = ref.lstrip("#/").split("/")
|
|
459
|
+
current: Any = self.root_schema
|
|
460
|
+
for part in parts:
|
|
461
|
+
resolved_part = part.replace("~1", "/").replace("~0", "~")
|
|
462
|
+
if not isinstance(current, dict) or resolved_part not in current:
|
|
463
|
+
raise ValueError(f"Unresolvable $ref: {ref}")
|
|
464
|
+
current = current[resolved_part]
|
|
465
|
+
if isinstance(current, dict):
|
|
466
|
+
return current
|
|
467
|
+
if current is True:
|
|
468
|
+
return True
|
|
469
|
+
if current is False:
|
|
470
|
+
return False
|
|
471
|
+
raise ValueError(f"Unresolvable $ref: {ref}")
|
|
472
|
+
|
|
473
|
+
def _copy_json_value(self, value: Any, path: str, label: str) -> JSONReturnType:
|
|
474
|
+
if value is None or isinstance(value, (str, int, float, bool)):
|
|
475
|
+
return value
|
|
476
|
+
if isinstance(value, list):
|
|
477
|
+
return [self._copy_json_value(item, f"{path}[{idx}]", label) for idx, item in enumerate(value)]
|
|
478
|
+
if isinstance(value, dict):
|
|
479
|
+
copied: dict[str, JSONReturnType] = {}
|
|
480
|
+
for key, item in value.items():
|
|
481
|
+
if not isinstance(key, str):
|
|
482
|
+
raise ValueError(f"{label.capitalize()} value at {path} contains a non-string key.")
|
|
483
|
+
copied[key] = self._copy_json_value(item, f"{path}.{key}", label)
|
|
484
|
+
return copied
|
|
485
|
+
raise ValueError(f"{label.capitalize()} value at {path} is not JSON compatible.")
|
|
486
|
+
|
|
487
|
+
def _prepare_schema_for_validation(self, schema: object) -> dict[str, Any]:
|
|
488
|
+
def normalize(node: Any) -> Any:
|
|
489
|
+
if isinstance(node, dict):
|
|
490
|
+
normalized = {key: normalize(value) for key, value in node.items()}
|
|
491
|
+
items = normalized.get("items")
|
|
492
|
+
if isinstance(items, list):
|
|
493
|
+
normalized.pop("items", None)
|
|
494
|
+
normalized["prefixItems"] = items
|
|
495
|
+
additional_items = normalized.pop("additionalItems", None)
|
|
496
|
+
if additional_items is False:
|
|
497
|
+
normalized["items"] = False
|
|
498
|
+
elif isinstance(additional_items, dict):
|
|
499
|
+
normalized["items"] = additional_items
|
|
500
|
+
return normalized
|
|
501
|
+
if isinstance(node, list):
|
|
502
|
+
return [normalize(item) for item in node]
|
|
503
|
+
return node
|
|
504
|
+
|
|
505
|
+
normalized = normalize(schema)
|
|
506
|
+
if not isinstance(normalized, dict):
|
|
507
|
+
raise ValueError("Schema must be an object.")
|
|
508
|
+
return normalized
|
json_repair/utils/constants.py
CHANGED
|
@@ -1,4 +1,15 @@
|
|
|
1
1
|
from typing import Any
|
|
2
2
|
|
|
3
|
+
|
|
4
|
+
class MissingValueType:
|
|
5
|
+
def __repr__(self) -> str:
|
|
6
|
+
return "<MISSING_VALUE>"
|
|
7
|
+
|
|
8
|
+
def __deepcopy__(self, memo: dict[int, Any]) -> "MissingValueType":
|
|
9
|
+
return self
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
MISSING_VALUE = MissingValueType()
|
|
13
|
+
|
|
3
14
|
JSONReturnType = dict[str, Any] | list[Any] | str | float | int | bool | None
|
|
4
15
|
STRING_DELIMITERS: list[str] = ['"', "'", "“", "”"]
|