json-repair 0.55.1__tar.gz → 0.56.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {json_repair-0.55.1 → json_repair-0.56.0}/PKG-INFO +70 -3
- json_repair-0.55.1/src/json_repair.egg-info/PKG-INFO → json_repair-0.56.0/README.md +66 -19
- {json_repair-0.55.1 → json_repair-0.56.0}/pyproject.toml +38 -2
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair/json_parser.py +66 -28
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair/json_repair.py +101 -25
- json_repair-0.56.0/src/json_repair/parse_array.py +117 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair/parse_comment.py +1 -2
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair/parse_number.py +1 -2
- json_repair-0.56.0/src/json_repair/parse_object.py +317 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair/parse_string.py +23 -25
- json_repair-0.56.0/src/json_repair/schema_repair.py +508 -0
- json_repair-0.56.0/src/json_repair/utils/constants.py +15 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair/utils/object_comparer.py +1 -1
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair/utils/string_file_wrapper.py +40 -35
- json_repair-0.55.1/README.md → json_repair-0.56.0/src/json_repair.egg-info/PKG-INFO +86 -2
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair.egg-info/SOURCES.txt +5 -0
- json_repair-0.56.0/src/json_repair.egg-info/requires.txt +4 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/tests/test_parse_array.py +8 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/tests/test_parse_object.py +4 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/tests/test_performance.py +3 -5
- json_repair-0.56.0/tests/test_repair_json_cli.py +178 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/tests/test_repair_json_from_file.py +5 -5
- json_repair-0.56.0/tests/test_schema_guided_parse.py +75 -0
- json_repair-0.56.0/tests/test_schema_parser_paths.py +222 -0
- json_repair-0.56.0/tests/test_schema_repairer.py +384 -0
- json_repair-0.55.1/src/json_repair/parse_array.py +0 -56
- json_repair-0.55.1/src/json_repair/parse_object.py +0 -185
- json_repair-0.55.1/src/json_repair/utils/constants.py +0 -4
- json_repair-0.55.1/tests/test_repair_json_cli.py +0 -67
- {json_repair-0.55.1 → json_repair-0.56.0}/LICENSE +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/setup.cfg +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair/__init__.py +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair/__main__.py +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair/parse_string_helpers/parse_boolean_or_null.py +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair/parse_string_helpers/parse_json_llm_block.py +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair/py.typed +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair/utils/json_context.py +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair.egg-info/dependency_links.txt +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair.egg-info/entry_points.txt +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/src/json_repair.egg-info/top_level.txt +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/tests/test_json_repair.py +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/tests/test_parse_comment.py +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/tests/test_parse_number.py +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/tests/test_parse_string.py +0 -0
- {json_repair-0.55.1 → json_repair-0.56.0}/tests/test_strict_mode.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: json_repair
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.56.0
|
|
4
4
|
Summary: A package to repair broken json strings
|
|
5
5
|
Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
|
|
6
6
|
License-Expression: MIT
|
|
@@ -13,6 +13,9 @@ Classifier: Operating System :: OS Independent
|
|
|
13
13
|
Requires-Python: >=3.10
|
|
14
14
|
Description-Content-Type: text/markdown
|
|
15
15
|
License-File: LICENSE
|
|
16
|
+
Provides-Extra: schema
|
|
17
|
+
Requires-Dist: jsonschema>=4.21; extra == "schema"
|
|
18
|
+
Requires-Dist: pydantic>=2; extra == "schema"
|
|
16
19
|
Dynamic: license-file
|
|
17
20
|
|
|
18
21
|
[](https://pypi.org/project/json-repair/)
|
|
@@ -190,6 +193,58 @@ In strict mode the parser raises `ValueError` as soon as it encounters structura
|
|
|
190
193
|
|
|
191
194
|
Strict mode still honors `skip_json_loads=True`; combining them lets you skip the initial `json.loads` check but still enforce strict parsing rules.
|
|
192
195
|
|
|
196
|
+
### Schema-guided repairs
|
|
197
|
+
|
|
198
|
+
**Alpha feature (not yet in stable releases).** Schema-guided repairs are currently shipped only in alpha builds (e.g., `0.56.0-alpha.*`). The API and behavior may change or break between alpha releases.
|
|
199
|
+
|
|
200
|
+
You can guide repairs with a JSON Schema (or a Pydantic v2 model). When enabled, the parser will:
|
|
201
|
+
|
|
202
|
+
- Fill missing values (defaults, required fields).
|
|
203
|
+
- Coerce scalars where safe (e.g., `"1"` → `1` for integer fields).
|
|
204
|
+
- Drop properties/items that the schema disallows.
|
|
205
|
+
|
|
206
|
+
This is especially useful when you need deterministic, schema-valid outputs for downstream validation, storage, or typed processing. If the input cannot be repaired to satisfy the schema, `json_repair` raises `ValueError`.
|
|
207
|
+
|
|
208
|
+
Install the optional dependencies:
|
|
209
|
+
|
|
210
|
+
pip install 'json-repair[schema]'
|
|
211
|
+
|
|
212
|
+
(For CLI usage, you can also use `pipx install 'json-repair[schema]'`.)
|
|
213
|
+
|
|
214
|
+
Schema guidance is skipped for already-valid JSON unless you pass `skip_json_loads=True` (this forces the parser to run even on valid JSON). Schema guidance is mutually exclusive with `strict=True`.
|
|
215
|
+
|
|
216
|
+
```
|
|
217
|
+
from json_repair import repair_json
|
|
218
|
+
|
|
219
|
+
schema = {
|
|
220
|
+
"type": "object",
|
|
221
|
+
"properties": {"value": {"type": "integer"}},
|
|
222
|
+
"required": ["value"],
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
repair_json('{"value": "1"}', schema=schema, skip_json_loads=True, return_objects=True)
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
Pydantic v2 model example:
|
|
229
|
+
|
|
230
|
+
```
|
|
231
|
+
from pydantic import BaseModel, Field
|
|
232
|
+
from json_repair import repair_json
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class Payload(BaseModel):
|
|
236
|
+
value: int
|
|
237
|
+
tags: list[str] = Field(default_factory=list)
|
|
238
|
+
|
|
239
|
+
|
|
240
|
+
repair_json(
|
|
241
|
+
'{"value": "1", "tags": }',
|
|
242
|
+
schema=Payload,
|
|
243
|
+
skip_json_loads=True,
|
|
244
|
+
return_objects=True,
|
|
245
|
+
)
|
|
246
|
+
```
|
|
247
|
+
|
|
193
248
|
### Use json_repair with streaming
|
|
194
249
|
|
|
195
250
|
Sometimes you are streaming some data and want to repair the JSON coming from it. Normally this won't work but you can pass `stream_stable` to `repair_json()` or `loads()` to make it work:
|
|
@@ -207,7 +262,9 @@ pipx install json-repair
|
|
|
207
262
|
to know all options available:
|
|
208
263
|
```
|
|
209
264
|
$ json_repair -h
|
|
210
|
-
usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT]
|
|
265
|
+
usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT]
|
|
266
|
+
[--skip-json-loads] [--schema SCHEMA] [--schema-model MODEL]
|
|
267
|
+
[--strict] [filename]
|
|
211
268
|
|
|
212
269
|
Repair and parse JSON files.
|
|
213
270
|
|
|
@@ -221,6 +278,9 @@ options:
|
|
|
221
278
|
If specified, the output will be written to TARGET filename instead of stdout
|
|
222
279
|
--ensure_ascii Pass ensure_ascii=True to json.dumps()
|
|
223
280
|
--indent INDENT Number of spaces for indentation (Default 2)
|
|
281
|
+
--skip-json-loads Skip initial json.loads validation (needed to force schema on valid JSON)
|
|
282
|
+
--schema SCHEMA Path to a JSON Schema file that guides repairs
|
|
283
|
+
--schema-model MODEL Pydantic v2 model in 'module:ClassName' form that guides repairs
|
|
224
284
|
--strict Raise on duplicate keys, missing separators, empty keys/values, and similar structural issues instead of repairing them
|
|
225
285
|
```
|
|
226
286
|
|
|
@@ -274,8 +334,15 @@ If something is wrong (a missing parentheses or quotes for example) it will use
|
|
|
274
334
|
|
|
275
335
|
I am sure some corner cases will be missing, if you have examples please open an issue or even better push a PR
|
|
276
336
|
|
|
337
|
+
# Contributing
|
|
338
|
+
If you want to contribute, start with `CONTRIBUTING.md` and read the Code Wiki writeup for a tour of the codebase and key entry points: https://codewiki.google/github.com/mangiucugna/json_repair
|
|
339
|
+
|
|
277
340
|
# How to develop
|
|
278
|
-
|
|
341
|
+
Use `uv` to set up the dev environment and run tooling:
|
|
342
|
+
|
|
343
|
+
uv sync --group dev
|
|
344
|
+
uv run pre-commit run --all-files
|
|
345
|
+
uv run pytest
|
|
279
346
|
|
|
280
347
|
Make sure that the Github Actions running after pushing a new commit don't fail as well.
|
|
281
348
|
|
|
@@ -1,20 +1,3 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: json_repair
|
|
3
|
-
Version: 0.55.1
|
|
4
|
-
Summary: A package to repair broken json strings
|
|
5
|
-
Author-email: Stefano Baccianella <4247706+mangiucugna@users.noreply.github.com>
|
|
6
|
-
License-Expression: MIT
|
|
7
|
-
Project-URL: Homepage, https://github.com/mangiucugna/json_repair/
|
|
8
|
-
Project-URL: Bug Tracker, https://github.com/mangiucugna/json_repair/issues
|
|
9
|
-
Project-URL: Live demo, https://mangiucugna.github.io/json_repair/
|
|
10
|
-
Keywords: JSON,REPAIR,LLM,PARSER
|
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
|
12
|
-
Classifier: Operating System :: OS Independent
|
|
13
|
-
Requires-Python: >=3.10
|
|
14
|
-
Description-Content-Type: text/markdown
|
|
15
|
-
License-File: LICENSE
|
|
16
|
-
Dynamic: license-file
|
|
17
|
-
|
|
18
1
|
[](https://pypi.org/project/json-repair/)
|
|
19
2
|

|
|
20
3
|
[](https://pypi.org/project/json-repair/)
|
|
@@ -190,6 +173,58 @@ In strict mode the parser raises `ValueError` as soon as it encounters structura
|
|
|
190
173
|
|
|
191
174
|
Strict mode still honors `skip_json_loads=True`; combining them lets you skip the initial `json.loads` check but still enforce strict parsing rules.
|
|
192
175
|
|
|
176
|
+
### Schema-guided repairs
|
|
177
|
+
|
|
178
|
+
**Alpha feature (not yet in stable releases).** Schema-guided repairs are currently shipped only in alpha builds (e.g., `0.56.0-alpha.*`). The API and behavior may change or break between alpha releases.
|
|
179
|
+
|
|
180
|
+
You can guide repairs with a JSON Schema (or a Pydantic v2 model). When enabled, the parser will:
|
|
181
|
+
|
|
182
|
+
- Fill missing values (defaults, required fields).
|
|
183
|
+
- Coerce scalars where safe (e.g., `"1"` → `1` for integer fields).
|
|
184
|
+
- Drop properties/items that the schema disallows.
|
|
185
|
+
|
|
186
|
+
This is especially useful when you need deterministic, schema-valid outputs for downstream validation, storage, or typed processing. If the input cannot be repaired to satisfy the schema, `json_repair` raises `ValueError`.
|
|
187
|
+
|
|
188
|
+
Install the optional dependencies:
|
|
189
|
+
|
|
190
|
+
pip install 'json-repair[schema]'
|
|
191
|
+
|
|
192
|
+
(For CLI usage, you can also use `pipx install 'json-repair[schema]'`.)
|
|
193
|
+
|
|
194
|
+
Schema guidance is skipped for already-valid JSON unless you pass `skip_json_loads=True` (this forces the parser to run even on valid JSON). Schema guidance is mutually exclusive with `strict=True`.
|
|
195
|
+
|
|
196
|
+
```
|
|
197
|
+
from json_repair import repair_json
|
|
198
|
+
|
|
199
|
+
schema = {
|
|
200
|
+
"type": "object",
|
|
201
|
+
"properties": {"value": {"type": "integer"}},
|
|
202
|
+
"required": ["value"],
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
repair_json('{"value": "1"}', schema=schema, skip_json_loads=True, return_objects=True)
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
Pydantic v2 model example:
|
|
209
|
+
|
|
210
|
+
```
|
|
211
|
+
from pydantic import BaseModel, Field
|
|
212
|
+
from json_repair import repair_json
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class Payload(BaseModel):
|
|
216
|
+
value: int
|
|
217
|
+
tags: list[str] = Field(default_factory=list)
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
repair_json(
|
|
221
|
+
'{"value": "1", "tags": }',
|
|
222
|
+
schema=Payload,
|
|
223
|
+
skip_json_loads=True,
|
|
224
|
+
return_objects=True,
|
|
225
|
+
)
|
|
226
|
+
```
|
|
227
|
+
|
|
193
228
|
### Use json_repair with streaming
|
|
194
229
|
|
|
195
230
|
Sometimes you are streaming some data and want to repair the JSON coming from it. Normally this won't work but you can pass `stream_stable` to `repair_json()` or `loads()` to make it work:
|
|
@@ -207,7 +242,9 @@ pipx install json-repair
|
|
|
207
242
|
to know all options available:
|
|
208
243
|
```
|
|
209
244
|
$ json_repair -h
|
|
210
|
-
usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT]
|
|
245
|
+
usage: json_repair [-h] [-i] [-o TARGET] [--ensure_ascii] [--indent INDENT]
|
|
246
|
+
[--skip-json-loads] [--schema SCHEMA] [--schema-model MODEL]
|
|
247
|
+
[--strict] [filename]
|
|
211
248
|
|
|
212
249
|
Repair and parse JSON files.
|
|
213
250
|
|
|
@@ -221,6 +258,9 @@ options:
|
|
|
221
258
|
If specified, the output will be written to TARGET filename instead of stdout
|
|
222
259
|
--ensure_ascii Pass ensure_ascii=True to json.dumps()
|
|
223
260
|
--indent INDENT Number of spaces for indentation (Default 2)
|
|
261
|
+
--skip-json-loads Skip initial json.loads validation (needed to force schema on valid JSON)
|
|
262
|
+
--schema SCHEMA Path to a JSON Schema file that guides repairs
|
|
263
|
+
--schema-model MODEL Pydantic v2 model in 'module:ClassName' form that guides repairs
|
|
224
264
|
--strict Raise on duplicate keys, missing separators, empty keys/values, and similar structural issues instead of repairing them
|
|
225
265
|
```
|
|
226
266
|
|
|
@@ -274,8 +314,15 @@ If something is wrong (a missing parentheses or quotes for example) it will use
|
|
|
274
314
|
|
|
275
315
|
I am sure some corner cases will be missing, if you have examples please open an issue or even better push a PR
|
|
276
316
|
|
|
317
|
+
# Contributing
|
|
318
|
+
If you want to contribute, start with `CONTRIBUTING.md` and read the Code Wiki writeup for a tour of the codebase and key entry points: https://codewiki.google/github.com/mangiucugna/json_repair
|
|
319
|
+
|
|
277
320
|
# How to develop
|
|
278
|
-
|
|
321
|
+
Use `uv` to set up the dev environment and run tooling:
|
|
322
|
+
|
|
323
|
+
uv sync --group dev
|
|
324
|
+
uv run pre-commit run --all-files
|
|
325
|
+
uv run pytest
|
|
279
326
|
|
|
280
327
|
Make sure that the Github Actions running after pushing a new commit don't fail as well.
|
|
281
328
|
|
|
@@ -3,7 +3,7 @@ requires = ["setuptools>=61.0"]
|
|
|
3
3
|
build-backend = "setuptools.build_meta"
|
|
4
4
|
[project]
|
|
5
5
|
name = "json_repair"
|
|
6
|
-
version = "0.
|
|
6
|
+
version = "0.56.0"
|
|
7
7
|
license = "MIT"
|
|
8
8
|
license-files = ["LICENSE"]
|
|
9
9
|
authors = [
|
|
@@ -21,6 +21,33 @@ classifiers = [
|
|
|
21
21
|
"Homepage" = "https://github.com/mangiucugna/json_repair/"
|
|
22
22
|
"Bug Tracker" = "https://github.com/mangiucugna/json_repair/issues"
|
|
23
23
|
"Live demo" = "https://mangiucugna.github.io/json_repair/"
|
|
24
|
+
[project.optional-dependencies]
|
|
25
|
+
schema = [
|
|
26
|
+
"jsonschema>=4.21",
|
|
27
|
+
"pydantic>=2",
|
|
28
|
+
]
|
|
29
|
+
[dependency-groups]
|
|
30
|
+
dev = [
|
|
31
|
+
"coverage",
|
|
32
|
+
"jsonschema",
|
|
33
|
+
"mypy",
|
|
34
|
+
"pydantic",
|
|
35
|
+
"pre-commit",
|
|
36
|
+
"pytest",
|
|
37
|
+
"pytest-benchmark",
|
|
38
|
+
"ty",
|
|
39
|
+
]
|
|
40
|
+
test = [
|
|
41
|
+
"coverage",
|
|
42
|
+
"jsonschema",
|
|
43
|
+
"pydantic",
|
|
44
|
+
"pytest",
|
|
45
|
+
"pytest-benchmark",
|
|
46
|
+
]
|
|
47
|
+
typecheck = [
|
|
48
|
+
"mypy",
|
|
49
|
+
"ty",
|
|
50
|
+
]
|
|
24
51
|
[tool.pytest.ini_options]
|
|
25
52
|
pythonpath = [
|
|
26
53
|
"."
|
|
@@ -69,13 +96,19 @@ target-version = "py313"
|
|
|
69
96
|
# Flake8-bugbear – catches real-world Python footguns - B
|
|
70
97
|
# Flake8-builtins - A
|
|
71
98
|
# Flake8-comprehensions - C4
|
|
99
|
+
# Flake8-blind-except - BLE
|
|
72
100
|
# Flake8-commas - COM
|
|
101
|
+
# Flake8-print - T20
|
|
73
102
|
# Flake8-quotes - Q
|
|
103
|
+
# Flake8-return - RET
|
|
74
104
|
# Flake8-tidy-imports - TID
|
|
75
105
|
# Flake8-unused-arguments - ARG
|
|
106
|
+
# Refurb - FURB
|
|
76
107
|
# Isort - I
|
|
77
108
|
# Mccabe – code complexity warnings - C90
|
|
78
109
|
# PEP 8 Naming convention - N
|
|
110
|
+
# Perflint - PERF
|
|
111
|
+
# Flake8-use-pathlib - PTH
|
|
79
112
|
# Pycodestyle - E, W
|
|
80
113
|
# Pyflakes - F
|
|
81
114
|
# Pylint - PLC, PLE, PLR, PLW
|
|
@@ -83,10 +116,11 @@ target-version = "py313"
|
|
|
83
116
|
# Pyupgrade – safe modernization (e.g., str() → f"") - UP
|
|
84
117
|
# Ruff specific - RUF
|
|
85
118
|
# Simplifications (e.g., if x == True → if x) - SIM
|
|
86
|
-
select = ['A', 'ARG', 'B', 'C4', 'COM', 'C90', 'E', 'F', 'I', 'N', 'PLC', 'PLE', 'PLW', 'PT', 'Q', 'S', 'SIM', 'TID', 'UP', 'W']
|
|
119
|
+
select = ['A', 'ARG', 'B', 'BLE', 'C4', 'COM', 'C90', 'E', 'F', 'FURB', 'I', 'N', 'PERF', 'PLC', 'PLE', 'PLW', 'PT', 'PTH', 'Q', 'RET', 'S', 'SIM', 'T20', 'TID', 'UP', 'W']
|
|
87
120
|
# Only enable these RUF rules
|
|
88
121
|
extend-select = [
|
|
89
122
|
"RUF001", # ambiguous Unicode
|
|
123
|
+
"RUF100", # unused noqa
|
|
90
124
|
"RUF012", # mutable default arguments
|
|
91
125
|
"RUF013", # unnecessary super()
|
|
92
126
|
"RUF016", # unnecessary else after return (optional)
|
|
@@ -117,5 +151,7 @@ line-ending = "auto"
|
|
|
117
151
|
[tool.ruff.lint.per-file-ignores]
|
|
118
152
|
# Explicit re-exports is fine in __init__.py, still a code smell elsewhere.
|
|
119
153
|
"__init__.py" = ["PLC0414"]
|
|
154
|
+
"src/json_repair/json_repair.py" = ["T201"]
|
|
155
|
+
"tests/profiler.py" = ["T201"]
|
|
120
156
|
[tool.mypy]
|
|
121
157
|
strict = true
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
from
|
|
1
|
+
from collections.abc import Callable
|
|
2
|
+
from typing import TYPE_CHECKING, Any, TextIO
|
|
2
3
|
|
|
3
4
|
from .parse_array import parse_array as _parse_array
|
|
4
5
|
from .parse_comment import parse_comment as _parse_comment
|
|
@@ -10,11 +11,18 @@ from .utils.json_context import JsonContext
|
|
|
10
11
|
from .utils.object_comparer import ObjectComparer
|
|
11
12
|
from .utils.string_file_wrapper import StringFileWrapper
|
|
12
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from .schema_repair import SchemaRepairer
|
|
16
|
+
|
|
13
17
|
|
|
14
18
|
class JSONParser:
|
|
15
19
|
# Split the parse methods into separate files because this one was like 3000 lines
|
|
16
|
-
def parse_array(
|
|
17
|
-
|
|
20
|
+
def parse_array(
|
|
21
|
+
self,
|
|
22
|
+
schema: dict[str, Any] | bool | None = None,
|
|
23
|
+
path: str = "$",
|
|
24
|
+
) -> list[JSONReturnType]:
|
|
25
|
+
return _parse_array(self, schema, path)
|
|
18
26
|
|
|
19
27
|
def parse_comment(self) -> JSONReturnType:
|
|
20
28
|
return _parse_comment(self)
|
|
@@ -22,8 +30,12 @@ class JSONParser:
|
|
|
22
30
|
def parse_number(self) -> JSONReturnType:
|
|
23
31
|
return _parse_number(self)
|
|
24
32
|
|
|
25
|
-
def parse_object(
|
|
26
|
-
|
|
33
|
+
def parse_object(
|
|
34
|
+
self,
|
|
35
|
+
schema: dict[str, Any] | bool | None = None,
|
|
36
|
+
path: str = "$",
|
|
37
|
+
) -> JSONReturnType:
|
|
38
|
+
return _parse_object(self, schema, path)
|
|
27
39
|
|
|
28
40
|
def parse_string(self) -> JSONReturnType:
|
|
29
41
|
return _parse_string(self)
|
|
@@ -53,8 +65,8 @@ class JSONParser:
|
|
|
53
65
|
# We could add a guard in the code for each call but that would make this code unreadable, so here's this neat trick
|
|
54
66
|
# Replace self.log with a noop
|
|
55
67
|
self.logging = logging
|
|
68
|
+
self.logger: list[dict[str, str]] = []
|
|
56
69
|
if logging:
|
|
57
|
-
self.logger: list[dict[str, str]] = []
|
|
58
70
|
self.log = self._log
|
|
59
71
|
else:
|
|
60
72
|
# No-op
|
|
@@ -71,11 +83,26 @@ class JSONParser:
|
|
|
71
83
|
# may not be desirable in some use cases and the user would prefer json_repair to return an exception.
|
|
72
84
|
# So strict mode was added to disable some of those heuristics.
|
|
73
85
|
self.strict = strict
|
|
86
|
+
self.schema_repairer: SchemaRepairer | None = None
|
|
74
87
|
|
|
75
88
|
def parse(
|
|
76
89
|
self,
|
|
77
|
-
) -> JSONReturnType
|
|
78
|
-
|
|
90
|
+
) -> JSONReturnType:
|
|
91
|
+
return self._parse_top_level(self.parse_json)
|
|
92
|
+
|
|
93
|
+
def parse_with_schema(
|
|
94
|
+
self,
|
|
95
|
+
repairer: "SchemaRepairer",
|
|
96
|
+
schema: dict[str, Any] | bool,
|
|
97
|
+
) -> JSONReturnType:
|
|
98
|
+
"""Parse with schema guidance enabled for all nested values."""
|
|
99
|
+
self.schema_repairer = repairer
|
|
100
|
+
return self._parse_top_level(lambda: self.parse_json(schema, "$"))
|
|
101
|
+
|
|
102
|
+
# Consolidate top-level parsing so we handle multiple sequential JSON values consistently
|
|
103
|
+
# (including update semantics and strict-mode validation).
|
|
104
|
+
def _parse_top_level(self, parse_element: Callable[[], JSONReturnType]) -> JSONReturnType:
|
|
105
|
+
json = parse_element()
|
|
79
106
|
if self.index < len(self.json_str):
|
|
80
107
|
self.log(
|
|
81
108
|
"The parser returned early, checking if there's more json elements",
|
|
@@ -83,19 +110,17 @@ class JSONParser:
|
|
|
83
110
|
json = [json]
|
|
84
111
|
while self.index < len(self.json_str):
|
|
85
112
|
self.context.reset()
|
|
86
|
-
j =
|
|
113
|
+
j = parse_element()
|
|
87
114
|
if j:
|
|
88
115
|
if ObjectComparer.is_same_object(json[-1], j):
|
|
89
|
-
#
|
|
116
|
+
# Treat repeated objects as updates: keep the newest value.
|
|
90
117
|
json.pop()
|
|
91
118
|
else:
|
|
92
119
|
if not json[-1]:
|
|
93
120
|
json.pop()
|
|
94
121
|
json.append(j)
|
|
95
122
|
else:
|
|
96
|
-
# this was a bust, move the index
|
|
97
123
|
self.index += 1
|
|
98
|
-
# If nothing extra was found, don't return an array
|
|
99
124
|
if len(json) == 1:
|
|
100
125
|
self.log(
|
|
101
126
|
"There were no more elements, returning the element without the array",
|
|
@@ -106,38 +131,51 @@ class JSONParser:
|
|
|
106
131
|
"Multiple top-level JSON elements found in strict mode, raising an error",
|
|
107
132
|
)
|
|
108
133
|
raise ValueError("Multiple top-level JSON elements found in strict mode.")
|
|
109
|
-
|
|
110
|
-
return json, self.logger
|
|
111
|
-
else:
|
|
112
|
-
return json
|
|
134
|
+
return json
|
|
113
135
|
|
|
114
136
|
def parse_json(
|
|
115
137
|
self,
|
|
138
|
+
schema: dict[str, Any] | bool | None = None,
|
|
139
|
+
path: str = "$",
|
|
116
140
|
) -> JSONReturnType:
|
|
141
|
+
"""Parse the next JSON value and, when configured, enforce schema constraints."""
|
|
142
|
+
repairer = self.schema_repairer if self.schema_repairer is not None and schema not in (None, True) else None
|
|
143
|
+
if repairer is not None:
|
|
144
|
+
# Resolve references once and decide whether schema-guided repairs are needed.
|
|
145
|
+
schema = repairer.resolve_schema(schema)
|
|
146
|
+
if schema is True:
|
|
147
|
+
repairer = None
|
|
148
|
+
elif schema is False:
|
|
149
|
+
raise ValueError("Schema does not allow any values.")
|
|
150
|
+
|
|
117
151
|
while True:
|
|
118
152
|
char = self.get_char_at()
|
|
119
153
|
# None means that we are at the end of the string provided
|
|
120
154
|
if char is None:
|
|
121
155
|
return ""
|
|
122
156
|
# <object> starts with '{'
|
|
123
|
-
|
|
157
|
+
if char == "{":
|
|
124
158
|
self.index += 1
|
|
125
|
-
|
|
159
|
+
value = self.parse_object(schema, path) if repairer else self.parse_object()
|
|
160
|
+
return repairer.repair_value(value, schema, path) if repairer else value
|
|
126
161
|
# <array> starts with '['
|
|
127
|
-
|
|
162
|
+
if char == "[":
|
|
128
163
|
self.index += 1
|
|
129
|
-
|
|
164
|
+
value = self.parse_array(schema, path) if repairer else self.parse_array()
|
|
165
|
+
return repairer.repair_value(value, schema, path) if repairer else value
|
|
130
166
|
# <string> starts with a quote
|
|
131
|
-
|
|
132
|
-
|
|
167
|
+
if not self.context.empty and (char in STRING_DELIMITERS or char.isalpha()):
|
|
168
|
+
value = self.parse_string()
|
|
169
|
+
return repairer.repair_value(value, schema, path) if repairer else value
|
|
133
170
|
# <number> starts with [0-9] or minus
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
171
|
+
if not self.context.empty and (char.isdigit() or char == "-" or char == "."):
|
|
172
|
+
value = self.parse_number()
|
|
173
|
+
return repairer.repair_value(value, schema, path) if repairer else value
|
|
174
|
+
if char in ["#", "/"]:
|
|
175
|
+
value = self.parse_comment()
|
|
176
|
+
return repairer.repair_value(value, schema, path) if repairer else value
|
|
138
177
|
# If everything else fails, we just ignore and move on
|
|
139
|
-
|
|
140
|
-
self.index += 1
|
|
178
|
+
self.index += 1
|
|
141
179
|
|
|
142
180
|
def get_char_at(self, count: int = 0) -> str | None:
|
|
143
181
|
# Why not use something simpler? Because try/except in python is a faster alternative to an "if" statement that is often True
|