jsonmend 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jsonmend-0.1.0/LICENSE +21 -0
- jsonmend-0.1.0/PKG-INFO +193 -0
- jsonmend-0.1.0/README.md +166 -0
- jsonmend-0.1.0/pyproject.toml +39 -0
- jsonmend-0.1.0/setup.cfg +4 -0
- jsonmend-0.1.0/src/jsonmend/__init__.py +269 -0
- jsonmend-0.1.0/src/jsonmend/_engine.py +1889 -0
- jsonmend-0.1.0/src/jsonmend.egg-info/PKG-INFO +193 -0
- jsonmend-0.1.0/src/jsonmend.egg-info/SOURCES.txt +13 -0
- jsonmend-0.1.0/src/jsonmend.egg-info/dependency_links.txt +1 -0
- jsonmend-0.1.0/src/jsonmend.egg-info/top_level.txt +1 -0
- jsonmend-0.1.0/tests/test_api.py +95 -0
- jsonmend-0.1.0/tests/test_corpus.py +57 -0
- jsonmend-0.1.0/tests/test_robustness.py +144 -0
- jsonmend-0.1.0/tests/test_streaming.py +139 -0
jsonmend-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 adam2go
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
jsonmend-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: jsonmend
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Mends the JSON your LLM almost wrote - fast single-pass repair with true incremental streaming
|
|
5
|
+
Author: adam2go
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/adam2go/jsonmend
|
|
8
|
+
Keywords: json,repair,fix,llm,streaming,incremental,agent,tool-calls
|
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
19
|
+
Classifier: Programming Language :: Python :: Implementation :: CPython
|
|
20
|
+
Classifier: Programming Language :: Python :: Implementation :: PyPy
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
22
|
+
Classifier: Topic :: Text Processing :: Filters
|
|
23
|
+
Requires-Python: >=3.9
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Dynamic: license-file
|
|
27
|
+
|
|
28
|
+
# jsonmend
|
|
29
|
+
|
|
30
|
+
**Mends the JSON your LLM almost wrote.**
|
|
31
|
+
|
|
32
|
+
Truncated tool calls, markdown fences, single quotes, bare keys, Python
|
|
33
|
+
literals, comments, trailing commas, prose around the payload — jsonmend
|
|
34
|
+
turns them into valid JSON. It is a drop-in replacement for
|
|
35
|
+
[json_repair](https://github.com/mangiucugna/json_repair) that is
|
|
36
|
+
**5–10× faster** on batch repair, **~50× faster** on streaming, ships a
|
|
37
|
+
**true incremental streaming API** (O(new bytes) per chunk, not O(buffer)),
|
|
38
|
+
and is the reference implementation of an open, cross-language
|
|
39
|
+
[**conformance corpus**](corpus/) for JSON repair.
|
|
40
|
+
|
|
41
|
+
Pure Python, zero dependencies, zero binaries. Works on CPython 3.9–3.14,
|
|
42
|
+
PyPy, Pyodide/WASM, AWS Lambda — anywhere `pip install` works.
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install jsonmend
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Scoreboard
|
|
49
|
+
|
|
50
|
+
JSON repair has no standard: the same broken input is repaired
|
|
51
|
+
*differently* by the Python and JavaScript incumbents, which is a real
|
|
52
|
+
source of production bugs. The [jsonmend conformance corpus](corpus/)
|
|
53
|
+
(460 cases, 19 categories, CC0) defines repair semantics as data —
|
|
54
|
+
including the genuinely ambiguous cases, where every defensible answer
|
|
55
|
+
is accepted.
|
|
56
|
+
|
|
57
|
+
| | jsonmend 0.1.0 | json_repair 0.60.1 | jsonrepair 3.14.0 (JS) |
|
|
58
|
+
|---|---|---|---|
|
|
59
|
+
| corpus pass rate | **460/460 (100%)** | 320/460 (69.6%) | 354/460 (77.0%) |
|
|
60
|
+
|
|
61
|
+
Per-category breakdown: [corpus/scoreboard.md](corpus/scoreboard.md).
|
|
62
|
+
Reproduce: `python tools/referee.py --write` (needs `pip install
|
|
63
|
+
json_repair` and `npm install jsonrepair`, dev-only).
|
|
64
|
+
|
|
65
|
+
## Performance
|
|
66
|
+
|
|
67
|
+
Median of 7, three independent rounds within ±5%, Python 3.12, M-series
|
|
68
|
+
macOS. All inputs are *broken* JSON (the `json.loads` fast path never
|
|
69
|
+
runs). Verified-then-timed: outputs are checked equal before timing.
|
|
70
|
+
Reproduce: `python tools/bench.py --verify && python tools/bench.py`.
|
|
71
|
+
|
|
72
|
+
| workload | size | jsonmend | json_repair | speedup |
|
|
73
|
+
|---|---|---|---|---|
|
|
74
|
+
| truncated tool call | 1 KB | 0.027 ms | 0.199 ms | **7.3×** |
|
|
75
|
+
| truncated row payload | 75 KB | 1.48 ms | 12.6 ms | **8.5×** |
|
|
76
|
+
| markdown-fenced output | 49 KB | 0.25 ms | 2.6 ms | **10.6×** |
|
|
77
|
+
| dirty (quotes/keys/literals) | 5 KB | 0.38 ms | 2.6 ms | **7.0×** |
|
|
78
|
+
|
|
79
|
+
### Streaming is a different complexity class
|
|
80
|
+
|
|
81
|
+
A streaming UI re-renders the partial value on every chunk. With a batch
|
|
82
|
+
repairer you must re-parse the whole buffer each time — O(n²) total. The
|
|
83
|
+
stateful `Mender` only pays for the new bytes:
|
|
84
|
+
|
|
85
|
+
| workload | jsonmend `Mender` | json_repair (`stream_stable=True`) | |
|
|
86
|
+
|---|---|---|---|
|
|
87
|
+
| 150 KB in 4 KB chunks | 6.9 ms | 323 ms | **47×** |
|
|
88
|
+
| 10 MB in 4 KB chunks | 1.2 s | est. >20 min (quadratic) | — |
|
|
89
|
+
|
|
90
|
+
## Usage
|
|
91
|
+
|
|
92
|
+
### Drop-in for json_repair
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
# before
|
|
96
|
+
from json_repair import repair_json, loads
|
|
97
|
+
# after — same call sites
|
|
98
|
+
from jsonmend import repair_json, loads
|
|
99
|
+
|
|
100
|
+
repair_json("{'name': 'John', age: 31") # '{"name": "John", "age": 31}'
|
|
101
|
+
loads('```json\n{"ok": true,}\n```') # {'ok': True}
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
`repair_json(json_str, return_objects=..., skip_json_loads=...,
|
|
105
|
+
ensure_ascii=..., **json_dumps_args)`, `loads`, `load(fd)`,
|
|
106
|
+
`from_file(path)` match json_repair's signatures. Valid JSON
|
|
107
|
+
short-circuits through C-speed `json.loads`.
|
|
108
|
+
|
|
109
|
+
### Streaming
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from jsonmend import Mender
|
|
113
|
+
|
|
114
|
+
m = Mender()
|
|
115
|
+
for chunk in llm_stream: # feed as the tokens arrive
|
|
116
|
+
partial = m.feed(chunk) # best-effort value, O(new bytes)
|
|
117
|
+
render(partial) # e.g. {"answer": "The capital of Fr"}
|
|
118
|
+
value = m.close() # final mended value
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
`feed()` returns a live view that grows in place — including the string
|
|
122
|
+
that is currently streaming in. Any chunking gives byte-identical results
|
|
123
|
+
to batch repair (property-tested over the whole corpus).
|
|
124
|
+
|
|
125
|
+
### Strict mode
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from jsonmend import loads, JSONMendError
|
|
129
|
+
|
|
130
|
+
loads("complete garbage") # "" (json_repair-compatible)
|
|
131
|
+
loads("complete garbage", strict=True) # raises JSONMendError
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## What it fixes
|
|
135
|
+
|
|
136
|
+
truncated objects/arrays/strings/numbers/literals · markdown fences with
|
|
137
|
+
prose around them · single/smart/backtick quotes · unescaped inner quotes
|
|
138
|
+
· missing quotes · bare keys and values · `True/False/None/undefined/NaN/
|
|
139
|
+
Infinity` · `//`, `#`, `/* */` comments · trailing/missing/extra commas ·
|
|
140
|
+
missing colons · mismatched brackets · concatenated/NDJSON documents ·
|
|
141
|
+
string concatenation (`"a" + "b"`) · JSONP/MongoDB wrappers
|
|
142
|
+
(`ObjectId("…")`) · Python tuples/sets · ellipsis placeholders ·
|
|
143
|
+
non-string keys · BOM and exotic whitespace · escaped-JSON documents
|
|
144
|
+
(`{\"a\": 1}`) · broken `\u` escapes and surrogate pairs · 100k-deep
|
|
145
|
+
nesting (no recursion anywhere)
|
|
146
|
+
|
|
147
|
+
## Why it's fast
|
|
148
|
+
|
|
149
|
+
* **One resumable state machine** serves batch and streaming — batch is
|
|
150
|
+
a single feed that never suspends, so there is no streaming tax.
|
|
151
|
+
* **Strings cost one `str.find` + one slice** when clean; never a
|
|
152
|
+
per-character Python loop.
|
|
153
|
+
* **Speculative C parsing**: complete sub-trees inside broken documents
|
|
154
|
+
are recognized and handed to the C `json` scanner, with a salvage step
|
|
155
|
+
that parses the longest clean prefix of a broken container in one shot.
|
|
156
|
+
Semantics-affecting inputs (NaN, control chars, surrogate escapes)
|
|
157
|
+
fall back to the machine, so behavior never changes.
|
|
158
|
+
* **Bounded backtracking**: a string-close decision can revisit one
|
|
159
|
+
recorded candidate quote, never rescan; adversarial quote storms stay
|
|
160
|
+
linear (tested).
|
|
161
|
+
|
|
162
|
+
## Guarantees
|
|
163
|
+
|
|
164
|
+
* Output is always **valid RFC 8259 JSON** (or `""`/an exception). Unlike
|
|
165
|
+
json_repair, `NaN`/`Infinity` never leak into the output text — they
|
|
166
|
+
serialize as `null` (`loads` still gives you the floats).
|
|
167
|
+
* Output is always UTF-8 encodable (lone surrogates are replaced).
|
|
168
|
+
* Never crashes, never recurses: fuzzed and property-tested, 100k-deep
|
|
169
|
+
inputs are fine.
|
|
170
|
+
* `Mender.close()` ≡ batch result, for every chunking (property-tested).
|
|
171
|
+
|
|
172
|
+
## Honest differences vs json_repair
|
|
173
|
+
|
|
174
|
+
* `logging=True` is not supported (it is incompatible with the
|
|
175
|
+
single-pass design and is one reason json_repair is slow); a no-op
|
|
176
|
+
shim raises `TypeError` so you notice.
|
|
177
|
+
* Schema-guided repair (`schema=`) is not implemented in v0.1.
|
|
178
|
+
* json_repair's `stream_stable=True` flag changes how *truncated escapes*
|
|
179
|
+
render mid-stream; jsonmend's `Mender` is always stream-stable.
|
|
180
|
+
* On `ambiguous` corpus cases the libraries may legitimately differ;
|
|
181
|
+
jsonmend's choices are documented case-by-case in the corpus
|
|
182
|
+
rationales.
|
|
183
|
+
|
|
184
|
+
## The corpus is the point
|
|
185
|
+
|
|
186
|
+
If you maintain a JSON-repair library in any language: please steal
|
|
187
|
+
[corpus/](corpus/). It is CC0, the format is three fields, and 460 cases
|
|
188
|
+
with rationales are more valuable than any of our engines. Cross-language
|
|
189
|
+
agreement on repair semantics helps everyone shipping LLM systems.
|
|
190
|
+
|
|
191
|
+
## License
|
|
192
|
+
|
|
193
|
+
MIT. The conformance corpus is CC0.
|
jsonmend-0.1.0/README.md
ADDED
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
# jsonmend
|
|
2
|
+
|
|
3
|
+
**Mends the JSON your LLM almost wrote.**
|
|
4
|
+
|
|
5
|
+
Truncated tool calls, markdown fences, single quotes, bare keys, Python
|
|
6
|
+
literals, comments, trailing commas, prose around the payload — jsonmend
|
|
7
|
+
turns them into valid JSON. It is a drop-in replacement for
|
|
8
|
+
[json_repair](https://github.com/mangiucugna/json_repair) that is
|
|
9
|
+
**5–10× faster** on batch repair, **~50× faster** on streaming, ships a
|
|
10
|
+
**true incremental streaming API** (O(new bytes) per chunk, not O(buffer)),
|
|
11
|
+
and is the reference implementation of an open, cross-language
|
|
12
|
+
[**conformance corpus**](corpus/) for JSON repair.
|
|
13
|
+
|
|
14
|
+
Pure Python, zero dependencies, zero binaries. Works on CPython 3.9–3.14,
|
|
15
|
+
PyPy, Pyodide/WASM, AWS Lambda — anywhere `pip install` works.
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
pip install jsonmend
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
## Scoreboard
|
|
22
|
+
|
|
23
|
+
JSON repair has no standard: the same broken input is repaired
|
|
24
|
+
*differently* by the Python and JavaScript incumbents, which is a real
|
|
25
|
+
source of production bugs. The [jsonmend conformance corpus](corpus/)
|
|
26
|
+
(460 cases, 19 categories, CC0) defines repair semantics as data —
|
|
27
|
+
including the genuinely ambiguous cases, where every defensible answer
|
|
28
|
+
is accepted.
|
|
29
|
+
|
|
30
|
+
| | jsonmend 0.1.0 | json_repair 0.60.1 | jsonrepair 3.14.0 (JS) |
|
|
31
|
+
|---|---|---|---|
|
|
32
|
+
| corpus pass rate | **460/460 (100%)** | 320/460 (69.6%) | 354/460 (77.0%) |
|
|
33
|
+
|
|
34
|
+
Per-category breakdown: [corpus/scoreboard.md](corpus/scoreboard.md).
|
|
35
|
+
Reproduce: `python tools/referee.py --write` (needs `pip install
|
|
36
|
+
json_repair` and `npm install jsonrepair`, dev-only).
|
|
37
|
+
|
|
38
|
+
## Performance
|
|
39
|
+
|
|
40
|
+
Median of 7, three independent rounds within ±5%, Python 3.12, M-series
|
|
41
|
+
macOS. All inputs are *broken* JSON (the `json.loads` fast path never
|
|
42
|
+
runs). Verified-then-timed: outputs are checked equal before timing.
|
|
43
|
+
Reproduce: `python tools/bench.py --verify && python tools/bench.py`.
|
|
44
|
+
|
|
45
|
+
| workload | size | jsonmend | json_repair | speedup |
|
|
46
|
+
|---|---|---|---|---|
|
|
47
|
+
| truncated tool call | 1 KB | 0.027 ms | 0.199 ms | **7.3×** |
|
|
48
|
+
| truncated row payload | 75 KB | 1.48 ms | 12.6 ms | **8.5×** |
|
|
49
|
+
| markdown-fenced output | 49 KB | 0.25 ms | 2.6 ms | **10.6×** |
|
|
50
|
+
| dirty (quotes/keys/literals) | 5 KB | 0.38 ms | 2.6 ms | **7.0×** |
|
|
51
|
+
|
|
52
|
+
### Streaming is a different complexity class
|
|
53
|
+
|
|
54
|
+
A streaming UI re-renders the partial value on every chunk. With a batch
|
|
55
|
+
repairer you must re-parse the whole buffer each time — O(n²) total. The
|
|
56
|
+
stateful `Mender` only pays for the new bytes:
|
|
57
|
+
|
|
58
|
+
| workload | jsonmend `Mender` | json_repair (`stream_stable=True`) | |
|
|
59
|
+
|---|---|---|---|
|
|
60
|
+
| 150 KB in 4 KB chunks | 6.9 ms | 323 ms | **47×** |
|
|
61
|
+
| 10 MB in 4 KB chunks | 1.2 s | est. >20 min (quadratic) | — |
|
|
62
|
+
|
|
63
|
+
## Usage
|
|
64
|
+
|
|
65
|
+
### Drop-in for json_repair
|
|
66
|
+
|
|
67
|
+
```python
|
|
68
|
+
# before
|
|
69
|
+
from json_repair import repair_json, loads
|
|
70
|
+
# after — same call sites
|
|
71
|
+
from jsonmend import repair_json, loads
|
|
72
|
+
|
|
73
|
+
repair_json("{'name': 'John', age: 31") # '{"name": "John", "age": 31}'
|
|
74
|
+
loads('```json\n{"ok": true,}\n```') # {'ok': True}
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
`repair_json(json_str, return_objects=..., skip_json_loads=...,
|
|
78
|
+
ensure_ascii=..., **json_dumps_args)`, `loads`, `load(fd)`,
|
|
79
|
+
`from_file(path)` match json_repair's signatures. Valid JSON
|
|
80
|
+
short-circuits through C-speed `json.loads`.
|
|
81
|
+
|
|
82
|
+
### Streaming
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
from jsonmend import Mender
|
|
86
|
+
|
|
87
|
+
m = Mender()
|
|
88
|
+
for chunk in llm_stream: # feed as the tokens arrive
|
|
89
|
+
partial = m.feed(chunk) # best-effort value, O(new bytes)
|
|
90
|
+
render(partial) # e.g. {"answer": "The capital of Fr"}
|
|
91
|
+
value = m.close() # final mended value
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
`feed()` returns a live view that grows in place — including the string
|
|
95
|
+
that is currently streaming in. Any chunking gives byte-identical results
|
|
96
|
+
to batch repair (property-tested over the whole corpus).
|
|
97
|
+
|
|
98
|
+
### Strict mode
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from jsonmend import loads, JSONMendError
|
|
102
|
+
|
|
103
|
+
loads("complete garbage") # "" (json_repair-compatible)
|
|
104
|
+
loads("complete garbage", strict=True) # raises JSONMendError
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## What it fixes
|
|
108
|
+
|
|
109
|
+
truncated objects/arrays/strings/numbers/literals · markdown fences with
|
|
110
|
+
prose around them · single/smart/backtick quotes · unescaped inner quotes
|
|
111
|
+
· missing quotes · bare keys and values · `True/False/None/undefined/NaN/
|
|
112
|
+
Infinity` · `//`, `#`, `/* */` comments · trailing/missing/extra commas ·
|
|
113
|
+
missing colons · mismatched brackets · concatenated/NDJSON documents ·
|
|
114
|
+
string concatenation (`"a" + "b"`) · JSONP/MongoDB wrappers
|
|
115
|
+
(`ObjectId("…")`) · Python tuples/sets · ellipsis placeholders ·
|
|
116
|
+
non-string keys · BOM and exotic whitespace · escaped-JSON documents
|
|
117
|
+
(`{\"a\": 1}`) · broken `\u` escapes and surrogate pairs · 100k-deep
|
|
118
|
+
nesting (no recursion anywhere)
|
|
119
|
+
|
|
120
|
+
## Why it's fast
|
|
121
|
+
|
|
122
|
+
* **One resumable state machine** serves batch and streaming — batch is
|
|
123
|
+
a single feed that never suspends, so there is no streaming tax.
|
|
124
|
+
* **Strings cost one `str.find` + one slice** when clean; never a
|
|
125
|
+
per-character Python loop.
|
|
126
|
+
* **Speculative C parsing**: complete sub-trees inside broken documents
|
|
127
|
+
are recognized and handed to the C `json` scanner, with a salvage step
|
|
128
|
+
that parses the longest clean prefix of a broken container in one shot.
|
|
129
|
+
Semantics-affecting inputs (NaN, control chars, surrogate escapes)
|
|
130
|
+
fall back to the machine, so behavior never changes.
|
|
131
|
+
* **Bounded backtracking**: a string-close decision can revisit one
|
|
132
|
+
recorded candidate quote, never rescan; adversarial quote storms stay
|
|
133
|
+
linear (tested).
|
|
134
|
+
|
|
135
|
+
## Guarantees
|
|
136
|
+
|
|
137
|
+
* Output is always **valid RFC 8259 JSON** (or `""`/an exception). Unlike
|
|
138
|
+
json_repair, `NaN`/`Infinity` never leak into the output text — they
|
|
139
|
+
serialize as `null` (`loads` still gives you the floats).
|
|
140
|
+
* Output is always UTF-8 encodable (lone surrogates are replaced).
|
|
141
|
+
* Never crashes, never recurses: fuzzed and property-tested, 100k-deep
|
|
142
|
+
inputs are fine.
|
|
143
|
+
* `Mender.close()` ≡ batch result, for every chunking (property-tested).
|
|
144
|
+
|
|
145
|
+
## Honest differences vs json_repair
|
|
146
|
+
|
|
147
|
+
* `logging=True` is not supported (it is incompatible with the
|
|
148
|
+
single-pass design and is one reason json_repair is slow); a no-op
|
|
149
|
+
shim raises `TypeError` so you notice.
|
|
150
|
+
* Schema-guided repair (`schema=`) is not implemented in v0.1.
|
|
151
|
+
* json_repair's `stream_stable=True` flag changes how *truncated escapes*
|
|
152
|
+
render mid-stream; jsonmend's `Mender` is always stream-stable.
|
|
153
|
+
* On `ambiguous` corpus cases the libraries may legitimately differ;
|
|
154
|
+
jsonmend's choices are documented case-by-case in the corpus
|
|
155
|
+
rationales.
|
|
156
|
+
|
|
157
|
+
## The corpus is the point
|
|
158
|
+
|
|
159
|
+
If you maintain a JSON-repair library in any language: please steal
|
|
160
|
+
[corpus/](corpus/). It is CC0, the format is three fields, and 460 cases
|
|
161
|
+
with rationales are more valuable than any of our engines. Cross-language
|
|
162
|
+
agreement on repair semantics helps everyone shipping LLM systems.
|
|
163
|
+
|
|
164
|
+
## License
|
|
165
|
+
|
|
166
|
+
MIT. The conformance corpus is CC0.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "jsonmend"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Mends the JSON your LLM almost wrote - fast single-pass repair with true incremental streaming"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "adam2go" }]
|
|
13
|
+
keywords = ["json", "repair", "fix", "llm", "streaming", "incremental", "agent", "tool-calls"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.9",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
24
|
+
"Programming Language :: Python :: 3.14",
|
|
25
|
+
"Programming Language :: Python :: Implementation :: CPython",
|
|
26
|
+
"Programming Language :: Python :: Implementation :: PyPy",
|
|
27
|
+
"Topic :: Software Development :: Libraries",
|
|
28
|
+
"Topic :: Text Processing :: Filters",
|
|
29
|
+
]
|
|
30
|
+
dependencies = []
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
Homepage = "https://github.com/adam2go/jsonmend"
|
|
34
|
+
|
|
35
|
+
[tool.setuptools.packages.find]
|
|
36
|
+
where = ["src"]
|
|
37
|
+
|
|
38
|
+
[tool.pytest.ini_options]
|
|
39
|
+
testpaths = ["tests"]
|
jsonmend-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"""jsonmend — mends the JSON your LLM almost wrote.
|
|
2
|
+
|
|
3
|
+
Batch API (drop-in for json_repair):
|
|
4
|
+
|
|
5
|
+
from jsonmend import repair_json, loads, load, from_file
|
|
6
|
+
|
|
7
|
+
Streaming API (true incremental, O(new bytes) per feed):
|
|
8
|
+
|
|
9
|
+
from jsonmend import Mender
|
|
10
|
+
m = Mender()
|
|
11
|
+
for chunk in stream:
|
|
12
|
+
partial = m.feed(chunk) # best-effort value so far
|
|
13
|
+
value = m.close()
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json as _json
|
|
19
|
+
import math as _math
|
|
20
|
+
|
|
21
|
+
from ._engine import SKIP, JSONMendError, MendMachine
|
|
22
|
+
|
|
23
|
+
__version__ = "0.1.0"
|
|
24
|
+
|
|
25
|
+
__all__ = [
|
|
26
|
+
"repair_json", "loads", "load", "from_file",
|
|
27
|
+
"mend", "Mender", "JSONMendError", "__version__",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def mend(text, *, strict=False, _doom_hint=None):
|
|
32
|
+
"""Repair ``text`` and return the parsed Python value.
|
|
33
|
+
|
|
34
|
+
This always runs the repair machine (no ``json.loads`` fast path).
|
|
35
|
+
Returns ``""`` for unmendable input, or raises :class:`JSONMendError`
|
|
36
|
+
when ``strict`` is true.
|
|
37
|
+
"""
|
|
38
|
+
if not isinstance(text, str):
|
|
39
|
+
text = _coerce_text(text)
|
|
40
|
+
if text and text[0] == "":
|
|
41
|
+
text = text.lstrip("")
|
|
42
|
+
_doom_hint = None
|
|
43
|
+
machine = MendMachine()
|
|
44
|
+
machine.final = True
|
|
45
|
+
if _doom_hint is not None:
|
|
46
|
+
machine.doomed_from = _doom_hint
|
|
47
|
+
machine.feed(text)
|
|
48
|
+
result = machine.close()
|
|
49
|
+
if result is SKIP:
|
|
50
|
+
if strict:
|
|
51
|
+
raise JSONMendError("no JSON content found in input")
|
|
52
|
+
return ""
|
|
53
|
+
return result
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def loads(json_str, *, skip_json_loads=False, strict=False, **_compat):
|
|
57
|
+
"""Repair and parse, returning Python objects.
|
|
58
|
+
|
|
59
|
+
Valid JSON takes a C-speed ``json.loads`` fast path unless
|
|
60
|
+
``skip_json_loads`` is true.
|
|
61
|
+
"""
|
|
62
|
+
if not isinstance(json_str, str):
|
|
63
|
+
json_str = _coerce_text(json_str)
|
|
64
|
+
if not skip_json_loads:
|
|
65
|
+
try:
|
|
66
|
+
return _json.loads(json_str)
|
|
67
|
+
except Exception as e:
|
|
68
|
+
p = getattr(e, "pos", None)
|
|
69
|
+
if p is not None and p >= len(json_str):
|
|
70
|
+
# truncated input: the machine need not rescan the root
|
|
71
|
+
return mend(json_str, strict=strict, _doom_hint=p)
|
|
72
|
+
return mend(json_str, strict=strict)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def repair_json(json_str="", return_objects=False, skip_json_loads=False,
|
|
76
|
+
ensure_ascii=True, strict=False, **json_dumps_args):
|
|
77
|
+
"""Repair broken JSON. Returns a JSON string (or objects).
|
|
78
|
+
|
|
79
|
+
API-compatible with ``json_repair.repair_json`` for the core
|
|
80
|
+
parameters. Unlike json_repair, the output is always *valid* JSON:
|
|
81
|
+
non-finite numbers (NaN/Infinity) are serialized as ``null``.
|
|
82
|
+
"""
|
|
83
|
+
if json_dumps_args.pop("logging", False):
|
|
84
|
+
raise TypeError(
|
|
85
|
+
"jsonmend does not support json_repair's logging=True "
|
|
86
|
+
"(incompatible with single-pass repair); remove the flag")
|
|
87
|
+
json_dumps_args.pop("stream_stable", None) # Mender is always stable
|
|
88
|
+
if not isinstance(json_str, str):
|
|
89
|
+
json_str = _coerce_text(json_str)
|
|
90
|
+
value = None
|
|
91
|
+
hint = None
|
|
92
|
+
if not skip_json_loads:
|
|
93
|
+
try:
|
|
94
|
+
value = _json.loads(json_str)
|
|
95
|
+
parsed = True
|
|
96
|
+
except Exception as e:
|
|
97
|
+
parsed = False
|
|
98
|
+
p = getattr(e, "pos", None)
|
|
99
|
+
if p is not None and p >= len(json_str):
|
|
100
|
+
hint = p
|
|
101
|
+
else:
|
|
102
|
+
parsed = False
|
|
103
|
+
if not parsed:
|
|
104
|
+
value = mend(json_str, strict=strict, _doom_hint=hint)
|
|
105
|
+
if return_objects:
|
|
106
|
+
return value
|
|
107
|
+
return _dumps(value, ensure_ascii=ensure_ascii, **json_dumps_args)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def load(fd, **kwargs):
|
|
111
|
+
"""Repair and parse JSON from a file-like object."""
|
|
112
|
+
return loads(fd.read(), **kwargs)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def from_file(filename, **kwargs):
|
|
116
|
+
"""Repair and parse JSON from a file path."""
|
|
117
|
+
with open(filename, encoding="utf-8-sig", newline="") as fd:
|
|
118
|
+
return loads(fd.read(), **kwargs)
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
class Mender:
|
|
122
|
+
"""Stateful incremental mender.
|
|
123
|
+
|
|
124
|
+
Each :meth:`feed` consumes one chunk and returns the best-effort
|
|
125
|
+
parsed value so far; the cost of a feed is proportional to the new
|
|
126
|
+
bytes, not to everything fed so far. The returned value is a *live
|
|
127
|
+
view* that later feeds may extend in place; call :meth:`close` to get
|
|
128
|
+
the final result.
|
|
129
|
+
"""
|
|
130
|
+
|
|
131
|
+
def __init__(self):
|
|
132
|
+
self._machine = MendMachine()
|
|
133
|
+
self._closed = False
|
|
134
|
+
self._result = None
|
|
135
|
+
|
|
136
|
+
def feed(self, chunk):
|
|
137
|
+
"""Feed one chunk; returns the current best-effort value."""
|
|
138
|
+
if self._closed:
|
|
139
|
+
raise ValueError("Mender is closed")
|
|
140
|
+
if not isinstance(chunk, str):
|
|
141
|
+
chunk = _coerce_text(chunk)
|
|
142
|
+
self._machine.feed(chunk)
|
|
143
|
+
return self._machine.current()
|
|
144
|
+
|
|
145
|
+
@property
|
|
146
|
+
def value(self):
|
|
147
|
+
"""Current best-effort value without feeding."""
|
|
148
|
+
if self._closed:
|
|
149
|
+
return self._result
|
|
150
|
+
return self._machine.current()
|
|
151
|
+
|
|
152
|
+
def close(self):
|
|
153
|
+
"""Finish parsing and return the final mended value."""
|
|
154
|
+
if not self._closed:
|
|
155
|
+
result = self._machine.close()
|
|
156
|
+
self._result = "" if result is SKIP else result
|
|
157
|
+
self._closed = True
|
|
158
|
+
return self._result
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
# ---------------------------------------------------------------------------
|
|
162
|
+
# serialization helpers
|
|
163
|
+
# ---------------------------------------------------------------------------
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def _coerce_text(obj):
|
|
167
|
+
if isinstance(obj, (bytes, bytearray)):
|
|
168
|
+
return obj.decode("utf-8", errors="replace")
|
|
169
|
+
return str(obj)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _sanitize_nonfinite(value):
|
|
173
|
+
"""Replace NaN/Infinity floats with None (iterative, no recursion)."""
|
|
174
|
+
if isinstance(value, float):
|
|
175
|
+
return value if _math.isfinite(value) else None
|
|
176
|
+
if not isinstance(value, (dict, list)):
|
|
177
|
+
return value
|
|
178
|
+
root = [] if isinstance(value, list) else {}
|
|
179
|
+
todo = [(value, root)]
|
|
180
|
+
while todo:
|
|
181
|
+
src, dst = todo.pop()
|
|
182
|
+
items = src.items() if isinstance(src, dict) else enumerate(src)
|
|
183
|
+
for k, v in items:
|
|
184
|
+
if isinstance(v, float) and not _math.isfinite(v):
|
|
185
|
+
v = None
|
|
186
|
+
elif isinstance(v, dict):
|
|
187
|
+
new = {}
|
|
188
|
+
todo.append((v, new))
|
|
189
|
+
v = new
|
|
190
|
+
elif isinstance(v, list):
|
|
191
|
+
new = []
|
|
192
|
+
todo.append((v, new))
|
|
193
|
+
v = new
|
|
194
|
+
if isinstance(dst, dict):
|
|
195
|
+
dst[k] = v
|
|
196
|
+
else:
|
|
197
|
+
dst.append(v)
|
|
198
|
+
return root
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def _has_nonfinite(value):
|
|
202
|
+
todo = [value]
|
|
203
|
+
while todo:
|
|
204
|
+
v = todo.pop()
|
|
205
|
+
if isinstance(v, float):
|
|
206
|
+
if not _math.isfinite(v):
|
|
207
|
+
return True
|
|
208
|
+
elif isinstance(v, dict):
|
|
209
|
+
todo.extend(v.values())
|
|
210
|
+
elif isinstance(v, list):
|
|
211
|
+
todo.extend(v)
|
|
212
|
+
return False
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
def _dumps(value, ensure_ascii=True, **kw):
|
|
216
|
+
kw.setdefault("separators", (", ", ": "))
|
|
217
|
+
try:
|
|
218
|
+
return _json.dumps(value, ensure_ascii=ensure_ascii,
|
|
219
|
+
allow_nan=False, **kw)
|
|
220
|
+
except ValueError:
|
|
221
|
+
return _json.dumps(_sanitize_nonfinite(value),
|
|
222
|
+
ensure_ascii=ensure_ascii, **kw)
|
|
223
|
+
except RecursionError:
|
|
224
|
+
return _iter_dumps(value, ensure_ascii=ensure_ascii)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def _iter_dumps(value, ensure_ascii=True):
|
|
228
|
+
"""Iterative serializer for absurdly deep structures."""
|
|
229
|
+
out = []
|
|
230
|
+
enc = _json.encoder.encode_basestring_ascii if ensure_ascii \
|
|
231
|
+
else _json.encoder.encode_basestring
|
|
232
|
+
stack = [("v", value)]
|
|
233
|
+
while stack:
|
|
234
|
+
op, v = stack.pop()
|
|
235
|
+
if op == "t": # literal text
|
|
236
|
+
out.append(v)
|
|
237
|
+
continue
|
|
238
|
+
if isinstance(v, dict):
|
|
239
|
+
out.append("{")
|
|
240
|
+
stack.append(("t", "}"))
|
|
241
|
+
items = list(v.items())
|
|
242
|
+
for idx in range(len(items) - 1, -1, -1):
|
|
243
|
+
k, val = items[idx]
|
|
244
|
+
stack.append(("v", val))
|
|
245
|
+
stack.append(("t", enc(k) + ": "))
|
|
246
|
+
if idx:
|
|
247
|
+
stack.append(("t", ", "))
|
|
248
|
+
continue
|
|
249
|
+
if isinstance(v, list):
|
|
250
|
+
out.append("[")
|
|
251
|
+
stack.append(("t", "]"))
|
|
252
|
+
for idx in range(len(v) - 1, -1, -1):
|
|
253
|
+
stack.append(("v", v[idx]))
|
|
254
|
+
if idx:
|
|
255
|
+
stack.append(("t", ", "))
|
|
256
|
+
continue
|
|
257
|
+
if v is True:
|
|
258
|
+
out.append("true")
|
|
259
|
+
elif v is False:
|
|
260
|
+
out.append("false")
|
|
261
|
+
elif v is None:
|
|
262
|
+
out.append("null")
|
|
263
|
+
elif isinstance(v, str):
|
|
264
|
+
out.append(enc(v))
|
|
265
|
+
elif isinstance(v, float):
|
|
266
|
+
out.append(repr(v) if _math.isfinite(v) else "null")
|
|
267
|
+
else:
|
|
268
|
+
out.append(str(v))
|
|
269
|
+
return "".join(out)
|