jsonsax 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jsonsax/__init__.py +30 -0
- jsonsax/__main__.py +41 -0
- jsonsax/parser.py +399 -0
- jsonsax/py.typed +0 -0
- jsonsax-0.1.0.dist-info/METADATA +399 -0
- jsonsax-0.1.0.dist-info/RECORD +8 -0
- jsonsax-0.1.0.dist-info/WHEEL +4 -0
- jsonsax-0.1.0.dist-info/licenses/LICENSE +21 -0
jsonsax/__init__.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""jsonsax - a lightweight streaming JSON parser.
|
|
2
|
+
|
|
3
|
+
Processes JSON incrementally as characters arrive, firing callbacks for each
|
|
4
|
+
structural event - similar to how a SAX parser works with XML. Instead of
|
|
5
|
+
buffering a whole document and parsing it at the end, you :meth:`~jsonsax.Parser.feed`
|
|
6
|
+
chunks as they stream in and react to events immediately.
|
|
7
|
+
|
|
8
|
+
Perfect for:
|
|
9
|
+
- Reacting to LLM token streams that emit JSON (structured outputs).
|
|
10
|
+
- Parsing JSON payloads larger than memory.
|
|
11
|
+
- Acting on early fields before the rest of the document arrives.
|
|
12
|
+
|
|
13
|
+
Example::
|
|
14
|
+
|
|
15
|
+
from jsonsax import Parser
|
|
16
|
+
|
|
17
|
+
parser = Parser()
|
|
18
|
+
parser.on("value", lambda path, val: print(path, "=", val))
|
|
19
|
+
for chunk in stream:
|
|
20
|
+
parser.feed(chunk)
|
|
21
|
+
parser.close()
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from .parser import EVENTS, Callback, ParseError, Parser, parse
|
|
27
|
+
|
|
28
|
+
__all__ = ["EVENTS", "Callback", "ParseError", "Parser", "parse", "__version__"]
|
|
29
|
+
|
|
30
|
+
__version__ = "0.1.0"
|
jsonsax/__main__.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
"""Command-line demo: ``python -m jsonsax``.
|
|
2
|
+
|
|
3
|
+
Reads JSON from stdin (or a default sample), feeds it to the parser one
|
|
4
|
+
character at a time, and prints each structural event with its path.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import sys
|
|
10
|
+
|
|
11
|
+
from .parser import Parser
|
|
12
|
+
|
|
13
|
+
_SAMPLE = '{"title": "RAG", "tags": ["ai", "search"], "score": 9.5, "ok": true}'
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def main(argv: list[str] | None = None) -> int:
|
|
17
|
+
"""Run the streaming demo. Returns a process exit code."""
|
|
18
|
+
argv = sys.argv[1:] if argv is None else argv
|
|
19
|
+
if argv and argv[0] in ("-h", "--help"):
|
|
20
|
+
print(__doc__)
|
|
21
|
+
return 0
|
|
22
|
+
|
|
23
|
+
# Read from stdin when piped, otherwise use the built-in sample.
|
|
24
|
+
document = _SAMPLE if sys.stdin.isatty() else sys.stdin.read()
|
|
25
|
+
|
|
26
|
+
parser = Parser()
|
|
27
|
+
parser.on("start_object", lambda path: print(f"{path:24} {{"))
|
|
28
|
+
parser.on("end_object", lambda path: print(f"{path:24} }}"))
|
|
29
|
+
parser.on("start_array", lambda path: print(f"{path:24} ["))
|
|
30
|
+
parser.on("end_array", lambda path: print(f"{path:24} ]"))
|
|
31
|
+
parser.on("key", lambda path, key: print(f"{path:24} key={key!r}"))
|
|
32
|
+
parser.on("value", lambda path, val: print(f"{path:24} value={val!r}"))
|
|
33
|
+
|
|
34
|
+
for char in document:
|
|
35
|
+
parser.feed(char)
|
|
36
|
+
parser.close()
|
|
37
|
+
return 0
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
if __name__ == "__main__":
|
|
41
|
+
raise SystemExit(main())
|
jsonsax/parser.py
ADDED
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
"""Core streaming JSON parser implementation.
|
|
2
|
+
|
|
3
|
+
See :mod:`jsonsax` for the public API and usage examples.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from typing import Any, Callable, Optional
|
|
9
|
+
|
|
10
|
+
__all__ = ["EVENTS", "Callback", "ParseError", "Parser", "parse"]
|
|
11
|
+
|
|
12
|
+
# Event names fired through callbacks.
|
|
13
|
+
EVENTS = (
|
|
14
|
+
"start_object", # callback(path)
|
|
15
|
+
"end_object", # callback(path)
|
|
16
|
+
"start_array", # callback(path)
|
|
17
|
+
"end_array", # callback(path)
|
|
18
|
+
"key", # callback(path, key)
|
|
19
|
+
"value", # callback(path, value) -> str | int | float | bool | None
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
#: Signature of an event callback. Receives the JSONPath-style location plus,
|
|
23
|
+
#: for ``key``/``value`` events, the parsed key or scalar.
|
|
24
|
+
Callback = Callable[..., None]
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class ParseError(ValueError):
|
|
28
|
+
"""Raised when the input is not well-formed JSON."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
# Internal parse-state constants.
|
|
32
|
+
_VALUE = "value" # expecting a value (top level or after ':' / array ',')
|
|
33
|
+
_DONE = "done" # a complete top-level value has been parsed
|
|
34
|
+
_OBJ_FIRST = "obj_first" # just after '{' - expecting a key or '}'
|
|
35
|
+
_OBJ_KEY = "obj_key" # after ',' in object - a key is required (no '}')
|
|
36
|
+
_OBJ_COLON = "obj_colon" # after a key - expecting ':'
|
|
37
|
+
_OBJ_NEXT = "obj_next" # after a value - expecting ',' or '}'
|
|
38
|
+
_ARR_FIRST = "arr_first" # just after '[' - expecting a value or ']'
|
|
39
|
+
_ARR_NEXT = "arr_next" # after a value - expecting ',' or ']'
|
|
40
|
+
|
|
41
|
+
# States in which a fresh value may legally begin.
|
|
42
|
+
_VALUE_STATES = (_VALUE, _ARR_FIRST)
|
|
43
|
+
|
|
44
|
+
_WHITESPACE = " \t\n\r"
|
|
45
|
+
_NUMBER_CHARS = "0123456789+-.eE"
|
|
46
|
+
_ESCAPES = {
|
|
47
|
+
'"': '"', "\\": "\\", "/": "/",
|
|
48
|
+
"b": "\b", "f": "\f", "n": "\n",
|
|
49
|
+
"r": "\r", "t": "\t",
|
|
50
|
+
}
|
|
51
|
+
_LITERALS = {"true": True, "false": False, "null": None}
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Parser:
|
|
55
|
+
"""Incremental, callback-driven JSON parser.
|
|
56
|
+
|
|
57
|
+
Feed it text with :meth:`feed` as it arrives, register handlers with
|
|
58
|
+
:meth:`on`, and call :meth:`close` once the stream ends to confirm the
|
|
59
|
+
document was complete.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
# pylint: disable=too-many-instance-attributes
|
|
63
|
+
# The instance attributes together form one tokenizer state machine; they
|
|
64
|
+
# are cohesive rather than incidental.
|
|
65
|
+
|
|
66
|
+
def __init__(self) -> None:
|
|
67
|
+
self._handlers: dict[str, list[Callback]] = {e: [] for e in EVENTS}
|
|
68
|
+
|
|
69
|
+
# Parser state machine.
|
|
70
|
+
self._state = _VALUE
|
|
71
|
+
# Stack of containers we're inside. Each frame is either:
|
|
72
|
+
# ["object", current_key_or_None]
|
|
73
|
+
# ["array", current_index]
|
|
74
|
+
self._stack: list[list[Any]] = []
|
|
75
|
+
|
|
76
|
+
# Token accumulation buffers.
|
|
77
|
+
self._buf = "" # leftover, not-yet-consumed input
|
|
78
|
+
self._in_string = False
|
|
79
|
+
self._string_chars: list[str] = []
|
|
80
|
+
self._escape = False
|
|
81
|
+
self._unicode: Optional[str] = None # accumulating \uXXXX digits
|
|
82
|
+
self._string_is_key = False
|
|
83
|
+
|
|
84
|
+
self._in_number = False
|
|
85
|
+
self._number_chars: list[str] = []
|
|
86
|
+
|
|
87
|
+
self._in_literal = False # true/false/null
|
|
88
|
+
self._literal_chars: list[str] = []
|
|
89
|
+
|
|
90
|
+
self._closed = False
|
|
91
|
+
|
|
92
|
+
# ------------------------------------------------------------------ #
|
|
93
|
+
# Public API
|
|
94
|
+
# ------------------------------------------------------------------ #
|
|
95
|
+
def on(self, event: str, callback: Callback) -> "Parser":
|
|
96
|
+
"""Register *callback* for *event*. Returns ``self`` for chaining."""
|
|
97
|
+
if event not in self._handlers:
|
|
98
|
+
raise ValueError(
|
|
99
|
+
f"Unknown event {event!r}. Valid events: {', '.join(EVENTS)}"
|
|
100
|
+
)
|
|
101
|
+
self._handlers[event].append(callback)
|
|
102
|
+
return self
|
|
103
|
+
|
|
104
|
+
def feed(self, chunk: str) -> None:
|
|
105
|
+
"""Feed a chunk of JSON text into the parser, firing any events."""
|
|
106
|
+
if self._closed:
|
|
107
|
+
raise ParseError("Cannot feed() a closed parser.")
|
|
108
|
+
if not chunk:
|
|
109
|
+
return
|
|
110
|
+
self._buf += chunk
|
|
111
|
+
self._consume()
|
|
112
|
+
|
|
113
|
+
def close(self) -> None:
|
|
114
|
+
"""Signal end of input and finalize any pending token.
|
|
115
|
+
|
|
116
|
+
Raises :class:`ParseError` if the document is incomplete (e.g. an
|
|
117
|
+
unterminated string or an unclosed container).
|
|
118
|
+
"""
|
|
119
|
+
if self._closed:
|
|
120
|
+
return
|
|
121
|
+
|
|
122
|
+
# A trailing number/literal at EOF must be flushed since its end is
|
|
123
|
+
# otherwise only detected by the following delimiter.
|
|
124
|
+
if self._in_number:
|
|
125
|
+
self._emit_number()
|
|
126
|
+
elif self._in_literal:
|
|
127
|
+
self._emit_literal()
|
|
128
|
+
|
|
129
|
+
# Skip trailing whitespace.
|
|
130
|
+
rest = self._buf.lstrip(_WHITESPACE)
|
|
131
|
+
self._buf = ""
|
|
132
|
+
|
|
133
|
+
if self._in_string:
|
|
134
|
+
raise ParseError("Unexpected end of input: unterminated string.")
|
|
135
|
+
if rest:
|
|
136
|
+
raise ParseError(f"Trailing data after JSON value: {rest!r}")
|
|
137
|
+
if self._stack:
|
|
138
|
+
raise ParseError("Unexpected end of input: unclosed container.")
|
|
139
|
+
if self._state in _VALUE_STATES:
|
|
140
|
+
raise ParseError("Unexpected end of input: no value parsed.")
|
|
141
|
+
|
|
142
|
+
self._closed = True
|
|
143
|
+
|
|
144
|
+
@property
|
|
145
|
+
def closed(self) -> bool:
|
|
146
|
+
"""Whether :meth:`close` has been called and validated successfully."""
|
|
147
|
+
return self._closed
|
|
148
|
+
|
|
149
|
+
# ------------------------------------------------------------------ #
|
|
150
|
+
# Event dispatch
|
|
151
|
+
# ------------------------------------------------------------------ #
|
|
152
|
+
def _fire(self, event: str, *args: Any) -> None:
|
|
153
|
+
for callback in self._handlers[event]:
|
|
154
|
+
callback(*args)
|
|
155
|
+
|
|
156
|
+
def _path(self) -> str:
|
|
157
|
+
"""Build a dotted/bracketed path describing the current location."""
|
|
158
|
+
parts: list[str] = ["$"]
|
|
159
|
+
for kind, marker in self._stack:
|
|
160
|
+
if kind == "object":
|
|
161
|
+
parts.append(f".{marker}" if marker is not None else "")
|
|
162
|
+
else: # array
|
|
163
|
+
parts.append(f"[{marker}]")
|
|
164
|
+
return "".join(parts)
|
|
165
|
+
|
|
166
|
+
# ------------------------------------------------------------------ #
|
|
167
|
+
# Core consume loop
|
|
168
|
+
# ------------------------------------------------------------------ #
|
|
169
|
+
def _consume(self) -> None:
|
|
170
|
+
i = 0
|
|
171
|
+
buf = self._buf
|
|
172
|
+
n = len(buf)
|
|
173
|
+
|
|
174
|
+
while i < n:
|
|
175
|
+
ch = buf[i]
|
|
176
|
+
|
|
177
|
+
if self._in_string:
|
|
178
|
+
i = self._consume_string(buf, i, n)
|
|
179
|
+
# _consume_string returns the index after the closing quote, or
|
|
180
|
+
# n if the string is not yet complete.
|
|
181
|
+
if self._in_string:
|
|
182
|
+
# Ran out of input mid-string; nothing left to buffer here.
|
|
183
|
+
self._buf = ""
|
|
184
|
+
return
|
|
185
|
+
continue
|
|
186
|
+
|
|
187
|
+
if self._in_number:
|
|
188
|
+
if ch in _NUMBER_CHARS:
|
|
189
|
+
self._number_chars.append(ch)
|
|
190
|
+
i += 1
|
|
191
|
+
continue
|
|
192
|
+
# Number terminated by a non-number char; reprocess it below.
|
|
193
|
+
self._emit_number()
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
if self._in_literal:
|
|
197
|
+
if ch.isalpha():
|
|
198
|
+
self._literal_chars.append(ch)
|
|
199
|
+
i += 1
|
|
200
|
+
continue
|
|
201
|
+
self._emit_literal()
|
|
202
|
+
continue
|
|
203
|
+
|
|
204
|
+
if ch in _WHITESPACE:
|
|
205
|
+
i += 1
|
|
206
|
+
continue
|
|
207
|
+
|
|
208
|
+
i = self._consume_token(ch, i)
|
|
209
|
+
|
|
210
|
+
self._buf = ""
|
|
211
|
+
|
|
212
|
+
def _consume_string(self, buf: str, i: int, n: int) -> int:
|
|
213
|
+
"""Consume string content starting at *i*. Returns the next index."""
|
|
214
|
+
while i < n:
|
|
215
|
+
ch = buf[i]
|
|
216
|
+
i += 1
|
|
217
|
+
|
|
218
|
+
if self._escape:
|
|
219
|
+
self._escape = False
|
|
220
|
+
if ch == "u":
|
|
221
|
+
self._unicode = ""
|
|
222
|
+
elif ch in _ESCAPES:
|
|
223
|
+
self._string_chars.append(_ESCAPES[ch])
|
|
224
|
+
else:
|
|
225
|
+
raise ParseError(f"Invalid escape sequence: \\{ch}")
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
if self._unicode is not None:
|
|
229
|
+
self._unicode += ch
|
|
230
|
+
if len(self._unicode) == 4:
|
|
231
|
+
try:
|
|
232
|
+
self._string_chars.append(chr(int(self._unicode, 16)))
|
|
233
|
+
except ValueError as exc:
|
|
234
|
+
raise ParseError(
|
|
235
|
+
f"Invalid unicode escape: \\u{self._unicode}"
|
|
236
|
+
) from exc
|
|
237
|
+
self._unicode = None
|
|
238
|
+
continue
|
|
239
|
+
|
|
240
|
+
if ch == "\\":
|
|
241
|
+
self._escape = True
|
|
242
|
+
continue
|
|
243
|
+
|
|
244
|
+
if ch == '"':
|
|
245
|
+
self._in_string = False
|
|
246
|
+
value = "".join(self._string_chars)
|
|
247
|
+
self._string_chars = []
|
|
248
|
+
self._finish_string(value)
|
|
249
|
+
return i
|
|
250
|
+
|
|
251
|
+
self._string_chars.append(ch)
|
|
252
|
+
|
|
253
|
+
# Reached end of buffer without a closing quote.
|
|
254
|
+
return i
|
|
255
|
+
|
|
256
|
+
def _consume_token(self, ch: str, i: int) -> int:
|
|
257
|
+
"""Handle a structural char or the start of a scalar. Returns next ``i``."""
|
|
258
|
+
if ch == "{":
|
|
259
|
+
self._expect_value_position()
|
|
260
|
+
self._fire("start_object", self._path())
|
|
261
|
+
self._stack.append(["object", None])
|
|
262
|
+
self._state = _OBJ_FIRST
|
|
263
|
+
return i + 1
|
|
264
|
+
|
|
265
|
+
if ch == "}":
|
|
266
|
+
if self._state not in (_OBJ_FIRST, _OBJ_NEXT) or not self._stack:
|
|
267
|
+
raise ParseError("Unexpected '}'.")
|
|
268
|
+
self._pop_container("object", "end_object")
|
|
269
|
+
return i + 1
|
|
270
|
+
|
|
271
|
+
if ch == "[":
|
|
272
|
+
self._expect_value_position()
|
|
273
|
+
self._fire("start_array", self._path())
|
|
274
|
+
self._stack.append(["array", 0])
|
|
275
|
+
self._state = _ARR_FIRST # expect a value or, immediately, ']'
|
|
276
|
+
return i + 1
|
|
277
|
+
|
|
278
|
+
if ch == "]":
|
|
279
|
+
if self._state not in (_ARR_FIRST, _ARR_NEXT) or not self._stack:
|
|
280
|
+
raise ParseError("Unexpected ']'.")
|
|
281
|
+
self._pop_container("array", "end_array")
|
|
282
|
+
return i + 1
|
|
283
|
+
|
|
284
|
+
if ch == ",":
|
|
285
|
+
if self._state == _OBJ_NEXT:
|
|
286
|
+
self._state = _OBJ_KEY
|
|
287
|
+
elif self._state == _ARR_NEXT:
|
|
288
|
+
self._stack[-1][1] += 1 # advance index
|
|
289
|
+
self._state = _VALUE
|
|
290
|
+
else:
|
|
291
|
+
raise ParseError("Unexpected ','.")
|
|
292
|
+
return i + 1
|
|
293
|
+
|
|
294
|
+
if ch == ":":
|
|
295
|
+
if self._state != _OBJ_COLON:
|
|
296
|
+
raise ParseError("Unexpected ':'.")
|
|
297
|
+
self._state = _VALUE
|
|
298
|
+
return i + 1
|
|
299
|
+
|
|
300
|
+
if ch == '"':
|
|
301
|
+
# Start of a string (either a key or a value).
|
|
302
|
+
if self._state in (_OBJ_FIRST, _OBJ_KEY):
|
|
303
|
+
self._string_is_key = True
|
|
304
|
+
else:
|
|
305
|
+
self._expect_value_position()
|
|
306
|
+
self._string_is_key = False
|
|
307
|
+
self._in_string = True
|
|
308
|
+
self._string_chars = []
|
|
309
|
+
return i + 1
|
|
310
|
+
|
|
311
|
+
if ch in "-0123456789":
|
|
312
|
+
self._expect_value_position()
|
|
313
|
+
self._in_number = True
|
|
314
|
+
self._number_chars = [ch]
|
|
315
|
+
return i + 1
|
|
316
|
+
|
|
317
|
+
if ch in "tfn": # true / false / null
|
|
318
|
+
self._expect_value_position()
|
|
319
|
+
self._in_literal = True
|
|
320
|
+
self._literal_chars = [ch]
|
|
321
|
+
return i + 1
|
|
322
|
+
|
|
323
|
+
raise ParseError(f"Unexpected character: {ch!r}")
|
|
324
|
+
|
|
325
|
+
# ------------------------------------------------------------------ #
|
|
326
|
+
# Token finalizers
|
|
327
|
+
# ------------------------------------------------------------------ #
|
|
328
|
+
def _expect_value_position(self) -> None:
|
|
329
|
+
"""Guard: the parser must currently be expecting a value."""
|
|
330
|
+
if self._state not in _VALUE_STATES:
|
|
331
|
+
raise ParseError(f"Unexpected value (parser state: {self._state}).")
|
|
332
|
+
|
|
333
|
+
def _finish_string(self, value: str) -> None:
|
|
334
|
+
if self._string_is_key:
|
|
335
|
+
self._stack[-1][1] = value # record current key on the frame
|
|
336
|
+
self._fire("key", self._path(), value)
|
|
337
|
+
self._state = _OBJ_COLON
|
|
338
|
+
else:
|
|
339
|
+
self._fire("value", self._path(), value)
|
|
340
|
+
self._after_value()
|
|
341
|
+
|
|
342
|
+
def _emit_number(self) -> None:
|
|
343
|
+
text = "".join(self._number_chars)
|
|
344
|
+
self._in_number = False
|
|
345
|
+
self._number_chars = []
|
|
346
|
+
try:
|
|
347
|
+
value: Any = int(text)
|
|
348
|
+
except ValueError:
|
|
349
|
+
try:
|
|
350
|
+
value = float(text)
|
|
351
|
+
except ValueError as exc:
|
|
352
|
+
raise ParseError(f"Invalid number: {text!r}") from exc
|
|
353
|
+
self._fire("value", self._path(), value)
|
|
354
|
+
self._after_value()
|
|
355
|
+
|
|
356
|
+
def _emit_literal(self) -> None:
|
|
357
|
+
text = "".join(self._literal_chars)
|
|
358
|
+
self._in_literal = False
|
|
359
|
+
self._literal_chars = []
|
|
360
|
+
if text not in _LITERALS:
|
|
361
|
+
raise ParseError(f"Invalid literal: {text!r}")
|
|
362
|
+
self._fire("value", self._path(), _LITERALS[text])
|
|
363
|
+
self._after_value()
|
|
364
|
+
|
|
365
|
+
def _after_value(self) -> None:
|
|
366
|
+
"""Advance state after a scalar value has been emitted."""
|
|
367
|
+
if not self._stack:
|
|
368
|
+
self._state = _DONE
|
|
369
|
+
return
|
|
370
|
+
kind = self._stack[-1][0]
|
|
371
|
+
self._state = _OBJ_NEXT if kind == "object" else _ARR_NEXT
|
|
372
|
+
|
|
373
|
+
def _pop_container(self, kind: str, end_event: str) -> None:
|
|
374
|
+
# Pop first so the path points at the container itself (via the parent's
|
|
375
|
+
# key/index) rather than at the container's last child.
|
|
376
|
+
frame = self._stack.pop()
|
|
377
|
+
if frame[0] != kind:
|
|
378
|
+
raise ParseError(f"Mismatched container close for {kind!r}.")
|
|
379
|
+
self._fire(end_event, self._path())
|
|
380
|
+
if not self._stack:
|
|
381
|
+
self._state = _DONE
|
|
382
|
+
else:
|
|
383
|
+
parent_kind = self._stack[-1][0]
|
|
384
|
+
self._state = _OBJ_NEXT if parent_kind == "object" else _ARR_NEXT
|
|
385
|
+
|
|
386
|
+
|
|
387
|
+
def parse(text: str, **handlers: Callback) -> Parser:
|
|
388
|
+
"""Parse a complete string in one call, wiring handlers by event name.
|
|
389
|
+
|
|
390
|
+
Example::
|
|
391
|
+
|
|
392
|
+
parse('{"a": 1}', value=lambda path, v: print(path, v))
|
|
393
|
+
"""
|
|
394
|
+
parser = Parser()
|
|
395
|
+
for event, callback in handlers.items():
|
|
396
|
+
parser.on(event, callback)
|
|
397
|
+
parser.feed(text)
|
|
398
|
+
parser.close()
|
|
399
|
+
return parser
|
jsonsax/py.typed
ADDED
|
File without changes
|
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: jsonsax
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightweight, dependency-free streaming (SAX-style) JSON parser.
|
|
5
|
+
Project-URL: Homepage, https://github.com/chandrapenugonda/jsonsax
|
|
6
|
+
Project-URL: Repository, https://github.com/chandrapenugonda/jsonsax
|
|
7
|
+
Project-URL: Issues, https://github.com/chandrapenugonda/jsonsax/issues
|
|
8
|
+
Author: chandrapenugonda
|
|
9
|
+
License: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: incremental,json,llm,parser,sax,streaming
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
22
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
23
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
24
|
+
Classifier: Typing :: Typed
|
|
25
|
+
Requires-Python: >=3.9
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: mypy>=1.8; extra == 'dev'
|
|
28
|
+
Requires-Dist: pylint>=3; extra == 'dev'
|
|
29
|
+
Requires-Dist: pytest>=7; extra == 'dev'
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
# jsonsax
|
|
33
|
+
|
|
34
|
+
**Read JSON while it is still arriving — don't wait for the whole thing.**
|
|
35
|
+
|
|
36
|
+
Imagine someone is reading you a long story out loud, one word at a time. You
|
|
37
|
+
don't wait for them to finish the whole book before you start listening — you
|
|
38
|
+
react to each part as you hear it. `jsonsax` does that for JSON.
|
|
39
|
+
|
|
40
|
+
Normally a computer waits for the *entire* JSON to show up, then reads it.
|
|
41
|
+
`jsonsax` is different: you hand it little pieces as they arrive, and it taps
|
|
42
|
+
you on the shoulder and says *"hey, I just found a name!"*, *"hey, here's a
|
|
43
|
+
number!"* — right away, piece by piece.
|
|
44
|
+
|
|
45
|
+
This style of reading-as-you-go is called a **streaming** (or **SAX-style**)
|
|
46
|
+
parser. (XML has had one for years; this is the same idea for JSON.)
|
|
47
|
+
|
|
48
|
+
### Why would you want that?
|
|
49
|
+
|
|
50
|
+
- 🤖 **Talking to an AI** — chatbots send their answer one word at a time. With
|
|
51
|
+
`jsonsax` you can start using the first part of the answer before the rest
|
|
52
|
+
has even arrived.
|
|
53
|
+
- 🐘 **Huge files** — a JSON file too big to fit in memory? Read it in small
|
|
54
|
+
sips instead of swallowing it whole.
|
|
55
|
+
- ⚡ **Show things sooner** — display the title of an article the instant it
|
|
56
|
+
appears, without waiting for the whole article.
|
|
57
|
+
|
|
58
|
+
---
|
|
59
|
+
|
|
60
|
+
## Install
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install jsonsax
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
That's it. No other stuff gets installed — `jsonsax` has **zero dependencies**.
|
|
67
|
+
|
|
68
|
+
---
|
|
69
|
+
|
|
70
|
+
## The tiniest example
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
from jsonsax import parse
|
|
74
|
+
|
|
75
|
+
parse('{"name": "Bo", "age": 5}', value=lambda path, val: print(path, "=", val))
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
Output:
|
|
79
|
+
|
|
80
|
+
```
|
|
81
|
+
$.name = Bo
|
|
82
|
+
$.age = 5
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
`$` means "the start". `$.name` means "the `name` part". Think of it as an
|
|
86
|
+
address that tells you **where** in the JSON you are.
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
## Feeding it bit by bit (the fun part)
|
|
91
|
+
|
|
92
|
+
Real streams don't arrive all at once. Watch what happens when the JSON shows
|
|
93
|
+
up in messy little chunks — even cut in the middle of a word:
|
|
94
|
+
|
|
95
|
+
```python
|
|
96
|
+
from jsonsax import Parser
|
|
97
|
+
|
|
98
|
+
parser = Parser()
|
|
99
|
+
parser.on("value", lambda path, val: print("found:", path, "=", val))
|
|
100
|
+
|
|
101
|
+
chunks = ['{"tit', 'le": "R', 'AG", "sco', 're": 9.5}']
|
|
102
|
+
for chunk in chunks:
|
|
103
|
+
parser.feed(chunk) # hand over one piece at a time
|
|
104
|
+
parser.close() # tell it "okay, that's everything"
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Output:
|
|
108
|
+
|
|
109
|
+
```
|
|
110
|
+
found: $.title = RAG
|
|
111
|
+
found: $.score = 9.5
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
Even though `"title"` got chopped into `"tit"` + `"le"`, `jsonsax` patiently
|
|
115
|
+
stitched it back together. 🧩
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Listening for different things ("events")
|
|
120
|
+
|
|
121
|
+
You tell `jsonsax` what you care about with `parser.on(...)`. Each time it sees
|
|
122
|
+
that kind of thing, it calls your little function (a *callback*).
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
from jsonsax import Parser
|
|
126
|
+
|
|
127
|
+
parser = Parser()
|
|
128
|
+
parser.on("start_object", lambda path: print(path, "{ ... an object starts"))
|
|
129
|
+
parser.on("end_object", lambda path: print(path, "} ... an object ends"))
|
|
130
|
+
parser.on("start_array", lambda path: print(path, "[ ... a list starts"))
|
|
131
|
+
parser.on("end_array", lambda path: print(path, "] ... a list ends"))
|
|
132
|
+
parser.on("key", lambda path, key: print(path, "key:", key))
|
|
133
|
+
parser.on("value", lambda path, val: print(path, "value:", repr(val)))
|
|
134
|
+
|
|
135
|
+
parse_me = '{"pets": ["cat", "dog"], "happy": true}'
|
|
136
|
+
for ch in parse_me:
|
|
137
|
+
parser.feed(ch)
|
|
138
|
+
parser.close()
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
Output:
|
|
142
|
+
|
|
143
|
+
```
|
|
144
|
+
$ { ... an object starts
|
|
145
|
+
$.pets key: pets
|
|
146
|
+
$.pets [ ... a list starts
|
|
147
|
+
$.pets[0] value: 'cat'
|
|
148
|
+
$.pets[1] value: 'dog'
|
|
149
|
+
$.pets ] ... a list ends
|
|
150
|
+
$.happy key: happy
|
|
151
|
+
$.happy value: True
|
|
152
|
+
$ } ... an object ends
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
See how `$.pets[0]` and `$.pets[1]` count the items in the list, just like
|
|
156
|
+
"first pet" and "second pet"?
|
|
157
|
+
|
|
158
|
+
---
|
|
159
|
+
|
|
160
|
+
## The events you can listen for
|
|
161
|
+
|
|
162
|
+
| Event | You get… | Happens when it sees… |
|
|
163
|
+
| -------------- | --------------- | -------------------------------------- |
|
|
164
|
+
| `start_object` | `path` | a `{` — an object is starting |
|
|
165
|
+
| `end_object` | `path` | a `}` — an object is finished |
|
|
166
|
+
| `start_array` | `path` | a `[` — a list is starting |
|
|
167
|
+
| `end_array` | `path` | a `]` — a list is finished |
|
|
168
|
+
| `key` | `path, key` | a label inside an object (like `name`) |
|
|
169
|
+
| `value` | `path, value` | a real value: text, number, true/false/null |
|
|
170
|
+
|
|
171
|
+
A `value` can be a `str`, an `int`, a `float`, `True`, `False`, or `None`
|
|
172
|
+
(JSON's `null` becomes Python's `None`).
|
|
173
|
+
|
|
174
|
+
---
|
|
175
|
+
|
|
176
|
+
## More examples (little recipes)
|
|
177
|
+
|
|
178
|
+
### 1. Grab just one field, ignore everything else
|
|
179
|
+
|
|
180
|
+
Only want the title? Only listen for it:
|
|
181
|
+
|
|
182
|
+
```python
|
|
183
|
+
from jsonsax import Parser
|
|
184
|
+
|
|
185
|
+
def on_value(path, val):
|
|
186
|
+
if path == "$.title":
|
|
187
|
+
print("The title is:", val)
|
|
188
|
+
|
|
189
|
+
p = Parser()
|
|
190
|
+
p.on("value", on_value)
|
|
191
|
+
p.feed('{"title": "Hello", "body": "long boring text..."}')
|
|
192
|
+
p.close()
|
|
193
|
+
# The title is: Hello
|
|
194
|
+
```
|
|
195
|
+
|
|
196
|
+
### 2. Build a normal dictionary as you go
|
|
197
|
+
|
|
198
|
+
```python
|
|
199
|
+
from jsonsax import Parser
|
|
200
|
+
|
|
201
|
+
data = {}
|
|
202
|
+
p = Parser()
|
|
203
|
+
p.on("value", lambda path, val: data.__setitem__(path, val))
|
|
204
|
+
p.feed('{"a": 1, "b": 2, "c": 3}')
|
|
205
|
+
p.close()
|
|
206
|
+
print(data)
|
|
207
|
+
# {'$.a': 1, '$.b': 2, '$.c': 3}
|
|
208
|
+
```
|
|
209
|
+
|
|
210
|
+
### 3. Count the items in a list
|
|
211
|
+
|
|
212
|
+
```python
|
|
213
|
+
from jsonsax import Parser
|
|
214
|
+
|
|
215
|
+
count = 0
|
|
216
|
+
def bump(path, val):
|
|
217
|
+
global count
|
|
218
|
+
count += 1
|
|
219
|
+
|
|
220
|
+
p = Parser()
|
|
221
|
+
p.on("value", bump)
|
|
222
|
+
p.feed('[10, 20, 30, 40, 50]')
|
|
223
|
+
p.close()
|
|
224
|
+
print("items:", count) # items: 5
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
### 4. Deeply nested stuff is no problem
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
from jsonsax import parse
|
|
231
|
+
|
|
232
|
+
parse(
|
|
233
|
+
'{"user": {"name": "Mia", "tags": ["a", "b"]}}',
|
|
234
|
+
value=lambda path, val: print(path, "=", val),
|
|
235
|
+
)
|
|
236
|
+
# $.user.name = Mia
|
|
237
|
+
# $.user.tags[0] = a
|
|
238
|
+
# $.user.tags[1] = b
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
### 5. All the value types at once
|
|
242
|
+
|
|
243
|
+
```python
|
|
244
|
+
from jsonsax import parse
|
|
245
|
+
|
|
246
|
+
parse(
|
|
247
|
+
'{"text": "hi", "whole": 42, "decimal": 3.14, "yes": true, "no": false, "nothing": null}',
|
|
248
|
+
value=lambda path, val: print(f"{path:14} -> {val!r}"),
|
|
249
|
+
)
|
|
250
|
+
# $.text -> 'hi'
|
|
251
|
+
# $.whole -> 42
|
|
252
|
+
# $.decimal -> 3.14
|
|
253
|
+
# $.yes -> True
|
|
254
|
+
# $.no -> False
|
|
255
|
+
# $.nothing -> None
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
### 6. Reacting to an AI that types its answer slowly
|
|
259
|
+
|
|
260
|
+
This is the big one. Pretend an AI sends its reply word-by-word:
|
|
261
|
+
|
|
262
|
+
```python
|
|
263
|
+
from jsonsax import Parser
|
|
264
|
+
|
|
265
|
+
# These pieces would normally come from the AI, one at a time.
|
|
266
|
+
ai_stream = ['{"head', 'line": "Big New', 's!", "summary": "It happened today."}']
|
|
267
|
+
|
|
268
|
+
p = Parser()
|
|
269
|
+
p.on("value", lambda path, val: print(f"[{path}] arrived: {val}"))
|
|
270
|
+
|
|
271
|
+
for piece in ai_stream:
|
|
272
|
+
p.feed(piece) # the moment a field finishes, you hear about it
|
|
273
|
+
p.close()
|
|
274
|
+
# [$.headline] arrived: Big News!
|
|
275
|
+
# [$.summary] arrived: It happened today.
|
|
276
|
+
```
|
|
277
|
+
|
|
278
|
+
### 7. Chain your setup in one breath
|
|
279
|
+
|
|
280
|
+
`on(...)` hands you the parser back, so you can line them up:
|
|
281
|
+
|
|
282
|
+
```python
|
|
283
|
+
from jsonsax import Parser
|
|
284
|
+
|
|
285
|
+
p = (
|
|
286
|
+
Parser()
|
|
287
|
+
.on("key", lambda path, k: print("key", k))
|
|
288
|
+
.on("value", lambda path, v: print("value", v))
|
|
289
|
+
)
|
|
290
|
+
p.feed('{"x": 1}')
|
|
291
|
+
p.close()
|
|
292
|
+
```
|
|
293
|
+
|
|
294
|
+
---
|
|
295
|
+
|
|
296
|
+
## When the JSON is broken
|
|
297
|
+
|
|
298
|
+
If the JSON is messy or unfinished, `jsonsax` tells you by raising a
|
|
299
|
+
`ParseError` (which is just a special kind of Python `ValueError`). It is
|
|
300
|
+
**strict** on purpose — better to shout early than to quietly hand you wrong data.
|
|
301
|
+
|
|
302
|
+
```python
|
|
303
|
+
from jsonsax import parse, ParseError
|
|
304
|
+
|
|
305
|
+
broken_examples = [
|
|
306
|
+
'{"a": 1,}', # extra comma at the end
|
|
307
|
+
'[1, 2', # forgot to close the list
|
|
308
|
+
'{"a" 1}', # missing the ':' between key and value
|
|
309
|
+
'"never ends', # string with no closing quote
|
|
310
|
+
'true false', # two things glued together
|
|
311
|
+
]
|
|
312
|
+
|
|
313
|
+
for bad in broken_examples:
|
|
314
|
+
try:
|
|
315
|
+
parse(bad)
|
|
316
|
+
except ParseError as error:
|
|
317
|
+
print("rejected:", bad, "->", error)
|
|
318
|
+
```
|
|
319
|
+
|
|
320
|
+
Output (your wording may vary slightly):
|
|
321
|
+
|
|
322
|
+
```
|
|
323
|
+
rejected: {"a": 1,} -> Unexpected '}'.
|
|
324
|
+
rejected: [1, 2 -> Unexpected end of input: unclosed container.
|
|
325
|
+
rejected: {"a" 1} -> Unexpected value (parser state: obj_colon).
|
|
326
|
+
rejected: "never ends -> Unexpected end of input: unterminated string.
|
|
327
|
+
rejected: true false -> Unexpected value (parser state: done).
|
|
328
|
+
```
|
|
329
|
+
|
|
330
|
+
> **Always call `parser.close()` at the end.** That's the moment `jsonsax`
|
|
331
|
+
> double-checks that the JSON was actually complete. Forgetting it means you
|
|
332
|
+
> might miss the "you're missing the last `}`!" warning.
|
|
333
|
+
|
|
334
|
+
---
|
|
335
|
+
|
|
336
|
+
## Run it from the terminal (no code needed)
|
|
337
|
+
|
|
338
|
+
You can pipe JSON straight into `jsonsax` to watch the events scroll by:
|
|
339
|
+
|
|
340
|
+
```bash
|
|
341
|
+
echo '{"x": [1, 2, true]}' | python -m jsonsax
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
Output:
|
|
345
|
+
|
|
346
|
+
```
|
|
347
|
+
$ {
|
|
348
|
+
$.x key='x'
|
|
349
|
+
$.x [
|
|
350
|
+
$.x[0] value=1
|
|
351
|
+
$.x[1] value=2
|
|
352
|
+
$.x[2] value=True
|
|
353
|
+
$.x ]
|
|
354
|
+
$ }
|
|
355
|
+
```
|
|
356
|
+
|
|
357
|
+
---
|
|
358
|
+
|
|
359
|
+
## The whole toolbox (quick reference)
|
|
360
|
+
|
|
361
|
+
```python
|
|
362
|
+
from jsonsax import Parser, parse, ParseError, EVENTS
|
|
363
|
+
|
|
364
|
+
parser = Parser() # make a new reader
|
|
365
|
+
parser.on(event, callback) # "when you see <event>, call <callback>" (returns parser)
|
|
366
|
+
parser.feed(chunk) # give it the next piece of text
|
|
367
|
+
parser.close() # "that's all" — checks the JSON was complete
|
|
368
|
+
parser.closed # True after a successful close()
|
|
369
|
+
|
|
370
|
+
parse(text, **handlers) # shortcut: feed + close in one line
|
|
371
|
+
ParseError # raised when the JSON is broken (a ValueError)
|
|
372
|
+
EVENTS # the tuple of all valid event names
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
Good to know:
|
|
376
|
+
|
|
377
|
+
- **Truly incremental** — chunks can split *anywhere*, even in the middle of a
|
|
378
|
+
word, a number, or a `\uXXXX` escape.
|
|
379
|
+
- **Strict** — rejects trailing commas, missing colons, leftover junk, and
|
|
380
|
+
unfinished strings or brackets.
|
|
381
|
+
- **Typed** — ships with `py.typed`, so type checkers understand it.
|
|
382
|
+
- **Tiny & dependency-free**, works on **Python 3.9+**.
|
|
383
|
+
|
|
384
|
+
---
|
|
385
|
+
|
|
386
|
+
## For developers (working on jsonsax itself)
|
|
387
|
+
|
|
388
|
+
```bash
|
|
389
|
+
pip install -e ".[dev]"
|
|
390
|
+
pytest # run the tests
|
|
391
|
+
pylint src/jsonsax # check the style
|
|
392
|
+
mypy # check the types
|
|
393
|
+
```
|
|
394
|
+
|
|
395
|
+
---
|
|
396
|
+
|
|
397
|
+
## License
|
|
398
|
+
|
|
399
|
+
MIT — free to use, change, and share.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
jsonsax/__init__.py,sha256=2OiJOFp2ei3n2v1iTVkMJbbsJf_KcoyeU_WpYbKLl98,963
|
|
2
|
+
jsonsax/__main__.py,sha256=q_NaeTuWqEcKQap2jAIqclRjZXaauc06-cV72kFvn3s,1308
|
|
3
|
+
jsonsax/parser.py,sha256=bD6GzCrfeq2aZfUKnsESCgMNqcr1DSqQWqODDQjGk08,13984
|
|
4
|
+
jsonsax/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
jsonsax-0.1.0.dist-info/METADATA,sha256=nw6-w9uLqVlCN0iWFsLKJ52GFNj1LlujhNKVOx7z7jQ,11014
|
|
6
|
+
jsonsax-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
7
|
+
jsonsax-0.1.0.dist-info/licenses/LICENSE,sha256=GJEDtR4g0kMYMwK12RxCZj4LY_QCp0IdCQK0dr8OTNY,1073
|
|
8
|
+
jsonsax-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 chandrapenugonda
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|