xml-sax-ts 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -26,22 +26,40 @@ npm install xml-sax-ts
26
26
 
27
27
  ## Quick start
28
28
 
29
- ### SAX streaming
29
+ ### Token streaming (sync)
30
30
 
31
31
  ```ts
32
- import { XmlSaxParser } from "xml-sax-ts";
32
+ import { CloseTagToken, OpenTagToken, TextToken, XmlSaxParser } from "xml-sax-ts";
33
33
 
34
- const parser = new XmlSaxParser({
35
- onOpenTag: (tag) => console.log("open", tag.name, tag.attributes),
36
- onText: (text) => console.log("text", text),
37
- onCloseTag: (tag) => console.log("close", tag.name),
38
- });
34
+ const parser = new XmlSaxParser();
39
35
 
40
- parser.feed("<root>");
41
- parser.feed("Hello</root>");
36
+ for (const token of parser.feed("<root>")) {
37
+ if (token instanceof OpenTagToken) console.log("open", token.tag.name);
38
+ }
39
+ for (const token of parser.feed("Hello</root>")) {
40
+ if (token instanceof TextToken) console.log("text", token.text);
41
+ if (token instanceof CloseTagToken) console.log("close", token.tag.name);
42
+ }
42
43
  parser.close();
43
44
  ```
44
45
 
46
+ ### Token streaming (async)
47
+
48
+ ```ts
49
+ import { OpenTagToken, tokenizeXmlAsync } from "xml-sax-ts";
50
+
51
+ async function* chunks(): AsyncGenerator<string> {
52
+ yield "<root><item>1</item>";
53
+ yield "<item>2</item></root>";
54
+ }
55
+
56
+ for await (const token of tokenizeXmlAsync(chunks())) {
57
+ if (token instanceof OpenTagToken) {
58
+ console.log(token.depth, token.path.join("/"));
59
+ }
60
+ }
61
+ ```
62
+
45
63
  ### Parse to tree
46
64
 
47
65
  ```ts
@@ -64,19 +82,25 @@ const obj = buildObject(root);
64
82
  ### Streaming object builder
65
83
 
66
84
  ```ts
67
- import { ObjectBuilder, XmlSaxParser } from "xml-sax-ts";
85
+ import { CdataToken, CloseTagToken, OpenTagToken, TextToken, ObjectBuilder, XmlSaxParser } from "xml-sax-ts";
68
86
 
69
87
  const builder = new ObjectBuilder();
70
- const parser = new XmlSaxParser({
71
- onOpenTag: builder.onOpenTag,
72
- onText: builder.onText,
73
- onCdata: builder.onCdata,
74
- onCloseTag: builder.onCloseTag
75
- });
88
+ const parser = new XmlSaxParser();
89
+
90
+ const consume = (token: unknown): void => {
91
+ if (
92
+ token instanceof OpenTagToken ||
93
+ token instanceof TextToken ||
94
+ token instanceof CdataToken ||
95
+ token instanceof CloseTagToken
96
+ ) {
97
+ builder.consume(token);
98
+ }
99
+ };
76
100
 
77
- parser.feed("<root><item>1</item>");
78
- parser.feed("<item>2</item></root>");
79
- parser.close();
101
+ for (const token of parser.feed("<root><item>1</item>")) consume(token);
102
+ for (const token of parser.feed("<item>2</item></root>")) consume(token);
103
+ for (const token of parser.close()) consume(token);
80
104
 
81
105
  const obj = builder.getResult();
82
106
  // { item: ["1", "2"] }
@@ -141,6 +165,47 @@ Quick run (fewer rounds):
141
165
  npm run bench:quick
142
166
  ```
143
167
 
168
+ ### Large file streaming benchmark (2.5 GB)
169
+
170
+ This benchmark parses a synthetic XML document containing 10 large text blobs of 250 MB each.
171
+ The parser runs in true streaming mode and does not require building a full tree in memory.
172
+
173
+ Run in Node:
174
+
175
+ ```bash
176
+ npm run bench:large:node
177
+ ```
178
+
179
+ Optional smaller smoke run:
180
+
181
+ ```bash
182
+ npm run bench:large:node:sample
183
+ ```
184
+
185
+ Environment variables for `bench:large:node`:
186
+
187
+ - `LARGE_BLOB_SIZE_MB` (default `250`)
188
+ - `LARGE_BLOB_COUNT` (default `10`)
189
+ - `LARGE_XML_CHUNK_MB` (default `1`)
190
+ - `LARGE_XML_XMLNS` (`1` enables namespaces; default `false`)
191
+ - `LARGE_XML_COALESCE_TEXT` (`1` enables text coalescing; default `false`)
192
+ - `LARGE_XML_REGENERATE=1` to rebuild the generated dataset
193
+ - `LARGE_XML_FILE` to override the dataset path
194
+
195
+ Browser benchmark page:
196
+
197
+ 1. Build the package: `npm run build`
198
+ 2. Serve the repository root with any static server
199
+ 3. Open `benchmarks/browser/large-bench.html`
200
+ 4. Select a large XML file and run the benchmark
201
+
202
+ Latest large-file run (this machine):
203
+
204
+ | Environment | Dataset | Parser settings | Elapsed | Throughput | Peak memory |
205
+ | --- | --- | --- | ---: | ---: | ---: |
206
+ | Node v24.7.0 (darwin arm64) | `10 x 250 MB` blobs (`2.44 GB` on disk) | `xmlns=false`, `coalesceText=false`, `trackPosition=false`, `1 MB` read chunks | `1.75 s` | `1,426.44 MB/s` | `217.8 MB RSS` |
207
+ | Browser (local run) | `10 x 250 MB` blobs (`2.44 GB` on disk) | `xmlns=false`, `coalesceText=false`, `trackPosition=false`, `1024 KB` chunk size | `1.97 s` | `1,272.2 MB/s` | `63.1 MB JS heap (end)` |
208
+
144
209
  The benchmark now runs multiple rounds and reports median/mean/stddev for better comparability.
145
210
 
146
211
  - `xml-sax-ts:sax` scenarios measure streaming event parsing
@@ -249,46 +314,72 @@ Current status for this environment: comparable runs show `xml-sax-ts` at `0.971
249
314
  new XmlSaxParser(options?: ParserOptions)
250
315
  ```
251
316
 
252
- | Method. | Description |
253
- | ------------- | ------------------------------------------ |
254
- | `feed(chunk)` | Feed a string chunk to the parser |
255
- | `close()` | Signal end-of-input and validate open tags |
317
+ | Method. | Description |
318
+ | --------------------- | ----------------------------------------------------------------------------------- |
319
+ | `feed(chunk)` | Feed one XML chunk and return parsed tokens for that chunk |
320
+ | `close()` | Finalize parsing, validate state, and return remaining tokens plus `EndToken` |
321
+ | `drainTokens()` | Return and clear buffered tokens (usually empty if you consume `feed`/`close` return values) |
322
+ | `[Symbol.iterator]()` | Iterate currently buffered tokens |
323
+ | `iterateChunks(src)` | Async iterator over an `Iterable<string>` or `AsyncIterable<string>` chunk source |
256
324
 
257
325
  #### `ParserOptions`
258
326
 
259
- | Option | Type | Default | Description |
260
- | ----------------------------- | ---------- | ------- | ---------------------------------------------- |
261
- | `xmlns` | `boolean` | `true` | Enable namespace resolution |
262
- | `includeNamespaceAttributes` | `boolean` | `false` | Include `xmlns:*` attributes in tag output |
263
- | `allowDoctype` | `boolean` | `true` | Allow `<!DOCTYPE …>` declarations |
264
- | `coalesceText` | `boolean` | `true` | Merge adjacent text callbacks into one event |
265
- | `trackPosition` | `boolean` | `true` | Track line/column; disable for faster parsing |
266
- | `onOpenTag` | `function` | — | Called for each opening / self-closing tag |
267
- | `onCloseTag` | `function` | — | Called for each closing tag |
268
- | `onText` | `function` | — | Called for text nodes |
269
- | `onCdata` | `function` | — | Called for CDATA sections |
270
- | `onComment` | `function` | — | Called for comments |
271
- | `onProcessingInstruction` | `function` | — | Called for processing instructions (`<?…?>`) |
272
- | `onDoctype` | `function` | — | Called for DOCTYPE declarations |
273
- | `onError` | `function` | — | Called on parse errors |
274
-
275
- By default (`coalesceText: true`), adjacent text chunks are merged and emitted as one `onText` callback per structural boundary. Set `coalesceText: false` to receive text callbacks exactly as chunk boundaries are parsed.
327
+ | Option | Type | Default | Description |
328
+ | ---------------------------- | --------- | ------- | ---------------------------------------------- |
329
+ | `xmlns` | `boolean` | `true` | Enable namespace resolution |
330
+ | `includeNamespaceAttributes` | `boolean` | `false` | Include `xmlns:*` attributes in tag output |
331
+ | `allowDoctype` | `boolean` | `true` | Allow `<!DOCTYPE …>` declarations |
332
+ | `coalesceText` | `boolean` | `true` | Merge adjacent text tokens into a single token |
333
+ | `trackPosition` | `boolean` | `true` | Track line/column; disable for faster parsing |
334
+
335
+ By default (`coalesceText: true`), adjacent text chunks are merged and emitted as one `TextToken` per structural boundary. Set `coalesceText: false` to keep chunk-level text tokenization.
276
336
 
277
337
  `trackPosition` controls line/column tracking for parser errors. When set to `false`, parsing is faster and `XmlSaxError` still reports `offset`, while `line` and `column` are set to `0`.
278
338
 
279
- Event payload note (breaking change): with `xmlns: false`, parser events now emit plain-mode tag shapes aligned with `saxes` performance semantics.
339
+ Token payload note: with `xmlns: false`, `OpenTagToken` and `CloseTagToken` use plain-mode tag shapes aligned with `saxes` performance semantics.
280
340
 
281
- - `onOpenTag(tag).attributes` values are strings (not `XmlAttribute` objects)
282
- - `onOpenTag(tag)` and `onCloseTag(tag)` omit `prefix`, `local`, and `uri`
341
+ - `OpenTagToken.tag.attributes` values are strings (not `XmlAttribute` objects)
342
+ - `OpenTagToken.tag` and `CloseTagToken.tag` omit `prefix`, `local`, and `uri`
283
343
  - With `xmlns: true`, full namespace metadata remains present
284
344
 
345
+ ### Tokens
346
+
347
+ Token classes:
348
+
349
+ - `OpenTagToken`
350
+ - `CloseTagToken`
351
+ - `TextToken`
352
+ - `CdataToken`
353
+ - `CommentToken`
354
+ - `ProcessingInstructionToken`
355
+ - `DoctypeToken`
356
+ - `EndToken`
357
+
358
+ All token classes derive from `XmlToken` and include:
359
+
360
+ - `kind`
361
+ - `position` (`{ offset, line, column }` when `trackPosition` is enabled)
362
+
363
+ `OpenTagToken` and `CloseTagToken` also include:
364
+
365
+ - `depth`
366
+ - `path`
367
+
368
+ ### `tokenizeXml(xml, options?)`
369
+
370
+ Convenience helper for one-shot tokenization of a complete XML string.
371
+
372
+ ### `tokenizeXmlAsync(chunks, options?)`
373
+
374
+ Convenience async generator for iterating tokens from an `Iterable<string>` or `AsyncIterable<string>` source.
375
+
285
376
  ### `parseXmlString(xml, options?)`
286
377
 
287
378
  Convenience function that parses a complete XML string into an `XmlNode` tree using `XmlSaxParser` + `TreeBuilder` internally.
288
379
 
289
380
  ### `TreeBuilder`
290
381
 
291
- Low-level tree builder. Attach its `onOpenTag`, `onText`, `onCdata`, and `onCloseTag` methods to a parser, then call `getRoot()` to retrieve the resulting `XmlNode`.
382
+ Low-level tree builder. Consume parser tokens via `consume(token)` and call `getRoot()` to retrieve the resulting `XmlNode`.
292
383
 
293
384
  ### `buildObject(root, options?)`
294
385
 
@@ -296,7 +387,7 @@ Projects an `XmlNode` tree into a plain object. Attributes are prefixed (default
296
387
 
297
388
  ### `ObjectBuilder`
298
389
 
299
- Streaming builder that produces the same object shape as `buildObject` without building a full `XmlNode` tree. Attach its `onOpenTag`, `onText`, `onCdata`, and `onCloseTag` methods to the parser.
390
+ Streaming builder that produces the same object shape as `buildObject` without building a full `XmlNode` tree. Consume parser tokens via `consume(token)`.
300
391
 
301
392
  #### `ObjectBuilderOptions`
302
393
 
@@ -345,7 +436,7 @@ Custom error class thrown on parse errors. Includes `offset`, `line`, and `colum
345
436
 
346
437
  ### Exported types
347
438
 
348
- `OpenTag` · `CloseTag` · `XmlAttribute` · `ProcessingInstruction` · `Doctype` · `XmlNode` · `XmlChild` · `XmlPosition` · `ParserOptions` · `SerializeOptions` · `ObjectBuilderOptions` · `ArrayElementSelector` · `XmlObjectMap` · `XmlObjectValue` · `XmlBuilderOptions` · `XmlInputObject` · `XmlInputValue` · `ObjectToXmlOptions`
439
+ `XmlTokenKind` · `XmlAnyToken` · `OpenTag` · `CloseTag` · `XmlAttribute` · `ProcessingInstruction` · `Doctype` · `XmlNode` · `XmlChild` · `XmlPosition` · `XmlChunkIterable` · `ParserOptions` · `SerializeOptions` · `ObjectBuilderOptions` · `ArrayElementSelector` · `XmlObjectMap` · `XmlObjectValue` · `XmlBuilderOptions` · `XmlInputObject` · `XmlInputValue` · `ObjectToXmlOptions`
349
440
 
350
441
  ## Features
351
442