xml-sax-ts 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +137 -46
- package/dist/index.cjs +273 -118
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +67 -28
- package/dist/index.d.ts +67 -28
- package/dist/index.js +263 -119
- package/dist/index.js.map +1 -1
- package/package.json +6 -3
package/README.md
CHANGED
|
@@ -26,22 +26,40 @@ npm install xml-sax-ts
|
|
|
26
26
|
|
|
27
27
|
## Quick start
|
|
28
28
|
|
|
29
|
-
###
|
|
29
|
+
### Token streaming (sync)
|
|
30
30
|
|
|
31
31
|
```ts
|
|
32
|
-
import { XmlSaxParser } from "xml-sax-ts";
|
|
32
|
+
import { CloseTagToken, OpenTagToken, TextToken, XmlSaxParser } from "xml-sax-ts";
|
|
33
33
|
|
|
34
|
-
const parser = new XmlSaxParser(
|
|
35
|
-
onOpenTag: (tag) => console.log("open", tag.name, tag.attributes),
|
|
36
|
-
onText: (text) => console.log("text", text),
|
|
37
|
-
onCloseTag: (tag) => console.log("close", tag.name),
|
|
38
|
-
});
|
|
34
|
+
const parser = new XmlSaxParser();
|
|
39
35
|
|
|
40
|
-
parser.feed("<root>")
|
|
41
|
-
|
|
36
|
+
for (const token of parser.feed("<root>")) {
|
|
37
|
+
if (token instanceof OpenTagToken) console.log("open", token.tag.name);
|
|
38
|
+
}
|
|
39
|
+
for (const token of parser.feed("Hello</root>")) {
|
|
40
|
+
if (token instanceof TextToken) console.log("text", token.text);
|
|
41
|
+
if (token instanceof CloseTagToken) console.log("close", token.tag.name);
|
|
42
|
+
}
|
|
42
43
|
parser.close();
|
|
43
44
|
```
|
|
44
45
|
|
|
46
|
+
### Token streaming (async)
|
|
47
|
+
|
|
48
|
+
```ts
|
|
49
|
+
import { OpenTagToken, tokenizeXmlAsync } from "xml-sax-ts";
|
|
50
|
+
|
|
51
|
+
async function* chunks(): AsyncGenerator<string> {
|
|
52
|
+
yield "<root><item>1</item>";
|
|
53
|
+
yield "<item>2</item></root>";
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
for await (const token of tokenizeXmlAsync(chunks())) {
|
|
57
|
+
if (token instanceof OpenTagToken) {
|
|
58
|
+
console.log(token.depth, token.path.join("/"));
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
```
|
|
62
|
+
|
|
45
63
|
### Parse to tree
|
|
46
64
|
|
|
47
65
|
```ts
|
|
@@ -64,19 +82,25 @@ const obj = buildObject(root);
|
|
|
64
82
|
### Streaming object builder
|
|
65
83
|
|
|
66
84
|
```ts
|
|
67
|
-
import { ObjectBuilder, XmlSaxParser } from "xml-sax-ts";
|
|
85
|
+
import { CdataToken, CloseTagToken, OpenTagToken, TextToken, ObjectBuilder, XmlSaxParser } from "xml-sax-ts";
|
|
68
86
|
|
|
69
87
|
const builder = new ObjectBuilder();
|
|
70
|
-
const parser = new XmlSaxParser(
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
88
|
+
const parser = new XmlSaxParser();
|
|
89
|
+
|
|
90
|
+
const consume = (token: unknown): void => {
|
|
91
|
+
if (
|
|
92
|
+
token instanceof OpenTagToken ||
|
|
93
|
+
token instanceof TextToken ||
|
|
94
|
+
token instanceof CdataToken ||
|
|
95
|
+
token instanceof CloseTagToken
|
|
96
|
+
) {
|
|
97
|
+
builder.consume(token);
|
|
98
|
+
}
|
|
99
|
+
};
|
|
76
100
|
|
|
77
|
-
parser.feed("<root><item>1</item>");
|
|
78
|
-
parser.feed("<item>2</item></root>");
|
|
79
|
-
parser.close();
|
|
101
|
+
for (const token of parser.feed("<root><item>1</item>")) consume(token);
|
|
102
|
+
for (const token of parser.feed("<item>2</item></root>")) consume(token);
|
|
103
|
+
for (const token of parser.close()) consume(token);
|
|
80
104
|
|
|
81
105
|
const obj = builder.getResult();
|
|
82
106
|
// { item: ["1", "2"] }
|
|
@@ -141,6 +165,47 @@ Quick run (fewer rounds):
|
|
|
141
165
|
npm run bench:quick
|
|
142
166
|
```
|
|
143
167
|
|
|
168
|
+
### Large file streaming benchmark (2.5 GB)
|
|
169
|
+
|
|
170
|
+
This benchmark parses a synthetic XML document containing 10 large text blobs of 250 MB each.
|
|
171
|
+
The parser runs in true streaming mode and does not require building a full tree in memory.
|
|
172
|
+
|
|
173
|
+
Run in Node:
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
npm run bench:large:node
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
Optional smaller smoke run:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
npm run bench:large:node:sample
|
|
183
|
+
```
|
|
184
|
+
|
|
185
|
+
Environment variables for `bench:large:node`:
|
|
186
|
+
|
|
187
|
+
- `LARGE_BLOB_SIZE_MB` (default `250`)
|
|
188
|
+
- `LARGE_BLOB_COUNT` (default `10`)
|
|
189
|
+
- `LARGE_XML_CHUNK_MB` (default `1`)
|
|
190
|
+
- `LARGE_XML_XMLNS` (`1` enables namespaces; default `false`)
|
|
191
|
+
- `LARGE_XML_COALESCE_TEXT` (`1` enables text coalescing; default `false`)
|
|
192
|
+
- `LARGE_XML_REGENERATE=1` to rebuild the generated dataset
|
|
193
|
+
- `LARGE_XML_FILE` to override the dataset path
|
|
194
|
+
|
|
195
|
+
Browser benchmark page:
|
|
196
|
+
|
|
197
|
+
1. Build the package: `npm run build`
|
|
198
|
+
2. Serve the repository root with any static server
|
|
199
|
+
3. Open `benchmarks/browser/large-bench.html`
|
|
200
|
+
4. Select a large XML file and run the benchmark
|
|
201
|
+
|
|
202
|
+
Latest large-file run (this machine):
|
|
203
|
+
|
|
204
|
+
| Environment | Dataset | Parser settings | Elapsed | Throughput | Peak memory |
|
|
205
|
+
| --- | --- | --- | ---: | ---: | ---: |
|
|
206
|
+
| Node v24.7.0 (darwin arm64) | `10 x 250 MB` blobs (`2.44 GB` on disk) | `xmlns=false`, `coalesceText=false`, `trackPosition=false`, `1 MB` read chunks | `1.75 s` | `1,426.44 MB/s` | `217.8 MB RSS` |
|
|
207
|
+
| Browser (local run) | `10 x 250 MB` blobs (`2.44 GB` on disk) | `xmlns=false`, `coalesceText=false`, `trackPosition=false`, `1024 KB` chunk size | `1.97 s` | `1,272.2 MB/s` | `63.1 MB JS heap (end)` |
|
|
208
|
+
|
|
144
209
|
The benchmark now runs multiple rounds and reports median/mean/stddev for better comparability.
|
|
145
210
|
|
|
146
211
|
- `xml-sax-ts:sax` scenarios measure streaming event parsing
|
|
@@ -249,46 +314,72 @@ Current status for this environment: comparable runs show `xml-sax-ts` at `0.971
|
|
|
249
314
|
new XmlSaxParser(options?: ParserOptions)
|
|
250
315
|
```
|
|
251
316
|
|
|
252
|
-
| Method.
|
|
253
|
-
|
|
|
254
|
-
| `feed(chunk)`
|
|
255
|
-
| `close()`
|
|
317
|
+
| Method. | Description |
|
|
318
|
+
| --------------------- | ----------------------------------------------------------------------------------- |
|
|
319
|
+
| `feed(chunk)` | Feed one XML chunk and return parsed tokens for that chunk |
|
|
320
|
+
| `close()` | Finalize parsing, validate state, and return remaining tokens plus `EndToken` |
|
|
321
|
+
| `drainTokens()` | Return and clear buffered tokens (usually empty if you consume `feed`/`close` return values) |
|
|
322
|
+
| `[Symbol.iterator]()` | Iterate currently buffered tokens |
|
|
323
|
+
| `iterateChunks(src)` | Async iterator over an `Iterable<string>` or `AsyncIterable<string>` chunk source |
|
|
256
324
|
|
|
257
325
|
#### `ParserOptions`
|
|
258
326
|
|
|
259
|
-
| Option
|
|
260
|
-
|
|
|
261
|
-
| `xmlns`
|
|
262
|
-
| `includeNamespaceAttributes`
|
|
263
|
-
| `allowDoctype`
|
|
264
|
-
| `coalesceText`
|
|
265
|
-
| `trackPosition`
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
| `onText` | `function` | — | Called for text nodes |
|
|
269
|
-
| `onCdata` | `function` | — | Called for CDATA sections |
|
|
270
|
-
| `onComment` | `function` | — | Called for comments |
|
|
271
|
-
| `onProcessingInstruction` | `function` | — | Called for processing instructions (`<?…?>`) |
|
|
272
|
-
| `onDoctype` | `function` | — | Called for DOCTYPE declarations |
|
|
273
|
-
| `onError` | `function` | — | Called on parse errors |
|
|
274
|
-
|
|
275
|
-
By default (`coalesceText: false`), streaming input can produce multiple consecutive `onText` callbacks that are logically adjacent. Enable `coalesceText: true` to receive one merged text callback per structural boundary.
|
|
327
|
+
| Option | Type | Default | Description |
|
|
328
|
+
| ---------------------------- | --------- | ------- | ---------------------------------------------- |
|
|
329
|
+
| `xmlns` | `boolean` | `true` | Enable namespace resolution |
|
|
330
|
+
| `includeNamespaceAttributes` | `boolean` | `false` | Include `xmlns:*` attributes in tag output |
|
|
331
|
+
| `allowDoctype` | `boolean` | `true` | Allow `<!DOCTYPE …>` declarations |
|
|
332
|
+
| `coalesceText` | `boolean` | `true` | Merge adjacent text tokens into a single token |
|
|
333
|
+
| `trackPosition` | `boolean` | `true` | Track line/column; disable for faster parsing |
|
|
334
|
+
|
|
335
|
+
By default (`coalesceText: true`), adjacent text chunks are merged and emitted as one `TextToken` per structural boundary. Set `coalesceText: false` to keep chunk-level text tokenization.
|
|
276
336
|
|
|
277
337
|
`trackPosition` controls line/column tracking for parser errors. When set to `false`, parsing is faster and `XmlSaxError` still reports `offset`, while `line` and `column` are set to `0`.
|
|
278
338
|
|
|
279
|
-
|
|
339
|
+
Token payload note: with `xmlns: false`, `OpenTagToken` and `CloseTagToken` use plain-mode tag shapes aligned with `saxes` performance semantics.
|
|
280
340
|
|
|
281
|
-
- `
|
|
282
|
-
- `
|
|
341
|
+
- `OpenTagToken.tag.attributes` values are strings (not `XmlAttribute` objects)
|
|
342
|
+
- `OpenTagToken.tag` and `CloseTagToken.tag` omit `prefix`, `local`, and `uri`
|
|
283
343
|
- With `xmlns: true`, full namespace metadata remains present
|
|
284
344
|
|
|
345
|
+
### Tokens
|
|
346
|
+
|
|
347
|
+
Token classes:
|
|
348
|
+
|
|
349
|
+
- `OpenTagToken`
|
|
350
|
+
- `CloseTagToken`
|
|
351
|
+
- `TextToken`
|
|
352
|
+
- `CdataToken`
|
|
353
|
+
- `CommentToken`
|
|
354
|
+
- `ProcessingInstructionToken`
|
|
355
|
+
- `DoctypeToken`
|
|
356
|
+
- `EndToken`
|
|
357
|
+
|
|
358
|
+
All token classes derive from `XmlToken` and include:
|
|
359
|
+
|
|
360
|
+
- `kind`
|
|
361
|
+
- `position` (`{ offset, line, column }` when `trackPosition` is enabled)
|
|
362
|
+
|
|
363
|
+
`OpenTagToken` and `CloseTagToken` also include:
|
|
364
|
+
|
|
365
|
+
- `depth`
|
|
366
|
+
- `path`
|
|
367
|
+
|
|
368
|
+
### `tokenizeXml(xml, options?)`
|
|
369
|
+
|
|
370
|
+
Convenience helper for one-shot tokenization of a complete XML string.
|
|
371
|
+
|
|
372
|
+
### `tokenizeXmlAsync(chunks, options?)`
|
|
373
|
+
|
|
374
|
+
Convenience async generator for iterating tokens from an `Iterable<string>` or `AsyncIterable<string>` source.
|
|
375
|
+
|
|
285
376
|
### `parseXmlString(xml, options?)`
|
|
286
377
|
|
|
287
378
|
Convenience function that parses a complete XML string into an `XmlNode` tree using `XmlSaxParser` + `TreeBuilder` internally.
|
|
288
379
|
|
|
289
380
|
### `TreeBuilder`
|
|
290
381
|
|
|
291
|
-
Low-level tree builder.
|
|
382
|
+
Low-level tree builder. Consume parser tokens via `consume(token)` and call `getRoot()` to retrieve the resulting `XmlNode`.
|
|
292
383
|
|
|
293
384
|
### `buildObject(root, options?)`
|
|
294
385
|
|
|
@@ -296,7 +387,7 @@ Projects an `XmlNode` tree into a plain object. Attributes are prefixed (default
|
|
|
296
387
|
|
|
297
388
|
### `ObjectBuilder`
|
|
298
389
|
|
|
299
|
-
Streaming builder that produces the same object shape as `buildObject` without building a full `XmlNode` tree.
|
|
390
|
+
Streaming builder that produces the same object shape as `buildObject` without building a full `XmlNode` tree. Consume parser tokens via `consume(token)`.
|
|
300
391
|
|
|
301
392
|
#### `ObjectBuilderOptions`
|
|
302
393
|
|
|
@@ -345,7 +436,7 @@ Custom error class thrown on parse errors. Includes `offset`, `line`, and `colum
|
|
|
345
436
|
|
|
346
437
|
### Exported types
|
|
347
438
|
|
|
348
|
-
`OpenTag` · `CloseTag` · `XmlAttribute` · `ProcessingInstruction` · `Doctype` · `XmlNode` · `XmlChild` · `XmlPosition` · `ParserOptions` · `SerializeOptions` · `ObjectBuilderOptions` · `ArrayElementSelector` · `XmlObjectMap` · `XmlObjectValue` · `XmlBuilderOptions` · `XmlInputObject` · `XmlInputValue` · `ObjectToXmlOptions`
|
|
439
|
+
`XmlTokenKind` · `XmlAnyToken` · `OpenTag` · `CloseTag` · `XmlAttribute` · `ProcessingInstruction` · `Doctype` · `XmlNode` · `XmlChild` · `XmlPosition` · `XmlChunkIterable` · `ParserOptions` · `SerializeOptions` · `ObjectBuilderOptions` · `ArrayElementSelector` · `XmlObjectMap` · `XmlObjectValue` · `XmlBuilderOptions` · `XmlInputObject` · `XmlInputValue` · `ObjectToXmlOptions`
|
|
349
440
|
|
|
350
441
|
## Features
|
|
351
442
|
|