@cj-tech-master/excelts 7.4.0 → 7.5.0-canary.20260404054153.f4c5ecc
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/modules/excel/stream/hyperlink-reader.js +1 -1
- package/dist/browser/modules/excel/stream/workbook-reader.browser.js +2 -2
- package/dist/browser/modules/excel/stream/worksheet-reader.js +1 -1
- package/dist/browser/modules/excel/xlsx/xform/base-xform.js +1 -1
- package/dist/browser/modules/xml/dom.d.ts +1 -1
- package/dist/browser/modules/xml/dom.js +8 -5
- package/dist/browser/modules/xml/index.d.ts +1 -1
- package/dist/browser/modules/xml/sax.d.ts +41 -0
- package/dist/browser/modules/xml/sax.js +265 -76
- package/dist/browser/modules/xml/to-object-shared.d.ts +1 -0
- package/dist/browser/modules/xml/to-object-shared.js +2 -1
- package/dist/browser/modules/xml/to-object.js +7 -4
- package/dist/browser/modules/xml/types.d.ts +33 -1
- package/dist/cjs/modules/excel/stream/hyperlink-reader.js +1 -1
- package/dist/cjs/modules/excel/stream/workbook-reader.browser.js +2 -2
- package/dist/cjs/modules/excel/stream/worksheet-reader.js +1 -1
- package/dist/cjs/modules/excel/xlsx/xform/base-xform.js +1 -1
- package/dist/cjs/modules/xml/dom.js +8 -5
- package/dist/cjs/modules/xml/sax.js +265 -76
- package/dist/cjs/modules/xml/to-object-shared.js +2 -1
- package/dist/cjs/modules/xml/to-object.js +7 -4
- package/dist/esm/modules/excel/stream/hyperlink-reader.js +1 -1
- package/dist/esm/modules/excel/stream/workbook-reader.browser.js +2 -2
- package/dist/esm/modules/excel/stream/worksheet-reader.js +1 -1
- package/dist/esm/modules/excel/xlsx/xform/base-xform.js +1 -1
- package/dist/esm/modules/xml/dom.js +8 -5
- package/dist/esm/modules/xml/sax.js +265 -76
- package/dist/esm/modules/xml/to-object-shared.js +2 -1
- package/dist/esm/modules/xml/to-object.js +7 -4
- package/dist/iife/excelts.iife.js +196 -54
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +44 -44
- package/dist/types/modules/xml/dom.d.ts +1 -1
- package/dist/types/modules/xml/index.d.ts +1 -1
- package/dist/types/modules/xml/sax.d.ts +41 -0
- package/dist/types/modules/xml/to-object-shared.d.ts +1 -0
- package/dist/types/modules/xml/types.d.ts +33 -1
- package/package.json +1 -1
|
@@ -43,7 +43,7 @@ class HyperlinkReader extends EventEmitter {
|
|
|
43
43
|
return;
|
|
44
44
|
}
|
|
45
45
|
try {
|
|
46
|
-
const parser = new SaxParser({ position: false });
|
|
46
|
+
const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
|
|
47
47
|
const decoder = new TextDecoder("utf-8", { fatal: true });
|
|
48
48
|
parser.on("opentag", (node) => {
|
|
49
49
|
if (node.name !== "Relationship") {
|
|
@@ -211,7 +211,7 @@ export class WorkbookReaderBase extends EventEmitter {
|
|
|
211
211
|
// For "cache" mode, use direct SAX callbacks (no event objects, no async generator overhead)
|
|
212
212
|
if (this.options.sharedStrings === "cache") {
|
|
213
213
|
const sharedStrings = this.sharedStrings;
|
|
214
|
-
const parser = new SaxParser({ position: false });
|
|
214
|
+
const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
|
|
215
215
|
parser.on("opentag", (node) => {
|
|
216
216
|
switch (node.name) {
|
|
217
217
|
case "b":
|
|
@@ -311,7 +311,7 @@ export class WorkbookReaderBase extends EventEmitter {
|
|
|
311
311
|
return;
|
|
312
312
|
}
|
|
313
313
|
// "emit" mode — must yield, so use direct SAX with per-chunk yield
|
|
314
|
-
const emitParser = new SaxParser();
|
|
314
|
+
const emitParser = new SaxParser({ invalidCharHandling: "skip" });
|
|
315
315
|
const emitDecoder = new TextDecoder("utf-8", { fatal: true });
|
|
316
316
|
let pendingEmits = [];
|
|
317
317
|
emitParser.on("opentag", (node) => {
|
|
@@ -143,7 +143,7 @@ class WorksheetReader extends EventEmitter {
|
|
|
143
143
|
// Direct SAX callback mode — zero intermediate event objects.
|
|
144
144
|
// We collect worksheet events per-chunk and yield them.
|
|
145
145
|
let worksheetEvents = null;
|
|
146
|
-
const parser = new SaxParser({ position: false });
|
|
146
|
+
const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
|
|
147
147
|
parser.on("opentag", (node) => {
|
|
148
148
|
if (emitSheet) {
|
|
149
149
|
switch (node.name) {
|
|
@@ -159,7 +159,7 @@ class BaseXform {
|
|
|
159
159
|
* Use this instead of parse(parseSax(stream)) for hot paths.
|
|
160
160
|
*/
|
|
161
161
|
async parseStreamDirect(stream) {
|
|
162
|
-
const parser = new SaxParser();
|
|
162
|
+
const parser = new SaxParser({ invalidCharHandling: "skip" });
|
|
163
163
|
const decoder = new TextDecoder("utf-8", { fatal: true });
|
|
164
164
|
let done = false;
|
|
165
165
|
let finalModel;
|
|
@@ -52,7 +52,7 @@ declare function walk(element: XmlElement, visitor: (el: XmlElement) => void): v
|
|
|
52
52
|
/**
|
|
53
53
|
* Convert an {@link XmlElement} DOM tree to a plain JavaScript object.
|
|
54
54
|
*
|
|
55
|
-
* Produces
|
|
55
|
+
* Produces a plain JavaScript object where element names become object keys,
|
|
56
56
|
* attributes are prefixed (default `@_`), text-only elements collapse to their
|
|
57
57
|
* string value, and repeated sibling names merge into arrays.
|
|
58
58
|
*
|
|
@@ -87,7 +87,8 @@ function parseXml(xml, options) {
|
|
|
87
87
|
fragment: options?.fragment ?? false,
|
|
88
88
|
xmlns: options?.xmlns ?? false,
|
|
89
89
|
maxDepth: options?.maxDepth,
|
|
90
|
-
maxEntityExpansions: options?.maxEntityExpansions
|
|
90
|
+
maxEntityExpansions: options?.maxEntityExpansions,
|
|
91
|
+
invalidCharHandling: options?.invalidCharHandling
|
|
91
92
|
});
|
|
92
93
|
// Stack of elements being built. The bottom is a synthetic root
|
|
93
94
|
// that collects top-level nodes.
|
|
@@ -256,7 +257,7 @@ function walk(element, visitor) {
|
|
|
256
257
|
/**
|
|
257
258
|
* Convert an {@link XmlElement} DOM tree to a plain JavaScript object.
|
|
258
259
|
*
|
|
259
|
-
* Produces
|
|
260
|
+
* Produces a plain JavaScript object where element names become object keys,
|
|
260
261
|
* attributes are prefixed (default `@_`), text-only elements collapse to their
|
|
261
262
|
* string value, and repeated sibling names merge into arrays.
|
|
262
263
|
*
|
|
@@ -285,9 +286,11 @@ function toPlainObject(element, options) {
|
|
|
285
286
|
// Add attributes — el.attributes is created via Object.create(null)
|
|
286
287
|
// by safeAttributes(), so no prototype keys to guard against.
|
|
287
288
|
let hasAttributes = false;
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
289
|
+
if (!opts.ignoreAttributes) {
|
|
290
|
+
for (const key in el.attributes) {
|
|
291
|
+
obj[opts.attrPrefix + key] = el.attributes[key];
|
|
292
|
+
hasAttributes = true;
|
|
293
|
+
}
|
|
291
294
|
}
|
|
292
295
|
// Collect text and child elements in a single pass.
|
|
293
296
|
let text = "";
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* - Dual-mode: streaming (SAX parser + stream writer) and buffered (DOM parser + writer)
|
|
11
11
|
* - Shared XmlSink interface lets rendering code target both modes transparently
|
|
12
12
|
*/
|
|
13
|
-
export type { XmlAttributes, XmlNodeType, XmlElement, XmlText, XmlCData, XmlComment, XmlProcessingInstruction, XmlNode, XmlDocument, XmlSink, SaxTag, SaxEvent, SaxEventAny, SaxHandlers, SaxOptions, WritableTarget, XmlParseOptions, ToPlainObjectOptions, ParseXmlToObjectOptions } from "./types.js";
|
|
13
|
+
export type { XmlAttributes, XmlNodeType, XmlElement, XmlText, XmlCData, XmlComment, XmlProcessingInstruction, XmlNode, XmlDocument, XmlSink, SaxTag, SaxEvent, SaxEventAny, SaxHandlers, SaxOptions, InvalidCharHandling, WritableTarget, XmlParseOptions, ToPlainObjectOptions, ParseXmlToObjectOptions } from "./types.js";
|
|
14
14
|
export { xmlEncode, xmlDecode, xmlEncodeAttr, validateXmlName, encodeCData, validateCommentText } from "./encode.js";
|
|
15
15
|
export { XmlWriter, StdDocAttributes } from "./writer.js";
|
|
16
16
|
export { XmlStreamWriter } from "./stream-writer.js";
|
|
@@ -37,6 +37,7 @@ declare class SaxParser {
|
|
|
37
37
|
private xmlns;
|
|
38
38
|
private maxDepth;
|
|
39
39
|
private maxEntityExpansions;
|
|
40
|
+
private invalidCharHandling;
|
|
40
41
|
private _entityExpansionCount;
|
|
41
42
|
private _nsStack;
|
|
42
43
|
private state;
|
|
@@ -81,6 +82,46 @@ declare class SaxParser {
|
|
|
81
82
|
fail(message: string): this;
|
|
82
83
|
write(chunk: string | null): this;
|
|
83
84
|
close(): this;
|
|
85
|
+
/**
|
|
86
|
+
* Handle an invalid XML character according to the configured strategy.
|
|
87
|
+
*
|
|
88
|
+
* Used by `handleTextInRoot()` fast path which manages its own text accumulation
|
|
89
|
+
* and cannot use the `getCode()` loop approach.
|
|
90
|
+
*
|
|
91
|
+
* - `"error"`: call `fail()` and return the original code.
|
|
92
|
+
* - `"skip"`: return `REPLACEMENT_CHAR` as a sentinel (caller handles skip).
|
|
93
|
+
* - `"replace"`: return `REPLACEMENT_CHAR`.
|
|
94
|
+
*
|
|
95
|
+
* Note: For `getCode()`, invalid char handling is inlined to avoid recursion.
|
|
96
|
+
*
|
|
97
|
+
* @param code - The invalid character code point.
|
|
98
|
+
* @param kind - Optional description (e.g. "lone surrogate") for error messages.
|
|
99
|
+
* @returns The code point to use.
|
|
100
|
+
*/
|
|
101
|
+
private handleInvalidChar;
|
|
102
|
+
/**
|
|
103
|
+
* Handle an invalid character inside the `handleTextInRoot()` fast loop.
|
|
104
|
+
*
|
|
105
|
+
* Unlike `handleInvalidChar()` (which returns a code point for `getCode()`),
|
|
106
|
+
* this method manages the text accumulation state (`this.text`, `start`) that
|
|
107
|
+
* the fast text loop relies on.
|
|
108
|
+
*
|
|
109
|
+
* - `"error"`: call `fail()`, leave text accumulation unchanged (char stays in output).
|
|
110
|
+
* - `"skip"`: flush text up to the invalid char, skip it, return new `start`.
|
|
111
|
+
* - `"replace"`: flush text up to the invalid char, append U+FFFD, return new `start`.
|
|
112
|
+
*
|
|
113
|
+
* @returns The updated `start` index for the text accumulation loop.
|
|
114
|
+
*/
|
|
115
|
+
private handleInvalidCharInText;
|
|
116
|
+
/**
|
|
117
|
+
* Handle an invalid character inside `sAttribValueQuoted()`.
|
|
118
|
+
*
|
|
119
|
+
* Same pattern as `handleInvalidCharInText()` but for attribute value
|
|
120
|
+
* accumulation (always uses `this.text`, no conditional handler check).
|
|
121
|
+
*
|
|
122
|
+
* @returns The updated `start` index.
|
|
123
|
+
*/
|
|
124
|
+
private handleInvalidCharInAttr;
|
|
84
125
|
private getCode;
|
|
85
126
|
private unget;
|
|
86
127
|
private processState;
|
|
@@ -35,6 +35,8 @@ const GREATER = 0x3e; // >
|
|
|
35
35
|
const QUESTION = 0x3f; // ?
|
|
36
36
|
const OPEN_BRACKET = 0x5b; // [
|
|
37
37
|
const CLOSE_BRACKET = 0x5d; // ]
|
|
38
|
+
const REPLACEMENT_CHAR = 0xfffd; // U+FFFD REPLACEMENT CHARACTER
|
|
39
|
+
const REPLACEMENT_STR = "\uFFFD"; // Pre-allocated string form of U+FFFD
|
|
38
40
|
// =============================================================================
|
|
39
41
|
// Pre-computed Lookup Tables
|
|
40
42
|
// =============================================================================
|
|
@@ -233,6 +235,7 @@ class SaxParser {
|
|
|
233
235
|
this.maxDepth = options?.maxDepth !== undefined ? options.maxDepth : 256;
|
|
234
236
|
this.maxEntityExpansions =
|
|
235
237
|
options?.maxEntityExpansions !== undefined ? options.maxEntityExpansions : 10000;
|
|
238
|
+
this.invalidCharHandling = options?.invalidCharHandling ?? "error";
|
|
236
239
|
this._init();
|
|
237
240
|
}
|
|
238
241
|
get closed() {
|
|
@@ -350,87 +353,215 @@ class SaxParser {
|
|
|
350
353
|
return this.write(null);
|
|
351
354
|
}
|
|
352
355
|
// ===========================================================================
|
|
353
|
-
// Character
|
|
356
|
+
// Invalid Character Handling
|
|
354
357
|
// ===========================================================================
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
358
|
+
/**
|
|
359
|
+
* Handle an invalid XML character according to the configured strategy.
|
|
360
|
+
*
|
|
361
|
+
* Used by `handleTextInRoot()` fast path which manages its own text accumulation
|
|
362
|
+
* and cannot use the `getCode()` loop approach.
|
|
363
|
+
*
|
|
364
|
+
* - `"error"`: call `fail()` and return the original code.
|
|
365
|
+
* - `"skip"`: return `REPLACEMENT_CHAR` as a sentinel (caller handles skip).
|
|
366
|
+
* - `"replace"`: return `REPLACEMENT_CHAR`.
|
|
367
|
+
*
|
|
368
|
+
* Note: For `getCode()`, invalid char handling is inlined to avoid recursion.
|
|
369
|
+
*
|
|
370
|
+
* @param code - The invalid character code point.
|
|
371
|
+
* @param kind - Optional description (e.g. "lone surrogate") for error messages.
|
|
372
|
+
* @returns The code point to use.
|
|
373
|
+
*/
|
|
374
|
+
handleInvalidChar(code, kind) {
|
|
375
|
+
switch (this.invalidCharHandling) {
|
|
376
|
+
case "replace":
|
|
377
|
+
return REPLACEMENT_CHAR;
|
|
378
|
+
case "skip":
|
|
379
|
+
// Caller is responsible for the actual skip logic.
|
|
380
|
+
// We return -2 as a sentinel to tell getCode()'s loop to continue.
|
|
381
|
+
return -2;
|
|
382
|
+
default: {
|
|
383
|
+
// "error" — existing strict behavior
|
|
384
|
+
const label = kind
|
|
385
|
+
? `invalid XML character: ${kind} 0x${code.toString(16)}`
|
|
386
|
+
: `invalid XML character: 0x${code.toString(16)}`;
|
|
387
|
+
this.fail(label);
|
|
388
|
+
return code;
|
|
368
389
|
}
|
|
369
|
-
return code;
|
|
370
390
|
}
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
391
|
+
}
|
|
392
|
+
/**
|
|
393
|
+
* Handle an invalid character inside the `handleTextInRoot()` fast loop.
|
|
394
|
+
*
|
|
395
|
+
* Unlike `handleInvalidChar()` (which returns a code point for `getCode()`),
|
|
396
|
+
* this method manages the text accumulation state (`this.text`, `start`) that
|
|
397
|
+
* the fast text loop relies on.
|
|
398
|
+
*
|
|
399
|
+
* - `"error"`: call `fail()`, leave text accumulation unchanged (char stays in output).
|
|
400
|
+
* - `"skip"`: flush text up to the invalid char, skip it, return new `start`.
|
|
401
|
+
* - `"replace"`: flush text up to the invalid char, append U+FFFD, return new `start`.
|
|
402
|
+
*
|
|
403
|
+
* @returns The updated `start` index for the text accumulation loop.
|
|
404
|
+
*/
|
|
405
|
+
handleInvalidCharInText(code, handler, start, kind) {
|
|
406
|
+
switch (this.invalidCharHandling) {
|
|
407
|
+
case "skip":
|
|
408
|
+
// Flush text accumulated before this invalid char, then skip it
|
|
409
|
+
if (handler && start < this.prevI) {
|
|
410
|
+
this.text += this.chunk.slice(start, this.prevI);
|
|
411
|
+
}
|
|
412
|
+
return this.i;
|
|
413
|
+
case "replace":
|
|
414
|
+
// Flush text accumulated before this invalid char, append replacement
|
|
415
|
+
if (handler) {
|
|
416
|
+
if (start < this.prevI) {
|
|
417
|
+
this.text += this.chunk.slice(start, this.prevI);
|
|
418
|
+
}
|
|
419
|
+
this.text += REPLACEMENT_STR;
|
|
420
|
+
}
|
|
421
|
+
return this.i;
|
|
422
|
+
default: {
|
|
423
|
+
// "error" — existing strict behavior, char stays in output
|
|
424
|
+
const label = kind
|
|
425
|
+
? `invalid XML character: ${kind} 0x${code.toString(16)}`
|
|
426
|
+
: `invalid XML character: 0x${code.toString(16)}`;
|
|
427
|
+
this.fail(label);
|
|
428
|
+
return start;
|
|
375
429
|
}
|
|
376
|
-
return code;
|
|
377
430
|
}
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
431
|
+
}
|
|
432
|
+
/**
|
|
433
|
+
* Handle an invalid character inside `sAttribValueQuoted()`.
|
|
434
|
+
*
|
|
435
|
+
* Same pattern as `handleInvalidCharInText()` but for attribute value
|
|
436
|
+
* accumulation (always uses `this.text`, no conditional handler check).
|
|
437
|
+
*
|
|
438
|
+
* @returns The updated `start` index.
|
|
439
|
+
*/
|
|
440
|
+
handleInvalidCharInAttr(code, start, kind) {
|
|
441
|
+
switch (this.invalidCharHandling) {
|
|
442
|
+
case "skip":
|
|
443
|
+
if (start < this.prevI) {
|
|
444
|
+
this.text += this.chunk.slice(start, this.prevI);
|
|
445
|
+
}
|
|
446
|
+
return this.i;
|
|
447
|
+
case "replace":
|
|
448
|
+
if (start < this.prevI) {
|
|
449
|
+
this.text += this.chunk.slice(start, this.prevI);
|
|
450
|
+
}
|
|
451
|
+
this.text += REPLACEMENT_STR;
|
|
452
|
+
return this.i;
|
|
453
|
+
default: {
|
|
454
|
+
const label = kind
|
|
455
|
+
? `invalid XML character: ${kind} 0x${code.toString(16)}`
|
|
456
|
+
: `invalid XML character: 0x${code.toString(16)}`;
|
|
457
|
+
this.fail(label);
|
|
458
|
+
return start;
|
|
387
459
|
}
|
|
388
|
-
return NL;
|
|
389
460
|
}
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
461
|
+
}
|
|
462
|
+
// ===========================================================================
|
|
463
|
+
// Character Reading
|
|
464
|
+
// ===========================================================================
|
|
465
|
+
getCode() {
|
|
466
|
+
// Loop to handle skip mode: when an invalid char returns -2, we retry
|
|
467
|
+
// with the next character instead of recursing (avoids stack overflow
|
|
468
|
+
// on long runs of consecutive invalid characters).
|
|
469
|
+
for (;;) {
|
|
470
|
+
const { chunk } = this;
|
|
471
|
+
const i = this.i;
|
|
472
|
+
this.prevI = i;
|
|
473
|
+
this.i = i + 1;
|
|
474
|
+
if (i >= chunk.length) {
|
|
475
|
+
return -1;
|
|
476
|
+
}
|
|
477
|
+
const code = chunk.charCodeAt(i);
|
|
478
|
+
// Ultra-fast path: printable ASCII (0x20-0x7E) — the vast majority of XML content.
|
|
479
|
+
// No validation needed; these are always valid XML 1.0 characters.
|
|
480
|
+
if (code >= 0x20 && code <= 0x7e) {
|
|
481
|
+
if (this.trackPosition) {
|
|
482
|
+
this.column++;
|
|
483
|
+
}
|
|
484
|
+
return code;
|
|
396
485
|
}
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
// Handle surrogates
|
|
400
|
-
if (code >= 0xd800 && code <= 0xdbff) {
|
|
401
|
-
const next = chunk.charCodeAt(i + 1);
|
|
402
|
-
if (next >= 0xdc00 && next <= 0xdfff) {
|
|
403
|
-
this.i = i + 2;
|
|
486
|
+
// Secondary fast path: TAB (0x09) — common in attribute values
|
|
487
|
+
if (code === TAB) {
|
|
404
488
|
if (this.trackPosition) {
|
|
405
489
|
this.column++;
|
|
406
490
|
}
|
|
407
|
-
return
|
|
491
|
+
return code;
|
|
408
492
|
}
|
|
409
|
-
//
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
493
|
+
// Handle CR (normalize CR and CR+LF to LF per XML 1.0 §2.11)
|
|
494
|
+
if (code === CR) {
|
|
495
|
+
if (chunk.charCodeAt(i + 1) === NL) {
|
|
496
|
+
this.i = i + 2;
|
|
497
|
+
}
|
|
498
|
+
if (this.trackPosition) {
|
|
499
|
+
this.line++;
|
|
500
|
+
this.column = 0;
|
|
501
|
+
this.positionAtNewLine = this.position;
|
|
502
|
+
}
|
|
503
|
+
return NL;
|
|
504
|
+
}
|
|
505
|
+
// Handle LF
|
|
506
|
+
if (code === NL) {
|
|
507
|
+
if (this.trackPosition) {
|
|
508
|
+
this.line++;
|
|
509
|
+
this.column = 0;
|
|
510
|
+
this.positionAtNewLine = this.position;
|
|
511
|
+
}
|
|
512
|
+
return NL;
|
|
513
|
+
}
|
|
514
|
+
// Handle surrogates
|
|
515
|
+
if (code >= 0xd800 && code <= 0xdbff) {
|
|
516
|
+
const next = chunk.charCodeAt(i + 1);
|
|
517
|
+
if (next >= 0xdc00 && next <= 0xdfff) {
|
|
518
|
+
this.i = i + 2;
|
|
519
|
+
if (this.trackPosition) {
|
|
520
|
+
this.column++;
|
|
521
|
+
}
|
|
522
|
+
return 0x10000 + ((code - 0xd800) * 0x400 + (next - 0xdc00));
|
|
523
|
+
}
|
|
524
|
+
// Lone high surrogate — invalid XML character
|
|
525
|
+
const result = this.handleInvalidChar(code, "lone surrogate");
|
|
526
|
+
if (result !== -2) {
|
|
527
|
+
return result;
|
|
528
|
+
}
|
|
529
|
+
continue; // skip: loop to next char
|
|
530
|
+
}
|
|
531
|
+
// Lone low surrogate — invalid XML character
|
|
532
|
+
if (code >= 0xdc00 && code <= 0xdfff) {
|
|
533
|
+
const result = this.handleInvalidChar(code, "lone surrogate");
|
|
534
|
+
if (result !== -2) {
|
|
535
|
+
return result;
|
|
536
|
+
}
|
|
537
|
+
continue;
|
|
538
|
+
}
|
|
539
|
+
// Non-ASCII above surrogate range (0x80-0xD7FF, 0xE000-0xFFFD) — all valid XML
|
|
540
|
+
if (code >= 0x80) {
|
|
541
|
+
if (this.trackPosition) {
|
|
542
|
+
this.column++;
|
|
543
|
+
}
|
|
544
|
+
// Reject 0xFFFE and 0xFFFF
|
|
545
|
+
if (code === 0xfffe || code === 0xffff) {
|
|
546
|
+
const result = this.handleInvalidChar(code);
|
|
547
|
+
if (result !== -2) {
|
|
548
|
+
return result;
|
|
549
|
+
}
|
|
550
|
+
continue;
|
|
551
|
+
}
|
|
552
|
+
return code;
|
|
553
|
+
}
|
|
554
|
+
// Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
|
|
555
|
+
// All invalid in XML 1.0
|
|
418
556
|
if (this.trackPosition) {
|
|
419
557
|
this.column++;
|
|
420
558
|
}
|
|
421
|
-
|
|
422
|
-
if (
|
|
423
|
-
|
|
559
|
+
const result = this.handleInvalidChar(code);
|
|
560
|
+
if (result !== -2) {
|
|
561
|
+
return result;
|
|
424
562
|
}
|
|
425
|
-
|
|
426
|
-
}
|
|
427
|
-
// Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
|
|
428
|
-
// All invalid in XML 1.0
|
|
429
|
-
if (this.trackPosition) {
|
|
430
|
-
this.column++;
|
|
563
|
+
// skip: continue to next char
|
|
431
564
|
}
|
|
432
|
-
this.fail("invalid XML character: 0x" + code.toString(16));
|
|
433
|
-
return code;
|
|
434
565
|
}
|
|
435
566
|
unget() {
|
|
436
567
|
this.i = this.prevI;
|
|
@@ -637,16 +768,16 @@ class SaxParser {
|
|
|
637
768
|
}
|
|
638
769
|
else {
|
|
639
770
|
this.i++;
|
|
640
|
-
this.
|
|
771
|
+
start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
|
|
641
772
|
}
|
|
642
773
|
}
|
|
643
774
|
else if (code >= 0xdc00 && code <= 0xdfff) {
|
|
644
775
|
this.i++;
|
|
645
|
-
this.
|
|
776
|
+
start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
|
|
646
777
|
}
|
|
647
778
|
else if (code === 0xfffe || code === 0xffff) {
|
|
648
779
|
this.i++;
|
|
649
|
-
this.
|
|
780
|
+
start = this.handleInvalidCharInText(code, handler, start);
|
|
650
781
|
}
|
|
651
782
|
else {
|
|
652
783
|
this.i++;
|
|
@@ -662,7 +793,7 @@ class SaxParser {
|
|
|
662
793
|
if (this.trackPosition) {
|
|
663
794
|
this.column++;
|
|
664
795
|
}
|
|
665
|
-
this.
|
|
796
|
+
start = this.handleInvalidCharInText(code, handler, start);
|
|
666
797
|
}
|
|
667
798
|
// End of chunk
|
|
668
799
|
if (handler && start < this.i) {
|
|
@@ -674,14 +805,42 @@ class SaxParser {
|
|
|
674
805
|
let { i: start } = this;
|
|
675
806
|
const handler = this._handlers.text;
|
|
676
807
|
let nonSpace = false;
|
|
808
|
+
const isSkip = this.invalidCharHandling === "skip";
|
|
809
|
+
const isReplace = this.invalidCharHandling === "replace";
|
|
677
810
|
while (true) {
|
|
811
|
+
const iBeforeGet = this.i;
|
|
678
812
|
const c = this.getCode();
|
|
679
813
|
if (c === -1) {
|
|
680
|
-
if (handler && start <
|
|
681
|
-
this.text += chunk.slice(start,
|
|
814
|
+
if (handler && start < iBeforeGet) {
|
|
815
|
+
this.text += chunk.slice(start, iBeforeGet);
|
|
682
816
|
}
|
|
683
817
|
break;
|
|
684
818
|
}
|
|
819
|
+
// In skip mode, getCode() may have internally looped past invalid chars.
|
|
820
|
+
// Flush valid text before the gap and advance start past it.
|
|
821
|
+
if (isSkip && this.prevI > iBeforeGet) {
|
|
822
|
+
if (handler && start < iBeforeGet) {
|
|
823
|
+
this.text += chunk.slice(start, iBeforeGet);
|
|
824
|
+
}
|
|
825
|
+
start = this.prevI;
|
|
826
|
+
}
|
|
827
|
+
// In replace mode, getCode() returns REPLACEMENT_CHAR for invalid chars
|
|
828
|
+
// but the original byte is still in the chunk. Detect this by checking
|
|
829
|
+
// whether getCode() returned REPLACEMENT_CHAR while the raw chunk byte
|
|
830
|
+
// at prevI is NOT U+FFFD (i.e., it was substituted by handleInvalidChar).
|
|
831
|
+
if (isReplace &&
|
|
832
|
+
c === REPLACEMENT_CHAR &&
|
|
833
|
+
chunk.charCodeAt(this.prevI) !== REPLACEMENT_CHAR) {
|
|
834
|
+
if (handler) {
|
|
835
|
+
if (start < this.prevI) {
|
|
836
|
+
this.text += chunk.slice(start, this.prevI);
|
|
837
|
+
}
|
|
838
|
+
this.text += REPLACEMENT_STR;
|
|
839
|
+
}
|
|
840
|
+
start = this.i;
|
|
841
|
+
nonSpace = true;
|
|
842
|
+
continue;
|
|
843
|
+
}
|
|
685
844
|
if (c === LESS) {
|
|
686
845
|
if (handler) {
|
|
687
846
|
const slice = chunk.slice(start, this.prevI);
|
|
@@ -1058,13 +1217,43 @@ class SaxParser {
|
|
|
1058
1217
|
start = this.i;
|
|
1059
1218
|
continue;
|
|
1060
1219
|
}
|
|
1061
|
-
//
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1220
|
+
// Non-ASCII (>= 0x80) — mostly valid, handle inline like handleTextInRoot
|
|
1221
|
+
if (code >= 0x80) {
|
|
1222
|
+
this.prevI = this.i;
|
|
1223
|
+
if (code >= 0xd800 && code <= 0xdbff) {
|
|
1224
|
+
const next = chunk.charCodeAt(this.i + 1);
|
|
1225
|
+
if (next >= 0xdc00 && next <= 0xdfff) {
|
|
1226
|
+
this.i += 2; // valid surrogate pair
|
|
1227
|
+
}
|
|
1228
|
+
else {
|
|
1229
|
+
this.i++;
|
|
1230
|
+
start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
|
|
1231
|
+
}
|
|
1232
|
+
}
|
|
1233
|
+
else if (code >= 0xdc00 && code <= 0xdfff) {
|
|
1234
|
+
this.i++;
|
|
1235
|
+
start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
|
|
1236
|
+
}
|
|
1237
|
+
else if (code === 0xfffe || code === 0xffff) {
|
|
1238
|
+
this.i++;
|
|
1239
|
+
start = this.handleInvalidCharInAttr(code, start);
|
|
1240
|
+
}
|
|
1241
|
+
else {
|
|
1242
|
+
this.i++; // valid non-ASCII BMP char
|
|
1243
|
+
}
|
|
1244
|
+
if (this.trackPosition) {
|
|
1245
|
+
this.column++;
|
|
1246
|
+
}
|
|
1247
|
+
continue;
|
|
1248
|
+
}
|
|
1249
|
+
// Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
|
|
1250
|
+
// All invalid in XML 1.0
|
|
1251
|
+
this.prevI = this.i;
|
|
1252
|
+
this.i++;
|
|
1253
|
+
if (this.trackPosition) {
|
|
1254
|
+
this.column++;
|
|
1066
1255
|
}
|
|
1067
|
-
|
|
1256
|
+
start = this.handleInvalidCharInAttr(code, start);
|
|
1068
1257
|
}
|
|
1069
1258
|
// End of chunk
|
|
1070
1259
|
this.text += chunk.slice(start, this.i);
|
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
import type { ToPlainObjectOptions } from "./types.js";
|
|
8
8
|
/** Options with all defaults resolved — no more `??` checks at hot-path call sites. */
|
|
9
9
|
export interface ResolvedOptions {
|
|
10
|
+
readonly ignoreAttributes: boolean;
|
|
10
11
|
readonly attrPrefix: string;
|
|
11
12
|
readonly textKey: string;
|
|
12
13
|
readonly alwaysArray: boolean;
|
|
@@ -6,6 +6,7 @@
|
|
|
6
6
|
*/
|
|
7
7
|
export function resolveOptions(options) {
|
|
8
8
|
return {
|
|
9
|
+
ignoreAttributes: options?.ignoreAttributes ?? false,
|
|
9
10
|
attrPrefix: options?.attributePrefix ?? "@_",
|
|
10
11
|
textKey: options?.textKey ?? "#text",
|
|
11
12
|
alwaysArray: options?.alwaysArray ?? false,
|
|
@@ -52,7 +53,7 @@ export function resolveValue(obj, text, hasAttributes, hasChildren, opts) {
|
|
|
52
53
|
if (hasText) {
|
|
53
54
|
obj[opts.textKey] = text;
|
|
54
55
|
}
|
|
55
|
-
// Empty element with no attributes → empty string
|
|
56
|
+
// Empty element with no attributes → empty string
|
|
56
57
|
if (!hasAttributes && !hasChildren && !hasText) {
|
|
57
58
|
return "";
|
|
58
59
|
}
|
|
@@ -42,7 +42,8 @@ function parseXmlToObject(xml, options) {
|
|
|
42
42
|
position: false,
|
|
43
43
|
fragment: options?.fragment ?? false,
|
|
44
44
|
maxDepth: options?.maxDepth,
|
|
45
|
-
maxEntityExpansions: options?.maxEntityExpansions
|
|
45
|
+
maxEntityExpansions: options?.maxEntityExpansions,
|
|
46
|
+
invalidCharHandling: options?.invalidCharHandling
|
|
46
47
|
});
|
|
47
48
|
// Stack: bottom is a synthetic root frame that collects the document root.
|
|
48
49
|
const syntheticObj = Object.create(null);
|
|
@@ -64,9 +65,11 @@ function parseXmlToObject(xml, options) {
|
|
|
64
65
|
name: tag.name
|
|
65
66
|
};
|
|
66
67
|
// Write attributes directly into frame.obj
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
68
|
+
if (!opts.ignoreAttributes) {
|
|
69
|
+
for (const key in tag.attributes) {
|
|
70
|
+
frame.obj[opts.attrPrefix + key] = tag.attributes[key];
|
|
71
|
+
frame.hasAttributes = true;
|
|
72
|
+
}
|
|
70
73
|
}
|
|
71
74
|
// Mark parent as having children
|
|
72
75
|
stack[stack.length - 1].hasChildren = true;
|