@cj-tech-master/excelts 7.5.0 → 7.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/modules/excel/stream/hyperlink-reader.js +1 -1
- package/dist/browser/modules/excel/stream/workbook-reader.browser.js +2 -2
- package/dist/browser/modules/excel/stream/worksheet-reader.js +1 -1
- package/dist/browser/modules/excel/xlsx/xform/base-xform.js +1 -1
- package/dist/browser/modules/xml/dom.js +2 -1
- package/dist/browser/modules/xml/index.d.ts +1 -1
- package/dist/browser/modules/xml/sax.d.ts +41 -0
- package/dist/browser/modules/xml/sax.js +265 -76
- package/dist/browser/modules/xml/to-object.js +2 -1
- package/dist/browser/modules/xml/types.d.ts +24 -0
- package/dist/cjs/modules/excel/stream/hyperlink-reader.js +1 -1
- package/dist/cjs/modules/excel/stream/workbook-reader.browser.js +2 -2
- package/dist/cjs/modules/excel/stream/worksheet-reader.js +1 -1
- package/dist/cjs/modules/excel/xlsx/xform/base-xform.js +1 -1
- package/dist/cjs/modules/xml/dom.js +2 -1
- package/dist/cjs/modules/xml/sax.js +265 -76
- package/dist/cjs/modules/xml/to-object.js +2 -1
- package/dist/esm/modules/excel/stream/hyperlink-reader.js +1 -1
- package/dist/esm/modules/excel/stream/workbook-reader.browser.js +2 -2
- package/dist/esm/modules/excel/stream/worksheet-reader.js +1 -1
- package/dist/esm/modules/excel/xlsx/xform/base-xform.js +1 -1
- package/dist/esm/modules/xml/dom.js +2 -1
- package/dist/esm/modules/xml/sax.js +265 -76
- package/dist/esm/modules/xml/to-object.js +2 -1
- package/dist/iife/excelts.iife.js +196 -54
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +44 -44
- package/dist/types/modules/xml/index.d.ts +1 -1
- package/dist/types/modules/xml/sax.d.ts +41 -0
- package/dist/types/modules/xml/types.d.ts +24 -0
- package/package.json +1 -1
|
@@ -43,7 +43,7 @@ class HyperlinkReader extends EventEmitter {
|
|
|
43
43
|
return;
|
|
44
44
|
}
|
|
45
45
|
try {
|
|
46
|
-
const parser = new SaxParser({ position: false });
|
|
46
|
+
const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
|
|
47
47
|
const decoder = new TextDecoder("utf-8", { fatal: true });
|
|
48
48
|
parser.on("opentag", (node) => {
|
|
49
49
|
if (node.name !== "Relationship") {
|
|
@@ -211,7 +211,7 @@ export class WorkbookReaderBase extends EventEmitter {
|
|
|
211
211
|
// For "cache" mode, use direct SAX callbacks (no event objects, no async generator overhead)
|
|
212
212
|
if (this.options.sharedStrings === "cache") {
|
|
213
213
|
const sharedStrings = this.sharedStrings;
|
|
214
|
-
const parser = new SaxParser({ position: false });
|
|
214
|
+
const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
|
|
215
215
|
parser.on("opentag", (node) => {
|
|
216
216
|
switch (node.name) {
|
|
217
217
|
case "b":
|
|
@@ -311,7 +311,7 @@ export class WorkbookReaderBase extends EventEmitter {
|
|
|
311
311
|
return;
|
|
312
312
|
}
|
|
313
313
|
// "emit" mode — must yield, so use direct SAX with per-chunk yield
|
|
314
|
-
const emitParser = new SaxParser();
|
|
314
|
+
const emitParser = new SaxParser({ invalidCharHandling: "skip" });
|
|
315
315
|
const emitDecoder = new TextDecoder("utf-8", { fatal: true });
|
|
316
316
|
let pendingEmits = [];
|
|
317
317
|
emitParser.on("opentag", (node) => {
|
|
@@ -143,7 +143,7 @@ class WorksheetReader extends EventEmitter {
|
|
|
143
143
|
// Direct SAX callback mode — zero intermediate event objects.
|
|
144
144
|
// We collect worksheet events per-chunk and yield them.
|
|
145
145
|
let worksheetEvents = null;
|
|
146
|
-
const parser = new SaxParser({ position: false });
|
|
146
|
+
const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
|
|
147
147
|
parser.on("opentag", (node) => {
|
|
148
148
|
if (emitSheet) {
|
|
149
149
|
switch (node.name) {
|
|
@@ -159,7 +159,7 @@ class BaseXform {
|
|
|
159
159
|
* Use this instead of parse(parseSax(stream)) for hot paths.
|
|
160
160
|
*/
|
|
161
161
|
async parseStreamDirect(stream) {
|
|
162
|
-
const parser = new SaxParser();
|
|
162
|
+
const parser = new SaxParser({ invalidCharHandling: "skip" });
|
|
163
163
|
const decoder = new TextDecoder("utf-8", { fatal: true });
|
|
164
164
|
let done = false;
|
|
165
165
|
let finalModel;
|
|
@@ -87,7 +87,8 @@ function parseXml(xml, options) {
|
|
|
87
87
|
fragment: options?.fragment ?? false,
|
|
88
88
|
xmlns: options?.xmlns ?? false,
|
|
89
89
|
maxDepth: options?.maxDepth,
|
|
90
|
-
maxEntityExpansions: options?.maxEntityExpansions
|
|
90
|
+
maxEntityExpansions: options?.maxEntityExpansions,
|
|
91
|
+
invalidCharHandling: options?.invalidCharHandling
|
|
91
92
|
});
|
|
92
93
|
// Stack of elements being built. The bottom is a synthetic root
|
|
93
94
|
// that collects top-level nodes.
|
|
@@ -10,7 +10,7 @@
|
|
|
10
10
|
* - Dual-mode: streaming (SAX parser + stream writer) and buffered (DOM parser + writer)
|
|
11
11
|
* - Shared XmlSink interface lets rendering code target both modes transparently
|
|
12
12
|
*/
|
|
13
|
-
export type { XmlAttributes, XmlNodeType, XmlElement, XmlText, XmlCData, XmlComment, XmlProcessingInstruction, XmlNode, XmlDocument, XmlSink, SaxTag, SaxEvent, SaxEventAny, SaxHandlers, SaxOptions, WritableTarget, XmlParseOptions, ToPlainObjectOptions, ParseXmlToObjectOptions } from "./types.js";
|
|
13
|
+
export type { XmlAttributes, XmlNodeType, XmlElement, XmlText, XmlCData, XmlComment, XmlProcessingInstruction, XmlNode, XmlDocument, XmlSink, SaxTag, SaxEvent, SaxEventAny, SaxHandlers, SaxOptions, InvalidCharHandling, WritableTarget, XmlParseOptions, ToPlainObjectOptions, ParseXmlToObjectOptions } from "./types.js";
|
|
14
14
|
export { xmlEncode, xmlDecode, xmlEncodeAttr, validateXmlName, encodeCData, validateCommentText } from "./encode.js";
|
|
15
15
|
export { XmlWriter, StdDocAttributes } from "./writer.js";
|
|
16
16
|
export { XmlStreamWriter } from "./stream-writer.js";
|
|
@@ -37,6 +37,7 @@ declare class SaxParser {
|
|
|
37
37
|
private xmlns;
|
|
38
38
|
private maxDepth;
|
|
39
39
|
private maxEntityExpansions;
|
|
40
|
+
private invalidCharHandling;
|
|
40
41
|
private _entityExpansionCount;
|
|
41
42
|
private _nsStack;
|
|
42
43
|
private state;
|
|
@@ -81,6 +82,46 @@ declare class SaxParser {
|
|
|
81
82
|
fail(message: string): this;
|
|
82
83
|
write(chunk: string | null): this;
|
|
83
84
|
close(): this;
|
|
85
|
+
/**
|
|
86
|
+
* Handle an invalid XML character according to the configured strategy.
|
|
87
|
+
*
|
|
88
|
+
* Used by `handleTextInRoot()` fast path which manages its own text accumulation
|
|
89
|
+
* and cannot use the `getCode()` loop approach.
|
|
90
|
+
*
|
|
91
|
+
* - `"error"`: call `fail()` and return the original code.
|
|
92
|
+
* - `"skip"`: return `REPLACEMENT_CHAR` as a sentinel (caller handles skip).
|
|
93
|
+
* - `"replace"`: return `REPLACEMENT_CHAR`.
|
|
94
|
+
*
|
|
95
|
+
* Note: For `getCode()`, invalid char handling is inlined to avoid recursion.
|
|
96
|
+
*
|
|
97
|
+
* @param code - The invalid character code point.
|
|
98
|
+
* @param kind - Optional description (e.g. "lone surrogate") for error messages.
|
|
99
|
+
* @returns The code point to use.
|
|
100
|
+
*/
|
|
101
|
+
private handleInvalidChar;
|
|
102
|
+
/**
|
|
103
|
+
* Handle an invalid character inside the `handleTextInRoot()` fast loop.
|
|
104
|
+
*
|
|
105
|
+
* Unlike `handleInvalidChar()` (which returns a code point for `getCode()`),
|
|
106
|
+
* this method manages the text accumulation state (`this.text`, `start`) that
|
|
107
|
+
* the fast text loop relies on.
|
|
108
|
+
*
|
|
109
|
+
* - `"error"`: call `fail()`, leave text accumulation unchanged (char stays in output).
|
|
110
|
+
* - `"skip"`: flush text up to the invalid char, skip it, return new `start`.
|
|
111
|
+
* - `"replace"`: flush text up to the invalid char, append U+FFFD, return new `start`.
|
|
112
|
+
*
|
|
113
|
+
* @returns The updated `start` index for the text accumulation loop.
|
|
114
|
+
*/
|
|
115
|
+
private handleInvalidCharInText;
|
|
116
|
+
/**
|
|
117
|
+
* Handle an invalid character inside `sAttribValueQuoted()`.
|
|
118
|
+
*
|
|
119
|
+
* Same pattern as `handleInvalidCharInText()` but for attribute value
|
|
120
|
+
* accumulation (always uses `this.text`, no conditional handler check).
|
|
121
|
+
*
|
|
122
|
+
* @returns The updated `start` index.
|
|
123
|
+
*/
|
|
124
|
+
private handleInvalidCharInAttr;
|
|
84
125
|
private getCode;
|
|
85
126
|
private unget;
|
|
86
127
|
private processState;
|
|
@@ -35,6 +35,8 @@ const GREATER = 0x3e; // >
|
|
|
35
35
|
const QUESTION = 0x3f; // ?
|
|
36
36
|
const OPEN_BRACKET = 0x5b; // [
|
|
37
37
|
const CLOSE_BRACKET = 0x5d; // ]
|
|
38
|
+
const REPLACEMENT_CHAR = 0xfffd; // U+FFFD REPLACEMENT CHARACTER
|
|
39
|
+
const REPLACEMENT_STR = "\uFFFD"; // Pre-allocated string form of U+FFFD
|
|
38
40
|
// =============================================================================
|
|
39
41
|
// Pre-computed Lookup Tables
|
|
40
42
|
// =============================================================================
|
|
@@ -233,6 +235,7 @@ class SaxParser {
|
|
|
233
235
|
this.maxDepth = options?.maxDepth !== undefined ? options.maxDepth : 256;
|
|
234
236
|
this.maxEntityExpansions =
|
|
235
237
|
options?.maxEntityExpansions !== undefined ? options.maxEntityExpansions : 10000;
|
|
238
|
+
this.invalidCharHandling = options?.invalidCharHandling ?? "error";
|
|
236
239
|
this._init();
|
|
237
240
|
}
|
|
238
241
|
get closed() {
|
|
@@ -350,87 +353,215 @@ class SaxParser {
|
|
|
350
353
|
return this.write(null);
|
|
351
354
|
}
|
|
352
355
|
// ===========================================================================
|
|
353
|
-
// Character
|
|
356
|
+
// Invalid Character Handling
|
|
354
357
|
// ===========================================================================
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
358
|
+
/**
|
|
359
|
+
* Handle an invalid XML character according to the configured strategy.
|
|
360
|
+
*
|
|
361
|
+
* Used by `handleTextInRoot()` fast path which manages its own text accumulation
|
|
362
|
+
* and cannot use the `getCode()` loop approach.
|
|
363
|
+
*
|
|
364
|
+
* - `"error"`: call `fail()` and return the original code.
|
|
365
|
+
* - `"skip"`: return `REPLACEMENT_CHAR` as a sentinel (caller handles skip).
|
|
366
|
+
* - `"replace"`: return `REPLACEMENT_CHAR`.
|
|
367
|
+
*
|
|
368
|
+
* Note: For `getCode()`, invalid char handling is inlined to avoid recursion.
|
|
369
|
+
*
|
|
370
|
+
* @param code - The invalid character code point.
|
|
371
|
+
* @param kind - Optional description (e.g. "lone surrogate") for error messages.
|
|
372
|
+
* @returns The code point to use.
|
|
373
|
+
*/
|
|
374
|
+
handleInvalidChar(code, kind) {
|
|
375
|
+
switch (this.invalidCharHandling) {
|
|
376
|
+
case "replace":
|
|
377
|
+
return REPLACEMENT_CHAR;
|
|
378
|
+
case "skip":
|
|
379
|
+
// Caller is responsible for the actual skip logic.
|
|
380
|
+
// We return -2 as a sentinel to tell getCode()'s loop to continue.
|
|
381
|
+
return -2;
|
|
382
|
+
default: {
|
|
383
|
+
// "error" — existing strict behavior
|
|
384
|
+
const label = kind
|
|
385
|
+
? `invalid XML character: ${kind} 0x${code.toString(16)}`
|
|
386
|
+
: `invalid XML character: 0x${code.toString(16)}`;
|
|
387
|
+
this.fail(label);
|
|
388
|
+
return code;
|
|
368
389
|
}
|
|
369
|
-
return code;
|
|
370
390
|
}
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
391
|
+
}
|
|
392
|
+
/**
|
|
393
|
+
* Handle an invalid character inside the `handleTextInRoot()` fast loop.
|
|
394
|
+
*
|
|
395
|
+
* Unlike `handleInvalidChar()` (which returns a code point for `getCode()`),
|
|
396
|
+
* this method manages the text accumulation state (`this.text`, `start`) that
|
|
397
|
+
* the fast text loop relies on.
|
|
398
|
+
*
|
|
399
|
+
* - `"error"`: call `fail()`, leave text accumulation unchanged (char stays in output).
|
|
400
|
+
* - `"skip"`: flush text up to the invalid char, skip it, return new `start`.
|
|
401
|
+
* - `"replace"`: flush text up to the invalid char, append U+FFFD, return new `start`.
|
|
402
|
+
*
|
|
403
|
+
* @returns The updated `start` index for the text accumulation loop.
|
|
404
|
+
*/
|
|
405
|
+
handleInvalidCharInText(code, handler, start, kind) {
|
|
406
|
+
switch (this.invalidCharHandling) {
|
|
407
|
+
case "skip":
|
|
408
|
+
// Flush text accumulated before this invalid char, then skip it
|
|
409
|
+
if (handler && start < this.prevI) {
|
|
410
|
+
this.text += this.chunk.slice(start, this.prevI);
|
|
411
|
+
}
|
|
412
|
+
return this.i;
|
|
413
|
+
case "replace":
|
|
414
|
+
// Flush text accumulated before this invalid char, append replacement
|
|
415
|
+
if (handler) {
|
|
416
|
+
if (start < this.prevI) {
|
|
417
|
+
this.text += this.chunk.slice(start, this.prevI);
|
|
418
|
+
}
|
|
419
|
+
this.text += REPLACEMENT_STR;
|
|
420
|
+
}
|
|
421
|
+
return this.i;
|
|
422
|
+
default: {
|
|
423
|
+
// "error" — existing strict behavior, char stays in output
|
|
424
|
+
const label = kind
|
|
425
|
+
? `invalid XML character: ${kind} 0x${code.toString(16)}`
|
|
426
|
+
: `invalid XML character: 0x${code.toString(16)}`;
|
|
427
|
+
this.fail(label);
|
|
428
|
+
return start;
|
|
375
429
|
}
|
|
376
|
-
return code;
|
|
377
430
|
}
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
431
|
+
}
|
|
432
|
+
/**
|
|
433
|
+
* Handle an invalid character inside `sAttribValueQuoted()`.
|
|
434
|
+
*
|
|
435
|
+
* Same pattern as `handleInvalidCharInText()` but for attribute value
|
|
436
|
+
* accumulation (always uses `this.text`, no conditional handler check).
|
|
437
|
+
*
|
|
438
|
+
* @returns The updated `start` index.
|
|
439
|
+
*/
|
|
440
|
+
handleInvalidCharInAttr(code, start, kind) {
|
|
441
|
+
switch (this.invalidCharHandling) {
|
|
442
|
+
case "skip":
|
|
443
|
+
if (start < this.prevI) {
|
|
444
|
+
this.text += this.chunk.slice(start, this.prevI);
|
|
445
|
+
}
|
|
446
|
+
return this.i;
|
|
447
|
+
case "replace":
|
|
448
|
+
if (start < this.prevI) {
|
|
449
|
+
this.text += this.chunk.slice(start, this.prevI);
|
|
450
|
+
}
|
|
451
|
+
this.text += REPLACEMENT_STR;
|
|
452
|
+
return this.i;
|
|
453
|
+
default: {
|
|
454
|
+
const label = kind
|
|
455
|
+
? `invalid XML character: ${kind} 0x${code.toString(16)}`
|
|
456
|
+
: `invalid XML character: 0x${code.toString(16)}`;
|
|
457
|
+
this.fail(label);
|
|
458
|
+
return start;
|
|
387
459
|
}
|
|
388
|
-
return NL;
|
|
389
460
|
}
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
461
|
+
}
|
|
462
|
+
// ===========================================================================
|
|
463
|
+
// Character Reading
|
|
464
|
+
// ===========================================================================
|
|
465
|
+
getCode() {
|
|
466
|
+
// Loop to handle skip mode: when an invalid char returns -2, we retry
|
|
467
|
+
// with the next character instead of recursing (avoids stack overflow
|
|
468
|
+
// on long runs of consecutive invalid characters).
|
|
469
|
+
for (;;) {
|
|
470
|
+
const { chunk } = this;
|
|
471
|
+
const i = this.i;
|
|
472
|
+
this.prevI = i;
|
|
473
|
+
this.i = i + 1;
|
|
474
|
+
if (i >= chunk.length) {
|
|
475
|
+
return -1;
|
|
476
|
+
}
|
|
477
|
+
const code = chunk.charCodeAt(i);
|
|
478
|
+
// Ultra-fast path: printable ASCII (0x20-0x7E) — the vast majority of XML content.
|
|
479
|
+
// No validation needed; these are always valid XML 1.0 characters.
|
|
480
|
+
if (code >= 0x20 && code <= 0x7e) {
|
|
481
|
+
if (this.trackPosition) {
|
|
482
|
+
this.column++;
|
|
483
|
+
}
|
|
484
|
+
return code;
|
|
396
485
|
}
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
// Handle surrogates
|
|
400
|
-
if (code >= 0xd800 && code <= 0xdbff) {
|
|
401
|
-
const next = chunk.charCodeAt(i + 1);
|
|
402
|
-
if (next >= 0xdc00 && next <= 0xdfff) {
|
|
403
|
-
this.i = i + 2;
|
|
486
|
+
// Secondary fast path: TAB (0x09) — common in attribute values
|
|
487
|
+
if (code === TAB) {
|
|
404
488
|
if (this.trackPosition) {
|
|
405
489
|
this.column++;
|
|
406
490
|
}
|
|
407
|
-
return
|
|
491
|
+
return code;
|
|
408
492
|
}
|
|
409
|
-
//
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
493
|
+
// Handle CR (normalize CR and CR+LF to LF per XML 1.0 §2.11)
|
|
494
|
+
if (code === CR) {
|
|
495
|
+
if (chunk.charCodeAt(i + 1) === NL) {
|
|
496
|
+
this.i = i + 2;
|
|
497
|
+
}
|
|
498
|
+
if (this.trackPosition) {
|
|
499
|
+
this.line++;
|
|
500
|
+
this.column = 0;
|
|
501
|
+
this.positionAtNewLine = this.position;
|
|
502
|
+
}
|
|
503
|
+
return NL;
|
|
504
|
+
}
|
|
505
|
+
// Handle LF
|
|
506
|
+
if (code === NL) {
|
|
507
|
+
if (this.trackPosition) {
|
|
508
|
+
this.line++;
|
|
509
|
+
this.column = 0;
|
|
510
|
+
this.positionAtNewLine = this.position;
|
|
511
|
+
}
|
|
512
|
+
return NL;
|
|
513
|
+
}
|
|
514
|
+
// Handle surrogates
|
|
515
|
+
if (code >= 0xd800 && code <= 0xdbff) {
|
|
516
|
+
const next = chunk.charCodeAt(i + 1);
|
|
517
|
+
if (next >= 0xdc00 && next <= 0xdfff) {
|
|
518
|
+
this.i = i + 2;
|
|
519
|
+
if (this.trackPosition) {
|
|
520
|
+
this.column++;
|
|
521
|
+
}
|
|
522
|
+
return 0x10000 + ((code - 0xd800) * 0x400 + (next - 0xdc00));
|
|
523
|
+
}
|
|
524
|
+
// Lone high surrogate — invalid XML character
|
|
525
|
+
const result = this.handleInvalidChar(code, "lone surrogate");
|
|
526
|
+
if (result !== -2) {
|
|
527
|
+
return result;
|
|
528
|
+
}
|
|
529
|
+
continue; // skip: loop to next char
|
|
530
|
+
}
|
|
531
|
+
// Lone low surrogate — invalid XML character
|
|
532
|
+
if (code >= 0xdc00 && code <= 0xdfff) {
|
|
533
|
+
const result = this.handleInvalidChar(code, "lone surrogate");
|
|
534
|
+
if (result !== -2) {
|
|
535
|
+
return result;
|
|
536
|
+
}
|
|
537
|
+
continue;
|
|
538
|
+
}
|
|
539
|
+
// Non-ASCII above surrogate range (0x80-0xD7FF, 0xE000-0xFFFD) — all valid XML
|
|
540
|
+
if (code >= 0x80) {
|
|
541
|
+
if (this.trackPosition) {
|
|
542
|
+
this.column++;
|
|
543
|
+
}
|
|
544
|
+
// Reject 0xFFFE and 0xFFFF
|
|
545
|
+
if (code === 0xfffe || code === 0xffff) {
|
|
546
|
+
const result = this.handleInvalidChar(code);
|
|
547
|
+
if (result !== -2) {
|
|
548
|
+
return result;
|
|
549
|
+
}
|
|
550
|
+
continue;
|
|
551
|
+
}
|
|
552
|
+
return code;
|
|
553
|
+
}
|
|
554
|
+
// Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
|
|
555
|
+
// All invalid in XML 1.0
|
|
418
556
|
if (this.trackPosition) {
|
|
419
557
|
this.column++;
|
|
420
558
|
}
|
|
421
|
-
|
|
422
|
-
if (
|
|
423
|
-
|
|
559
|
+
const result = this.handleInvalidChar(code);
|
|
560
|
+
if (result !== -2) {
|
|
561
|
+
return result;
|
|
424
562
|
}
|
|
425
|
-
|
|
426
|
-
}
|
|
427
|
-
// Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
|
|
428
|
-
// All invalid in XML 1.0
|
|
429
|
-
if (this.trackPosition) {
|
|
430
|
-
this.column++;
|
|
563
|
+
// skip: continue to next char
|
|
431
564
|
}
|
|
432
|
-
this.fail("invalid XML character: 0x" + code.toString(16));
|
|
433
|
-
return code;
|
|
434
565
|
}
|
|
435
566
|
unget() {
|
|
436
567
|
this.i = this.prevI;
|
|
@@ -637,16 +768,16 @@ class SaxParser {
|
|
|
637
768
|
}
|
|
638
769
|
else {
|
|
639
770
|
this.i++;
|
|
640
|
-
this.
|
|
771
|
+
start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
|
|
641
772
|
}
|
|
642
773
|
}
|
|
643
774
|
else if (code >= 0xdc00 && code <= 0xdfff) {
|
|
644
775
|
this.i++;
|
|
645
|
-
this.
|
|
776
|
+
start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
|
|
646
777
|
}
|
|
647
778
|
else if (code === 0xfffe || code === 0xffff) {
|
|
648
779
|
this.i++;
|
|
649
|
-
this.
|
|
780
|
+
start = this.handleInvalidCharInText(code, handler, start);
|
|
650
781
|
}
|
|
651
782
|
else {
|
|
652
783
|
this.i++;
|
|
@@ -662,7 +793,7 @@ class SaxParser {
|
|
|
662
793
|
if (this.trackPosition) {
|
|
663
794
|
this.column++;
|
|
664
795
|
}
|
|
665
|
-
this.
|
|
796
|
+
start = this.handleInvalidCharInText(code, handler, start);
|
|
666
797
|
}
|
|
667
798
|
// End of chunk
|
|
668
799
|
if (handler && start < this.i) {
|
|
@@ -674,14 +805,42 @@ class SaxParser {
|
|
|
674
805
|
let { i: start } = this;
|
|
675
806
|
const handler = this._handlers.text;
|
|
676
807
|
let nonSpace = false;
|
|
808
|
+
const isSkip = this.invalidCharHandling === "skip";
|
|
809
|
+
const isReplace = this.invalidCharHandling === "replace";
|
|
677
810
|
while (true) {
|
|
811
|
+
const iBeforeGet = this.i;
|
|
678
812
|
const c = this.getCode();
|
|
679
813
|
if (c === -1) {
|
|
680
|
-
if (handler && start <
|
|
681
|
-
this.text += chunk.slice(start,
|
|
814
|
+
if (handler && start < iBeforeGet) {
|
|
815
|
+
this.text += chunk.slice(start, iBeforeGet);
|
|
682
816
|
}
|
|
683
817
|
break;
|
|
684
818
|
}
|
|
819
|
+
// In skip mode, getCode() may have internally looped past invalid chars.
|
|
820
|
+
// Flush valid text before the gap and advance start past it.
|
|
821
|
+
if (isSkip && this.prevI > iBeforeGet) {
|
|
822
|
+
if (handler && start < iBeforeGet) {
|
|
823
|
+
this.text += chunk.slice(start, iBeforeGet);
|
|
824
|
+
}
|
|
825
|
+
start = this.prevI;
|
|
826
|
+
}
|
|
827
|
+
// In replace mode, getCode() returns REPLACEMENT_CHAR for invalid chars
|
|
828
|
+
// but the original byte is still in the chunk. Detect this by checking
|
|
829
|
+
// whether getCode() returned REPLACEMENT_CHAR while the raw chunk byte
|
|
830
|
+
// at prevI is NOT U+FFFD (i.e., it was substituted by handleInvalidChar).
|
|
831
|
+
if (isReplace &&
|
|
832
|
+
c === REPLACEMENT_CHAR &&
|
|
833
|
+
chunk.charCodeAt(this.prevI) !== REPLACEMENT_CHAR) {
|
|
834
|
+
if (handler) {
|
|
835
|
+
if (start < this.prevI) {
|
|
836
|
+
this.text += chunk.slice(start, this.prevI);
|
|
837
|
+
}
|
|
838
|
+
this.text += REPLACEMENT_STR;
|
|
839
|
+
}
|
|
840
|
+
start = this.i;
|
|
841
|
+
nonSpace = true;
|
|
842
|
+
continue;
|
|
843
|
+
}
|
|
685
844
|
if (c === LESS) {
|
|
686
845
|
if (handler) {
|
|
687
846
|
const slice = chunk.slice(start, this.prevI);
|
|
@@ -1058,13 +1217,43 @@ class SaxParser {
|
|
|
1058
1217
|
start = this.i;
|
|
1059
1218
|
continue;
|
|
1060
1219
|
}
|
|
1061
|
-
//
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1220
|
+
// Non-ASCII (>= 0x80) — mostly valid, handle inline like handleTextInRoot
|
|
1221
|
+
if (code >= 0x80) {
|
|
1222
|
+
this.prevI = this.i;
|
|
1223
|
+
if (code >= 0xd800 && code <= 0xdbff) {
|
|
1224
|
+
const next = chunk.charCodeAt(this.i + 1);
|
|
1225
|
+
if (next >= 0xdc00 && next <= 0xdfff) {
|
|
1226
|
+
this.i += 2; // valid surrogate pair
|
|
1227
|
+
}
|
|
1228
|
+
else {
|
|
1229
|
+
this.i++;
|
|
1230
|
+
start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
|
|
1231
|
+
}
|
|
1232
|
+
}
|
|
1233
|
+
else if (code >= 0xdc00 && code <= 0xdfff) {
|
|
1234
|
+
this.i++;
|
|
1235
|
+
start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
|
|
1236
|
+
}
|
|
1237
|
+
else if (code === 0xfffe || code === 0xffff) {
|
|
1238
|
+
this.i++;
|
|
1239
|
+
start = this.handleInvalidCharInAttr(code, start);
|
|
1240
|
+
}
|
|
1241
|
+
else {
|
|
1242
|
+
this.i++; // valid non-ASCII BMP char
|
|
1243
|
+
}
|
|
1244
|
+
if (this.trackPosition) {
|
|
1245
|
+
this.column++;
|
|
1246
|
+
}
|
|
1247
|
+
continue;
|
|
1248
|
+
}
|
|
1249
|
+
// Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
|
|
1250
|
+
// All invalid in XML 1.0
|
|
1251
|
+
this.prevI = this.i;
|
|
1252
|
+
this.i++;
|
|
1253
|
+
if (this.trackPosition) {
|
|
1254
|
+
this.column++;
|
|
1066
1255
|
}
|
|
1067
|
-
|
|
1256
|
+
start = this.handleInvalidCharInAttr(code, start);
|
|
1068
1257
|
}
|
|
1069
1258
|
// End of chunk
|
|
1070
1259
|
this.text += chunk.slice(start, this.i);
|
|
@@ -42,7 +42,8 @@ function parseXmlToObject(xml, options) {
|
|
|
42
42
|
position: false,
|
|
43
43
|
fragment: options?.fragment ?? false,
|
|
44
44
|
maxDepth: options?.maxDepth,
|
|
45
|
-
maxEntityExpansions: options?.maxEntityExpansions
|
|
45
|
+
maxEntityExpansions: options?.maxEntityExpansions,
|
|
46
|
+
invalidCharHandling: options?.invalidCharHandling
|
|
46
47
|
});
|
|
47
48
|
// Stack: bottom is a synthetic root frame that collects the document root.
|
|
48
49
|
const syntheticObj = Object.create(null);
|
|
@@ -169,6 +169,15 @@ export interface SaxHandlers {
|
|
|
169
169
|
pi?: (target: string, body: string) => void;
|
|
170
170
|
error?: (err: Error) => void;
|
|
171
171
|
}
|
|
172
|
+
/**
|
|
173
|
+
* Strategy for handling invalid XML characters (control chars, lone surrogates,
|
|
174
|
+
* non-characters like U+FFFE/U+FFFF).
|
|
175
|
+
*
|
|
176
|
+
* - `"error"` — Report via error handler or throw (XML 1.0 strict). **Default.**
|
|
177
|
+
* - `"skip"` — Silently remove the invalid character from the output.
|
|
178
|
+
* - `"replace"` — Replace the invalid character with U+FFFD (REPLACEMENT CHARACTER).
|
|
179
|
+
*/
|
|
180
|
+
export type InvalidCharHandling = "error" | "skip" | "replace";
|
|
172
181
|
/** SAX parser options. */
|
|
173
182
|
export interface SaxOptions {
|
|
174
183
|
/** Track position (line/column) for error messages. Default: true */
|
|
@@ -187,6 +196,17 @@ export interface SaxOptions {
|
|
|
187
196
|
* Default: 10000. Set 0 to disable.
|
|
188
197
|
*/
|
|
189
198
|
maxEntityExpansions?: number;
|
|
199
|
+
/**
|
|
200
|
+
* How to handle invalid XML characters (ASCII control chars, lone surrogates,
|
|
201
|
+
* non-characters U+FFFE/U+FFFF, DEL U+007F, etc.).
|
|
202
|
+
*
|
|
203
|
+
* - `"error"` — Report via error handler or throw. **(Default)**
|
|
204
|
+
* - `"skip"` — Silently discard the character.
|
|
205
|
+
* - `"replace"` — Replace with U+FFFD (REPLACEMENT CHARACTER).
|
|
206
|
+
*
|
|
207
|
+
* @default "error"
|
|
208
|
+
*/
|
|
209
|
+
invalidCharHandling?: InvalidCharHandling;
|
|
190
210
|
}
|
|
191
211
|
/**
|
|
192
212
|
* Minimal writable interface for XmlStreamWriter.
|
|
@@ -265,6 +285,8 @@ export interface ParseXmlToObjectOptions extends ToPlainObjectOptions {
|
|
|
265
285
|
maxDepth?: number;
|
|
266
286
|
/** Maximum total entity expansions. Default: 10000. */
|
|
267
287
|
maxEntityExpansions?: number;
|
|
288
|
+
/** How to handle invalid XML characters. Default: "error". */
|
|
289
|
+
invalidCharHandling?: InvalidCharHandling;
|
|
268
290
|
}
|
|
269
291
|
/** Options for `parseXml()`. */
|
|
270
292
|
export interface XmlParseOptions {
|
|
@@ -282,5 +304,7 @@ export interface XmlParseOptions {
|
|
|
282
304
|
maxDepth?: number;
|
|
283
305
|
/** Maximum total entity expansions. Default: 10000. */
|
|
284
306
|
maxEntityExpansions?: number;
|
|
307
|
+
/** How to handle invalid XML characters. Default: "error". */
|
|
308
|
+
invalidCharHandling?: InvalidCharHandling;
|
|
285
309
|
}
|
|
286
310
|
export {};
|
|
@@ -46,7 +46,7 @@ class HyperlinkReader extends event_emitter_1.EventEmitter {
|
|
|
46
46
|
return;
|
|
47
47
|
}
|
|
48
48
|
try {
|
|
49
|
-
const parser = new sax_1.SaxParser({ position: false });
|
|
49
|
+
const parser = new sax_1.SaxParser({ position: false, invalidCharHandling: "skip" });
|
|
50
50
|
const decoder = new TextDecoder("utf-8", { fatal: true });
|
|
51
51
|
parser.on("opentag", (node) => {
|
|
52
52
|
if (node.name !== "Relationship") {
|
|
@@ -214,7 +214,7 @@ class WorkbookReaderBase extends event_emitter_1.EventEmitter {
|
|
|
214
214
|
// For "cache" mode, use direct SAX callbacks (no event objects, no async generator overhead)
|
|
215
215
|
if (this.options.sharedStrings === "cache") {
|
|
216
216
|
const sharedStrings = this.sharedStrings;
|
|
217
|
-
const parser = new sax_1.SaxParser({ position: false });
|
|
217
|
+
const parser = new sax_1.SaxParser({ position: false, invalidCharHandling: "skip" });
|
|
218
218
|
parser.on("opentag", (node) => {
|
|
219
219
|
switch (node.name) {
|
|
220
220
|
case "b":
|
|
@@ -314,7 +314,7 @@ class WorkbookReaderBase extends event_emitter_1.EventEmitter {
|
|
|
314
314
|
return;
|
|
315
315
|
}
|
|
316
316
|
// "emit" mode — must yield, so use direct SAX with per-chunk yield
|
|
317
|
-
const emitParser = new sax_1.SaxParser();
|
|
317
|
+
const emitParser = new sax_1.SaxParser({ invalidCharHandling: "skip" });
|
|
318
318
|
const emitDecoder = new TextDecoder("utf-8", { fatal: true });
|
|
319
319
|
let pendingEmits = [];
|
|
320
320
|
emitParser.on("opentag", (node) => {
|
|
@@ -146,7 +146,7 @@ class WorksheetReader extends event_emitter_1.EventEmitter {
|
|
|
146
146
|
// Direct SAX callback mode — zero intermediate event objects.
|
|
147
147
|
// We collect worksheet events per-chunk and yield them.
|
|
148
148
|
let worksheetEvents = null;
|
|
149
|
-
const parser = new sax_1.SaxParser({ position: false });
|
|
149
|
+
const parser = new sax_1.SaxParser({ position: false, invalidCharHandling: "skip" });
|
|
150
150
|
parser.on("opentag", (node) => {
|
|
151
151
|
if (emitSheet) {
|
|
152
152
|
switch (node.name) {
|
|
@@ -162,7 +162,7 @@ class BaseXform {
|
|
|
162
162
|
* Use this instead of parse(parseSax(stream)) for hot paths.
|
|
163
163
|
*/
|
|
164
164
|
async parseStreamDirect(stream) {
|
|
165
|
-
const parser = new sax_1.SaxParser();
|
|
165
|
+
const parser = new sax_1.SaxParser({ invalidCharHandling: "skip" });
|
|
166
166
|
const decoder = new TextDecoder("utf-8", { fatal: true });
|
|
167
167
|
let done = false;
|
|
168
168
|
let finalModel;
|