@cj-tech-master/excelts 7.5.0 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/dist/browser/modules/excel/stream/hyperlink-reader.js +1 -1
  2. package/dist/browser/modules/excel/stream/workbook-reader.browser.js +2 -2
  3. package/dist/browser/modules/excel/stream/worksheet-reader.js +1 -1
  4. package/dist/browser/modules/excel/xlsx/xform/base-xform.js +1 -1
  5. package/dist/browser/modules/xml/dom.js +2 -1
  6. package/dist/browser/modules/xml/index.d.ts +1 -1
  7. package/dist/browser/modules/xml/sax.d.ts +41 -0
  8. package/dist/browser/modules/xml/sax.js +265 -76
  9. package/dist/browser/modules/xml/to-object.js +2 -1
  10. package/dist/browser/modules/xml/types.d.ts +24 -0
  11. package/dist/cjs/modules/excel/stream/hyperlink-reader.js +1 -1
  12. package/dist/cjs/modules/excel/stream/workbook-reader.browser.js +2 -2
  13. package/dist/cjs/modules/excel/stream/worksheet-reader.js +1 -1
  14. package/dist/cjs/modules/excel/xlsx/xform/base-xform.js +1 -1
  15. package/dist/cjs/modules/xml/dom.js +2 -1
  16. package/dist/cjs/modules/xml/sax.js +265 -76
  17. package/dist/cjs/modules/xml/to-object.js +2 -1
  18. package/dist/esm/modules/excel/stream/hyperlink-reader.js +1 -1
  19. package/dist/esm/modules/excel/stream/workbook-reader.browser.js +2 -2
  20. package/dist/esm/modules/excel/stream/worksheet-reader.js +1 -1
  21. package/dist/esm/modules/excel/xlsx/xform/base-xform.js +1 -1
  22. package/dist/esm/modules/xml/dom.js +2 -1
  23. package/dist/esm/modules/xml/sax.js +265 -76
  24. package/dist/esm/modules/xml/to-object.js +2 -1
  25. package/dist/iife/excelts.iife.js +196 -54
  26. package/dist/iife/excelts.iife.js.map +1 -1
  27. package/dist/iife/excelts.iife.min.js +44 -44
  28. package/dist/types/modules/xml/index.d.ts +1 -1
  29. package/dist/types/modules/xml/sax.d.ts +41 -0
  30. package/dist/types/modules/xml/types.d.ts +24 -0
  31. package/package.json +1 -1
@@ -43,7 +43,7 @@ class HyperlinkReader extends EventEmitter {
43
43
  return;
44
44
  }
45
45
  try {
46
- const parser = new SaxParser({ position: false });
46
+ const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
47
47
  const decoder = new TextDecoder("utf-8", { fatal: true });
48
48
  parser.on("opentag", (node) => {
49
49
  if (node.name !== "Relationship") {
@@ -211,7 +211,7 @@ export class WorkbookReaderBase extends EventEmitter {
211
211
  // For "cache" mode, use direct SAX callbacks (no event objects, no async generator overhead)
212
212
  if (this.options.sharedStrings === "cache") {
213
213
  const sharedStrings = this.sharedStrings;
214
- const parser = new SaxParser({ position: false });
214
+ const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
215
215
  parser.on("opentag", (node) => {
216
216
  switch (node.name) {
217
217
  case "b":
@@ -311,7 +311,7 @@ export class WorkbookReaderBase extends EventEmitter {
311
311
  return;
312
312
  }
313
313
  // "emit" mode — must yield, so use direct SAX with per-chunk yield
314
- const emitParser = new SaxParser();
314
+ const emitParser = new SaxParser({ invalidCharHandling: "skip" });
315
315
  const emitDecoder = new TextDecoder("utf-8", { fatal: true });
316
316
  let pendingEmits = [];
317
317
  emitParser.on("opentag", (node) => {
@@ -143,7 +143,7 @@ class WorksheetReader extends EventEmitter {
143
143
  // Direct SAX callback mode — zero intermediate event objects.
144
144
  // We collect worksheet events per-chunk and yield them.
145
145
  let worksheetEvents = null;
146
- const parser = new SaxParser({ position: false });
146
+ const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
147
147
  parser.on("opentag", (node) => {
148
148
  if (emitSheet) {
149
149
  switch (node.name) {
@@ -159,7 +159,7 @@ class BaseXform {
159
159
  * Use this instead of parse(parseSax(stream)) for hot paths.
160
160
  */
161
161
  async parseStreamDirect(stream) {
162
- const parser = new SaxParser();
162
+ const parser = new SaxParser({ invalidCharHandling: "skip" });
163
163
  const decoder = new TextDecoder("utf-8", { fatal: true });
164
164
  let done = false;
165
165
  let finalModel;
@@ -87,7 +87,8 @@ function parseXml(xml, options) {
87
87
  fragment: options?.fragment ?? false,
88
88
  xmlns: options?.xmlns ?? false,
89
89
  maxDepth: options?.maxDepth,
90
- maxEntityExpansions: options?.maxEntityExpansions
90
+ maxEntityExpansions: options?.maxEntityExpansions,
91
+ invalidCharHandling: options?.invalidCharHandling
91
92
  });
92
93
  // Stack of elements being built. The bottom is a synthetic root
93
94
  // that collects top-level nodes.
@@ -10,7 +10,7 @@
10
10
  * - Dual-mode: streaming (SAX parser + stream writer) and buffered (DOM parser + writer)
11
11
  * - Shared XmlSink interface lets rendering code target both modes transparently
12
12
  */
13
- export type { XmlAttributes, XmlNodeType, XmlElement, XmlText, XmlCData, XmlComment, XmlProcessingInstruction, XmlNode, XmlDocument, XmlSink, SaxTag, SaxEvent, SaxEventAny, SaxHandlers, SaxOptions, WritableTarget, XmlParseOptions, ToPlainObjectOptions, ParseXmlToObjectOptions } from "./types.js";
13
+ export type { XmlAttributes, XmlNodeType, XmlElement, XmlText, XmlCData, XmlComment, XmlProcessingInstruction, XmlNode, XmlDocument, XmlSink, SaxTag, SaxEvent, SaxEventAny, SaxHandlers, SaxOptions, InvalidCharHandling, WritableTarget, XmlParseOptions, ToPlainObjectOptions, ParseXmlToObjectOptions } from "./types.js";
14
14
  export { xmlEncode, xmlDecode, xmlEncodeAttr, validateXmlName, encodeCData, validateCommentText } from "./encode.js";
15
15
  export { XmlWriter, StdDocAttributes } from "./writer.js";
16
16
  export { XmlStreamWriter } from "./stream-writer.js";
@@ -37,6 +37,7 @@ declare class SaxParser {
37
37
  private xmlns;
38
38
  private maxDepth;
39
39
  private maxEntityExpansions;
40
+ private invalidCharHandling;
40
41
  private _entityExpansionCount;
41
42
  private _nsStack;
42
43
  private state;
@@ -81,6 +82,46 @@ declare class SaxParser {
81
82
  fail(message: string): this;
82
83
  write(chunk: string | null): this;
83
84
  close(): this;
85
+ /**
86
+ * Handle an invalid XML character according to the configured strategy.
87
+ *
88
+ * Used by `handleTextInRoot()` fast path which manages its own text accumulation
89
+ * and cannot use the `getCode()` loop approach.
90
+ *
91
+ * - `"error"`: call `fail()` and return the original code.
92
+ * - `"skip"`: return `REPLACEMENT_CHAR` as a sentinel (caller handles skip).
93
+ * - `"replace"`: return `REPLACEMENT_CHAR`.
94
+ *
95
+ * Note: For `getCode()`, invalid char handling is inlined to avoid recursion.
96
+ *
97
+ * @param code - The invalid character code point.
98
+ * @param kind - Optional description (e.g. "lone surrogate") for error messages.
99
+ * @returns The code point to use.
100
+ */
101
+ private handleInvalidChar;
102
+ /**
103
+ * Handle an invalid character inside the `handleTextInRoot()` fast loop.
104
+ *
105
+ * Unlike `handleInvalidChar()` (which returns a code point for `getCode()`),
106
+ * this method manages the text accumulation state (`this.text`, `start`) that
107
+ * the fast text loop relies on.
108
+ *
109
+ * - `"error"`: call `fail()`, leave text accumulation unchanged (char stays in output).
110
+ * - `"skip"`: flush text up to the invalid char, skip it, return new `start`.
111
+ * - `"replace"`: flush text up to the invalid char, append U+FFFD, return new `start`.
112
+ *
113
+ * @returns The updated `start` index for the text accumulation loop.
114
+ */
115
+ private handleInvalidCharInText;
116
+ /**
117
+ * Handle an invalid character inside `sAttribValueQuoted()`.
118
+ *
119
+ * Same pattern as `handleInvalidCharInText()` but for attribute value
120
+ * accumulation (always uses `this.text`, no conditional handler check).
121
+ *
122
+ * @returns The updated `start` index.
123
+ */
124
+ private handleInvalidCharInAttr;
84
125
  private getCode;
85
126
  private unget;
86
127
  private processState;
@@ -35,6 +35,8 @@ const GREATER = 0x3e; // >
35
35
  const QUESTION = 0x3f; // ?
36
36
  const OPEN_BRACKET = 0x5b; // [
37
37
  const CLOSE_BRACKET = 0x5d; // ]
38
+ const REPLACEMENT_CHAR = 0xfffd; // U+FFFD REPLACEMENT CHARACTER
39
+ const REPLACEMENT_STR = "\uFFFD"; // Pre-allocated string form of U+FFFD
38
40
  // =============================================================================
39
41
  // Pre-computed Lookup Tables
40
42
  // =============================================================================
@@ -233,6 +235,7 @@ class SaxParser {
233
235
  this.maxDepth = options?.maxDepth !== undefined ? options.maxDepth : 256;
234
236
  this.maxEntityExpansions =
235
237
  options?.maxEntityExpansions !== undefined ? options.maxEntityExpansions : 10000;
238
+ this.invalidCharHandling = options?.invalidCharHandling ?? "error";
236
239
  this._init();
237
240
  }
238
241
  get closed() {
@@ -350,87 +353,215 @@ class SaxParser {
350
353
  return this.write(null);
351
354
  }
352
355
  // ===========================================================================
353
- // Character Reading
356
+ // Invalid Character Handling
354
357
  // ===========================================================================
355
- getCode() {
356
- const { chunk, i } = this;
357
- this.prevI = i;
358
- this.i = i + 1;
359
- if (i >= chunk.length) {
360
- return -1;
361
- }
362
- const code = chunk.charCodeAt(i);
363
- // Ultra-fast path: printable ASCII (0x20-0x7E) — the vast majority of XML content.
364
- // No validation needed; these are always valid XML 1.0 characters.
365
- if (code >= 0x20 && code <= 0x7e) {
366
- if (this.trackPosition) {
367
- this.column++;
358
+ /**
359
+ * Handle an invalid XML character according to the configured strategy.
360
+ *
361
+ * Used by `handleTextInRoot()` fast path which manages its own text accumulation
362
+ * and cannot use the `getCode()` loop approach.
363
+ *
364
+ * - `"error"`: call `fail()` and return the original code.
365
+ * - `"skip"`: return `REPLACEMENT_CHAR` as a sentinel (caller handles skip).
366
+ * - `"replace"`: return `REPLACEMENT_CHAR`.
367
+ *
368
+ * Note: For `getCode()`, invalid char handling is inlined to avoid recursion.
369
+ *
370
+ * @param code - The invalid character code point.
371
+ * @param kind - Optional description (e.g. "lone surrogate") for error messages.
372
+ * @returns The code point to use.
373
+ */
374
+ handleInvalidChar(code, kind) {
375
+ switch (this.invalidCharHandling) {
376
+ case "replace":
377
+ return REPLACEMENT_CHAR;
378
+ case "skip":
379
+ // Caller is responsible for the actual skip logic.
380
+ // We return -2 as a sentinel to tell getCode()'s loop to continue.
381
+ return -2;
382
+ default: {
383
+ // "error" — existing strict behavior
384
+ const label = kind
385
+ ? `invalid XML character: ${kind} 0x${code.toString(16)}`
386
+ : `invalid XML character: 0x${code.toString(16)}`;
387
+ this.fail(label);
388
+ return code;
368
389
  }
369
- return code;
370
390
  }
371
- // Secondary fast path: TAB (0x09) — common in attribute values
372
- if (code === TAB) {
373
- if (this.trackPosition) {
374
- this.column++;
391
+ }
392
+ /**
393
+ * Handle an invalid character inside the `handleTextInRoot()` fast loop.
394
+ *
395
+ * Unlike `handleInvalidChar()` (which returns a code point for `getCode()`),
396
+ * this method manages the text accumulation state (`this.text`, `start`) that
397
+ * the fast text loop relies on.
398
+ *
399
+ * - `"error"`: call `fail()`, leave text accumulation unchanged (char stays in output).
400
+ * - `"skip"`: flush text up to the invalid char, skip it, return new `start`.
401
+ * - `"replace"`: flush text up to the invalid char, append U+FFFD, return new `start`.
402
+ *
403
+ * @returns The updated `start` index for the text accumulation loop.
404
+ */
405
+ handleInvalidCharInText(code, handler, start, kind) {
406
+ switch (this.invalidCharHandling) {
407
+ case "skip":
408
+ // Flush text accumulated before this invalid char, then skip it
409
+ if (handler && start < this.prevI) {
410
+ this.text += this.chunk.slice(start, this.prevI);
411
+ }
412
+ return this.i;
413
+ case "replace":
414
+ // Flush text accumulated before this invalid char, append replacement
415
+ if (handler) {
416
+ if (start < this.prevI) {
417
+ this.text += this.chunk.slice(start, this.prevI);
418
+ }
419
+ this.text += REPLACEMENT_STR;
420
+ }
421
+ return this.i;
422
+ default: {
423
+ // "error" — existing strict behavior, char stays in output
424
+ const label = kind
425
+ ? `invalid XML character: ${kind} 0x${code.toString(16)}`
426
+ : `invalid XML character: 0x${code.toString(16)}`;
427
+ this.fail(label);
428
+ return start;
375
429
  }
376
- return code;
377
430
  }
378
- // Handle CR (normalize CR and CR+LF to LF per XML 1.0 §2.11)
379
- if (code === CR) {
380
- if (chunk.charCodeAt(i + 1) === NL) {
381
- this.i = i + 2;
382
- }
383
- if (this.trackPosition) {
384
- this.line++;
385
- this.column = 0;
386
- this.positionAtNewLine = this.position;
431
+ }
432
+ /**
433
+ * Handle an invalid character inside `sAttribValueQuoted()`.
434
+ *
435
+ * Same pattern as `handleInvalidCharInText()` but for attribute value
436
+ * accumulation (always uses `this.text`, no conditional handler check).
437
+ *
438
+ * @returns The updated `start` index.
439
+ */
440
+ handleInvalidCharInAttr(code, start, kind) {
441
+ switch (this.invalidCharHandling) {
442
+ case "skip":
443
+ if (start < this.prevI) {
444
+ this.text += this.chunk.slice(start, this.prevI);
445
+ }
446
+ return this.i;
447
+ case "replace":
448
+ if (start < this.prevI) {
449
+ this.text += this.chunk.slice(start, this.prevI);
450
+ }
451
+ this.text += REPLACEMENT_STR;
452
+ return this.i;
453
+ default: {
454
+ const label = kind
455
+ ? `invalid XML character: ${kind} 0x${code.toString(16)}`
456
+ : `invalid XML character: 0x${code.toString(16)}`;
457
+ this.fail(label);
458
+ return start;
387
459
  }
388
- return NL;
389
460
  }
390
- // Handle LF
391
- if (code === NL) {
392
- if (this.trackPosition) {
393
- this.line++;
394
- this.column = 0;
395
- this.positionAtNewLine = this.position;
461
+ }
462
+ // ===========================================================================
463
+ // Character Reading
464
+ // ===========================================================================
465
+ getCode() {
466
+ // Loop to handle skip mode: when an invalid char returns -2, we retry
467
+ // with the next character instead of recursing (avoids stack overflow
468
+ // on long runs of consecutive invalid characters).
469
+ for (;;) {
470
+ const { chunk } = this;
471
+ const i = this.i;
472
+ this.prevI = i;
473
+ this.i = i + 1;
474
+ if (i >= chunk.length) {
475
+ return -1;
476
+ }
477
+ const code = chunk.charCodeAt(i);
478
+ // Ultra-fast path: printable ASCII (0x20-0x7E) — the vast majority of XML content.
479
+ // No validation needed; these are always valid XML 1.0 characters.
480
+ if (code >= 0x20 && code <= 0x7e) {
481
+ if (this.trackPosition) {
482
+ this.column++;
483
+ }
484
+ return code;
396
485
  }
397
- return NL;
398
- }
399
- // Handle surrogates
400
- if (code >= 0xd800 && code <= 0xdbff) {
401
- const next = chunk.charCodeAt(i + 1);
402
- if (next >= 0xdc00 && next <= 0xdfff) {
403
- this.i = i + 2;
486
+ // Secondary fast path: TAB (0x09) — common in attribute values
487
+ if (code === TAB) {
404
488
  if (this.trackPosition) {
405
489
  this.column++;
406
490
  }
407
- return 0x10000 + ((code - 0xd800) * 0x400 + (next - 0xdc00));
491
+ return code;
408
492
  }
409
- // Lone high surrogate invalid XML character
410
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
411
- }
412
- // Lone low surrogate — invalid XML character
413
- if (code >= 0xdc00 && code <= 0xdfff) {
414
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
415
- }
416
- // Non-ASCII above surrogate range (0x80-0xD7FF, 0xE000-0xFFFD) — all valid XML
417
- if (code >= 0x80) {
493
+ // Handle CR (normalize CR and CR+LF to LF per XML 1.0 §2.11)
494
+ if (code === CR) {
495
+ if (chunk.charCodeAt(i + 1) === NL) {
496
+ this.i = i + 2;
497
+ }
498
+ if (this.trackPosition) {
499
+ this.line++;
500
+ this.column = 0;
501
+ this.positionAtNewLine = this.position;
502
+ }
503
+ return NL;
504
+ }
505
+ // Handle LF
506
+ if (code === NL) {
507
+ if (this.trackPosition) {
508
+ this.line++;
509
+ this.column = 0;
510
+ this.positionAtNewLine = this.position;
511
+ }
512
+ return NL;
513
+ }
514
+ // Handle surrogates
515
+ if (code >= 0xd800 && code <= 0xdbff) {
516
+ const next = chunk.charCodeAt(i + 1);
517
+ if (next >= 0xdc00 && next <= 0xdfff) {
518
+ this.i = i + 2;
519
+ if (this.trackPosition) {
520
+ this.column++;
521
+ }
522
+ return 0x10000 + ((code - 0xd800) * 0x400 + (next - 0xdc00));
523
+ }
524
+ // Lone high surrogate — invalid XML character
525
+ const result = this.handleInvalidChar(code, "lone surrogate");
526
+ if (result !== -2) {
527
+ return result;
528
+ }
529
+ continue; // skip: loop to next char
530
+ }
531
+ // Lone low surrogate — invalid XML character
532
+ if (code >= 0xdc00 && code <= 0xdfff) {
533
+ const result = this.handleInvalidChar(code, "lone surrogate");
534
+ if (result !== -2) {
535
+ return result;
536
+ }
537
+ continue;
538
+ }
539
+ // Non-ASCII above surrogate range (0x80-0xD7FF, 0xE000-0xFFFD) — all valid XML
540
+ if (code >= 0x80) {
541
+ if (this.trackPosition) {
542
+ this.column++;
543
+ }
544
+ // Reject 0xFFFE and 0xFFFF
545
+ if (code === 0xfffe || code === 0xffff) {
546
+ const result = this.handleInvalidChar(code);
547
+ if (result !== -2) {
548
+ return result;
549
+ }
550
+ continue;
551
+ }
552
+ return code;
553
+ }
554
+ // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
555
+ // All invalid in XML 1.0
418
556
  if (this.trackPosition) {
419
557
  this.column++;
420
558
  }
421
- // Reject 0xFFFE and 0xFFFF
422
- if (code === 0xfffe || code === 0xffff) {
423
- this.fail("invalid XML character: 0x" + code.toString(16));
559
+ const result = this.handleInvalidChar(code);
560
+ if (result !== -2) {
561
+ return result;
424
562
  }
425
- return code;
426
- }
427
- // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
428
- // All invalid in XML 1.0
429
- if (this.trackPosition) {
430
- this.column++;
563
+ // skip: continue to next char
431
564
  }
432
- this.fail("invalid XML character: 0x" + code.toString(16));
433
- return code;
434
565
  }
435
566
  unget() {
436
567
  this.i = this.prevI;
@@ -637,16 +768,16 @@ class SaxParser {
637
768
  }
638
769
  else {
639
770
  this.i++;
640
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
771
+ start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
641
772
  }
642
773
  }
643
774
  else if (code >= 0xdc00 && code <= 0xdfff) {
644
775
  this.i++;
645
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
776
+ start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
646
777
  }
647
778
  else if (code === 0xfffe || code === 0xffff) {
648
779
  this.i++;
649
- this.fail("invalid XML character: 0x" + code.toString(16));
780
+ start = this.handleInvalidCharInText(code, handler, start);
650
781
  }
651
782
  else {
652
783
  this.i++;
@@ -662,7 +793,7 @@ class SaxParser {
662
793
  if (this.trackPosition) {
663
794
  this.column++;
664
795
  }
665
- this.fail("invalid XML character: 0x" + code.toString(16));
796
+ start = this.handleInvalidCharInText(code, handler, start);
666
797
  }
667
798
  // End of chunk
668
799
  if (handler && start < this.i) {
@@ -674,14 +805,42 @@ class SaxParser {
674
805
  let { i: start } = this;
675
806
  const handler = this._handlers.text;
676
807
  let nonSpace = false;
808
+ const isSkip = this.invalidCharHandling === "skip";
809
+ const isReplace = this.invalidCharHandling === "replace";
677
810
  while (true) {
811
+ const iBeforeGet = this.i;
678
812
  const c = this.getCode();
679
813
  if (c === -1) {
680
- if (handler && start < this.i) {
681
- this.text += chunk.slice(start, this.i);
814
+ if (handler && start < iBeforeGet) {
815
+ this.text += chunk.slice(start, iBeforeGet);
682
816
  }
683
817
  break;
684
818
  }
819
+ // In skip mode, getCode() may have internally looped past invalid chars.
820
+ // Flush valid text before the gap and advance start past it.
821
+ if (isSkip && this.prevI > iBeforeGet) {
822
+ if (handler && start < iBeforeGet) {
823
+ this.text += chunk.slice(start, iBeforeGet);
824
+ }
825
+ start = this.prevI;
826
+ }
827
+ // In replace mode, getCode() returns REPLACEMENT_CHAR for invalid chars
828
+ // but the original byte is still in the chunk. Detect this by checking
829
+ // whether getCode() returned REPLACEMENT_CHAR while the raw chunk byte
830
+ // at prevI is NOT U+FFFD (i.e., it was substituted by handleInvalidChar).
831
+ if (isReplace &&
832
+ c === REPLACEMENT_CHAR &&
833
+ chunk.charCodeAt(this.prevI) !== REPLACEMENT_CHAR) {
834
+ if (handler) {
835
+ if (start < this.prevI) {
836
+ this.text += chunk.slice(start, this.prevI);
837
+ }
838
+ this.text += REPLACEMENT_STR;
839
+ }
840
+ start = this.i;
841
+ nonSpace = true;
842
+ continue;
843
+ }
685
844
  if (c === LESS) {
686
845
  if (handler) {
687
846
  const slice = chunk.slice(start, this.prevI);
@@ -1058,13 +1217,43 @@ class SaxParser {
1058
1217
  start = this.i;
1059
1218
  continue;
1060
1219
  }
1061
- // All other charsfall back to getCode() for validation
1062
- const c = this.getCode();
1063
- if (c === -1) {
1064
- this.text += chunk.slice(start, this.i);
1065
- return;
1220
+ // Non-ASCII (>= 0x80)mostly valid, handle inline like handleTextInRoot
1221
+ if (code >= 0x80) {
1222
+ this.prevI = this.i;
1223
+ if (code >= 0xd800 && code <= 0xdbff) {
1224
+ const next = chunk.charCodeAt(this.i + 1);
1225
+ if (next >= 0xdc00 && next <= 0xdfff) {
1226
+ this.i += 2; // valid surrogate pair
1227
+ }
1228
+ else {
1229
+ this.i++;
1230
+ start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
1231
+ }
1232
+ }
1233
+ else if (code >= 0xdc00 && code <= 0xdfff) {
1234
+ this.i++;
1235
+ start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
1236
+ }
1237
+ else if (code === 0xfffe || code === 0xffff) {
1238
+ this.i++;
1239
+ start = this.handleInvalidCharInAttr(code, start);
1240
+ }
1241
+ else {
1242
+ this.i++; // valid non-ASCII BMP char
1243
+ }
1244
+ if (this.trackPosition) {
1245
+ this.column++;
1246
+ }
1247
+ continue;
1248
+ }
1249
+ // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
1250
+ // All invalid in XML 1.0
1251
+ this.prevI = this.i;
1252
+ this.i++;
1253
+ if (this.trackPosition) {
1254
+ this.column++;
1066
1255
  }
1067
- // Just continue — char is already consumed by getCode()
1256
+ start = this.handleInvalidCharInAttr(code, start);
1068
1257
  }
1069
1258
  // End of chunk
1070
1259
  this.text += chunk.slice(start, this.i);
@@ -42,7 +42,8 @@ function parseXmlToObject(xml, options) {
42
42
  position: false,
43
43
  fragment: options?.fragment ?? false,
44
44
  maxDepth: options?.maxDepth,
45
- maxEntityExpansions: options?.maxEntityExpansions
45
+ maxEntityExpansions: options?.maxEntityExpansions,
46
+ invalidCharHandling: options?.invalidCharHandling
46
47
  });
47
48
  // Stack: bottom is a synthetic root frame that collects the document root.
48
49
  const syntheticObj = Object.create(null);
@@ -169,6 +169,15 @@ export interface SaxHandlers {
169
169
  pi?: (target: string, body: string) => void;
170
170
  error?: (err: Error) => void;
171
171
  }
172
+ /**
173
+ * Strategy for handling invalid XML characters (control chars, lone surrogates,
174
+ * non-characters like U+FFFE/U+FFFF).
175
+ *
176
+ * - `"error"` — Report via error handler or throw (XML 1.0 strict). **Default.**
177
+ * - `"skip"` — Silently remove the invalid character from the output.
178
+ * - `"replace"` — Replace the invalid character with U+FFFD (REPLACEMENT CHARACTER).
179
+ */
180
+ export type InvalidCharHandling = "error" | "skip" | "replace";
172
181
  /** SAX parser options. */
173
182
  export interface SaxOptions {
174
183
  /** Track position (line/column) for error messages. Default: true */
@@ -187,6 +196,17 @@ export interface SaxOptions {
187
196
  * Default: 10000. Set 0 to disable.
188
197
  */
189
198
  maxEntityExpansions?: number;
199
+ /**
200
+ * How to handle invalid XML characters (ASCII control chars, lone surrogates,
201
+ * non-characters U+FFFE/U+FFFF, DEL U+007F, etc.).
202
+ *
203
+ * - `"error"` — Report via error handler or throw. **(Default)**
204
+ * - `"skip"` — Silently discard the character.
205
+ * - `"replace"` — Replace with U+FFFD (REPLACEMENT CHARACTER).
206
+ *
207
+ * @default "error"
208
+ */
209
+ invalidCharHandling?: InvalidCharHandling;
190
210
  }
191
211
  /**
192
212
  * Minimal writable interface for XmlStreamWriter.
@@ -265,6 +285,8 @@ export interface ParseXmlToObjectOptions extends ToPlainObjectOptions {
265
285
  maxDepth?: number;
266
286
  /** Maximum total entity expansions. Default: 10000. */
267
287
  maxEntityExpansions?: number;
288
+ /** How to handle invalid XML characters. Default: "error". */
289
+ invalidCharHandling?: InvalidCharHandling;
268
290
  }
269
291
  /** Options for `parseXml()`. */
270
292
  export interface XmlParseOptions {
@@ -282,5 +304,7 @@ export interface XmlParseOptions {
282
304
  maxDepth?: number;
283
305
  /** Maximum total entity expansions. Default: 10000. */
284
306
  maxEntityExpansions?: number;
307
+ /** How to handle invalid XML characters. Default: "error". */
308
+ invalidCharHandling?: InvalidCharHandling;
285
309
  }
286
310
  export {};
@@ -46,7 +46,7 @@ class HyperlinkReader extends event_emitter_1.EventEmitter {
46
46
  return;
47
47
  }
48
48
  try {
49
- const parser = new sax_1.SaxParser({ position: false });
49
+ const parser = new sax_1.SaxParser({ position: false, invalidCharHandling: "skip" });
50
50
  const decoder = new TextDecoder("utf-8", { fatal: true });
51
51
  parser.on("opentag", (node) => {
52
52
  if (node.name !== "Relationship") {
@@ -214,7 +214,7 @@ class WorkbookReaderBase extends event_emitter_1.EventEmitter {
214
214
  // For "cache" mode, use direct SAX callbacks (no event objects, no async generator overhead)
215
215
  if (this.options.sharedStrings === "cache") {
216
216
  const sharedStrings = this.sharedStrings;
217
- const parser = new sax_1.SaxParser({ position: false });
217
+ const parser = new sax_1.SaxParser({ position: false, invalidCharHandling: "skip" });
218
218
  parser.on("opentag", (node) => {
219
219
  switch (node.name) {
220
220
  case "b":
@@ -314,7 +314,7 @@ class WorkbookReaderBase extends event_emitter_1.EventEmitter {
314
314
  return;
315
315
  }
316
316
  // "emit" mode — must yield, so use direct SAX with per-chunk yield
317
- const emitParser = new sax_1.SaxParser();
317
+ const emitParser = new sax_1.SaxParser({ invalidCharHandling: "skip" });
318
318
  const emitDecoder = new TextDecoder("utf-8", { fatal: true });
319
319
  let pendingEmits = [];
320
320
  emitParser.on("opentag", (node) => {
@@ -146,7 +146,7 @@ class WorksheetReader extends event_emitter_1.EventEmitter {
146
146
  // Direct SAX callback mode — zero intermediate event objects.
147
147
  // We collect worksheet events per-chunk and yield them.
148
148
  let worksheetEvents = null;
149
- const parser = new sax_1.SaxParser({ position: false });
149
+ const parser = new sax_1.SaxParser({ position: false, invalidCharHandling: "skip" });
150
150
  parser.on("opentag", (node) => {
151
151
  if (emitSheet) {
152
152
  switch (node.name) {
@@ -162,7 +162,7 @@ class BaseXform {
162
162
  * Use this instead of parse(parseSax(stream)) for hot paths.
163
163
  */
164
164
  async parseStreamDirect(stream) {
165
- const parser = new sax_1.SaxParser();
165
+ const parser = new sax_1.SaxParser({ invalidCharHandling: "skip" });
166
166
  const decoder = new TextDecoder("utf-8", { fatal: true });
167
167
  let done = false;
168
168
  let finalModel;