@cj-tech-master/excelts 7.4.0 → 7.5.0-canary.20260404054153.f4c5ecc

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/browser/modules/excel/stream/hyperlink-reader.js +1 -1
  2. package/dist/browser/modules/excel/stream/workbook-reader.browser.js +2 -2
  3. package/dist/browser/modules/excel/stream/worksheet-reader.js +1 -1
  4. package/dist/browser/modules/excel/xlsx/xform/base-xform.js +1 -1
  5. package/dist/browser/modules/xml/dom.d.ts +1 -1
  6. package/dist/browser/modules/xml/dom.js +8 -5
  7. package/dist/browser/modules/xml/index.d.ts +1 -1
  8. package/dist/browser/modules/xml/sax.d.ts +41 -0
  9. package/dist/browser/modules/xml/sax.js +265 -76
  10. package/dist/browser/modules/xml/to-object-shared.d.ts +1 -0
  11. package/dist/browser/modules/xml/to-object-shared.js +2 -1
  12. package/dist/browser/modules/xml/to-object.js +7 -4
  13. package/dist/browser/modules/xml/types.d.ts +33 -1
  14. package/dist/cjs/modules/excel/stream/hyperlink-reader.js +1 -1
  15. package/dist/cjs/modules/excel/stream/workbook-reader.browser.js +2 -2
  16. package/dist/cjs/modules/excel/stream/worksheet-reader.js +1 -1
  17. package/dist/cjs/modules/excel/xlsx/xform/base-xform.js +1 -1
  18. package/dist/cjs/modules/xml/dom.js +8 -5
  19. package/dist/cjs/modules/xml/sax.js +265 -76
  20. package/dist/cjs/modules/xml/to-object-shared.js +2 -1
  21. package/dist/cjs/modules/xml/to-object.js +7 -4
  22. package/dist/esm/modules/excel/stream/hyperlink-reader.js +1 -1
  23. package/dist/esm/modules/excel/stream/workbook-reader.browser.js +2 -2
  24. package/dist/esm/modules/excel/stream/worksheet-reader.js +1 -1
  25. package/dist/esm/modules/excel/xlsx/xform/base-xform.js +1 -1
  26. package/dist/esm/modules/xml/dom.js +8 -5
  27. package/dist/esm/modules/xml/sax.js +265 -76
  28. package/dist/esm/modules/xml/to-object-shared.js +2 -1
  29. package/dist/esm/modules/xml/to-object.js +7 -4
  30. package/dist/iife/excelts.iife.js +196 -54
  31. package/dist/iife/excelts.iife.js.map +1 -1
  32. package/dist/iife/excelts.iife.min.js +44 -44
  33. package/dist/types/modules/xml/dom.d.ts +1 -1
  34. package/dist/types/modules/xml/index.d.ts +1 -1
  35. package/dist/types/modules/xml/sax.d.ts +41 -0
  36. package/dist/types/modules/xml/to-object-shared.d.ts +1 -0
  37. package/dist/types/modules/xml/types.d.ts +33 -1
  38. package/package.json +1 -1
@@ -43,7 +43,7 @@ class HyperlinkReader extends EventEmitter {
43
43
  return;
44
44
  }
45
45
  try {
46
- const parser = new SaxParser({ position: false });
46
+ const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
47
47
  const decoder = new TextDecoder("utf-8", { fatal: true });
48
48
  parser.on("opentag", (node) => {
49
49
  if (node.name !== "Relationship") {
@@ -211,7 +211,7 @@ export class WorkbookReaderBase extends EventEmitter {
211
211
  // For "cache" mode, use direct SAX callbacks (no event objects, no async generator overhead)
212
212
  if (this.options.sharedStrings === "cache") {
213
213
  const sharedStrings = this.sharedStrings;
214
- const parser = new SaxParser({ position: false });
214
+ const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
215
215
  parser.on("opentag", (node) => {
216
216
  switch (node.name) {
217
217
  case "b":
@@ -311,7 +311,7 @@ export class WorkbookReaderBase extends EventEmitter {
311
311
  return;
312
312
  }
313
313
  // "emit" mode — must yield, so use direct SAX with per-chunk yield
314
- const emitParser = new SaxParser();
314
+ const emitParser = new SaxParser({ invalidCharHandling: "skip" });
315
315
  const emitDecoder = new TextDecoder("utf-8", { fatal: true });
316
316
  let pendingEmits = [];
317
317
  emitParser.on("opentag", (node) => {
@@ -143,7 +143,7 @@ class WorksheetReader extends EventEmitter {
143
143
  // Direct SAX callback mode — zero intermediate event objects.
144
144
  // We collect worksheet events per-chunk and yield them.
145
145
  let worksheetEvents = null;
146
- const parser = new SaxParser({ position: false });
146
+ const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
147
147
  parser.on("opentag", (node) => {
148
148
  if (emitSheet) {
149
149
  switch (node.name) {
@@ -159,7 +159,7 @@ class BaseXform {
159
159
  * Use this instead of parse(parseSax(stream)) for hot paths.
160
160
  */
161
161
  async parseStreamDirect(stream) {
162
- const parser = new SaxParser();
162
+ const parser = new SaxParser({ invalidCharHandling: "skip" });
163
163
  const decoder = new TextDecoder("utf-8", { fatal: true });
164
164
  let done = false;
165
165
  let finalModel;
@@ -52,7 +52,7 @@ declare function walk(element: XmlElement, visitor: (el: XmlElement) => void): v
52
52
  /**
53
53
  * Convert an {@link XmlElement} DOM tree to a plain JavaScript object.
54
54
  *
55
- * Produces output similar to fast-xml-parser: element names become object keys,
55
+ * Produces a plain JavaScript object where element names become object keys,
56
56
  * attributes are prefixed (default `@_`), text-only elements collapse to their
57
57
  * string value, and repeated sibling names merge into arrays.
58
58
  *
@@ -87,7 +87,8 @@ function parseXml(xml, options) {
87
87
  fragment: options?.fragment ?? false,
88
88
  xmlns: options?.xmlns ?? false,
89
89
  maxDepth: options?.maxDepth,
90
- maxEntityExpansions: options?.maxEntityExpansions
90
+ maxEntityExpansions: options?.maxEntityExpansions,
91
+ invalidCharHandling: options?.invalidCharHandling
91
92
  });
92
93
  // Stack of elements being built. The bottom is a synthetic root
93
94
  // that collects top-level nodes.
@@ -256,7 +257,7 @@ function walk(element, visitor) {
256
257
  /**
257
258
  * Convert an {@link XmlElement} DOM tree to a plain JavaScript object.
258
259
  *
259
- * Produces output similar to fast-xml-parser: element names become object keys,
260
+ * Produces a plain JavaScript object where element names become object keys,
260
261
  * attributes are prefixed (default `@_`), text-only elements collapse to their
261
262
  * string value, and repeated sibling names merge into arrays.
262
263
  *
@@ -285,9 +286,11 @@ function toPlainObject(element, options) {
285
286
  // Add attributes — el.attributes is created via Object.create(null)
286
287
  // by safeAttributes(), so no prototype keys to guard against.
287
288
  let hasAttributes = false;
288
- for (const key in el.attributes) {
289
- obj[opts.attrPrefix + key] = el.attributes[key];
290
- hasAttributes = true;
289
+ if (!opts.ignoreAttributes) {
290
+ for (const key in el.attributes) {
291
+ obj[opts.attrPrefix + key] = el.attributes[key];
292
+ hasAttributes = true;
293
+ }
291
294
  }
292
295
  // Collect text and child elements in a single pass.
293
296
  let text = "";
@@ -10,7 +10,7 @@
10
10
  * - Dual-mode: streaming (SAX parser + stream writer) and buffered (DOM parser + writer)
11
11
  * - Shared XmlSink interface lets rendering code target both modes transparently
12
12
  */
13
- export type { XmlAttributes, XmlNodeType, XmlElement, XmlText, XmlCData, XmlComment, XmlProcessingInstruction, XmlNode, XmlDocument, XmlSink, SaxTag, SaxEvent, SaxEventAny, SaxHandlers, SaxOptions, WritableTarget, XmlParseOptions, ToPlainObjectOptions, ParseXmlToObjectOptions } from "./types.js";
13
+ export type { XmlAttributes, XmlNodeType, XmlElement, XmlText, XmlCData, XmlComment, XmlProcessingInstruction, XmlNode, XmlDocument, XmlSink, SaxTag, SaxEvent, SaxEventAny, SaxHandlers, SaxOptions, InvalidCharHandling, WritableTarget, XmlParseOptions, ToPlainObjectOptions, ParseXmlToObjectOptions } from "./types.js";
14
14
  export { xmlEncode, xmlDecode, xmlEncodeAttr, validateXmlName, encodeCData, validateCommentText } from "./encode.js";
15
15
  export { XmlWriter, StdDocAttributes } from "./writer.js";
16
16
  export { XmlStreamWriter } from "./stream-writer.js";
@@ -37,6 +37,7 @@ declare class SaxParser {
37
37
  private xmlns;
38
38
  private maxDepth;
39
39
  private maxEntityExpansions;
40
+ private invalidCharHandling;
40
41
  private _entityExpansionCount;
41
42
  private _nsStack;
42
43
  private state;
@@ -81,6 +82,46 @@ declare class SaxParser {
81
82
  fail(message: string): this;
82
83
  write(chunk: string | null): this;
83
84
  close(): this;
85
+ /**
86
+ * Handle an invalid XML character according to the configured strategy.
87
+ *
88
+ * Used by `handleTextInRoot()` fast path which manages its own text accumulation
89
+ * and cannot use the `getCode()` loop approach.
90
+ *
91
+ * - `"error"`: call `fail()` and return the original code.
92
+ * - `"skip"`: return `REPLACEMENT_CHAR` as a sentinel (caller handles skip).
93
+ * - `"replace"`: return `REPLACEMENT_CHAR`.
94
+ *
95
+ * Note: For `getCode()`, invalid char handling is inlined to avoid recursion.
96
+ *
97
+ * @param code - The invalid character code point.
98
+ * @param kind - Optional description (e.g. "lone surrogate") for error messages.
99
+ * @returns The code point to use.
100
+ */
101
+ private handleInvalidChar;
102
+ /**
103
+ * Handle an invalid character inside the `handleTextInRoot()` fast loop.
104
+ *
105
+ * Unlike `handleInvalidChar()` (which returns a code point for `getCode()`),
106
+ * this method manages the text accumulation state (`this.text`, `start`) that
107
+ * the fast text loop relies on.
108
+ *
109
+ * - `"error"`: call `fail()`, leave text accumulation unchanged (char stays in output).
110
+ * - `"skip"`: flush text up to the invalid char, skip it, return new `start`.
111
+ * - `"replace"`: flush text up to the invalid char, append U+FFFD, return new `start`.
112
+ *
113
+ * @returns The updated `start` index for the text accumulation loop.
114
+ */
115
+ private handleInvalidCharInText;
116
+ /**
117
+ * Handle an invalid character inside `sAttribValueQuoted()`.
118
+ *
119
+ * Same pattern as `handleInvalidCharInText()` but for attribute value
120
+ * accumulation (always uses `this.text`, no conditional handler check).
121
+ *
122
+ * @returns The updated `start` index.
123
+ */
124
+ private handleInvalidCharInAttr;
84
125
  private getCode;
85
126
  private unget;
86
127
  private processState;
@@ -35,6 +35,8 @@ const GREATER = 0x3e; // >
35
35
  const QUESTION = 0x3f; // ?
36
36
  const OPEN_BRACKET = 0x5b; // [
37
37
  const CLOSE_BRACKET = 0x5d; // ]
38
+ const REPLACEMENT_CHAR = 0xfffd; // U+FFFD REPLACEMENT CHARACTER
39
+ const REPLACEMENT_STR = "\uFFFD"; // Pre-allocated string form of U+FFFD
38
40
  // =============================================================================
39
41
  // Pre-computed Lookup Tables
40
42
  // =============================================================================
@@ -233,6 +235,7 @@ class SaxParser {
233
235
  this.maxDepth = options?.maxDepth !== undefined ? options.maxDepth : 256;
234
236
  this.maxEntityExpansions =
235
237
  options?.maxEntityExpansions !== undefined ? options.maxEntityExpansions : 10000;
238
+ this.invalidCharHandling = options?.invalidCharHandling ?? "error";
236
239
  this._init();
237
240
  }
238
241
  get closed() {
@@ -350,87 +353,215 @@ class SaxParser {
350
353
  return this.write(null);
351
354
  }
352
355
  // ===========================================================================
353
- // Character Reading
356
+ // Invalid Character Handling
354
357
  // ===========================================================================
355
- getCode() {
356
- const { chunk, i } = this;
357
- this.prevI = i;
358
- this.i = i + 1;
359
- if (i >= chunk.length) {
360
- return -1;
361
- }
362
- const code = chunk.charCodeAt(i);
363
- // Ultra-fast path: printable ASCII (0x20-0x7E) — the vast majority of XML content.
364
- // No validation needed; these are always valid XML 1.0 characters.
365
- if (code >= 0x20 && code <= 0x7e) {
366
- if (this.trackPosition) {
367
- this.column++;
358
+ /**
359
+ * Handle an invalid XML character according to the configured strategy.
360
+ *
361
+ * Used by `handleTextInRoot()` fast path which manages its own text accumulation
362
+ * and cannot use the `getCode()` loop approach.
363
+ *
364
+ * - `"error"`: call `fail()` and return the original code.
365
+ * - `"skip"`: return `REPLACEMENT_CHAR` as a sentinel (caller handles skip).
366
+ * - `"replace"`: return `REPLACEMENT_CHAR`.
367
+ *
368
+ * Note: For `getCode()`, invalid char handling is inlined to avoid recursion.
369
+ *
370
+ * @param code - The invalid character code point.
371
+ * @param kind - Optional description (e.g. "lone surrogate") for error messages.
372
+ * @returns The code point to use.
373
+ */
374
+ handleInvalidChar(code, kind) {
375
+ switch (this.invalidCharHandling) {
376
+ case "replace":
377
+ return REPLACEMENT_CHAR;
378
+ case "skip":
379
+ // Caller is responsible for the actual skip logic.
380
+ // We return -2 as a sentinel to tell getCode()'s loop to continue.
381
+ return -2;
382
+ default: {
383
+ // "error" — existing strict behavior
384
+ const label = kind
385
+ ? `invalid XML character: ${kind} 0x${code.toString(16)}`
386
+ : `invalid XML character: 0x${code.toString(16)}`;
387
+ this.fail(label);
388
+ return code;
368
389
  }
369
- return code;
370
390
  }
371
- // Secondary fast path: TAB (0x09) — common in attribute values
372
- if (code === TAB) {
373
- if (this.trackPosition) {
374
- this.column++;
391
+ }
392
+ /**
393
+ * Handle an invalid character inside the `handleTextInRoot()` fast loop.
394
+ *
395
+ * Unlike `handleInvalidChar()` (which returns a code point for `getCode()`),
396
+ * this method manages the text accumulation state (`this.text`, `start`) that
397
+ * the fast text loop relies on.
398
+ *
399
+ * - `"error"`: call `fail()`, leave text accumulation unchanged (char stays in output).
400
+ * - `"skip"`: flush text up to the invalid char, skip it, return new `start`.
401
+ * - `"replace"`: flush text up to the invalid char, append U+FFFD, return new `start`.
402
+ *
403
+ * @returns The updated `start` index for the text accumulation loop.
404
+ */
405
+ handleInvalidCharInText(code, handler, start, kind) {
406
+ switch (this.invalidCharHandling) {
407
+ case "skip":
408
+ // Flush text accumulated before this invalid char, then skip it
409
+ if (handler && start < this.prevI) {
410
+ this.text += this.chunk.slice(start, this.prevI);
411
+ }
412
+ return this.i;
413
+ case "replace":
414
+ // Flush text accumulated before this invalid char, append replacement
415
+ if (handler) {
416
+ if (start < this.prevI) {
417
+ this.text += this.chunk.slice(start, this.prevI);
418
+ }
419
+ this.text += REPLACEMENT_STR;
420
+ }
421
+ return this.i;
422
+ default: {
423
+ // "error" — existing strict behavior, char stays in output
424
+ const label = kind
425
+ ? `invalid XML character: ${kind} 0x${code.toString(16)}`
426
+ : `invalid XML character: 0x${code.toString(16)}`;
427
+ this.fail(label);
428
+ return start;
375
429
  }
376
- return code;
377
430
  }
378
- // Handle CR (normalize CR and CR+LF to LF per XML 1.0 §2.11)
379
- if (code === CR) {
380
- if (chunk.charCodeAt(i + 1) === NL) {
381
- this.i = i + 2;
382
- }
383
- if (this.trackPosition) {
384
- this.line++;
385
- this.column = 0;
386
- this.positionAtNewLine = this.position;
431
+ }
432
+ /**
433
+ * Handle an invalid character inside `sAttribValueQuoted()`.
434
+ *
435
+ * Same pattern as `handleInvalidCharInText()` but for attribute value
436
+ * accumulation (always uses `this.text`, no conditional handler check).
437
+ *
438
+ * @returns The updated `start` index.
439
+ */
440
+ handleInvalidCharInAttr(code, start, kind) {
441
+ switch (this.invalidCharHandling) {
442
+ case "skip":
443
+ if (start < this.prevI) {
444
+ this.text += this.chunk.slice(start, this.prevI);
445
+ }
446
+ return this.i;
447
+ case "replace":
448
+ if (start < this.prevI) {
449
+ this.text += this.chunk.slice(start, this.prevI);
450
+ }
451
+ this.text += REPLACEMENT_STR;
452
+ return this.i;
453
+ default: {
454
+ const label = kind
455
+ ? `invalid XML character: ${kind} 0x${code.toString(16)}`
456
+ : `invalid XML character: 0x${code.toString(16)}`;
457
+ this.fail(label);
458
+ return start;
387
459
  }
388
- return NL;
389
460
  }
390
- // Handle LF
391
- if (code === NL) {
392
- if (this.trackPosition) {
393
- this.line++;
394
- this.column = 0;
395
- this.positionAtNewLine = this.position;
461
+ }
462
+ // ===========================================================================
463
+ // Character Reading
464
+ // ===========================================================================
465
+ getCode() {
466
+ // Loop to handle skip mode: when an invalid char returns -2, we retry
467
+ // with the next character instead of recursing (avoids stack overflow
468
+ // on long runs of consecutive invalid characters).
469
+ for (;;) {
470
+ const { chunk } = this;
471
+ const i = this.i;
472
+ this.prevI = i;
473
+ this.i = i + 1;
474
+ if (i >= chunk.length) {
475
+ return -1;
476
+ }
477
+ const code = chunk.charCodeAt(i);
478
+ // Ultra-fast path: printable ASCII (0x20-0x7E) — the vast majority of XML content.
479
+ // No validation needed; these are always valid XML 1.0 characters.
480
+ if (code >= 0x20 && code <= 0x7e) {
481
+ if (this.trackPosition) {
482
+ this.column++;
483
+ }
484
+ return code;
396
485
  }
397
- return NL;
398
- }
399
- // Handle surrogates
400
- if (code >= 0xd800 && code <= 0xdbff) {
401
- const next = chunk.charCodeAt(i + 1);
402
- if (next >= 0xdc00 && next <= 0xdfff) {
403
- this.i = i + 2;
486
+ // Secondary fast path: TAB (0x09) — common in attribute values
487
+ if (code === TAB) {
404
488
  if (this.trackPosition) {
405
489
  this.column++;
406
490
  }
407
- return 0x10000 + ((code - 0xd800) * 0x400 + (next - 0xdc00));
491
+ return code;
408
492
  }
409
- // Lone high surrogate invalid XML character
410
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
411
- }
412
- // Lone low surrogate — invalid XML character
413
- if (code >= 0xdc00 && code <= 0xdfff) {
414
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
415
- }
416
- // Non-ASCII above surrogate range (0x80-0xD7FF, 0xE000-0xFFFD) — all valid XML
417
- if (code >= 0x80) {
493
+ // Handle CR (normalize CR and CR+LF to LF per XML 1.0 §2.11)
494
+ if (code === CR) {
495
+ if (chunk.charCodeAt(i + 1) === NL) {
496
+ this.i = i + 2;
497
+ }
498
+ if (this.trackPosition) {
499
+ this.line++;
500
+ this.column = 0;
501
+ this.positionAtNewLine = this.position;
502
+ }
503
+ return NL;
504
+ }
505
+ // Handle LF
506
+ if (code === NL) {
507
+ if (this.trackPosition) {
508
+ this.line++;
509
+ this.column = 0;
510
+ this.positionAtNewLine = this.position;
511
+ }
512
+ return NL;
513
+ }
514
+ // Handle surrogates
515
+ if (code >= 0xd800 && code <= 0xdbff) {
516
+ const next = chunk.charCodeAt(i + 1);
517
+ if (next >= 0xdc00 && next <= 0xdfff) {
518
+ this.i = i + 2;
519
+ if (this.trackPosition) {
520
+ this.column++;
521
+ }
522
+ return 0x10000 + ((code - 0xd800) * 0x400 + (next - 0xdc00));
523
+ }
524
+ // Lone high surrogate — invalid XML character
525
+ const result = this.handleInvalidChar(code, "lone surrogate");
526
+ if (result !== -2) {
527
+ return result;
528
+ }
529
+ continue; // skip: loop to next char
530
+ }
531
+ // Lone low surrogate — invalid XML character
532
+ if (code >= 0xdc00 && code <= 0xdfff) {
533
+ const result = this.handleInvalidChar(code, "lone surrogate");
534
+ if (result !== -2) {
535
+ return result;
536
+ }
537
+ continue;
538
+ }
539
+ // Non-ASCII above surrogate range (0x80-0xD7FF, 0xE000-0xFFFD) — all valid XML
540
+ if (code >= 0x80) {
541
+ if (this.trackPosition) {
542
+ this.column++;
543
+ }
544
+ // Reject 0xFFFE and 0xFFFF
545
+ if (code === 0xfffe || code === 0xffff) {
546
+ const result = this.handleInvalidChar(code);
547
+ if (result !== -2) {
548
+ return result;
549
+ }
550
+ continue;
551
+ }
552
+ return code;
553
+ }
554
+ // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
555
+ // All invalid in XML 1.0
418
556
  if (this.trackPosition) {
419
557
  this.column++;
420
558
  }
421
- // Reject 0xFFFE and 0xFFFF
422
- if (code === 0xfffe || code === 0xffff) {
423
- this.fail("invalid XML character: 0x" + code.toString(16));
559
+ const result = this.handleInvalidChar(code);
560
+ if (result !== -2) {
561
+ return result;
424
562
  }
425
- return code;
426
- }
427
- // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
428
- // All invalid in XML 1.0
429
- if (this.trackPosition) {
430
- this.column++;
563
+ // skip: continue to next char
431
564
  }
432
- this.fail("invalid XML character: 0x" + code.toString(16));
433
- return code;
434
565
  }
435
566
  unget() {
436
567
  this.i = this.prevI;
@@ -637,16 +768,16 @@ class SaxParser {
637
768
  }
638
769
  else {
639
770
  this.i++;
640
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
771
+ start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
641
772
  }
642
773
  }
643
774
  else if (code >= 0xdc00 && code <= 0xdfff) {
644
775
  this.i++;
645
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
776
+ start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
646
777
  }
647
778
  else if (code === 0xfffe || code === 0xffff) {
648
779
  this.i++;
649
- this.fail("invalid XML character: 0x" + code.toString(16));
780
+ start = this.handleInvalidCharInText(code, handler, start);
650
781
  }
651
782
  else {
652
783
  this.i++;
@@ -662,7 +793,7 @@ class SaxParser {
662
793
  if (this.trackPosition) {
663
794
  this.column++;
664
795
  }
665
- this.fail("invalid XML character: 0x" + code.toString(16));
796
+ start = this.handleInvalidCharInText(code, handler, start);
666
797
  }
667
798
  // End of chunk
668
799
  if (handler && start < this.i) {
@@ -674,14 +805,42 @@ class SaxParser {
674
805
  let { i: start } = this;
675
806
  const handler = this._handlers.text;
676
807
  let nonSpace = false;
808
+ const isSkip = this.invalidCharHandling === "skip";
809
+ const isReplace = this.invalidCharHandling === "replace";
677
810
  while (true) {
811
+ const iBeforeGet = this.i;
678
812
  const c = this.getCode();
679
813
  if (c === -1) {
680
- if (handler && start < this.i) {
681
- this.text += chunk.slice(start, this.i);
814
+ if (handler && start < iBeforeGet) {
815
+ this.text += chunk.slice(start, iBeforeGet);
682
816
  }
683
817
  break;
684
818
  }
819
+ // In skip mode, getCode() may have internally looped past invalid chars.
820
+ // Flush valid text before the gap and advance start past it.
821
+ if (isSkip && this.prevI > iBeforeGet) {
822
+ if (handler && start < iBeforeGet) {
823
+ this.text += chunk.slice(start, iBeforeGet);
824
+ }
825
+ start = this.prevI;
826
+ }
827
+ // In replace mode, getCode() returns REPLACEMENT_CHAR for invalid chars
828
+ // but the original byte is still in the chunk. Detect this by checking
829
+ // whether getCode() returned REPLACEMENT_CHAR while the raw chunk byte
830
+ // at prevI is NOT U+FFFD (i.e., it was substituted by handleInvalidChar).
831
+ if (isReplace &&
832
+ c === REPLACEMENT_CHAR &&
833
+ chunk.charCodeAt(this.prevI) !== REPLACEMENT_CHAR) {
834
+ if (handler) {
835
+ if (start < this.prevI) {
836
+ this.text += chunk.slice(start, this.prevI);
837
+ }
838
+ this.text += REPLACEMENT_STR;
839
+ }
840
+ start = this.i;
841
+ nonSpace = true;
842
+ continue;
843
+ }
685
844
  if (c === LESS) {
686
845
  if (handler) {
687
846
  const slice = chunk.slice(start, this.prevI);
@@ -1058,13 +1217,43 @@ class SaxParser {
1058
1217
  start = this.i;
1059
1218
  continue;
1060
1219
  }
1061
- // All other charsfall back to getCode() for validation
1062
- const c = this.getCode();
1063
- if (c === -1) {
1064
- this.text += chunk.slice(start, this.i);
1065
- return;
1220
+ // Non-ASCII (>= 0x80)mostly valid, handle inline like handleTextInRoot
1221
+ if (code >= 0x80) {
1222
+ this.prevI = this.i;
1223
+ if (code >= 0xd800 && code <= 0xdbff) {
1224
+ const next = chunk.charCodeAt(this.i + 1);
1225
+ if (next >= 0xdc00 && next <= 0xdfff) {
1226
+ this.i += 2; // valid surrogate pair
1227
+ }
1228
+ else {
1229
+ this.i++;
1230
+ start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
1231
+ }
1232
+ }
1233
+ else if (code >= 0xdc00 && code <= 0xdfff) {
1234
+ this.i++;
1235
+ start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
1236
+ }
1237
+ else if (code === 0xfffe || code === 0xffff) {
1238
+ this.i++;
1239
+ start = this.handleInvalidCharInAttr(code, start);
1240
+ }
1241
+ else {
1242
+ this.i++; // valid non-ASCII BMP char
1243
+ }
1244
+ if (this.trackPosition) {
1245
+ this.column++;
1246
+ }
1247
+ continue;
1248
+ }
1249
+ // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
1250
+ // All invalid in XML 1.0
1251
+ this.prevI = this.i;
1252
+ this.i++;
1253
+ if (this.trackPosition) {
1254
+ this.column++;
1066
1255
  }
1067
- // Just continue — char is already consumed by getCode()
1256
+ start = this.handleInvalidCharInAttr(code, start);
1068
1257
  }
1069
1258
  // End of chunk
1070
1259
  this.text += chunk.slice(start, this.i);
@@ -7,6 +7,7 @@
7
7
  import type { ToPlainObjectOptions } from "./types.js";
8
8
  /** Options with all defaults resolved — no more `??` checks at hot-path call sites. */
9
9
  export interface ResolvedOptions {
10
+ readonly ignoreAttributes: boolean;
10
11
  readonly attrPrefix: string;
11
12
  readonly textKey: string;
12
13
  readonly alwaysArray: boolean;
@@ -6,6 +6,7 @@
6
6
  */
7
7
  export function resolveOptions(options) {
8
8
  return {
9
+ ignoreAttributes: options?.ignoreAttributes ?? false,
9
10
  attrPrefix: options?.attributePrefix ?? "@_",
10
11
  textKey: options?.textKey ?? "#text",
11
12
  alwaysArray: options?.alwaysArray ?? false,
@@ -52,7 +53,7 @@ export function resolveValue(obj, text, hasAttributes, hasChildren, opts) {
52
53
  if (hasText) {
53
54
  obj[opts.textKey] = text;
54
55
  }
55
- // Empty element with no attributes → empty string (like fast-xml-parser)
56
+ // Empty element with no attributes → empty string
56
57
  if (!hasAttributes && !hasChildren && !hasText) {
57
58
  return "";
58
59
  }
@@ -42,7 +42,8 @@ function parseXmlToObject(xml, options) {
42
42
  position: false,
43
43
  fragment: options?.fragment ?? false,
44
44
  maxDepth: options?.maxDepth,
45
- maxEntityExpansions: options?.maxEntityExpansions
45
+ maxEntityExpansions: options?.maxEntityExpansions,
46
+ invalidCharHandling: options?.invalidCharHandling
46
47
  });
47
48
  // Stack: bottom is a synthetic root frame that collects the document root.
48
49
  const syntheticObj = Object.create(null);
@@ -64,9 +65,11 @@ function parseXmlToObject(xml, options) {
64
65
  name: tag.name
65
66
  };
66
67
  // Write attributes directly into frame.obj
67
- for (const key in tag.attributes) {
68
- frame.obj[opts.attrPrefix + key] = tag.attributes[key];
69
- frame.hasAttributes = true;
68
+ if (!opts.ignoreAttributes) {
69
+ for (const key in tag.attributes) {
70
+ frame.obj[opts.attrPrefix + key] = tag.attributes[key];
71
+ frame.hasAttributes = true;
72
+ }
70
73
  }
71
74
  // Mark parent as having children
72
75
  stack[stack.length - 1].hasChildren = true;