@cj-tech-master/excelts 7.5.0 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/dist/browser/modules/excel/stream/hyperlink-reader.js +1 -1
  2. package/dist/browser/modules/excel/stream/workbook-reader.browser.js +2 -2
  3. package/dist/browser/modules/excel/stream/worksheet-reader.js +1 -1
  4. package/dist/browser/modules/excel/xlsx/xform/base-xform.js +1 -1
  5. package/dist/browser/modules/xml/dom.js +2 -1
  6. package/dist/browser/modules/xml/index.d.ts +1 -1
  7. package/dist/browser/modules/xml/sax.d.ts +41 -0
  8. package/dist/browser/modules/xml/sax.js +265 -76
  9. package/dist/browser/modules/xml/to-object.js +2 -1
  10. package/dist/browser/modules/xml/types.d.ts +24 -0
  11. package/dist/cjs/modules/excel/stream/hyperlink-reader.js +1 -1
  12. package/dist/cjs/modules/excel/stream/workbook-reader.browser.js +2 -2
  13. package/dist/cjs/modules/excel/stream/worksheet-reader.js +1 -1
  14. package/dist/cjs/modules/excel/xlsx/xform/base-xform.js +1 -1
  15. package/dist/cjs/modules/xml/dom.js +2 -1
  16. package/dist/cjs/modules/xml/sax.js +265 -76
  17. package/dist/cjs/modules/xml/to-object.js +2 -1
  18. package/dist/esm/modules/excel/stream/hyperlink-reader.js +1 -1
  19. package/dist/esm/modules/excel/stream/workbook-reader.browser.js +2 -2
  20. package/dist/esm/modules/excel/stream/worksheet-reader.js +1 -1
  21. package/dist/esm/modules/excel/xlsx/xform/base-xform.js +1 -1
  22. package/dist/esm/modules/xml/dom.js +2 -1
  23. package/dist/esm/modules/xml/sax.js +265 -76
  24. package/dist/esm/modules/xml/to-object.js +2 -1
  25. package/dist/iife/excelts.iife.js +196 -54
  26. package/dist/iife/excelts.iife.js.map +1 -1
  27. package/dist/iife/excelts.iife.min.js +44 -44
  28. package/dist/types/modules/xml/index.d.ts +1 -1
  29. package/dist/types/modules/xml/sax.d.ts +41 -0
  30. package/dist/types/modules/xml/types.d.ts +24 -0
  31. package/package.json +1 -1
@@ -96,7 +96,8 @@ function parseXml(xml, options) {
96
96
  fragment: options?.fragment ?? false,
97
97
  xmlns: options?.xmlns ?? false,
98
98
  maxDepth: options?.maxDepth,
99
- maxEntityExpansions: options?.maxEntityExpansions
99
+ maxEntityExpansions: options?.maxEntityExpansions,
100
+ invalidCharHandling: options?.invalidCharHandling
100
101
  });
101
102
  // Stack of elements being built. The bottom is a synthetic root
102
103
  // that collects top-level nodes.
@@ -40,6 +40,8 @@ const GREATER = 0x3e; // >
40
40
  const QUESTION = 0x3f; // ?
41
41
  const OPEN_BRACKET = 0x5b; // [
42
42
  const CLOSE_BRACKET = 0x5d; // ]
43
+ const REPLACEMENT_CHAR = 0xfffd; // U+FFFD REPLACEMENT CHARACTER
44
+ const REPLACEMENT_STR = "\uFFFD"; // Pre-allocated string form of U+FFFD
43
45
  // =============================================================================
44
46
  // Pre-computed Lookup Tables
45
47
  // =============================================================================
@@ -238,6 +240,7 @@ class SaxParser {
238
240
  this.maxDepth = options?.maxDepth !== undefined ? options.maxDepth : 256;
239
241
  this.maxEntityExpansions =
240
242
  options?.maxEntityExpansions !== undefined ? options.maxEntityExpansions : 10000;
243
+ this.invalidCharHandling = options?.invalidCharHandling ?? "error";
241
244
  this._init();
242
245
  }
243
246
  get closed() {
@@ -355,87 +358,215 @@ class SaxParser {
355
358
  return this.write(null);
356
359
  }
357
360
  // ===========================================================================
358
- // Character Reading
361
+ // Invalid Character Handling
359
362
  // ===========================================================================
360
- getCode() {
361
- const { chunk, i } = this;
362
- this.prevI = i;
363
- this.i = i + 1;
364
- if (i >= chunk.length) {
365
- return -1;
366
- }
367
- const code = chunk.charCodeAt(i);
368
- // Ultra-fast path: printable ASCII (0x20-0x7E) — the vast majority of XML content.
369
- // No validation needed; these are always valid XML 1.0 characters.
370
- if (code >= 0x20 && code <= 0x7e) {
371
- if (this.trackPosition) {
372
- this.column++;
363
+ /**
364
+ * Handle an invalid XML character according to the configured strategy.
365
+ *
366
+ * Used by `handleTextInRoot()` fast path which manages its own text accumulation
367
+ * and cannot use the `getCode()` loop approach.
368
+ *
369
+ * - `"error"`: call `fail()` and return the original code.
370
+ * - `"skip"`: return `REPLACEMENT_CHAR` as a sentinel (caller handles skip).
371
+ * - `"replace"`: return `REPLACEMENT_CHAR`.
372
+ *
373
+ * Note: For `getCode()`, invalid char handling is inlined to avoid recursion.
374
+ *
375
+ * @param code - The invalid character code point.
376
+ * @param kind - Optional description (e.g. "lone surrogate") for error messages.
377
+ * @returns The code point to use.
378
+ */
379
+ handleInvalidChar(code, kind) {
380
+ switch (this.invalidCharHandling) {
381
+ case "replace":
382
+ return REPLACEMENT_CHAR;
383
+ case "skip":
384
+ // Caller is responsible for the actual skip logic.
385
+ // We return -2 as a sentinel to tell getCode()'s loop to continue.
386
+ return -2;
387
+ default: {
388
+ // "error" — existing strict behavior
389
+ const label = kind
390
+ ? `invalid XML character: ${kind} 0x${code.toString(16)}`
391
+ : `invalid XML character: 0x${code.toString(16)}`;
392
+ this.fail(label);
393
+ return code;
373
394
  }
374
- return code;
375
395
  }
376
- // Secondary fast path: TAB (0x09) — common in attribute values
377
- if (code === TAB) {
378
- if (this.trackPosition) {
379
- this.column++;
396
+ }
397
+ /**
398
+ * Handle an invalid character inside the `handleTextInRoot()` fast loop.
399
+ *
400
+ * Unlike `handleInvalidChar()` (which returns a code point for `getCode()`),
401
+ * this method manages the text accumulation state (`this.text`, `start`) that
402
+ * the fast text loop relies on.
403
+ *
404
+ * - `"error"`: call `fail()`, leave text accumulation unchanged (char stays in output).
405
+ * - `"skip"`: flush text up to the invalid char, skip it, return new `start`.
406
+ * - `"replace"`: flush text up to the invalid char, append U+FFFD, return new `start`.
407
+ *
408
+ * @returns The updated `start` index for the text accumulation loop.
409
+ */
410
+ handleInvalidCharInText(code, handler, start, kind) {
411
+ switch (this.invalidCharHandling) {
412
+ case "skip":
413
+ // Flush text accumulated before this invalid char, then skip it
414
+ if (handler && start < this.prevI) {
415
+ this.text += this.chunk.slice(start, this.prevI);
416
+ }
417
+ return this.i;
418
+ case "replace":
419
+ // Flush text accumulated before this invalid char, append replacement
420
+ if (handler) {
421
+ if (start < this.prevI) {
422
+ this.text += this.chunk.slice(start, this.prevI);
423
+ }
424
+ this.text += REPLACEMENT_STR;
425
+ }
426
+ return this.i;
427
+ default: {
428
+ // "error" — existing strict behavior, char stays in output
429
+ const label = kind
430
+ ? `invalid XML character: ${kind} 0x${code.toString(16)}`
431
+ : `invalid XML character: 0x${code.toString(16)}`;
432
+ this.fail(label);
433
+ return start;
380
434
  }
381
- return code;
382
435
  }
383
- // Handle CR (normalize CR and CR+LF to LF per XML 1.0 §2.11)
384
- if (code === CR) {
385
- if (chunk.charCodeAt(i + 1) === NL) {
386
- this.i = i + 2;
387
- }
388
- if (this.trackPosition) {
389
- this.line++;
390
- this.column = 0;
391
- this.positionAtNewLine = this.position;
436
+ }
437
+ /**
438
+ * Handle an invalid character inside `sAttribValueQuoted()`.
439
+ *
440
+ * Same pattern as `handleInvalidCharInText()` but for attribute value
441
+ * accumulation (always uses `this.text`, no conditional handler check).
442
+ *
443
+ * @returns The updated `start` index.
444
+ */
445
+ handleInvalidCharInAttr(code, start, kind) {
446
+ switch (this.invalidCharHandling) {
447
+ case "skip":
448
+ if (start < this.prevI) {
449
+ this.text += this.chunk.slice(start, this.prevI);
450
+ }
451
+ return this.i;
452
+ case "replace":
453
+ if (start < this.prevI) {
454
+ this.text += this.chunk.slice(start, this.prevI);
455
+ }
456
+ this.text += REPLACEMENT_STR;
457
+ return this.i;
458
+ default: {
459
+ const label = kind
460
+ ? `invalid XML character: ${kind} 0x${code.toString(16)}`
461
+ : `invalid XML character: 0x${code.toString(16)}`;
462
+ this.fail(label);
463
+ return start;
392
464
  }
393
- return NL;
394
465
  }
395
- // Handle LF
396
- if (code === NL) {
397
- if (this.trackPosition) {
398
- this.line++;
399
- this.column = 0;
400
- this.positionAtNewLine = this.position;
466
+ }
467
+ // ===========================================================================
468
+ // Character Reading
469
+ // ===========================================================================
470
+ getCode() {
471
+ // Loop to handle skip mode: when an invalid char returns -2, we retry
472
+ // with the next character instead of recursing (avoids stack overflow
473
+ // on long runs of consecutive invalid characters).
474
+ for (;;) {
475
+ const { chunk } = this;
476
+ const i = this.i;
477
+ this.prevI = i;
478
+ this.i = i + 1;
479
+ if (i >= chunk.length) {
480
+ return -1;
481
+ }
482
+ const code = chunk.charCodeAt(i);
483
+ // Ultra-fast path: printable ASCII (0x20-0x7E) — the vast majority of XML content.
484
+ // No validation needed; these are always valid XML 1.0 characters.
485
+ if (code >= 0x20 && code <= 0x7e) {
486
+ if (this.trackPosition) {
487
+ this.column++;
488
+ }
489
+ return code;
401
490
  }
402
- return NL;
403
- }
404
- // Handle surrogates
405
- if (code >= 0xd800 && code <= 0xdbff) {
406
- const next = chunk.charCodeAt(i + 1);
407
- if (next >= 0xdc00 && next <= 0xdfff) {
408
- this.i = i + 2;
491
+ // Secondary fast path: TAB (0x09) — common in attribute values
492
+ if (code === TAB) {
409
493
  if (this.trackPosition) {
410
494
  this.column++;
411
495
  }
412
- return 0x10000 + ((code - 0xd800) * 0x400 + (next - 0xdc00));
496
+ return code;
413
497
  }
414
- // Lone high surrogate invalid XML character
415
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
416
- }
417
- // Lone low surrogate — invalid XML character
418
- if (code >= 0xdc00 && code <= 0xdfff) {
419
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
420
- }
421
- // Non-ASCII above surrogate range (0x80-0xD7FF, 0xE000-0xFFFD) — all valid XML
422
- if (code >= 0x80) {
498
+ // Handle CR (normalize CR and CR+LF to LF per XML 1.0 §2.11)
499
+ if (code === CR) {
500
+ if (chunk.charCodeAt(i + 1) === NL) {
501
+ this.i = i + 2;
502
+ }
503
+ if (this.trackPosition) {
504
+ this.line++;
505
+ this.column = 0;
506
+ this.positionAtNewLine = this.position;
507
+ }
508
+ return NL;
509
+ }
510
+ // Handle LF
511
+ if (code === NL) {
512
+ if (this.trackPosition) {
513
+ this.line++;
514
+ this.column = 0;
515
+ this.positionAtNewLine = this.position;
516
+ }
517
+ return NL;
518
+ }
519
+ // Handle surrogates
520
+ if (code >= 0xd800 && code <= 0xdbff) {
521
+ const next = chunk.charCodeAt(i + 1);
522
+ if (next >= 0xdc00 && next <= 0xdfff) {
523
+ this.i = i + 2;
524
+ if (this.trackPosition) {
525
+ this.column++;
526
+ }
527
+ return 0x10000 + ((code - 0xd800) * 0x400 + (next - 0xdc00));
528
+ }
529
+ // Lone high surrogate — invalid XML character
530
+ const result = this.handleInvalidChar(code, "lone surrogate");
531
+ if (result !== -2) {
532
+ return result;
533
+ }
534
+ continue; // skip: loop to next char
535
+ }
536
+ // Lone low surrogate — invalid XML character
537
+ if (code >= 0xdc00 && code <= 0xdfff) {
538
+ const result = this.handleInvalidChar(code, "lone surrogate");
539
+ if (result !== -2) {
540
+ return result;
541
+ }
542
+ continue;
543
+ }
544
+ // Non-ASCII above surrogate range (0x80-0xD7FF, 0xE000-0xFFFD) — all valid XML
545
+ if (code >= 0x80) {
546
+ if (this.trackPosition) {
547
+ this.column++;
548
+ }
549
+ // Reject 0xFFFE and 0xFFFF
550
+ if (code === 0xfffe || code === 0xffff) {
551
+ const result = this.handleInvalidChar(code);
552
+ if (result !== -2) {
553
+ return result;
554
+ }
555
+ continue;
556
+ }
557
+ return code;
558
+ }
559
+ // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
560
+ // All invalid in XML 1.0
423
561
  if (this.trackPosition) {
424
562
  this.column++;
425
563
  }
426
- // Reject 0xFFFE and 0xFFFF
427
- if (code === 0xfffe || code === 0xffff) {
428
- this.fail("invalid XML character: 0x" + code.toString(16));
564
+ const result = this.handleInvalidChar(code);
565
+ if (result !== -2) {
566
+ return result;
429
567
  }
430
- return code;
431
- }
432
- // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
433
- // All invalid in XML 1.0
434
- if (this.trackPosition) {
435
- this.column++;
568
+ // skip: continue to next char
436
569
  }
437
- this.fail("invalid XML character: 0x" + code.toString(16));
438
- return code;
439
570
  }
440
571
  unget() {
441
572
  this.i = this.prevI;
@@ -642,16 +773,16 @@ class SaxParser {
642
773
  }
643
774
  else {
644
775
  this.i++;
645
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
776
+ start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
646
777
  }
647
778
  }
648
779
  else if (code >= 0xdc00 && code <= 0xdfff) {
649
780
  this.i++;
650
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
781
+ start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
651
782
  }
652
783
  else if (code === 0xfffe || code === 0xffff) {
653
784
  this.i++;
654
- this.fail("invalid XML character: 0x" + code.toString(16));
785
+ start = this.handleInvalidCharInText(code, handler, start);
655
786
  }
656
787
  else {
657
788
  this.i++;
@@ -667,7 +798,7 @@ class SaxParser {
667
798
  if (this.trackPosition) {
668
799
  this.column++;
669
800
  }
670
- this.fail("invalid XML character: 0x" + code.toString(16));
801
+ start = this.handleInvalidCharInText(code, handler, start);
671
802
  }
672
803
  // End of chunk
673
804
  if (handler && start < this.i) {
@@ -679,14 +810,42 @@ class SaxParser {
679
810
  let { i: start } = this;
680
811
  const handler = this._handlers.text;
681
812
  let nonSpace = false;
813
+ const isSkip = this.invalidCharHandling === "skip";
814
+ const isReplace = this.invalidCharHandling === "replace";
682
815
  while (true) {
816
+ const iBeforeGet = this.i;
683
817
  const c = this.getCode();
684
818
  if (c === -1) {
685
- if (handler && start < this.i) {
686
- this.text += chunk.slice(start, this.i);
819
+ if (handler && start < iBeforeGet) {
820
+ this.text += chunk.slice(start, iBeforeGet);
687
821
  }
688
822
  break;
689
823
  }
824
+ // In skip mode, getCode() may have internally looped past invalid chars.
825
+ // Flush valid text before the gap and advance start past it.
826
+ if (isSkip && this.prevI > iBeforeGet) {
827
+ if (handler && start < iBeforeGet) {
828
+ this.text += chunk.slice(start, iBeforeGet);
829
+ }
830
+ start = this.prevI;
831
+ }
832
+ // In replace mode, getCode() returns REPLACEMENT_CHAR for invalid chars
833
+ // but the original byte is still in the chunk. Detect this by checking
834
+ // whether getCode() returned REPLACEMENT_CHAR while the raw chunk byte
835
+ // at prevI is NOT U+FFFD (i.e., it was substituted by handleInvalidChar).
836
+ if (isReplace &&
837
+ c === REPLACEMENT_CHAR &&
838
+ chunk.charCodeAt(this.prevI) !== REPLACEMENT_CHAR) {
839
+ if (handler) {
840
+ if (start < this.prevI) {
841
+ this.text += chunk.slice(start, this.prevI);
842
+ }
843
+ this.text += REPLACEMENT_STR;
844
+ }
845
+ start = this.i;
846
+ nonSpace = true;
847
+ continue;
848
+ }
690
849
  if (c === LESS) {
691
850
  if (handler) {
692
851
  const slice = chunk.slice(start, this.prevI);
@@ -1063,13 +1222,43 @@ class SaxParser {
1063
1222
  start = this.i;
1064
1223
  continue;
1065
1224
  }
1066
- // All other charsfall back to getCode() for validation
1067
- const c = this.getCode();
1068
- if (c === -1) {
1069
- this.text += chunk.slice(start, this.i);
1070
- return;
1225
+ // Non-ASCII (>= 0x80)mostly valid, handle inline like handleTextInRoot
1226
+ if (code >= 0x80) {
1227
+ this.prevI = this.i;
1228
+ if (code >= 0xd800 && code <= 0xdbff) {
1229
+ const next = chunk.charCodeAt(this.i + 1);
1230
+ if (next >= 0xdc00 && next <= 0xdfff) {
1231
+ this.i += 2; // valid surrogate pair
1232
+ }
1233
+ else {
1234
+ this.i++;
1235
+ start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
1236
+ }
1237
+ }
1238
+ else if (code >= 0xdc00 && code <= 0xdfff) {
1239
+ this.i++;
1240
+ start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
1241
+ }
1242
+ else if (code === 0xfffe || code === 0xffff) {
1243
+ this.i++;
1244
+ start = this.handleInvalidCharInAttr(code, start);
1245
+ }
1246
+ else {
1247
+ this.i++; // valid non-ASCII BMP char
1248
+ }
1249
+ if (this.trackPosition) {
1250
+ this.column++;
1251
+ }
1252
+ continue;
1253
+ }
1254
+ // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
1255
+ // All invalid in XML 1.0
1256
+ this.prevI = this.i;
1257
+ this.i++;
1258
+ if (this.trackPosition) {
1259
+ this.column++;
1071
1260
  }
1072
- // Just continue — char is already consumed by getCode()
1261
+ start = this.handleInvalidCharInAttr(code, start);
1073
1262
  }
1074
1263
  // End of chunk
1075
1264
  this.text += chunk.slice(start, this.i);
@@ -45,7 +45,8 @@ function parseXmlToObject(xml, options) {
45
45
  position: false,
46
46
  fragment: options?.fragment ?? false,
47
47
  maxDepth: options?.maxDepth,
48
- maxEntityExpansions: options?.maxEntityExpansions
48
+ maxEntityExpansions: options?.maxEntityExpansions,
49
+ invalidCharHandling: options?.invalidCharHandling
49
50
  });
50
51
  // Stack: bottom is a synthetic root frame that collects the document root.
51
52
  const syntheticObj = Object.create(null);
@@ -43,7 +43,7 @@ class HyperlinkReader extends EventEmitter {
43
43
  return;
44
44
  }
45
45
  try {
46
- const parser = new SaxParser({ position: false });
46
+ const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
47
47
  const decoder = new TextDecoder("utf-8", { fatal: true });
48
48
  parser.on("opentag", (node) => {
49
49
  if (node.name !== "Relationship") {
@@ -211,7 +211,7 @@ export class WorkbookReaderBase extends EventEmitter {
211
211
  // For "cache" mode, use direct SAX callbacks (no event objects, no async generator overhead)
212
212
  if (this.options.sharedStrings === "cache") {
213
213
  const sharedStrings = this.sharedStrings;
214
- const parser = new SaxParser({ position: false });
214
+ const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
215
215
  parser.on("opentag", (node) => {
216
216
  switch (node.name) {
217
217
  case "b":
@@ -311,7 +311,7 @@ export class WorkbookReaderBase extends EventEmitter {
311
311
  return;
312
312
  }
313
313
  // "emit" mode — must yield, so use direct SAX with per-chunk yield
314
- const emitParser = new SaxParser();
314
+ const emitParser = new SaxParser({ invalidCharHandling: "skip" });
315
315
  const emitDecoder = new TextDecoder("utf-8", { fatal: true });
316
316
  let pendingEmits = [];
317
317
  emitParser.on("opentag", (node) => {
@@ -143,7 +143,7 @@ class WorksheetReader extends EventEmitter {
143
143
  // Direct SAX callback mode — zero intermediate event objects.
144
144
  // We collect worksheet events per-chunk and yield them.
145
145
  let worksheetEvents = null;
146
- const parser = new SaxParser({ position: false });
146
+ const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
147
147
  parser.on("opentag", (node) => {
148
148
  if (emitSheet) {
149
149
  switch (node.name) {
@@ -159,7 +159,7 @@ class BaseXform {
159
159
  * Use this instead of parse(parseSax(stream)) for hot paths.
160
160
  */
161
161
  async parseStreamDirect(stream) {
162
- const parser = new SaxParser();
162
+ const parser = new SaxParser({ invalidCharHandling: "skip" });
163
163
  const decoder = new TextDecoder("utf-8", { fatal: true });
164
164
  let done = false;
165
165
  let finalModel;
@@ -87,7 +87,8 @@ function parseXml(xml, options) {
87
87
  fragment: options?.fragment ?? false,
88
88
  xmlns: options?.xmlns ?? false,
89
89
  maxDepth: options?.maxDepth,
90
- maxEntityExpansions: options?.maxEntityExpansions
90
+ maxEntityExpansions: options?.maxEntityExpansions,
91
+ invalidCharHandling: options?.invalidCharHandling
91
92
  });
92
93
  // Stack of elements being built. The bottom is a synthetic root
93
94
  // that collects top-level nodes.