@cj-tech-master/excelts 7.5.0 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/dist/browser/modules/excel/stream/hyperlink-reader.js +1 -1
  2. package/dist/browser/modules/excel/stream/workbook-reader.browser.js +2 -2
  3. package/dist/browser/modules/excel/stream/worksheet-reader.js +1 -1
  4. package/dist/browser/modules/excel/xlsx/xform/base-xform.js +1 -1
  5. package/dist/browser/modules/xml/dom.js +2 -1
  6. package/dist/browser/modules/xml/index.d.ts +1 -1
  7. package/dist/browser/modules/xml/sax.d.ts +41 -0
  8. package/dist/browser/modules/xml/sax.js +265 -76
  9. package/dist/browser/modules/xml/to-object.js +2 -1
  10. package/dist/browser/modules/xml/types.d.ts +24 -0
  11. package/dist/cjs/modules/excel/stream/hyperlink-reader.js +1 -1
  12. package/dist/cjs/modules/excel/stream/workbook-reader.browser.js +2 -2
  13. package/dist/cjs/modules/excel/stream/worksheet-reader.js +1 -1
  14. package/dist/cjs/modules/excel/xlsx/xform/base-xform.js +1 -1
  15. package/dist/cjs/modules/xml/dom.js +2 -1
  16. package/dist/cjs/modules/xml/sax.js +265 -76
  17. package/dist/cjs/modules/xml/to-object.js +2 -1
  18. package/dist/esm/modules/excel/stream/hyperlink-reader.js +1 -1
  19. package/dist/esm/modules/excel/stream/workbook-reader.browser.js +2 -2
  20. package/dist/esm/modules/excel/stream/worksheet-reader.js +1 -1
  21. package/dist/esm/modules/excel/xlsx/xform/base-xform.js +1 -1
  22. package/dist/esm/modules/xml/dom.js +2 -1
  23. package/dist/esm/modules/xml/sax.js +265 -76
  24. package/dist/esm/modules/xml/to-object.js +2 -1
  25. package/dist/iife/excelts.iife.js +196 -54
  26. package/dist/iife/excelts.iife.js.map +1 -1
  27. package/dist/iife/excelts.iife.min.js +44 -44
  28. package/dist/types/modules/xml/index.d.ts +1 -1
  29. package/dist/types/modules/xml/sax.d.ts +41 -0
  30. package/dist/types/modules/xml/types.d.ts +24 -0
  31. package/package.json +1 -1
@@ -35,6 +35,8 @@ const GREATER = 0x3e; // >
35
35
  const QUESTION = 0x3f; // ?
36
36
  const OPEN_BRACKET = 0x5b; // [
37
37
  const CLOSE_BRACKET = 0x5d; // ]
38
+ const REPLACEMENT_CHAR = 0xfffd; // U+FFFD REPLACEMENT CHARACTER
39
+ const REPLACEMENT_STR = "\uFFFD"; // Pre-allocated string form of U+FFFD
38
40
  // =============================================================================
39
41
  // Pre-computed Lookup Tables
40
42
  // =============================================================================
@@ -233,6 +235,7 @@ class SaxParser {
233
235
  this.maxDepth = options?.maxDepth !== undefined ? options.maxDepth : 256;
234
236
  this.maxEntityExpansions =
235
237
  options?.maxEntityExpansions !== undefined ? options.maxEntityExpansions : 10000;
238
+ this.invalidCharHandling = options?.invalidCharHandling ?? "error";
236
239
  this._init();
237
240
  }
238
241
  get closed() {
@@ -350,87 +353,215 @@ class SaxParser {
350
353
  return this.write(null);
351
354
  }
352
355
  // ===========================================================================
353
- // Character Reading
356
+ // Invalid Character Handling
354
357
  // ===========================================================================
355
- getCode() {
356
- const { chunk, i } = this;
357
- this.prevI = i;
358
- this.i = i + 1;
359
- if (i >= chunk.length) {
360
- return -1;
361
- }
362
- const code = chunk.charCodeAt(i);
363
- // Ultra-fast path: printable ASCII (0x20-0x7E) — the vast majority of XML content.
364
- // No validation needed; these are always valid XML 1.0 characters.
365
- if (code >= 0x20 && code <= 0x7e) {
366
- if (this.trackPosition) {
367
- this.column++;
358
+ /**
359
+ * Handle an invalid XML character according to the configured strategy.
360
+ *
361
+ * Used by `handleTextInRoot()` fast path which manages its own text accumulation
362
+ * and cannot use the `getCode()` loop approach.
363
+ *
364
+ * - `"error"`: call `fail()` and return the original code.
365
+ * - `"skip"`: return `REPLACEMENT_CHAR` as a sentinel (caller handles skip).
366
+ * - `"replace"`: return `REPLACEMENT_CHAR`.
367
+ *
368
+ * Note: For `getCode()`, invalid char handling is inlined to avoid recursion.
369
+ *
370
+ * @param code - The invalid character code point.
371
+ * @param kind - Optional description (e.g. "lone surrogate") for error messages.
372
+ * @returns The code point to use.
373
+ */
374
+ handleInvalidChar(code, kind) {
375
+ switch (this.invalidCharHandling) {
376
+ case "replace":
377
+ return REPLACEMENT_CHAR;
378
+ case "skip":
379
+ // Caller is responsible for the actual skip logic.
380
+ // We return -2 as a sentinel to tell getCode()'s loop to continue.
381
+ return -2;
382
+ default: {
383
+ // "error" — existing strict behavior
384
+ const label = kind
385
+ ? `invalid XML character: ${kind} 0x${code.toString(16)}`
386
+ : `invalid XML character: 0x${code.toString(16)}`;
387
+ this.fail(label);
388
+ return code;
368
389
  }
369
- return code;
370
390
  }
371
- // Secondary fast path: TAB (0x09) — common in attribute values
372
- if (code === TAB) {
373
- if (this.trackPosition) {
374
- this.column++;
391
+ }
392
+ /**
393
+ * Handle an invalid character inside the `handleTextInRoot()` fast loop.
394
+ *
395
+ * Unlike `handleInvalidChar()` (which returns a code point for `getCode()`),
396
+ * this method manages the text accumulation state (`this.text`, `start`) that
397
+ * the fast text loop relies on.
398
+ *
399
+ * - `"error"`: call `fail()`, leave text accumulation unchanged (char stays in output).
400
+ * - `"skip"`: flush text up to the invalid char, skip it, return new `start`.
401
+ * - `"replace"`: flush text up to the invalid char, append U+FFFD, return new `start`.
402
+ *
403
+ * @returns The updated `start` index for the text accumulation loop.
404
+ */
405
+ handleInvalidCharInText(code, handler, start, kind) {
406
+ switch (this.invalidCharHandling) {
407
+ case "skip":
408
+ // Flush text accumulated before this invalid char, then skip it
409
+ if (handler && start < this.prevI) {
410
+ this.text += this.chunk.slice(start, this.prevI);
411
+ }
412
+ return this.i;
413
+ case "replace":
414
+ // Flush text accumulated before this invalid char, append replacement
415
+ if (handler) {
416
+ if (start < this.prevI) {
417
+ this.text += this.chunk.slice(start, this.prevI);
418
+ }
419
+ this.text += REPLACEMENT_STR;
420
+ }
421
+ return this.i;
422
+ default: {
423
+ // "error" — existing strict behavior, char stays in output
424
+ const label = kind
425
+ ? `invalid XML character: ${kind} 0x${code.toString(16)}`
426
+ : `invalid XML character: 0x${code.toString(16)}`;
427
+ this.fail(label);
428
+ return start;
375
429
  }
376
- return code;
377
430
  }
378
- // Handle CR (normalize CR and CR+LF to LF per XML 1.0 §2.11)
379
- if (code === CR) {
380
- if (chunk.charCodeAt(i + 1) === NL) {
381
- this.i = i + 2;
382
- }
383
- if (this.trackPosition) {
384
- this.line++;
385
- this.column = 0;
386
- this.positionAtNewLine = this.position;
431
+ }
432
+ /**
433
+ * Handle an invalid character inside `sAttribValueQuoted()`.
434
+ *
435
+ * Same pattern as `handleInvalidCharInText()` but for attribute value
436
+ * accumulation (always uses `this.text`, no conditional handler check).
437
+ *
438
+ * @returns The updated `start` index.
439
+ */
440
+ handleInvalidCharInAttr(code, start, kind) {
441
+ switch (this.invalidCharHandling) {
442
+ case "skip":
443
+ if (start < this.prevI) {
444
+ this.text += this.chunk.slice(start, this.prevI);
445
+ }
446
+ return this.i;
447
+ case "replace":
448
+ if (start < this.prevI) {
449
+ this.text += this.chunk.slice(start, this.prevI);
450
+ }
451
+ this.text += REPLACEMENT_STR;
452
+ return this.i;
453
+ default: {
454
+ const label = kind
455
+ ? `invalid XML character: ${kind} 0x${code.toString(16)}`
456
+ : `invalid XML character: 0x${code.toString(16)}`;
457
+ this.fail(label);
458
+ return start;
387
459
  }
388
- return NL;
389
460
  }
390
- // Handle LF
391
- if (code === NL) {
392
- if (this.trackPosition) {
393
- this.line++;
394
- this.column = 0;
395
- this.positionAtNewLine = this.position;
461
+ }
462
+ // ===========================================================================
463
+ // Character Reading
464
+ // ===========================================================================
465
+ getCode() {
466
+ // Loop to handle skip mode: when an invalid char returns -2, we retry
467
+ // with the next character instead of recursing (avoids stack overflow
468
+ // on long runs of consecutive invalid characters).
469
+ for (;;) {
470
+ const { chunk } = this;
471
+ const i = this.i;
472
+ this.prevI = i;
473
+ this.i = i + 1;
474
+ if (i >= chunk.length) {
475
+ return -1;
476
+ }
477
+ const code = chunk.charCodeAt(i);
478
+ // Ultra-fast path: printable ASCII (0x20-0x7E) — the vast majority of XML content.
479
+ // No validation needed; these are always valid XML 1.0 characters.
480
+ if (code >= 0x20 && code <= 0x7e) {
481
+ if (this.trackPosition) {
482
+ this.column++;
483
+ }
484
+ return code;
396
485
  }
397
- return NL;
398
- }
399
- // Handle surrogates
400
- if (code >= 0xd800 && code <= 0xdbff) {
401
- const next = chunk.charCodeAt(i + 1);
402
- if (next >= 0xdc00 && next <= 0xdfff) {
403
- this.i = i + 2;
486
+ // Secondary fast path: TAB (0x09) — common in attribute values
487
+ if (code === TAB) {
404
488
  if (this.trackPosition) {
405
489
  this.column++;
406
490
  }
407
- return 0x10000 + ((code - 0xd800) * 0x400 + (next - 0xdc00));
491
+ return code;
408
492
  }
409
- // Lone high surrogate invalid XML character
410
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
411
- }
412
- // Lone low surrogate — invalid XML character
413
- if (code >= 0xdc00 && code <= 0xdfff) {
414
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
415
- }
416
- // Non-ASCII above surrogate range (0x80-0xD7FF, 0xE000-0xFFFD) — all valid XML
417
- if (code >= 0x80) {
493
+ // Handle CR (normalize CR and CR+LF to LF per XML 1.0 §2.11)
494
+ if (code === CR) {
495
+ if (chunk.charCodeAt(i + 1) === NL) {
496
+ this.i = i + 2;
497
+ }
498
+ if (this.trackPosition) {
499
+ this.line++;
500
+ this.column = 0;
501
+ this.positionAtNewLine = this.position;
502
+ }
503
+ return NL;
504
+ }
505
+ // Handle LF
506
+ if (code === NL) {
507
+ if (this.trackPosition) {
508
+ this.line++;
509
+ this.column = 0;
510
+ this.positionAtNewLine = this.position;
511
+ }
512
+ return NL;
513
+ }
514
+ // Handle surrogates
515
+ if (code >= 0xd800 && code <= 0xdbff) {
516
+ const next = chunk.charCodeAt(i + 1);
517
+ if (next >= 0xdc00 && next <= 0xdfff) {
518
+ this.i = i + 2;
519
+ if (this.trackPosition) {
520
+ this.column++;
521
+ }
522
+ return 0x10000 + ((code - 0xd800) * 0x400 + (next - 0xdc00));
523
+ }
524
+ // Lone high surrogate — invalid XML character
525
+ const result = this.handleInvalidChar(code, "lone surrogate");
526
+ if (result !== -2) {
527
+ return result;
528
+ }
529
+ continue; // skip: loop to next char
530
+ }
531
+ // Lone low surrogate — invalid XML character
532
+ if (code >= 0xdc00 && code <= 0xdfff) {
533
+ const result = this.handleInvalidChar(code, "lone surrogate");
534
+ if (result !== -2) {
535
+ return result;
536
+ }
537
+ continue;
538
+ }
539
+ // Non-ASCII above surrogate range (0x80-0xD7FF, 0xE000-0xFFFD) — all valid XML
540
+ if (code >= 0x80) {
541
+ if (this.trackPosition) {
542
+ this.column++;
543
+ }
544
+ // Reject 0xFFFE and 0xFFFF
545
+ if (code === 0xfffe || code === 0xffff) {
546
+ const result = this.handleInvalidChar(code);
547
+ if (result !== -2) {
548
+ return result;
549
+ }
550
+ continue;
551
+ }
552
+ return code;
553
+ }
554
+ // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
555
+ // All invalid in XML 1.0
418
556
  if (this.trackPosition) {
419
557
  this.column++;
420
558
  }
421
- // Reject 0xFFFE and 0xFFFF
422
- if (code === 0xfffe || code === 0xffff) {
423
- this.fail("invalid XML character: 0x" + code.toString(16));
559
+ const result = this.handleInvalidChar(code);
560
+ if (result !== -2) {
561
+ return result;
424
562
  }
425
- return code;
426
- }
427
- // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
428
- // All invalid in XML 1.0
429
- if (this.trackPosition) {
430
- this.column++;
563
+ // skip: continue to next char
431
564
  }
432
- this.fail("invalid XML character: 0x" + code.toString(16));
433
- return code;
434
565
  }
435
566
  unget() {
436
567
  this.i = this.prevI;
@@ -637,16 +768,16 @@ class SaxParser {
637
768
  }
638
769
  else {
639
770
  this.i++;
640
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
771
+ start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
641
772
  }
642
773
  }
643
774
  else if (code >= 0xdc00 && code <= 0xdfff) {
644
775
  this.i++;
645
- this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
776
+ start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
646
777
  }
647
778
  else if (code === 0xfffe || code === 0xffff) {
648
779
  this.i++;
649
- this.fail("invalid XML character: 0x" + code.toString(16));
780
+ start = this.handleInvalidCharInText(code, handler, start);
650
781
  }
651
782
  else {
652
783
  this.i++;
@@ -662,7 +793,7 @@ class SaxParser {
662
793
  if (this.trackPosition) {
663
794
  this.column++;
664
795
  }
665
- this.fail("invalid XML character: 0x" + code.toString(16));
796
+ start = this.handleInvalidCharInText(code, handler, start);
666
797
  }
667
798
  // End of chunk
668
799
  if (handler && start < this.i) {
@@ -674,14 +805,42 @@ class SaxParser {
674
805
  let { i: start } = this;
675
806
  const handler = this._handlers.text;
676
807
  let nonSpace = false;
808
+ const isSkip = this.invalidCharHandling === "skip";
809
+ const isReplace = this.invalidCharHandling === "replace";
677
810
  while (true) {
811
+ const iBeforeGet = this.i;
678
812
  const c = this.getCode();
679
813
  if (c === -1) {
680
- if (handler && start < this.i) {
681
- this.text += chunk.slice(start, this.i);
814
+ if (handler && start < iBeforeGet) {
815
+ this.text += chunk.slice(start, iBeforeGet);
682
816
  }
683
817
  break;
684
818
  }
819
+ // In skip mode, getCode() may have internally looped past invalid chars.
820
+ // Flush valid text before the gap and advance start past it.
821
+ if (isSkip && this.prevI > iBeforeGet) {
822
+ if (handler && start < iBeforeGet) {
823
+ this.text += chunk.slice(start, iBeforeGet);
824
+ }
825
+ start = this.prevI;
826
+ }
827
+ // In replace mode, getCode() returns REPLACEMENT_CHAR for invalid chars
828
+ // but the original byte is still in the chunk. Detect this by checking
829
+ // whether getCode() returned REPLACEMENT_CHAR while the raw chunk byte
830
+ // at prevI is NOT U+FFFD (i.e., it was substituted by handleInvalidChar).
831
+ if (isReplace &&
832
+ c === REPLACEMENT_CHAR &&
833
+ chunk.charCodeAt(this.prevI) !== REPLACEMENT_CHAR) {
834
+ if (handler) {
835
+ if (start < this.prevI) {
836
+ this.text += chunk.slice(start, this.prevI);
837
+ }
838
+ this.text += REPLACEMENT_STR;
839
+ }
840
+ start = this.i;
841
+ nonSpace = true;
842
+ continue;
843
+ }
685
844
  if (c === LESS) {
686
845
  if (handler) {
687
846
  const slice = chunk.slice(start, this.prevI);
@@ -1058,13 +1217,43 @@ class SaxParser {
1058
1217
  start = this.i;
1059
1218
  continue;
1060
1219
  }
1061
- // All other charsfall back to getCode() for validation
1062
- const c = this.getCode();
1063
- if (c === -1) {
1064
- this.text += chunk.slice(start, this.i);
1065
- return;
1220
+ // Non-ASCII (>= 0x80)mostly valid, handle inline like handleTextInRoot
1221
+ if (code >= 0x80) {
1222
+ this.prevI = this.i;
1223
+ if (code >= 0xd800 && code <= 0xdbff) {
1224
+ const next = chunk.charCodeAt(this.i + 1);
1225
+ if (next >= 0xdc00 && next <= 0xdfff) {
1226
+ this.i += 2; // valid surrogate pair
1227
+ }
1228
+ else {
1229
+ this.i++;
1230
+ start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
1231
+ }
1232
+ }
1233
+ else if (code >= 0xdc00 && code <= 0xdfff) {
1234
+ this.i++;
1235
+ start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
1236
+ }
1237
+ else if (code === 0xfffe || code === 0xffff) {
1238
+ this.i++;
1239
+ start = this.handleInvalidCharInAttr(code, start);
1240
+ }
1241
+ else {
1242
+ this.i++; // valid non-ASCII BMP char
1243
+ }
1244
+ if (this.trackPosition) {
1245
+ this.column++;
1246
+ }
1247
+ continue;
1248
+ }
1249
+ // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
1250
+ // All invalid in XML 1.0
1251
+ this.prevI = this.i;
1252
+ this.i++;
1253
+ if (this.trackPosition) {
1254
+ this.column++;
1066
1255
  }
1067
- // Just continue — char is already consumed by getCode()
1256
+ start = this.handleInvalidCharInAttr(code, start);
1068
1257
  }
1069
1258
  // End of chunk
1070
1259
  this.text += chunk.slice(start, this.i);
@@ -42,7 +42,8 @@ function parseXmlToObject(xml, options) {
42
42
  position: false,
43
43
  fragment: options?.fragment ?? false,
44
44
  maxDepth: options?.maxDepth,
45
- maxEntityExpansions: options?.maxEntityExpansions
45
+ maxEntityExpansions: options?.maxEntityExpansions,
46
+ invalidCharHandling: options?.invalidCharHandling
46
47
  });
47
48
  // Stack: bottom is a synthetic root frame that collects the document root.
48
49
  const syntheticObj = Object.create(null);