@cj-tech-master/excelts 7.5.0 → 7.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/modules/excel/stream/hyperlink-reader.js +1 -1
- package/dist/browser/modules/excel/stream/workbook-reader.browser.js +2 -2
- package/dist/browser/modules/excel/stream/worksheet-reader.js +1 -1
- package/dist/browser/modules/excel/xlsx/xform/base-xform.js +1 -1
- package/dist/browser/modules/xml/dom.js +2 -1
- package/dist/browser/modules/xml/index.d.ts +1 -1
- package/dist/browser/modules/xml/sax.d.ts +41 -0
- package/dist/browser/modules/xml/sax.js +265 -76
- package/dist/browser/modules/xml/to-object.js +2 -1
- package/dist/browser/modules/xml/types.d.ts +24 -0
- package/dist/cjs/modules/excel/stream/hyperlink-reader.js +1 -1
- package/dist/cjs/modules/excel/stream/workbook-reader.browser.js +2 -2
- package/dist/cjs/modules/excel/stream/worksheet-reader.js +1 -1
- package/dist/cjs/modules/excel/xlsx/xform/base-xform.js +1 -1
- package/dist/cjs/modules/xml/dom.js +2 -1
- package/dist/cjs/modules/xml/sax.js +265 -76
- package/dist/cjs/modules/xml/to-object.js +2 -1
- package/dist/esm/modules/excel/stream/hyperlink-reader.js +1 -1
- package/dist/esm/modules/excel/stream/workbook-reader.browser.js +2 -2
- package/dist/esm/modules/excel/stream/worksheet-reader.js +1 -1
- package/dist/esm/modules/excel/xlsx/xform/base-xform.js +1 -1
- package/dist/esm/modules/xml/dom.js +2 -1
- package/dist/esm/modules/xml/sax.js +265 -76
- package/dist/esm/modules/xml/to-object.js +2 -1
- package/dist/iife/excelts.iife.js +196 -54
- package/dist/iife/excelts.iife.js.map +1 -1
- package/dist/iife/excelts.iife.min.js +44 -44
- package/dist/types/modules/xml/index.d.ts +1 -1
- package/dist/types/modules/xml/sax.d.ts +41 -0
- package/dist/types/modules/xml/types.d.ts +24 -0
- package/package.json +1 -1
|
@@ -35,6 +35,8 @@ const GREATER = 0x3e; // >
|
|
|
35
35
|
const QUESTION = 0x3f; // ?
|
|
36
36
|
const OPEN_BRACKET = 0x5b; // [
|
|
37
37
|
const CLOSE_BRACKET = 0x5d; // ]
|
|
38
|
+
const REPLACEMENT_CHAR = 0xfffd; // U+FFFD REPLACEMENT CHARACTER
|
|
39
|
+
const REPLACEMENT_STR = "\uFFFD"; // Pre-allocated string form of U+FFFD
|
|
38
40
|
// =============================================================================
|
|
39
41
|
// Pre-computed Lookup Tables
|
|
40
42
|
// =============================================================================
|
|
@@ -233,6 +235,7 @@ class SaxParser {
|
|
|
233
235
|
this.maxDepth = options?.maxDepth !== undefined ? options.maxDepth : 256;
|
|
234
236
|
this.maxEntityExpansions =
|
|
235
237
|
options?.maxEntityExpansions !== undefined ? options.maxEntityExpansions : 10000;
|
|
238
|
+
this.invalidCharHandling = options?.invalidCharHandling ?? "error";
|
|
236
239
|
this._init();
|
|
237
240
|
}
|
|
238
241
|
get closed() {
|
|
@@ -350,87 +353,215 @@ class SaxParser {
|
|
|
350
353
|
return this.write(null);
|
|
351
354
|
}
|
|
352
355
|
// ===========================================================================
|
|
353
|
-
// Character
|
|
356
|
+
// Invalid Character Handling
|
|
354
357
|
// ===========================================================================
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
358
|
+
/**
|
|
359
|
+
* Handle an invalid XML character according to the configured strategy.
|
|
360
|
+
*
|
|
361
|
+
* Used by `handleTextInRoot()` fast path which manages its own text accumulation
|
|
362
|
+
* and cannot use the `getCode()` loop approach.
|
|
363
|
+
*
|
|
364
|
+
* - `"error"`: call `fail()` and return the original code.
|
|
365
|
+
* - `"skip"`: return `REPLACEMENT_CHAR` as a sentinel (caller handles skip).
|
|
366
|
+
* - `"replace"`: return `REPLACEMENT_CHAR`.
|
|
367
|
+
*
|
|
368
|
+
* Note: For `getCode()`, invalid char handling is inlined to avoid recursion.
|
|
369
|
+
*
|
|
370
|
+
* @param code - The invalid character code point.
|
|
371
|
+
* @param kind - Optional description (e.g. "lone surrogate") for error messages.
|
|
372
|
+
* @returns The code point to use.
|
|
373
|
+
*/
|
|
374
|
+
handleInvalidChar(code, kind) {
|
|
375
|
+
switch (this.invalidCharHandling) {
|
|
376
|
+
case "replace":
|
|
377
|
+
return REPLACEMENT_CHAR;
|
|
378
|
+
case "skip":
|
|
379
|
+
// Caller is responsible for the actual skip logic.
|
|
380
|
+
// We return -2 as a sentinel to tell getCode()'s loop to continue.
|
|
381
|
+
return -2;
|
|
382
|
+
default: {
|
|
383
|
+
// "error" — existing strict behavior
|
|
384
|
+
const label = kind
|
|
385
|
+
? `invalid XML character: ${kind} 0x${code.toString(16)}`
|
|
386
|
+
: `invalid XML character: 0x${code.toString(16)}`;
|
|
387
|
+
this.fail(label);
|
|
388
|
+
return code;
|
|
368
389
|
}
|
|
369
|
-
return code;
|
|
370
390
|
}
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
391
|
+
}
|
|
392
|
+
/**
|
|
393
|
+
* Handle an invalid character inside the `handleTextInRoot()` fast loop.
|
|
394
|
+
*
|
|
395
|
+
* Unlike `handleInvalidChar()` (which returns a code point for `getCode()`),
|
|
396
|
+
* this method manages the text accumulation state (`this.text`, `start`) that
|
|
397
|
+
* the fast text loop relies on.
|
|
398
|
+
*
|
|
399
|
+
* - `"error"`: call `fail()`, leave text accumulation unchanged (char stays in output).
|
|
400
|
+
* - `"skip"`: flush text up to the invalid char, skip it, return new `start`.
|
|
401
|
+
* - `"replace"`: flush text up to the invalid char, append U+FFFD, return new `start`.
|
|
402
|
+
*
|
|
403
|
+
* @returns The updated `start` index for the text accumulation loop.
|
|
404
|
+
*/
|
|
405
|
+
handleInvalidCharInText(code, handler, start, kind) {
|
|
406
|
+
switch (this.invalidCharHandling) {
|
|
407
|
+
case "skip":
|
|
408
|
+
// Flush text accumulated before this invalid char, then skip it
|
|
409
|
+
if (handler && start < this.prevI) {
|
|
410
|
+
this.text += this.chunk.slice(start, this.prevI);
|
|
411
|
+
}
|
|
412
|
+
return this.i;
|
|
413
|
+
case "replace":
|
|
414
|
+
// Flush text accumulated before this invalid char, append replacement
|
|
415
|
+
if (handler) {
|
|
416
|
+
if (start < this.prevI) {
|
|
417
|
+
this.text += this.chunk.slice(start, this.prevI);
|
|
418
|
+
}
|
|
419
|
+
this.text += REPLACEMENT_STR;
|
|
420
|
+
}
|
|
421
|
+
return this.i;
|
|
422
|
+
default: {
|
|
423
|
+
// "error" — existing strict behavior, char stays in output
|
|
424
|
+
const label = kind
|
|
425
|
+
? `invalid XML character: ${kind} 0x${code.toString(16)}`
|
|
426
|
+
: `invalid XML character: 0x${code.toString(16)}`;
|
|
427
|
+
this.fail(label);
|
|
428
|
+
return start;
|
|
375
429
|
}
|
|
376
|
-
return code;
|
|
377
430
|
}
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
431
|
+
}
|
|
432
|
+
/**
|
|
433
|
+
* Handle an invalid character inside `sAttribValueQuoted()`.
|
|
434
|
+
*
|
|
435
|
+
* Same pattern as `handleInvalidCharInText()` but for attribute value
|
|
436
|
+
* accumulation (always uses `this.text`, no conditional handler check).
|
|
437
|
+
*
|
|
438
|
+
* @returns The updated `start` index.
|
|
439
|
+
*/
|
|
440
|
+
handleInvalidCharInAttr(code, start, kind) {
|
|
441
|
+
switch (this.invalidCharHandling) {
|
|
442
|
+
case "skip":
|
|
443
|
+
if (start < this.prevI) {
|
|
444
|
+
this.text += this.chunk.slice(start, this.prevI);
|
|
445
|
+
}
|
|
446
|
+
return this.i;
|
|
447
|
+
case "replace":
|
|
448
|
+
if (start < this.prevI) {
|
|
449
|
+
this.text += this.chunk.slice(start, this.prevI);
|
|
450
|
+
}
|
|
451
|
+
this.text += REPLACEMENT_STR;
|
|
452
|
+
return this.i;
|
|
453
|
+
default: {
|
|
454
|
+
const label = kind
|
|
455
|
+
? `invalid XML character: ${kind} 0x${code.toString(16)}`
|
|
456
|
+
: `invalid XML character: 0x${code.toString(16)}`;
|
|
457
|
+
this.fail(label);
|
|
458
|
+
return start;
|
|
387
459
|
}
|
|
388
|
-
return NL;
|
|
389
460
|
}
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
461
|
+
}
|
|
462
|
+
// ===========================================================================
|
|
463
|
+
// Character Reading
|
|
464
|
+
// ===========================================================================
|
|
465
|
+
getCode() {
|
|
466
|
+
// Loop to handle skip mode: when an invalid char returns -2, we retry
|
|
467
|
+
// with the next character instead of recursing (avoids stack overflow
|
|
468
|
+
// on long runs of consecutive invalid characters).
|
|
469
|
+
for (;;) {
|
|
470
|
+
const { chunk } = this;
|
|
471
|
+
const i = this.i;
|
|
472
|
+
this.prevI = i;
|
|
473
|
+
this.i = i + 1;
|
|
474
|
+
if (i >= chunk.length) {
|
|
475
|
+
return -1;
|
|
476
|
+
}
|
|
477
|
+
const code = chunk.charCodeAt(i);
|
|
478
|
+
// Ultra-fast path: printable ASCII (0x20-0x7E) — the vast majority of XML content.
|
|
479
|
+
// No validation needed; these are always valid XML 1.0 characters.
|
|
480
|
+
if (code >= 0x20 && code <= 0x7e) {
|
|
481
|
+
if (this.trackPosition) {
|
|
482
|
+
this.column++;
|
|
483
|
+
}
|
|
484
|
+
return code;
|
|
396
485
|
}
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
// Handle surrogates
|
|
400
|
-
if (code >= 0xd800 && code <= 0xdbff) {
|
|
401
|
-
const next = chunk.charCodeAt(i + 1);
|
|
402
|
-
if (next >= 0xdc00 && next <= 0xdfff) {
|
|
403
|
-
this.i = i + 2;
|
|
486
|
+
// Secondary fast path: TAB (0x09) — common in attribute values
|
|
487
|
+
if (code === TAB) {
|
|
404
488
|
if (this.trackPosition) {
|
|
405
489
|
this.column++;
|
|
406
490
|
}
|
|
407
|
-
return
|
|
491
|
+
return code;
|
|
408
492
|
}
|
|
409
|
-
//
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
493
|
+
// Handle CR (normalize CR and CR+LF to LF per XML 1.0 §2.11)
|
|
494
|
+
if (code === CR) {
|
|
495
|
+
if (chunk.charCodeAt(i + 1) === NL) {
|
|
496
|
+
this.i = i + 2;
|
|
497
|
+
}
|
|
498
|
+
if (this.trackPosition) {
|
|
499
|
+
this.line++;
|
|
500
|
+
this.column = 0;
|
|
501
|
+
this.positionAtNewLine = this.position;
|
|
502
|
+
}
|
|
503
|
+
return NL;
|
|
504
|
+
}
|
|
505
|
+
// Handle LF
|
|
506
|
+
if (code === NL) {
|
|
507
|
+
if (this.trackPosition) {
|
|
508
|
+
this.line++;
|
|
509
|
+
this.column = 0;
|
|
510
|
+
this.positionAtNewLine = this.position;
|
|
511
|
+
}
|
|
512
|
+
return NL;
|
|
513
|
+
}
|
|
514
|
+
// Handle surrogates
|
|
515
|
+
if (code >= 0xd800 && code <= 0xdbff) {
|
|
516
|
+
const next = chunk.charCodeAt(i + 1);
|
|
517
|
+
if (next >= 0xdc00 && next <= 0xdfff) {
|
|
518
|
+
this.i = i + 2;
|
|
519
|
+
if (this.trackPosition) {
|
|
520
|
+
this.column++;
|
|
521
|
+
}
|
|
522
|
+
return 0x10000 + ((code - 0xd800) * 0x400 + (next - 0xdc00));
|
|
523
|
+
}
|
|
524
|
+
// Lone high surrogate — invalid XML character
|
|
525
|
+
const result = this.handleInvalidChar(code, "lone surrogate");
|
|
526
|
+
if (result !== -2) {
|
|
527
|
+
return result;
|
|
528
|
+
}
|
|
529
|
+
continue; // skip: loop to next char
|
|
530
|
+
}
|
|
531
|
+
// Lone low surrogate — invalid XML character
|
|
532
|
+
if (code >= 0xdc00 && code <= 0xdfff) {
|
|
533
|
+
const result = this.handleInvalidChar(code, "lone surrogate");
|
|
534
|
+
if (result !== -2) {
|
|
535
|
+
return result;
|
|
536
|
+
}
|
|
537
|
+
continue;
|
|
538
|
+
}
|
|
539
|
+
// Non-ASCII above surrogate range (0x80-0xD7FF, 0xE000-0xFFFD) — all valid XML
|
|
540
|
+
if (code >= 0x80) {
|
|
541
|
+
if (this.trackPosition) {
|
|
542
|
+
this.column++;
|
|
543
|
+
}
|
|
544
|
+
// Reject 0xFFFE and 0xFFFF
|
|
545
|
+
if (code === 0xfffe || code === 0xffff) {
|
|
546
|
+
const result = this.handleInvalidChar(code);
|
|
547
|
+
if (result !== -2) {
|
|
548
|
+
return result;
|
|
549
|
+
}
|
|
550
|
+
continue;
|
|
551
|
+
}
|
|
552
|
+
return code;
|
|
553
|
+
}
|
|
554
|
+
// Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
|
|
555
|
+
// All invalid in XML 1.0
|
|
418
556
|
if (this.trackPosition) {
|
|
419
557
|
this.column++;
|
|
420
558
|
}
|
|
421
|
-
|
|
422
|
-
if (
|
|
423
|
-
|
|
559
|
+
const result = this.handleInvalidChar(code);
|
|
560
|
+
if (result !== -2) {
|
|
561
|
+
return result;
|
|
424
562
|
}
|
|
425
|
-
|
|
426
|
-
}
|
|
427
|
-
// Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
|
|
428
|
-
// All invalid in XML 1.0
|
|
429
|
-
if (this.trackPosition) {
|
|
430
|
-
this.column++;
|
|
563
|
+
// skip: continue to next char
|
|
431
564
|
}
|
|
432
|
-
this.fail("invalid XML character: 0x" + code.toString(16));
|
|
433
|
-
return code;
|
|
434
565
|
}
|
|
435
566
|
unget() {
|
|
436
567
|
this.i = this.prevI;
|
|
@@ -637,16 +768,16 @@ class SaxParser {
|
|
|
637
768
|
}
|
|
638
769
|
else {
|
|
639
770
|
this.i++;
|
|
640
|
-
this.
|
|
771
|
+
start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
|
|
641
772
|
}
|
|
642
773
|
}
|
|
643
774
|
else if (code >= 0xdc00 && code <= 0xdfff) {
|
|
644
775
|
this.i++;
|
|
645
|
-
this.
|
|
776
|
+
start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
|
|
646
777
|
}
|
|
647
778
|
else if (code === 0xfffe || code === 0xffff) {
|
|
648
779
|
this.i++;
|
|
649
|
-
this.
|
|
780
|
+
start = this.handleInvalidCharInText(code, handler, start);
|
|
650
781
|
}
|
|
651
782
|
else {
|
|
652
783
|
this.i++;
|
|
@@ -662,7 +793,7 @@ class SaxParser {
|
|
|
662
793
|
if (this.trackPosition) {
|
|
663
794
|
this.column++;
|
|
664
795
|
}
|
|
665
|
-
this.
|
|
796
|
+
start = this.handleInvalidCharInText(code, handler, start);
|
|
666
797
|
}
|
|
667
798
|
// End of chunk
|
|
668
799
|
if (handler && start < this.i) {
|
|
@@ -674,14 +805,42 @@ class SaxParser {
|
|
|
674
805
|
let { i: start } = this;
|
|
675
806
|
const handler = this._handlers.text;
|
|
676
807
|
let nonSpace = false;
|
|
808
|
+
const isSkip = this.invalidCharHandling === "skip";
|
|
809
|
+
const isReplace = this.invalidCharHandling === "replace";
|
|
677
810
|
while (true) {
|
|
811
|
+
const iBeforeGet = this.i;
|
|
678
812
|
const c = this.getCode();
|
|
679
813
|
if (c === -1) {
|
|
680
|
-
if (handler && start <
|
|
681
|
-
this.text += chunk.slice(start,
|
|
814
|
+
if (handler && start < iBeforeGet) {
|
|
815
|
+
this.text += chunk.slice(start, iBeforeGet);
|
|
682
816
|
}
|
|
683
817
|
break;
|
|
684
818
|
}
|
|
819
|
+
// In skip mode, getCode() may have internally looped past invalid chars.
|
|
820
|
+
// Flush valid text before the gap and advance start past it.
|
|
821
|
+
if (isSkip && this.prevI > iBeforeGet) {
|
|
822
|
+
if (handler && start < iBeforeGet) {
|
|
823
|
+
this.text += chunk.slice(start, iBeforeGet);
|
|
824
|
+
}
|
|
825
|
+
start = this.prevI;
|
|
826
|
+
}
|
|
827
|
+
// In replace mode, getCode() returns REPLACEMENT_CHAR for invalid chars
|
|
828
|
+
// but the original byte is still in the chunk. Detect this by checking
|
|
829
|
+
// whether getCode() returned REPLACEMENT_CHAR while the raw chunk byte
|
|
830
|
+
// at prevI is NOT U+FFFD (i.e., it was substituted by handleInvalidChar).
|
|
831
|
+
if (isReplace &&
|
|
832
|
+
c === REPLACEMENT_CHAR &&
|
|
833
|
+
chunk.charCodeAt(this.prevI) !== REPLACEMENT_CHAR) {
|
|
834
|
+
if (handler) {
|
|
835
|
+
if (start < this.prevI) {
|
|
836
|
+
this.text += chunk.slice(start, this.prevI);
|
|
837
|
+
}
|
|
838
|
+
this.text += REPLACEMENT_STR;
|
|
839
|
+
}
|
|
840
|
+
start = this.i;
|
|
841
|
+
nonSpace = true;
|
|
842
|
+
continue;
|
|
843
|
+
}
|
|
685
844
|
if (c === LESS) {
|
|
686
845
|
if (handler) {
|
|
687
846
|
const slice = chunk.slice(start, this.prevI);
|
|
@@ -1058,13 +1217,43 @@ class SaxParser {
|
|
|
1058
1217
|
start = this.i;
|
|
1059
1218
|
continue;
|
|
1060
1219
|
}
|
|
1061
|
-
//
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1220
|
+
// Non-ASCII (>= 0x80) — mostly valid, handle inline like handleTextInRoot
|
|
1221
|
+
if (code >= 0x80) {
|
|
1222
|
+
this.prevI = this.i;
|
|
1223
|
+
if (code >= 0xd800 && code <= 0xdbff) {
|
|
1224
|
+
const next = chunk.charCodeAt(this.i + 1);
|
|
1225
|
+
if (next >= 0xdc00 && next <= 0xdfff) {
|
|
1226
|
+
this.i += 2; // valid surrogate pair
|
|
1227
|
+
}
|
|
1228
|
+
else {
|
|
1229
|
+
this.i++;
|
|
1230
|
+
start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
|
|
1231
|
+
}
|
|
1232
|
+
}
|
|
1233
|
+
else if (code >= 0xdc00 && code <= 0xdfff) {
|
|
1234
|
+
this.i++;
|
|
1235
|
+
start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
|
|
1236
|
+
}
|
|
1237
|
+
else if (code === 0xfffe || code === 0xffff) {
|
|
1238
|
+
this.i++;
|
|
1239
|
+
start = this.handleInvalidCharInAttr(code, start);
|
|
1240
|
+
}
|
|
1241
|
+
else {
|
|
1242
|
+
this.i++; // valid non-ASCII BMP char
|
|
1243
|
+
}
|
|
1244
|
+
if (this.trackPosition) {
|
|
1245
|
+
this.column++;
|
|
1246
|
+
}
|
|
1247
|
+
continue;
|
|
1248
|
+
}
|
|
1249
|
+
// Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
|
|
1250
|
+
// All invalid in XML 1.0
|
|
1251
|
+
this.prevI = this.i;
|
|
1252
|
+
this.i++;
|
|
1253
|
+
if (this.trackPosition) {
|
|
1254
|
+
this.column++;
|
|
1066
1255
|
}
|
|
1067
|
-
|
|
1256
|
+
start = this.handleInvalidCharInAttr(code, start);
|
|
1068
1257
|
}
|
|
1069
1258
|
// End of chunk
|
|
1070
1259
|
this.text += chunk.slice(start, this.i);
|
|
@@ -42,7 +42,8 @@ function parseXmlToObject(xml, options) {
|
|
|
42
42
|
position: false,
|
|
43
43
|
fragment: options?.fragment ?? false,
|
|
44
44
|
maxDepth: options?.maxDepth,
|
|
45
|
-
maxEntityExpansions: options?.maxEntityExpansions
|
|
45
|
+
maxEntityExpansions: options?.maxEntityExpansions,
|
|
46
|
+
invalidCharHandling: options?.invalidCharHandling
|
|
46
47
|
});
|
|
47
48
|
// Stack: bottom is a synthetic root frame that collects the document root.
|
|
48
49
|
const syntheticObj = Object.create(null);
|