npm - @cj-tech-master/excelts - Versions diffs - 7.5.0 → 7.6.0 - Mend

@cj-tech-master/excelts 7.5.0 → 7.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (31) hide show

package/dist/browser/modules/excel/stream/hyperlink-reader.js +1 -1
package/dist/browser/modules/excel/stream/workbook-reader.browser.js +2 -2
package/dist/browser/modules/excel/stream/worksheet-reader.js +1 -1
package/dist/browser/modules/excel/xlsx/xform/base-xform.js +1 -1
package/dist/browser/modules/xml/dom.js +2 -1
package/dist/browser/modules/xml/index.d.ts +1 -1
package/dist/browser/modules/xml/sax.d.ts +41 -0
package/dist/browser/modules/xml/sax.js +265 -76
package/dist/browser/modules/xml/to-object.js +2 -1
package/dist/browser/modules/xml/types.d.ts +24 -0
package/dist/cjs/modules/excel/stream/hyperlink-reader.js +1 -1
package/dist/cjs/modules/excel/stream/workbook-reader.browser.js +2 -2
package/dist/cjs/modules/excel/stream/worksheet-reader.js +1 -1
package/dist/cjs/modules/excel/xlsx/xform/base-xform.js +1 -1
package/dist/cjs/modules/xml/dom.js +2 -1
package/dist/cjs/modules/xml/sax.js +265 -76
package/dist/cjs/modules/xml/to-object.js +2 -1
package/dist/esm/modules/excel/stream/hyperlink-reader.js +1 -1
package/dist/esm/modules/excel/stream/workbook-reader.browser.js +2 -2
package/dist/esm/modules/excel/stream/worksheet-reader.js +1 -1
package/dist/esm/modules/excel/xlsx/xform/base-xform.js +1 -1
package/dist/esm/modules/xml/dom.js +2 -1
package/dist/esm/modules/xml/sax.js +265 -76
package/dist/esm/modules/xml/to-object.js +2 -1
package/dist/iife/excelts.iife.js +196 -54
package/dist/iife/excelts.iife.js.map +1 -1
package/dist/iife/excelts.iife.min.js +44 -44
package/dist/types/modules/xml/index.d.ts +1 -1
package/dist/types/modules/xml/sax.d.ts +41 -0
package/dist/types/modules/xml/types.d.ts +24 -0
package/package.json +1 -1

package/dist/cjs/modules/xml/dom.js CHANGED Viewed

@@ -96,7 +96,8 @@ function parseXml(xml, options) {
         fragment: options?.fragment ?? false,
         xmlns: options?.xmlns ?? false,
         maxDepth: options?.maxDepth,
-        maxEntityExpansions: options?.maxEntityExpansions
+        maxEntityExpansions: options?.maxEntityExpansions,
+        invalidCharHandling: options?.invalidCharHandling
     });
     // Stack of elements being built. The bottom is a synthetic root
     // that collects top-level nodes.

package/dist/cjs/modules/xml/sax.js CHANGED Viewed

@@ -40,6 +40,8 @@ const GREATER = 0x3e; // >
 const QUESTION = 0x3f; // ?
 const OPEN_BRACKET = 0x5b; // [
 const CLOSE_BRACKET = 0x5d; // ]
+const REPLACEMENT_CHAR = 0xfffd; // U+FFFD REPLACEMENT CHARACTER
+const REPLACEMENT_STR = "\uFFFD"; // Pre-allocated string form of U+FFFD
 // =============================================================================
 // Pre-computed Lookup Tables
 // =============================================================================
@@ -238,6 +240,7 @@ class SaxParser {
         this.maxDepth = options?.maxDepth !== undefined ? options.maxDepth : 256;
         this.maxEntityExpansions =
             options?.maxEntityExpansions !== undefined ? options.maxEntityExpansions : 10000;
+        this.invalidCharHandling = options?.invalidCharHandling ?? "error";
         this._init();
     }
     get closed() {
@@ -355,87 +358,215 @@ class SaxParser {
         return this.write(null);
     }
     // ===========================================================================
-    // Character Reading
+    // Invalid Character Handling
     // ===========================================================================
-    getCode() {
-        const { chunk, i } = this;
-        this.prevI = i;
-        this.i = i + 1;
-        if (i >= chunk.length) {
-            return -1;
-        }
-        const code = chunk.charCodeAt(i);
-        // Ultra-fast path: printable ASCII (0x20-0x7E) — the vast majority of XML content.
-        // No validation needed; these are always valid XML 1.0 characters.
-        if (code >= 0x20 && code <= 0x7e) {
-            if (this.trackPosition) {
-                this.column++;
+    /**
+     * Handle an invalid XML character according to the configured strategy.
+     *
+     * Used by `handleTextInRoot()` fast path which manages its own text accumulation
+     * and cannot use the `getCode()` loop approach.
+     *
+     * - `"error"`: call `fail()` and return the original code.
+     * - `"skip"`: return `REPLACEMENT_CHAR` as a sentinel (caller handles skip).
+     * - `"replace"`: return `REPLACEMENT_CHAR`.
+     *
+     * Note: For `getCode()`, invalid char handling is inlined to avoid recursion.
+     *
+     * @param code - The invalid character code point.
+     * @param kind - Optional description (e.g. "lone surrogate") for error messages.
+     * @returns The code point to use.
+     */
+    handleInvalidChar(code, kind) {
+        switch (this.invalidCharHandling) {
+            case "replace":
+                return REPLACEMENT_CHAR;
+            case "skip":
+                // Caller is responsible for the actual skip logic.
+                // We return -2 as a sentinel to tell getCode()'s loop to continue.
+                return -2;
+            default: {
+                // "error" — existing strict behavior
+                const label = kind
+                    ? `invalid XML character: ${kind} 0x${code.toString(16)}`
+                    : `invalid XML character: 0x${code.toString(16)}`;
+                this.fail(label);
+                return code;
             }
-            return code;
         }
-        // Secondary fast path: TAB (0x09) — common in attribute values
-        if (code === TAB) {
-            if (this.trackPosition) {
-                this.column++;
+    }
+    /**
+     * Handle an invalid character inside the `handleTextInRoot()` fast loop.
+     *
+     * Unlike `handleInvalidChar()` (which returns a code point for `getCode()`),
+     * this method manages the text accumulation state (`this.text`, `start`) that
+     * the fast text loop relies on.
+     *
+     * - `"error"`: call `fail()`, leave text accumulation unchanged (char stays in output).
+     * - `"skip"`: flush text up to the invalid char, skip it, return new `start`.
+     * - `"replace"`: flush text up to the invalid char, append U+FFFD, return new `start`.
+     *
+     * @returns The updated `start` index for the text accumulation loop.
+     */
+    handleInvalidCharInText(code, handler, start, kind) {
+        switch (this.invalidCharHandling) {
+            case "skip":
+                // Flush text accumulated before this invalid char, then skip it
+                if (handler && start < this.prevI) {
+                    this.text += this.chunk.slice(start, this.prevI);
+                }
+                return this.i;
+            case "replace":
+                // Flush text accumulated before this invalid char, append replacement
+                if (handler) {
+                    if (start < this.prevI) {
+                        this.text += this.chunk.slice(start, this.prevI);
+                    }
+                    this.text += REPLACEMENT_STR;
+                }
+                return this.i;
+            default: {
+                // "error" — existing strict behavior, char stays in output
+                const label = kind
+                    ? `invalid XML character: ${kind} 0x${code.toString(16)}`
+                    : `invalid XML character: 0x${code.toString(16)}`;
+                this.fail(label);
+                return start;
             }
-            return code;
         }
-        // Handle CR (normalize CR and CR+LF to LF per XML 1.0 §2.11)
-        if (code === CR) {
-            if (chunk.charCodeAt(i + 1) === NL) {
-                this.i = i + 2;
-            }
-            if (this.trackPosition) {
-                this.line++;
-                this.column = 0;
-                this.positionAtNewLine = this.position;
+    }
+    /**
+     * Handle an invalid character inside `sAttribValueQuoted()`.
+     *
+     * Same pattern as `handleInvalidCharInText()` but for attribute value
+     * accumulation (always uses `this.text`, no conditional handler check).
+     *
+     * @returns The updated `start` index.
+     */
+    handleInvalidCharInAttr(code, start, kind) {
+        switch (this.invalidCharHandling) {
+            case "skip":
+                if (start < this.prevI) {
+                    this.text += this.chunk.slice(start, this.prevI);
+                }
+                return this.i;
+            case "replace":
+                if (start < this.prevI) {
+                    this.text += this.chunk.slice(start, this.prevI);
+                }
+                this.text += REPLACEMENT_STR;
+                return this.i;
+            default: {
+                const label = kind
+                    ? `invalid XML character: ${kind} 0x${code.toString(16)}`
+                    : `invalid XML character: 0x${code.toString(16)}`;
+                this.fail(label);
+                return start;
             }
-            return NL;
         }
-        // Handle LF
-        if (code === NL) {
-            if (this.trackPosition) {
-                this.line++;
-                this.column = 0;
-                this.positionAtNewLine = this.position;
+    }
+    // ===========================================================================
+    // Character Reading
+    // ===========================================================================
+    getCode() {
+        // Loop to handle skip mode: when an invalid char returns -2, we retry
+        // with the next character instead of recursing (avoids stack overflow
+        // on long runs of consecutive invalid characters).
+        for (;;) {
+            const { chunk } = this;
+            const i = this.i;
+            this.prevI = i;
+            this.i = i + 1;
+            if (i >= chunk.length) {
+                return -1;
+            }
+            const code = chunk.charCodeAt(i);
+            // Ultra-fast path: printable ASCII (0x20-0x7E) — the vast majority of XML content.
+            // No validation needed; these are always valid XML 1.0 characters.
+            if (code >= 0x20 && code <= 0x7e) {
+                if (this.trackPosition) {
+                    this.column++;
+                }
+                return code;
             }
-            return NL;
-        }
-        // Handle surrogates
-        if (code >= 0xd800 && code <= 0xdbff) {
-            const next = chunk.charCodeAt(i + 1);
-            if (next >= 0xdc00 && next <= 0xdfff) {
-                this.i = i + 2;
+            // Secondary fast path: TAB (0x09) — common in attribute values
+            if (code === TAB) {
                 if (this.trackPosition) {
                     this.column++;
                 }
-                return 0x10000 + ((code - 0xd800) * 0x400 + (next - 0xdc00));
+                return code;
             }
-            // Lone high surrogate — invalid XML character
-            this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
-        }
-        // Lone low surrogate — invalid XML character
-        if (code >= 0xdc00 && code <= 0xdfff) {
-            this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
-        }
-        // Non-ASCII above surrogate range (0x80-0xD7FF, 0xE000-0xFFFD) — all valid XML
-        if (code >= 0x80) {
+            // Handle CR (normalize CR and CR+LF to LF per XML 1.0 §2.11)
+            if (code === CR) {
+                if (chunk.charCodeAt(i + 1) === NL) {
+                    this.i = i + 2;
+                }
+                if (this.trackPosition) {
+                    this.line++;
+                    this.column = 0;
+                    this.positionAtNewLine = this.position;
+                }
+                return NL;
+            }
+            // Handle LF
+            if (code === NL) {
+                if (this.trackPosition) {
+                    this.line++;
+                    this.column = 0;
+                    this.positionAtNewLine = this.position;
+                }
+                return NL;
+            }
+            // Handle surrogates
+            if (code >= 0xd800 && code <= 0xdbff) {
+                const next = chunk.charCodeAt(i + 1);
+                if (next >= 0xdc00 && next <= 0xdfff) {
+                    this.i = i + 2;
+                    if (this.trackPosition) {
+                        this.column++;
+                    }
+                    return 0x10000 + ((code - 0xd800) * 0x400 + (next - 0xdc00));
+                }
+                // Lone high surrogate — invalid XML character
+                const result = this.handleInvalidChar(code, "lone surrogate");
+                if (result !== -2) {
+                    return result;
+                }
+                continue; // skip: loop to next char
+            }
+            // Lone low surrogate — invalid XML character
+            if (code >= 0xdc00 && code <= 0xdfff) {
+                const result = this.handleInvalidChar(code, "lone surrogate");
+                if (result !== -2) {
+                    return result;
+                }
+                continue;
+            }
+            // Non-ASCII above surrogate range (0x80-0xD7FF, 0xE000-0xFFFD) — all valid XML
+            if (code >= 0x80) {
+                if (this.trackPosition) {
+                    this.column++;
+                }
+                // Reject 0xFFFE and 0xFFFF
+                if (code === 0xfffe || code === 0xffff) {
+                    const result = this.handleInvalidChar(code);
+                    if (result !== -2) {
+                        return result;
+                    }
+                    continue;
+                }
+                return code;
+            }
+            // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
+            // All invalid in XML 1.0
             if (this.trackPosition) {
                 this.column++;
             }
-            // Reject 0xFFFE and 0xFFFF
-            if (code === 0xfffe || code === 0xffff) {
-                this.fail("invalid XML character: 0x" + code.toString(16));
+            const result = this.handleInvalidChar(code);
+            if (result !== -2) {
+                return result;
             }
-            return code;
-        }
-        // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
-        // All invalid in XML 1.0
-        if (this.trackPosition) {
-            this.column++;
+            // skip: continue to next char
         }
-        this.fail("invalid XML character: 0x" + code.toString(16));
-        return code;
     }
     unget() {
         this.i = this.prevI;
@@ -642,16 +773,16 @@ class SaxParser {
                     }
                     else {
                         this.i++;
-                        this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
+                        start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
                     }
                 }
                 else if (code >= 0xdc00 && code <= 0xdfff) {
                     this.i++;
-                    this.fail("invalid XML character: lone surrogate 0x" + code.toString(16));
+                    start = this.handleInvalidCharInText(code, handler, start, "lone surrogate");
                 }
                 else if (code === 0xfffe || code === 0xffff) {
                     this.i++;
-                    this.fail("invalid XML character: 0x" + code.toString(16));
+                    start = this.handleInvalidCharInText(code, handler, start);
                 }
                 else {
                     this.i++;
@@ -667,7 +798,7 @@ class SaxParser {
             if (this.trackPosition) {
                 this.column++;
             }
-            this.fail("invalid XML character: 0x" + code.toString(16));
+            start = this.handleInvalidCharInText(code, handler, start);
         }
         // End of chunk
         if (handler && start < this.i) {
@@ -679,14 +810,42 @@ class SaxParser {
         let { i: start } = this;
         const handler = this._handlers.text;
         let nonSpace = false;
+        const isSkip = this.invalidCharHandling === "skip";
+        const isReplace = this.invalidCharHandling === "replace";
         while (true) {
+            const iBeforeGet = this.i;
             const c = this.getCode();
             if (c === -1) {
-                if (handler && start < this.i) {
-                    this.text += chunk.slice(start, this.i);
+                if (handler && start < iBeforeGet) {
+                    this.text += chunk.slice(start, iBeforeGet);
                 }
                 break;
             }
+            // In skip mode, getCode() may have internally looped past invalid chars.
+            // Flush valid text before the gap and advance start past it.
+            if (isSkip && this.prevI > iBeforeGet) {
+                if (handler && start < iBeforeGet) {
+                    this.text += chunk.slice(start, iBeforeGet);
+                }
+                start = this.prevI;
+            }
+            // In replace mode, getCode() returns REPLACEMENT_CHAR for invalid chars
+            // but the original byte is still in the chunk.  Detect this by checking
+            // whether getCode() returned REPLACEMENT_CHAR while the raw chunk byte
+            // at prevI is NOT U+FFFD (i.e., it was substituted by handleInvalidChar).
+            if (isReplace &&
+                c === REPLACEMENT_CHAR &&
+                chunk.charCodeAt(this.prevI) !== REPLACEMENT_CHAR) {
+                if (handler) {
+                    if (start < this.prevI) {
+                        this.text += chunk.slice(start, this.prevI);
+                    }
+                    this.text += REPLACEMENT_STR;
+                }
+                start = this.i;
+                nonSpace = true;
+                continue;
+            }
             if (c === LESS) {
                 if (handler) {
                     const slice = chunk.slice(start, this.prevI);
@@ -1063,13 +1222,43 @@ class SaxParser {
                 start = this.i;
                 continue;
             }
-            // All other chars — fall back to getCode() for validation
-            const c = this.getCode();
-            if (c === -1) {
-                this.text += chunk.slice(start, this.i);
-                return;
+            // Non-ASCII (>= 0x80) — mostly valid, handle inline like handleTextInRoot
+            if (code >= 0x80) {
+                this.prevI = this.i;
+                if (code >= 0xd800 && code <= 0xdbff) {
+                    const next = chunk.charCodeAt(this.i + 1);
+                    if (next >= 0xdc00 && next <= 0xdfff) {
+                        this.i += 2; // valid surrogate pair
+                    }
+                    else {
+                        this.i++;
+                        start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
+                    }
+                }
+                else if (code >= 0xdc00 && code <= 0xdfff) {
+                    this.i++;
+                    start = this.handleInvalidCharInAttr(code, start, "lone surrogate");
+                }
+                else if (code === 0xfffe || code === 0xffff) {
+                    this.i++;
+                    start = this.handleInvalidCharInAttr(code, start);
+                }
+                else {
+                    this.i++; // valid non-ASCII BMP char
+                }
+                if (this.trackPosition) {
+                    this.column++;
+                }
+                continue;
+            }
+            // Remaining: ASCII control characters (0x00-0x08, 0x0B, 0x0C, 0x0E-0x1F, 0x7F)
+            // All invalid in XML 1.0
+            this.prevI = this.i;
+            this.i++;
+            if (this.trackPosition) {
+                this.column++;
             }
-            // Just continue — char is already consumed by getCode()
+            start = this.handleInvalidCharInAttr(code, start);
         }
         // End of chunk
         this.text += chunk.slice(start, this.i);

package/dist/cjs/modules/xml/to-object.js CHANGED Viewed

@@ -45,7 +45,8 @@ function parseXmlToObject(xml, options) {
         position: false,
         fragment: options?.fragment ?? false,
         maxDepth: options?.maxDepth,
-        maxEntityExpansions: options?.maxEntityExpansions
+        maxEntityExpansions: options?.maxEntityExpansions,
+        invalidCharHandling: options?.invalidCharHandling
     });
     // Stack: bottom is a synthetic root frame that collects the document root.
     const syntheticObj = Object.create(null);

package/dist/esm/modules/excel/stream/hyperlink-reader.js CHANGED Viewed

@@ -43,7 +43,7 @@ class HyperlinkReader extends EventEmitter {
             return;
         }
         try {
-            const parser = new SaxParser({ position: false });
+            const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
             const decoder = new TextDecoder("utf-8", { fatal: true });
             parser.on("opentag", (node) => {
                 if (node.name !== "Relationship") {

package/dist/esm/modules/excel/stream/workbook-reader.browser.js CHANGED Viewed

@@ -211,7 +211,7 @@ export class WorkbookReaderBase extends EventEmitter {
         // For "cache" mode, use direct SAX callbacks (no event objects, no async generator overhead)
         if (this.options.sharedStrings === "cache") {
             const sharedStrings = this.sharedStrings;
-            const parser = new SaxParser({ position: false });
+            const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
             parser.on("opentag", (node) => {
                 switch (node.name) {
                     case "b":
@@ -311,7 +311,7 @@ export class WorkbookReaderBase extends EventEmitter {
             return;
         }
         // "emit" mode — must yield, so use direct SAX with per-chunk yield
-        const emitParser = new SaxParser();
+        const emitParser = new SaxParser({ invalidCharHandling: "skip" });
         const emitDecoder = new TextDecoder("utf-8", { fatal: true });
         let pendingEmits = [];
         emitParser.on("opentag", (node) => {

package/dist/esm/modules/excel/stream/worksheet-reader.js CHANGED Viewed

@@ -143,7 +143,7 @@ class WorksheetReader extends EventEmitter {
         // Direct SAX callback mode — zero intermediate event objects.
         // We collect worksheet events per-chunk and yield them.
         let worksheetEvents = null;
-        const parser = new SaxParser({ position: false });
+        const parser = new SaxParser({ position: false, invalidCharHandling: "skip" });
         parser.on("opentag", (node) => {
             if (emitSheet) {
                 switch (node.name) {

package/dist/esm/modules/excel/xlsx/xform/base-xform.js CHANGED Viewed

@@ -159,7 +159,7 @@ class BaseXform {
      * Use this instead of parse(parseSax(stream)) for hot paths.
      */
     async parseStreamDirect(stream) {
-        const parser = new SaxParser();
+        const parser = new SaxParser({ invalidCharHandling: "skip" });
         const decoder = new TextDecoder("utf-8", { fatal: true });
         let done = false;
         let finalModel;

package/dist/esm/modules/xml/dom.js CHANGED Viewed

@@ -87,7 +87,8 @@ function parseXml(xml, options) {
         fragment: options?.fragment ?? false,
         xmlns: options?.xmlns ?? false,
         maxDepth: options?.maxDepth,
-        maxEntityExpansions: options?.maxEntityExpansions
+        maxEntityExpansions: options?.maxEntityExpansions,
+        invalidCharHandling: options?.invalidCharHandling
     });
     // Stack of elements being built. The bottom is a synthetic root
     // that collects top-level nodes.