npm - @cj-tech-master/excelts - Versions diffs - 9.5.5 → 9.5.6 - Mend

@cj-tech-master/excelts 9.5.5 → 9.5.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

package/dist/browser/modules/word/convert/html/html-import.js CHANGED Viewed

@@ -67,73 +67,342 @@ function tokenize(html) {
     // instructions before tokenising — none of them should appear as text
     // in the document body. The previous regex treated `<!doctype html>`
     // as a text node containing `"!doctype html>"`.
-    const stripped = html
-        .replace(/<!--[\s\S]*?-->/g, "")
-        .replace(/<!doctype[^>]*>/gi, "")
-        .replace(/<!\[CDATA\[[\s\S]*?\]\]>/g, "")
-        .replace(/<\?[\s\S]*?\?>/g, "");
-    // Match a tag, OR a run of text. Text is anything-up-to-the-next-tag,
-    // with the addition that a `<` not followed by a tag-like character is
-    // treated as literal text (so "1 < 2" / "a<b" / "<<" survive instead
-    // of being silently swallowed).
-    const re = /<\/?([a-zA-Z][a-zA-Z0-9]*)((?:\s+[^>]*?)?)\/?\s*>|((?:[^<]|<(?![/a-zA-Z]))+)/g;
-    const tagRe = /^<(\/?)([a-zA-Z][a-zA-Z0-9]*)((?:\s+[^>]*?)?)(\/?)\s*>$/;
-    let m;
-    while ((m = re.exec(stripped)) !== null) {
-        const fullMatch = m[0];
-        if (m[3] !== undefined) {
-            // Text node
-            const text = decodeHtmlEntities(m[3]);
+    //
+    // We use a single linear scan rather than chained `.replace()` calls so
+    // we are immune to two CodeQL findings:
+    //   - Incomplete multi-character sanitization: chained replaces let
+    //     payloads such as `<!--<!--x-->-->` leak through (each pass only
+    //     removes one layer, leaving `-->` behind).
+    //   - Polynomial regular expression on uncontrolled data: lazy
+    //     quantifiers like `<!--[\s\S]*?-->` exhibit catastrophic
+    //     backtracking on adversarial input.
+    const stripped = stripSgmlNoise(html);
+    // The tokenizer is implemented as a linear index scan rather than a
+    // global regex (`/<\/?…(?:\s+[^>]*?)?\/?\s*>|((?:[^<]|…)+)/g`). The
+    // previous regex form combined an optional lazy attribute span with
+    // an optional `\/?` and optional trailing whitespace, which CodeQL
+    // flagged as polynomial-redos: an adversarial payload such as
+    // `<a` followed by many spaces but no closing `>` triggered
+    // catastrophic backtracking.
+    //
+    // The scan below is strictly O(n):
+    //   - At every position we either advance one character or jump
+    //     forward to the next `<` / `>` via a single `indexOf`.
+    //   - Attribute parsing is delegated to `parseHtmlAttrs`, which is
+    //     itself a linear scanner.
+    const n = stripped.length;
+    let i = 0;
+    while (i < n) {
+        // Scan a text run: everything up to the next position that begins
+        // a tag (`<` followed by a letter, or `</` followed by a letter).
+        // Bare `<` characters and unfinished tag-like fragments are kept
+        // inside the text run so that input such as `1 < 2`, `a<b<c`,
+        // `<<<<` or `<unfinished` (with no closing `>` anywhere) is not
+        // shattered into a stream of single-character runs.
+        if (stripped.charCodeAt(i) !== 0x3c /* '<' */ || !isTagStart(stripped, i)) {
+            const textEnd = scanTextEnd(stripped, i);
+            const raw = stripped.slice(i, textEnd);
+            const text = decodeHtmlEntities(raw);
             if (text) {
                 tokens.push({ type: "text", value: text });
             }
+            i = textEnd;
+            if (i >= n) {
+                break;
+            }
+            // Fall through: position `i` is now at a real tag start.
+        }
+        // We are at '<' that introduces a tag (guaranteed by the
+        // `isTagStart` check above).
+        const next = stripped.charCodeAt(i + 1);
+        const isClose = next === 0x2f; /* '/' */
+        const nameStart = isClose ? i + 2 : i + 1;
+        // Defensive: the loop guard above should already ensure this, but
+        // keep the check so a future refactor cannot silently turn a bare
+        // `<` into an attempted tag parse.
+        if (!isAsciiAlpha(stripped.charCodeAt(nameStart))) {
+            tokens.push({ type: "text", value: "<" });
+            i++;
+            continue;
         }
-        else {
-            const tagMatch = tagRe.exec(fullMatch);
-            if (tagMatch) {
-                const isClose = tagMatch[1] === "/";
-                const tag = tagMatch[2].toLowerCase();
-                const attrStr = tagMatch[3];
-                const selfClose = tagMatch[4] === "/" || VOID_ELEMENTS.has(tag);
-                const attrs = parseHtmlAttrs(attrStr);
-                if (isClose) {
-                    tokens.push({ type: "close", tag, attrs: {} });
-                }
-                else if (selfClose) {
-                    tokens.push({ type: "selfclose", tag, attrs });
-                }
-                else {
-                    tokens.push({ type: "open", tag, attrs });
-                    // Raw-text elements: their body must not be parsed as markup. Skip
-                    // forward to the matching close tag and either capture the body as
-                    // a single text token (for <style>, which is post-processed by
-                    // extractStyleRules) or discard it entirely (for <script>, etc.).
-                    // Without this, embedded scripts would leak into the document body.
-                    if (RAW_TEXT_ELEMENTS.has(tag)) {
-                        const closeRe = new RegExp(`</${tag}\\s*>`, "i");
-                        closeRe.lastIndex = re.lastIndex;
-                        const startBody = re.lastIndex;
-                        const closeMatch = closeRe.exec(stripped);
-                        if (closeMatch) {
-                            const body = stripped.slice(startBody, closeMatch.index);
-                            if (RAW_TEXT_PRESERVE_BODY.has(tag)) {
-                                tokens.push({ type: "text", value: body });
-                            }
-                            tokens.push({ type: "close", tag, attrs: {} });
-                            re.lastIndex = closeMatch.index + closeMatch[0].length;
-                        }
-                        else {
-                            // No closing tag — discard the rest of the input for this
-                            // raw-text element to avoid emitting markup as text.
-                            re.lastIndex = stripped.length;
-                        }
-                    }
+        // Read the tag name: [A-Za-z][A-Za-z0-9]*.
+        let p = nameStart + 1;
+        while (p < n) {
+            const c = stripped.charCodeAt(p);
+            if (!isAsciiAlpha(c) && !isAsciiDigit(c)) {
+                break;
+            }
+            p++;
+        }
+        const tagName = stripped.slice(nameStart, p).toLowerCase();
+        // Find the closing '>' of the tag. We have to be careful not to
+        // mistake a '>' inside a quoted attribute value for the tag end.
+        const tagEnd = findTagEnd(stripped, p);
+        if (tagEnd < 0) {
+            // No closing '>' — the rest of the input is malformed; treat the
+            // remainder as text. (Original regex would simply not match and
+            // leave the same characters as text via the alternation.)
+            const text = decodeHtmlEntities(stripped.slice(i));
+            if (text) {
+                tokens.push({ type: "text", value: text });
+            }
+            // `break` exits the loop directly; no need to assign `i = n`
+            // first (CodeQL js/useless-assignment-to-local).
+            break;
+        }
+        // Inside [p, tagEnd) lie attributes (and possibly a trailing '/').
+        let inner = stripped.slice(p, tagEnd);
+        // Detect self-close: trailing '/'. Strip it so it is not parsed as
+        // an attribute name.
+        let selfClose = false;
+        // Trim trailing whitespace, then a single '/'.
+        let innerEnd = inner.length;
+        while (innerEnd > 0 && isHtmlSpace(inner.charCodeAt(innerEnd - 1))) {
+            innerEnd--;
+        }
+        if (innerEnd > 0 && inner.charCodeAt(innerEnd - 1) === 0x2f) {
+            selfClose = true;
+            innerEnd--;
+        }
+        inner = inner.slice(0, innerEnd);
+        if (isClose) {
+            tokens.push({ type: "close", tag: tagName, attrs: {} });
+            i = tagEnd + 1;
+            continue;
+        }
+        const attrs = parseHtmlAttrs(inner);
+        const isVoidElement = VOID_ELEMENTS.has(tagName);
+        if (selfClose || isVoidElement) {
+            tokens.push({ type: "selfclose", tag: tagName, attrs });
+            i = tagEnd + 1;
+            continue;
+        }
+        tokens.push({ type: "open", tag: tagName, attrs });
+        i = tagEnd + 1;
+        // Raw-text elements: their body must not be parsed as markup.
+        if (RAW_TEXT_ELEMENTS.has(tagName)) {
+            const closeIdx = findRawTextClose(stripped, i, tagName);
+            if (closeIdx === null) {
+                // No closing tag — discard the rest of the input for this
+                // raw-text element to avoid emitting markup as text.
+                i = n;
+            }
+            else {
+                const body = stripped.slice(i, closeIdx.bodyEnd);
+                if (RAW_TEXT_PRESERVE_BODY.has(tagName)) {
+                    tokens.push({ type: "text", value: body });
                 }
+                tokens.push({ type: "close", tag: tagName, attrs: {} });
+                i = closeIdx.next;
             }
         }
     }
     return tokens;
 }
+function isAsciiAlpha(c) {
+    return (c >= 0x41 && c <= 0x5a) || (c >= 0x61 && c <= 0x7a);
+}
+function isAsciiDigit(c) {
+    return c >= 0x30 && c <= 0x39;
+}
+function isHtmlSpace(c) {
+    return c === 0x20 || c === 0x09 || c === 0x0a || c === 0x0d || c === 0x0c;
+}
+/**
+ * Scan forward from `from` to the position of the next '<' that
+ * introduces a tag (i.e. is followed by `[a-zA-Z]` or `/[a-zA-Z]`).
+ * A bare '<' (e.g. in `1 < 2`) is included in the text run.
+ */
+function scanTextEnd(s, from) {
+    const n = s.length;
+    let i = from;
+    while (i < n) {
+        const lt = s.indexOf("<", i);
+        if (lt < 0) {
+            return n;
+        }
+        if (isTagStart(s, lt)) {
+            return lt;
+        }
+        // Bare '<' or `</` not followed by a letter — keep scanning.
+        i = lt + 1;
+    }
+    return n;
+}
+/**
+ * Return true if position `pos` in `s` is `<` followed by a letter
+ * (open tag) or `</` followed by a letter (close tag). Used to
+ * distinguish "real" tag starts from literal `<` characters.
+ */
+function isTagStart(s, pos) {
+    if (s.charCodeAt(pos) !== 0x3c /* '<' */) {
+        return false;
+    }
+    const next = s.charCodeAt(pos + 1);
+    if (isAsciiAlpha(next)) {
+        return true;
+    }
+    if (next === 0x2f /* '/' */ && isAsciiAlpha(s.charCodeAt(pos + 2))) {
+        return true;
+    }
+    return false;
+}
+/**
+ * Find the index of the '>' that closes the tag opened just before
+ * `from`. Honours quoted attribute values so that `<a href="x>y">`
+ * does not stop at the '>' inside quotes.
+ *
+ * Returns -1 if no closing '>' is found before EOF.
+ */
+function findTagEnd(s, from) {
+    const n = s.length;
+    let i = from;
+    while (i < n) {
+        const c = s.charCodeAt(i);
+        if (c === 0x22 /* '"' */ || c === 0x27 /* "'" */) {
+            const close = s.indexOf(c === 0x22 ? '"' : "'", i + 1);
+            if (close < 0) {
+                return -1;
+            }
+            i = close + 1;
+            continue;
+        }
+        if (c === 0x3e /* '>' */) {
+            return i;
+        }
+        i++;
+    }
+    return -1;
+}
+/**
+ * Find the closing tag for a raw-text element (e.g. `</script>`),
+ * starting at `from`. Returns the position immediately after the
+ * close tag (`next`) plus the position where the body ends (`bodyEnd`,
+ * i.e. the start of the close-tag literal).
+ *
+ * Implemented with a linear scan (no dynamic `RegExp`) so that
+ * adversarial bodies cannot trigger super-linear runtime.
+ */
+function findRawTextClose(s, from, tagName) {
+    const n = s.length;
+    let i = from;
+    while (i < n) {
+        const lt = s.indexOf("</", i);
+        if (lt < 0) {
+            return null;
+        }
+        const after = lt + 2;
+        // Compare tag name case-insensitively.
+        let ok = true;
+        for (let k = 0; k < tagName.length; k++) {
+            const a = s.charCodeAt(after + k);
+            const aLower = a >= 0x41 && a <= 0x5a ? a + 0x20 : a;
+            if (aLower !== tagName.charCodeAt(k)) {
+                ok = false;
+                break;
+            }
+        }
+        if (!ok) {
+            i = after;
+            continue;
+        }
+        // Skip any trailing whitespace before '>'.
+        let p = after + tagName.length;
+        while (p < n && isHtmlSpace(s.charCodeAt(p))) {
+            p++;
+        }
+        if (p < n && s.charCodeAt(p) === 0x3e /* '>' */) {
+            return { bodyEnd: lt, next: p + 1 };
+        }
+        i = after;
+    }
+    return null;
+}
+/**
+ * Strip HTML comments, doctype declarations, CDATA sections and SGML
+ * processing instructions in a single linear scan.
+ *
+ * A linear scan (vs. chained `String.prototype.replace` with regular
+ * expressions) is required for two reasons:
+ *
+ * 1. **Incomplete multi-character sanitization** — chained replaces are
+ *    each one pass; an attacker can nest the syntax (e.g.
+ *    `<!--<!--x-->-->`) so the outer marker survives after the inner
+ *    one is removed.
+ * 2. **Catastrophic backtracking** — lazy quantifiers such as
+ *    `<!--[\s\S]*?-->` are polynomial-time on adversarial input
+ *    (very long unterminated comments).
+ *
+ * The scan is O(n) in the input length and removes nested constructs by
+ * not advancing past the closing marker into already-emitted text.
+ */
+function stripSgmlNoise(input) {
+    let out = "";
+    let i = 0;
+    const n = input.length;
+    while (i < n) {
+        if (input.charCodeAt(i) !== 0x3c /* '<' */) {
+            out += input[i];
+            i++;
+            continue;
+        }
+        // Comment: <!-- ... -->
+        // If the closing `-->` is missing the input is malformed. The
+        // previous regex (`/<!--[\s\S]*?-->/g`) simply did not match in that
+        // case and left the text in place; we preserve that behaviour rather
+        // than swallowing the rest of the document, which would silently
+        // change the parse for legitimate inputs that happen to contain a
+        // stray `<!--`.
+        if (input.startsWith("<!--", i)) {
+            const end = input.indexOf("-->", i + 4);
+            if (end < 0) {
+                out += "<";
+                i++;
+                continue;
+            }
+            i = end + 3;
+            continue;
+        }
+        // CDATA: <![CDATA[ ... ]]>
+        if (input.startsWith("<![CDATA[", i)) {
+            const end = input.indexOf("]]>", i + 9);
+            if (end < 0) {
+                out += "<";
+                i++;
+                continue;
+            }
+            i = end + 3;
+            continue;
+        }
+        // Doctype: <!doctype ...> (case-insensitive)
+        if (input.charCodeAt(i + 1) === 0x21 /* '!' */ &&
+            input.slice(i + 2, i + 9).toLowerCase() === "doctype") {
+            const end = input.indexOf(">", i + 9);
+            if (end < 0) {
+                out += "<";
+                i++;
+                continue;
+            }
+            i = end + 1;
+            continue;
+        }
+        // Processing instruction: <? ... ?>
+        if (input.charCodeAt(i + 1) === 0x3f /* '?' */) {
+            const end = input.indexOf("?>", i + 2);
+            if (end < 0) {
+                out += "<";
+                i++;
+                continue;
+            }
+            i = end + 2;
+            continue;
+        }
+        // Not an SGML noise construct — emit the '<' literally and continue.
+        out += "<";
+        i++;
+    }
+    return out;
+}
 /**
  * HTML elements whose body is not parsed as markup. Their content is either
  * preserved (style) for downstream processing or discarded entirely.
@@ -204,26 +473,110 @@ function extractStyleRules(tokens) {
     }
     return result;
 }
+/**
+ * Parse HTML-style attributes from the inside of a start tag, e.g.
+ * `class="x" id='y' disabled href=foo`.
+ *
+ * Implemented as a linear scan rather than the previous global regex
+ * `/([a-zA-Z_][\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g` so
+ * adversarial start-tag content cannot trigger polynomial-redos
+ * (CodeQL js/polynomial-redos). Behaviour matches the regex form on
+ * well-formed inputs:
+ *   - Attribute names lower-cased.
+ *   - Double-quoted, single-quoted and unquoted values supported.
+ *   - Boolean attributes (no `=`) yield an empty string value.
+ */
 function parseHtmlAttrs(str) {
     const attrs = {};
-    const re = /([a-zA-Z_][\w-]*)(?:\s*=\s*(?:"([^"]*)"|'([^']*)'|(\S+)))?/g;
-    let m;
-    while ((m = re.exec(str)) !== null) {
-        attrs[m[1].toLowerCase()] = m[2] ?? m[3] ?? m[4] ?? "";
+    const n = str.length;
+    let i = 0;
+    while (i < n) {
+        // Skip whitespace.
+        while (i < n && isHtmlSpace(str.charCodeAt(i))) {
+            i++;
+        }
+        if (i >= n) {
+            break;
+        }
+        // Read attribute name: [A-Za-z_][\w-]*.
+        const nameStart = i;
+        const first = str.charCodeAt(i);
+        if (!isAsciiAlpha(first) && first !== 0x5f /* '_' */) {
+            // Not a valid attribute-name start — skip one char and resync.
+            i++;
+            continue;
+        }
+        i++;
+        while (i < n) {
+            const c = str.charCodeAt(i);
+            if (isAsciiAlpha(c) || isAsciiDigit(c) || c === 0x5f /* '_' */ || c === 0x2d /* '-' */) {
+                i++;
+                continue;
+            }
+            break;
+        }
+        const name = str.slice(nameStart, i).toLowerCase();
+        // Optional `\s*=\s*` then a value.
+        let j = i;
+        while (j < n && isHtmlSpace(str.charCodeAt(j))) {
+            j++;
+        }
+        if (j >= n || str.charCodeAt(j) !== 0x3d /* '=' */) {
+            // Boolean attribute.
+            attrs[name] = "";
+            continue;
+        }
+        j++; // past '='
+        while (j < n && isHtmlSpace(str.charCodeAt(j))) {
+            j++;
+        }
+        if (j >= n) {
+            attrs[name] = "";
+            i = j;
+            continue;
+        }
+        const q = str.charCodeAt(j);
+        if (q === 0x22 /* '"' */ || q === 0x27 /* "'" */) {
+            const close = str.indexOf(q === 0x22 ? '"' : "'", j + 1);
+            if (close < 0) {
+                // Unterminated quoted value — take whatever is left and stop.
+                attrs[name] = str.slice(j + 1);
+                break;
+            }
+            attrs[name] = str.slice(j + 1, close);
+            i = close + 1;
+            continue;
+        }
+        // Unquoted value: run of non-whitespace.
+        const valStart = j;
+        while (j < n && !isHtmlSpace(str.charCodeAt(j))) {
+            j++;
+        }
+        attrs[name] = str.slice(valStart, j);
+        i = j;
     }
     return attrs;
 }
 function decodeHtmlEntities(text) {
-    return text
-        .replace(/&amp;/g, "&")
-        .replace(/&lt;/g, "<")
-        .replace(/&gt;/g, ">")
-        .replace(/&quot;/g, '"')
-        .replace(/&#39;/g, "'")
-        .replace(/&nbsp;/g, "\u00A0")
-        .replace(/&#(\d+);/g, (_, n) => safeFromCodePoint(parseInt(n, 10)))
-        .replace(/&#x([a-fA-F0-9]+);/g, (_, n) => safeFromCodePoint(parseInt(n, 16)))
-        .replace(/&([a-zA-Z]+);/g, (match, name) => HTML_ENTITIES[name] ?? match);
+    // Decode every entity in a single pass. Chaining `.replace()` calls
+    // (first `&amp;` → `&`, then `&lt;` → `<`, …) re-runs the later
+    // replacements over the output of the earlier ones, so input like
+    // `&amp;lt;` would round-trip to `<` instead of the intended `&lt;`.
+    // CodeQL flags this as "Double escaping or unescaping". A single
+    // alternation guarantees each source position is decoded at most once.
+    return text.replace(/&(?:#(\d+)|#[xX]([a-fA-F0-9]+)|([a-zA-Z][a-zA-Z0-9]*));/g, (match, dec, hex, name) => {
+        if (dec !== undefined) {
+            return safeFromCodePoint(parseInt(dec, 10));
+        }
+        if (hex !== undefined) {
+            return safeFromCodePoint(parseInt(hex, 16));
+        }
+        if (name !== undefined) {
+            const replacement = HTML_ENTITIES[name];
+            return replacement ?? match;
+        }
+        return match;
+    });
 }
 /**
  * Convert a numeric character reference to a string. Uses fromCodePoint so
@@ -243,6 +596,16 @@ function safeFromCodePoint(cp) {
 }
 /** Common HTML named entities mapped to their Unicode characters. */
 const HTML_ENTITIES = {
+    // Core XML/HTML entities — these used to be handled as standalone
+    // chained `.replace()` calls in `decodeHtmlEntities`. They must live
+    // in this table so the single-pass decoder can resolve them without
+    // re-running over already-decoded output (CodeQL "double unescaping").
+    amp: "&",
+    lt: "<",
+    gt: ">",
+    quot: '"',
+    apos: "'",
+    nbsp: "\u00A0",
     // Punctuation & Typography
     mdash: "\u2014",
     ndash: "\u2013",

package/dist/browser/modules/word/convert/markdown/markdown-renderer.js CHANGED Viewed

@@ -188,8 +188,13 @@ function renderTable(state, table) {
                     cellParts.push(renderInlineChildren(state, block.children).trim());
                 }
             }
-            // Escape pipe characters to prevent table structure corruption
-            rowTexts.push(cellParts.join(" ").replace(/\|/g, "\\|"));
+            // Escape pipe characters to prevent table structure corruption.
+            // Backslashes must be escaped *first*: replacing `|` first leaves
+            // a literal `\|` in the source untouched, but a subsequent
+            // `\` → `\\` pass would then double-escape it into `\\|`,
+            // breaking GFM tables. CodeQL flags the single-pass form as
+            // "Incomplete string escaping or encoding".
+            rowTexts.push(cellParts.join(" ").replace(/\\/g, "\\\\").replace(/\|/g, "\\|"));
         }
         grid.push(rowTexts);
     }
@@ -470,7 +475,10 @@ function isMonospaceFont(font) {
     if (typeof font === "string") {
         return isMonospaceFontName(font);
     }
-    if (typeof font === "object" && font !== null) {
+    // `!font` above already discarded `null`; `font !== null` here was
+    // therefore always true and CodeQL flagged it as a comparison
+    // between inconvertible types.
+    if (typeof font === "object") {
         const f = font;
         return (isMonospaceFontName(f.ascii) ||
             isMonospaceFontName(f.hAnsi));

package/dist/browser/modules/word/layout/layout-full.js CHANGED Viewed

@@ -1254,7 +1254,10 @@ function resolveColorHex(color) {
     if (typeof color === "string") {
         return color;
     }
-    if (typeof color === "object" && color !== null && "value" in color) {
+    // The `!color` check above already discarded `null`; an additional
+    // `color !== null` test was always true and CodeQL flagged it as a
+    // comparison between inconvertible types.
+    if (typeof color === "object" && "value" in color) {
         return color.value;
     }
     return undefined;