npm - entities - Versions diffs - 6.0.1 → 7.0.1 - Mend

entities 6.0.1 → 7.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

package/decode.d.ts +2 -0
package/dist/commonjs/decode-codepoint.d.ts.map +1 -1
package/dist/commonjs/decode-codepoint.js +2 -2
package/dist/commonjs/decode-codepoint.js.map +1 -1
package/dist/commonjs/decode.d.ts +3 -7
package/dist/commonjs/decode.d.ts.map +1 -1
package/dist/commonjs/decode.js +105 -48
package/dist/commonjs/decode.js.map +1 -1
package/dist/commonjs/encode.d.ts.map +1 -1
package/dist/commonjs/encode.js +49 -30
package/dist/commonjs/encode.js.map +1 -1
package/dist/commonjs/escape.d.ts +7 -4
package/dist/commonjs/escape.d.ts.map +1 -1
package/dist/commonjs/escape.js +36 -19
package/dist/commonjs/escape.js.map +1 -1
package/dist/commonjs/generated/decode-data-html.d.ts.map +1 -1
package/dist/commonjs/generated/decode-data-html.js +2 -5
package/dist/commonjs/generated/decode-data-html.js.map +1 -1
package/dist/commonjs/generated/decode-data-xml.d.ts.map +1 -1
package/dist/commonjs/generated/decode-data-xml.js +2 -5
package/dist/commonjs/generated/decode-data-xml.js.map +1 -1
package/dist/commonjs/generated/encode-html.d.ts +1 -6
package/dist/commonjs/generated/encode-html.d.ts.map +1 -1
package/dist/commonjs/generated/encode-html.js +9 -8
package/dist/commonjs/generated/encode-html.js.map +1 -1
package/dist/commonjs/index.d.ts +3 -3
package/dist/commonjs/index.d.ts.map +1 -1
package/dist/commonjs/index.js +19 -19
package/dist/commonjs/index.js.map +1 -1
package/dist/commonjs/internal/bin-trie-flags.d.ts +17 -0
package/dist/commonjs/internal/bin-trie-flags.d.ts.map +1 -0
package/dist/commonjs/internal/bin-trie-flags.js +21 -0
package/dist/commonjs/internal/bin-trie-flags.js.map +1 -0
package/dist/commonjs/internal/decode-shared.d.ts +2 -0
package/dist/commonjs/internal/decode-shared.d.ts.map +1 -0
package/dist/commonjs/internal/decode-shared.js +31 -0
package/dist/commonjs/internal/decode-shared.js.map +1 -0
package/dist/commonjs/internal/encode-shared.d.ts +32 -0
package/dist/commonjs/internal/encode-shared.d.ts.map +1 -0
package/dist/commonjs/internal/encode-shared.js +94 -0
package/dist/commonjs/internal/encode-shared.js.map +1 -0
package/dist/esm/decode-codepoint.d.ts.map +1 -1
package/dist/esm/decode-codepoint.js +2 -2
package/dist/esm/decode-codepoint.js.map +1 -1
package/dist/esm/decode.d.ts +3 -7
package/dist/esm/decode.d.ts.map +1 -1
package/dist/esm/decode.js +96 -39
package/dist/esm/decode.js.map +1 -1
package/dist/esm/encode.d.ts.map +1 -1
package/dist/esm/encode.js +49 -30
package/dist/esm/encode.js.map +1 -1
package/dist/esm/escape.d.ts +7 -4
package/dist/esm/escape.d.ts.map +1 -1
package/dist/esm/escape.js +35 -18
package/dist/esm/escape.js.map +1 -1
package/dist/esm/generated/decode-data-html.d.ts.map +1 -1
package/dist/esm/generated/decode-data-html.js +2 -5
package/dist/esm/generated/decode-data-html.js.map +1 -1
package/dist/esm/generated/decode-data-xml.d.ts.map +1 -1
package/dist/esm/generated/decode-data-xml.js +2 -5
package/dist/esm/generated/decode-data-xml.js.map +1 -1
package/dist/esm/generated/encode-html.d.ts +1 -6
package/dist/esm/generated/encode-html.d.ts.map +1 -1
package/dist/esm/generated/encode-html.js +9 -8
package/dist/esm/generated/encode-html.js.map +1 -1
package/dist/esm/index.d.ts +3 -3
package/dist/esm/index.d.ts.map +1 -1
package/dist/esm/index.js +9 -9
package/dist/esm/index.js.map +1 -1
package/dist/esm/internal/bin-trie-flags.d.ts +17 -0
package/dist/esm/internal/bin-trie-flags.d.ts.map +1 -0
package/dist/esm/internal/bin-trie-flags.js +18 -0
package/dist/esm/internal/bin-trie-flags.js.map +1 -0
package/dist/esm/internal/decode-shared.d.ts +2 -0
package/dist/esm/internal/decode-shared.d.ts.map +1 -0
package/dist/esm/internal/decode-shared.js +28 -0
package/dist/esm/internal/decode-shared.js.map +1 -0
package/dist/esm/internal/encode-shared.d.ts +32 -0
package/dist/esm/internal/encode-shared.d.ts.map +1 -0
package/dist/esm/internal/encode-shared.js +91 -0
package/dist/esm/internal/encode-shared.js.map +1 -0
package/escape.d.ts +2 -0
package/package.json +26 -24
package/readme.md +32 -11
package/src/decode-codepoint.ts +2 -2
package/src/decode.ts +120 -55
package/src/encode.ts +47 -31
package/src/escape.ts +39 -26
package/src/generated/decode-data-html.ts +3 -5
package/src/generated/decode-data-xml.ts +3 -5
package/src/generated/encode-html.ts +14 -14
package/src/index.ts +23 -24
package/src/internal/bin-trie-flags.ts +16 -0
package/src/internal/decode-shared.ts +30 -0
package/src/internal/encode-shared.ts +121 -0
package/src/decode.spec.ts +0 -320
package/src/encode.spec.ts +0 -78
package/src/escape.spec.ts +0 -14
package/src/index.spec.ts +0 -125

package/src/decode.ts CHANGED Viewed

@@ -1,6 +1,7 @@
+import { fromCodePoint, replaceCodePoint } from "./decode-codepoint.js";
 import { htmlDecodeTree } from "./generated/decode-data-html.js";
 import { xmlDecodeTree } from "./generated/decode-data-xml.js";
-import { replaceCodePoint, fromCodePoint } from "./decode-codepoint.js";
+import { BinTrieFlags } from "./internal/bin-trie-flags.js";
 const enum CharCodes {
     NUM = 35, // "#"
@@ -20,12 +21,6 @@ const enum CharCodes {
 /** Bit that needs to be set to convert an upper case ASCII character to lower case */
 const TO_LOWER_BIT = 0b10_0000;
-export enum BinTrieFlags {
-    VALUE_LENGTH = 0b1100_0000_0000_0000,
-    BRANCH_LENGTH = 0b0011_1111_1000_0000,
-    JUMP_TABLE = 0b0000_0000_0111_1111,
-}
 function isNumber(code: number): boolean {
     return code >= CharCodes.ZERO && code <= CharCodes.NINE;
 }
@@ -89,6 +84,7 @@ export interface EntityErrorProducer {
 export class EntityDecoder {
     constructor(
         /** The tree used to decode entities. */
+        // biome-ignore lint/correctness/noUnusedPrivateClassMembers: False positive
         private readonly decodeTree: Uint16Array,
         /**
          * The function that is called when a codepoint is decoded.
@@ -122,6 +118,8 @@ export class EntityDecoder {
     private excess = 1;
     /** The mode in which the decoder is operating. */
     private decodeMode = DecodingMode.Strict;
+    /** The number of characters that have been consumed in the current run. */
+    private runConsumed = 0;
     /** Resets the instance to make it reusable. */
     startEntity(decodeMode: DecodingMode): void {
@@ -131,6 +129,7 @@ export class EntityDecoder {
         this.treeIndex = 0;
         this.excess = 1;
         this.consumed = 1;
+        this.runConsumed = 0;
     }
     /**
@@ -198,21 +197,6 @@ export class EntityDecoder {
         return this.stateNumericDecimal(input, offset);
     }
-    private addToNumericResult(
-        input: string,
-        start: number,
-        end: number,
-        base: number,
-    ): void {
-        if (start !== end) {
-            const digitCount = end - start;
-            this.result =
-                this.result * Math.pow(base, digitCount) +
-                Number.parseInt(input.substr(start, digitCount), base);
-            this.consumed += digitCount;
-        }
-    }
     /**
      * Parses a hexadecimal numeric entity.
      *
@@ -223,21 +207,22 @@ export class EntityDecoder {
      * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
      */
     private stateNumericHex(input: string, offset: number): number {
-        const startIndex = offset;
         while (offset < input.length) {
             const char = input.charCodeAt(offset);
             if (isNumber(char) || isHexadecimalCharacter(char)) {
-                offset += 1;
+                // Convert hex digit to value (0-15); 'a'/'A' -> 10.
+                const digit =
+                    char <= CharCodes.NINE
+                        ? char - CharCodes.ZERO
+                        : (char | TO_LOWER_BIT) - CharCodes.LOWER_A + 10;
+                this.result = this.result * 16 + digit;
+                this.consumed++;
+                offset++;
             } else {
-                this.addToNumericResult(input, startIndex, offset, 16);
                 return this.emitNumericEntity(char, 3);
             }
         }
-        this.addToNumericResult(input, startIndex, offset, 16);
-        return -1;
+        return -1; // Incomplete entity
     }
     /**
@@ -250,21 +235,17 @@ export class EntityDecoder {
      * @returns The number of characters that were consumed, or -1 if the entity is incomplete.
      */
     private stateNumericDecimal(input: string, offset: number): number {
-        const startIndex = offset;
         while (offset < input.length) {
             const char = input.charCodeAt(offset);
             if (isNumber(char)) {
-                offset += 1;
+                this.result = this.result * 10 + (char - CharCodes.ZERO);
+                this.consumed++;
+                offset++;
             } else {
-                this.addToNumericResult(input, startIndex, offset, 10);
                 return this.emitNumericEntity(char, 2);
             }
         }
-        this.addToNumericResult(input, startIndex, offset, 10);
-        return -1;
+        return -1; // Incomplete entity
     }
     /**
@@ -321,12 +302,84 @@ export class EntityDecoder {
     private stateNamedEntity(input: string, offset: number): number {
         const { decodeTree } = this;
         let current = decodeTree[this.treeIndex];
-        // The mask is the number of bytes of the value, including the current byte.
+        // The length is the number of bytes of the value, including the current byte.
         let valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
-        for (; offset < input.length; offset++, this.excess++) {
+        while (offset < input.length) {
+            // Handle compact runs (possibly inline): valueLength == 0 and SEMI_REQUIRED bit set.
+            if (valueLength === 0 && (current & BinTrieFlags.FLAG13) !== 0) {
+                const runLength =
+                    (current & BinTrieFlags.BRANCH_LENGTH) >> 7; /* 2..63 */
+                // If we are starting a run, check the first char.
+                if (this.runConsumed === 0) {
+                    const firstChar = current & BinTrieFlags.JUMP_TABLE;
+                    if (input.charCodeAt(offset) !== firstChar) {
+                        return this.result === 0
+                            ? 0
+                            : this.emitNotTerminatedNamedEntity();
+                    }
+                    offset++;
+                    this.excess++;
+                    this.runConsumed++;
+                }
+                // Check remaining characters in the run.
+                while (this.runConsumed < runLength) {
+                    if (offset >= input.length) {
+                        return -1;
+                    }
+                    const charIndexInPacked = this.runConsumed - 1;
+                    const packedWord =
+                        decodeTree[
+                            this.treeIndex + 1 + (charIndexInPacked >> 1)
+                        ];
+                    const expectedChar =
+                        charIndexInPacked % 2 === 0
+                            ? packedWord & 0xff
+                            : (packedWord >> 8) & 0xff;
+                    if (input.charCodeAt(offset) !== expectedChar) {
+                        this.runConsumed = 0;
+                        return this.result === 0
+                            ? 0
+                            : this.emitNotTerminatedNamedEntity();
+                    }
+                    offset++;
+                    this.excess++;
+                    this.runConsumed++;
+                }
+                this.runConsumed = 0;
+                this.treeIndex += 1 + (runLength >> 1);
+                current = decodeTree[this.treeIndex];
+                valueLength = (current & BinTrieFlags.VALUE_LENGTH) >> 14;
+            }
+            if (offset >= input.length) break;
             const char = input.charCodeAt(offset);
+            /*
+             * Implicit semicolon handling for nodes that require a semicolon but
+             * don't have an explicit ';' branch stored in the trie. If we have
+             * a value on the current node, it requires a semicolon, and the
+             * current input character is a semicolon, emit the entity using the
+             * current node (without descending further).
+             */
+            if (
+                char === CharCodes.SEMI &&
+                valueLength !== 0 &&
+                (current & BinTrieFlags.FLAG13) !== 0
+            ) {
+                return this.emitNamedEntityData(
+                    this.treeIndex,
+                    valueLength,
+                    this.consumed + this.excess,
+                );
+            }
             this.treeIndex = determineBranch(
                 decodeTree,
                 current,
@@ -361,12 +414,18 @@ export class EntityDecoder {
                 }
                 // If we encounter a non-terminated (legacy) entity while parsing strictly, then ignore it.
-                if (this.decodeMode !== DecodingMode.Strict) {
+                if (
+                    this.decodeMode !== DecodingMode.Strict &&
+                    (current & BinTrieFlags.FLAG13) === 0
+                ) {
                     this.result = this.treeIndex;
                     this.consumed += this.excess;
                     this.excess = 0;
                 }
             }
+            // Increment offset & excess for next iteration
+            offset++;
+            this.excess++;
         }
         return -1;
@@ -407,7 +466,8 @@ export class EntityDecoder {
         this.emitCodePoint(
             valueLength === 1
-                ? decodeTree[result] & ~BinTrieFlags.VALUE_LENGTH
+                ? decodeTree[result] &
+                      ~(BinTrieFlags.VALUE_LENGTH | BinTrieFlags.FLAG13)
                 : decodeTree[result + 1],
             consumed,
         );
@@ -540,22 +600,28 @@ export function determineBranch(
             : decodeTree[nodeIndex + value] - 1;
     }
-    // Case 3: Multiple branches encoded in dictionary
+    // Case 3: Multiple branches encoded in packed dictionary (two keys per uint16)
+    const packedKeySlots = (branchCount + 1) >> 1;
-    // Binary search for the character.
-    let lo = nodeIndex;
-    let hi = lo + branchCount - 1;
+    /*
+     * Treat packed keys as a virtual sorted array of length `branchCount`.
+     * Key(i) = low byte for even i, high byte for odd i in slot i>>1.
+     */
+    let lo = 0;
+    let hi = branchCount - 1;
     while (lo <= hi) {
         const mid = (lo + hi) >>> 1;
-        const midValue = decodeTree[mid];
+        const slot = mid >> 1;
+        const packed = decodeTree[nodeIndex + slot];
+        const midKey = (packed >> ((mid & 1) * 8)) & 0xff;
-        if (midValue < char) {
+        if (midKey < char) {
             lo = mid + 1;
-        } else if (midValue > char) {
+        } else if (midKey > char) {
             hi = mid - 1;
         } else {
-            return decodeTree[mid + branchCount];
+            return decodeTree[nodeIndex + packedKeySlots + mid];
         }
     }
@@ -609,12 +675,11 @@ export function decodeXML(xmlString: string): string {
     return xmlDecoder(xmlString, DecodingMode.Strict);
 }
-// Re-export for use by eg. htmlparser2
-export { htmlDecodeTree } from "./generated/decode-data-html.js";
-export { xmlDecodeTree } from "./generated/decode-data-xml.js";
 export {
     decodeCodePoint,
-    replaceCodePoint,
     fromCodePoint,
+    replaceCodePoint,
 } from "./decode-codepoint.js";
+// Re-export for use by eg. htmlparser2
+export { htmlDecodeTree } from "./generated/decode-data-html.js";
+export { xmlDecodeTree } from "./generated/decode-data-xml.js";

package/src/encode.ts CHANGED Viewed

@@ -1,7 +1,17 @@
+import { getCodePoint, XML_BITSET_VALUE } from "./escape.js";
 import { htmlTrie } from "./generated/encode-html.js";
-import { xmlReplacer, getCodePoint } from "./escape.js";
-const htmlReplacer = /[\t\n\f!-,./:-@[-`{-}\u0080-\uFFFF]/g;
+/**
+ * We store the characters to consider as a compact bitset for fast lookups.
+ */
+const HTML_BITSET = /* #__PURE__ */ new Uint32Array([
+    0x16_00, // Bits for 09,0A,0C
+    0xfc_00_ff_fe, // 32..63 -> 21-2D (minus space), 2E,2F,3A-3F
+    0xf8_00_00_01, // 64..95 -> 40, 5B-5F
+    0x38_00_00_01, // 96..127-> 60, 7B-7D
+]);
+const XML_BITSET = /* #__PURE__ */ new Uint32Array([0, XML_BITSET_VALUE, 0, 0]);
 /**
  * Encodes all characters in the input using HTML entities. This includes
@@ -15,7 +25,7 @@ const htmlReplacer = /[\t\n\f!-,./:-@[-`{-}\u0080-\uFFFF]/g;
  * (eg. `&#xfc;`) will be used.
  */
 export function encodeHTML(input: string): string {
-    return encodeHTMLTrieRe(htmlReplacer, input);
+    return encodeHTMLTrieRe(HTML_BITSET, input);
 }
 /**
  * Encodes all non-ASCII characters, as well as characters not valid in HTML
@@ -26,52 +36,58 @@ export function encodeHTML(input: string): string {
  * (eg. `&#xfc;`) will be used.
  */
 export function encodeNonAsciiHTML(input: string): string {
-    return encodeHTMLTrieRe(xmlReplacer, input);
+    return encodeHTMLTrieRe(XML_BITSET, input);
 }
-function encodeHTMLTrieRe(regExp: RegExp, input: string): string {
-    let returnValue = "";
-    let lastIndex = 0;
-    let match;
+function encodeHTMLTrieRe(bitset: Uint32Array, input: string): string {
+    let out: string | undefined;
+    let last = 0; // Start of the next untouched slice.
+    const { length } = input;
-    while ((match = regExp.exec(input)) !== null) {
-        const { index } = match;
-        returnValue += input.substring(lastIndex, index);
+    for (let index = 0; index < length; index++) {
         const char = input.charCodeAt(index);
-        let next = htmlTrie.get(char);
+        // Skip ASCII characters that don't need encoding
+        if (char < 0x80 && !((bitset[char >>> 5] >>> char) & 1)) {
+            continue;
+        }
-        if (typeof next === "object") {
-            // We are in a branch. Try to match the next char.
-            if (index + 1 < input.length) {
+        if (out === undefined) out = input.substring(0, index);
+        else if (last !== index) out += input.substring(last, index);
+        let node = htmlTrie.get(char);
+        if (typeof node === "object") {
+            if (index + 1 < length) {
                 const nextChar = input.charCodeAt(index + 1);
                 const value =
-                    typeof next.n === "number"
-                        ? next.n === nextChar
-                            ? next.o
+                    typeof node.next === "number"
+                        ? node.next === nextChar
+                            ? node.nextValue
                             : undefined
-                        : next.n.get(nextChar);
+                        : node.next.get(nextChar);
                 if (value !== undefined) {
-                    returnValue += value;
-                    lastIndex = regExp.lastIndex += 1;
+                    out += value;
+                    index++;
+                    last = index + 1;
                     continue;
                 }
             }
-            next = next.v;
+            node = node.value;
         }
-        // We might have a tree node without a value; skip and use a numeric entity.
-        if (next === undefined) {
+        if (node === undefined) {
             const cp = getCodePoint(input, index);
-            returnValue += `&#x${cp.toString(16)};`;
-            // Increase by 1 if we have a surrogate pair
-            lastIndex = regExp.lastIndex += Number(cp !== char);
+            out += `&#x${cp.toString(16)};`;
+            if (cp !== char) index++;
+            last = index + 1;
         } else {
-            returnValue += next;
-            lastIndex = index + 1;
+            out += node;
+            last = index + 1;
         }
     }
-    return returnValue + input.substr(lastIndex);
+    if (out === undefined) return input;
+    if (last < length) out += input.substr(last);
+    return out;
 }

package/src/escape.ts CHANGED Viewed

@@ -1,5 +1,3 @@
-export const xmlReplacer: RegExp = /["$&'<>\u0080-\uFFFF]/g;
 const xmlCodeMap = new Map([
     [34, "&quot;"],
     [38, "&amp;"],
@@ -22,39 +20,54 @@ export const getCodePoint: (c: string, index: number) => number =
         : // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
           (input: string, index: number): number => input.codePointAt(index)!;
+/**
+ * Bitset for ASCII characters that need to be escaped in XML.
+ */
+export const XML_BITSET_VALUE = 0x50_00_00_c4; // 32..63 -> 34 ("),38 (&),39 ('),60 (<),62 (>)
 /**
  * Encodes all non-ASCII characters, as well as characters not valid in XML
- * documents using XML entities.
+ * documents using XML entities. Uses a fast bitset scan instead of RegExp.
  *
- * If a character has no equivalent entity, a
- * numeric hexadecimal reference (eg. `&#xfc;`) will be used.
+ * If a character has no equivalent entity, a numeric hexadecimal reference
+ * (eg. `&#xfc;`) will be used.
  */
 export function encodeXML(input: string): string {
-    let returnValue = "";
-    let lastIndex = 0;
-    let match;
+    let out: string | undefined;
+    let last = 0;
+    const { length } = input;
-    while ((match = xmlReplacer.exec(input)) !== null) {
-        const { index } = match;
+    for (let index = 0; index < length; index++) {
         const char = input.charCodeAt(index);
-        const next = xmlCodeMap.get(char);
-        if (next === undefined) {
-            returnValue += `${input.substring(lastIndex, index)}&#x${getCodePoint(
-                input,
-                index,
-            ).toString(16)};`;
-            // Increase by 1 if we have a surrogate pair
-            lastIndex = xmlReplacer.lastIndex += Number(
-                (char & 0xfc_00) === 0xd8_00,
-            );
-        } else {
-            returnValue += input.substring(lastIndex, index) + next;
-            lastIndex = index + 1;
+        // Check for ASCII chars that don't need escaping
+        if (
+            char < 0x80 &&
+            (((XML_BITSET_VALUE >>> char) & 1) === 0 || char >= 64 || char < 32)
+        ) {
+            continue;
         }
+        if (out === undefined) out = input.substring(0, index);
+        else if (last !== index) out += input.substring(last, index);
+        if (char < 64) {
+            // Known replacement
+            out += xmlCodeMap.get(char)!;
+            last = index + 1;
+            continue;
+        }
+        // Non-ASCII: encode as numeric entity (handle surrogate pair)
+        const cp = getCodePoint(input, index);
+        out += `&#x${cp.toString(16)};`;
+        if (cp !== char) index++; // Skip trailing surrogate
+        last = index + 1;
     }
-    return returnValue + input.substr(lastIndex);
+    if (out === undefined) return input;
+    if (last < length) out += input.substr(last);
+    return out;
 }
 /**
@@ -83,7 +96,7 @@ function getEscaper(
     map: Map<number, string>,
 ): (data: string) => string {
     return function escape(data: string): string {
-        let match;
+        let match: RegExpExecArray | null;
         let lastIndex = 0;
         let result = "";