npm - @tkeron/html-parser - Versions diffs - 0.1.7 → 1.1.0 - Mend

@tkeron/html-parser 0.1.7 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (90) hide show

package/README.md +1 -7
package/bun.lock +5 -0
package/index.ts +4 -0
package/package.json +7 -1
package/src/css-selector.ts +1 -1
package/src/dom-simulator.ts +41 -17
package/src/encoding.ts +39 -0
package/src/index.ts +9 -0
package/src/parser.ts +509 -143
package/src/serializer.ts +450 -0
package/src/tokenizer.ts +190 -118
package/tests/advanced.test.ts +121 -108
package/tests/custom-elements-head.test.ts +105 -0
package/tests/dom-extended.test.ts +12 -12
package/tests/dom-manipulation.test.ts +9 -10
package/tests/dom.test.ts +32 -27
package/tests/helpers/tokenizer-adapter.test.ts +70 -0
package/tests/helpers/tokenizer-adapter.ts +65 -0
package/tests/helpers/tree-adapter.test.ts +39 -0
package/tests/helpers/tree-adapter.ts +60 -0
package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
package/tests/html5lib-data/tree-construction/math.dat +104 -0
package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
package/tests/html5lib-data/tree-construction/svg.dat +104 -0
package/tests/html5lib-data/tree-construction/template.dat +1673 -0
package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
package/tests/parser.test.ts +173 -193
package/tests/serializer-core.test.ts +16 -0
package/tests/serializer-data/core.test +125 -0
package/tests/serializer-data/injectmeta.test +66 -0
package/tests/serializer-data/optionaltags.test +965 -0
package/tests/serializer-data/options.test +60 -0
package/tests/serializer-data/whitespace.test +51 -0
package/tests/serializer-injectmeta.test.ts +16 -0
package/tests/serializer-optionaltags.test.ts +16 -0
package/tests/serializer-options.test.ts +16 -0
package/tests/serializer-whitespace.test.ts +16 -0
package/tests/tokenizer-namedEntities.test.ts +20 -0
package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
package/tests/tokenizer.test.ts +25 -32
package/tests/tree-construction-adoption01.test.ts +37 -0
package/tests/tree-construction-adoption02.test.ts +34 -0
package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
package/tests/tree-construction-entities02.test.ts +33 -0
package/tests/tree-construction-html5test-com.test.ts +32 -0
package/tests/tree-construction-math.test.ts +18 -0
package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
package/tests/tree-construction-noscript01.test.ts +18 -0
package/tests/tree-construction-ruby.test.ts +21 -0
package/tests/tree-construction-scriptdata01.test.ts +21 -0
package/tests/tree-construction-svg.test.ts +21 -0
package/tests/tree-construction-template.test.ts +21 -0
package/tests/tree-construction-tests10.test.ts +21 -0
package/tests/tree-construction-tests11.test.ts +21 -0
package/tests/tree-construction-tests20.test.ts +18 -0
package/tests/tree-construction-tests21.test.ts +18 -0
package/tests/tree-construction-tests23.test.ts +18 -0
package/tests/tree-construction-tests24.test.ts +18 -0
package/tests/tree-construction-tests5.test.ts +21 -0
package/tests/tree-construction-tests6.test.ts +21 -0
package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
package/tests/custom-elements.test.ts +0 -745
package/tests/official/README.md +0 -87
package/tests/official/acid/acid-tests.test.ts +0 -309
package/tests/official/final-output/final-output.test.ts +0 -361
package/tests/official/html5lib/tokenizer-utils.ts +0 -192
package/tests/official/html5lib/tokenizer.test.ts +0 -171
package/tests/official/html5lib/tree-construction-utils.ts +0 -194
package/tests/official/html5lib/tree-construction.test.ts +0 -250
package/tests/official/validator/validator-tests.test.ts +0 -237
package/tests/official/validator-nu/validator-nu.test.ts +0 -335
package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
package/tests/official/wpt/wpt-tests.test.ts +0 -409

package/src/tokenizer.ts CHANGED Viewed

@@ -24,69 +24,132 @@ export interface Token {
   isClosing?: boolean;
 }
-const HTML_ENTITIES: Record<string, string> = {
-  '&amp;': '&',
-  '&lt;': '<',
-  '&gt;': '>',
-  '&quot;': '"',
-  '&apos;': "'",
-  '&nbsp;': '\u00A0',
-  '&copy;': '©',
-  '&reg;': '®',
-  '&trade;': '™',
-  '&hellip;': '…',
-  '&mdash;': '—',
-  '&ndash;': '–',
-  '&lsquo;': '\u2018',
-  '&rsquo;': '\u2019',
-  '&ldquo;': '\u201C',
-  '&rdquo;': '\u201D',
-  '&not;': '¬'
-};
+import { allNamedEntities } from 'all-named-html-entities';
+const HTML_ENTITIES: Record<string, string> = allNamedEntities;
 function decodeEntities(text: string): string {
-  let result = text.replace(/\u0000/g, '\uFFFD');
-  return result.replace(/&(?:#x([0-9a-fA-F]+);?|#([0-9]+);?|([a-zA-Z][a-zA-Z0-9]*);?)/g, (match, hex, decimal, named) => {
-    if (hex) {
-      return String.fromCharCode(parseInt(hex, 16));
-    }
-    if (decimal) {
-      return String.fromCharCode(parseInt(decimal, 10));
-    }
-    if (named) {
-      if (HTML_ENTITIES[`&${named};`]) {
-        return HTML_ENTITIES[`&${named};`];
-      }
-      if (!match.endsWith(';')) {
-        for (let i = named.length; i > 0; i--) {
-          const prefix = named.substring(0, i);
-          if (HTML_ENTITIES[`&${prefix};`]) {
-            const remainder = named.substring(i);
-            return HTML_ENTITIES[`&${prefix};`] + remainder;
+  let result = '';
+  let i = 0;
+  while (i < text.length) {
+    if (text[i] === '&') {
+      let match = '';
+      let j = i + 1;
+      if (text[j] === '#') {
+        j++;
+        if (text[j] === 'x' || text[j] === 'X') {
+          j++;
+          while (j < text.length && /[0-9a-fA-F]/.test(text[j])) {
+            j++;
+          }
+        } else {
+          while (j < text.length && /[0-9]/.test(text[j])) {
+            j++;
+          }
+        }
+        if (text[j] === ';') {
+          j++;
+        }
+        match = text.substring(i, j);
+        const entity = match;
+        if (entity.startsWith('&#x') && entity.endsWith(';')) {
+          const hex = entity.slice(3, -1);
+          result += String.fromCharCode(parseInt(hex, 16));
+          i = j;
+          continue;
+        } else if (entity.startsWith('&#') && entity.endsWith(';')) {
+          const decimal = entity.slice(2, -1);
+          result += String.fromCharCode(parseInt(decimal, 10));
+          i = j;
+          continue;
+        }
+      } else {
+        while (j < text.length && /[a-zA-Z0-9]/.test(text[j])) {
+          j++;
+        }
+        const hasSemi = text[j] === ';';
+        if (hasSemi) {
+          j++;
+        }
+        match = text.substring(i, j);
+        const named = match.slice(1, hasSemi ? -1 : undefined);
+        if (HTML_ENTITIES[named]) {
+          if (hasSemi || (j < text.length && !/[a-zA-Z0-9]/.test(text[j]))) {
+            result += HTML_ENTITIES[named];
+            i = j;
+            continue;
           }
         }
       }
-      return match;
+      result += text[i];
+      i++;
+    } else {
+      result += text[i];
+      i++;
     }
-    return match;
-  });
+  }
+  return result.replace(/\u0000/g, '\uFFFD');
 }
 function parseAttributes(attributeString: string): Record<string, string> {
   const attributes: Record<string, string> = {};
+  let i = 0;
-  const attrRegex = /([a-zA-Z][a-zA-Z0-9\-_:]*)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+)))?/g;
-  let match;
-  while ((match = attrRegex.exec(attributeString)) !== null) {
-    const [, name, doubleQuoted, singleQuoted, unquoted] = match;
-    if (name) {
-      const value = doubleQuoted ?? singleQuoted ?? unquoted ?? '';
-      attributes[name.toLowerCase()] = decodeEntities(value);
+  while (i < attributeString.length) {
+    while (i < attributeString.length && /\s/.test(attributeString[i])) {
+      i++;
+    }
+    if (i >= attributeString.length || attributeString[i] === '/' || attributeString[i] === '>') {
+      break;
+    }
+    let name = '';
+    while (i < attributeString.length && !/[\s=\/>]/.test(attributeString[i])) {
+      name += attributeString[i];
+      i++;
+    }
+    if (!name) {
+      i++;
+      continue;
+    }
+    while (i < attributeString.length && /\s/.test(attributeString[i])) {
+      i++;
+    }
+    let value = '';
+    if (i < attributeString.length && attributeString[i] === '=') {
+      i++;
+      while (i < attributeString.length && /\s/.test(attributeString[i])) {
+        i++;
+      }
+      if (i < attributeString.length) {
+        if (attributeString[i] === '"') {
+          i++;
+          while (i < attributeString.length && attributeString[i] !== '"') {
+            value += attributeString[i];
+            i++;
+          }
+          i++;
+        } else if (attributeString[i] === "'") {
+          i++;
+          while (i < attributeString.length && attributeString[i] !== "'") {
+            value += attributeString[i];
+            i++;
+          }
+          i++;
+        } else {
+          while (i < attributeString.length && !/[\s>]/.test(attributeString[i])) {
+            value += attributeString[i];
+            i++;
+          }
+        }
+      }
     }
+    attributes[name.toLowerCase()] = decodeEntities(value);
   }
   return attributes;
@@ -101,79 +164,72 @@ function calculatePosition(text: string, offset: number): Position {
   };
 }
+const RAW_TEXT_ELEMENTS = new Set(['script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript']);
+const RCDATA_ELEMENTS = new Set(['textarea', 'title']);
 export function tokenize(html: string): Token[] {
   const tokens: Token[] = [];
-  let position = 0;
-  const specialCases = [
-    {
-      pattern: /<!DOCTYPE\s+[^>]*>/gi,
-      type: TokenType.DOCTYPE,
-      getValue: (match: string) => {
-        const doctypeMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
-        return doctypeMatch && doctypeMatch[1] ? doctypeMatch[1].toLowerCase() : match;
-      }
-    },
-    {
-      pattern: /<!--([\s\S]*?)(?:-->|$)/g,
-      type: TokenType.COMMENT,
-      getValue: (match: string) => match.slice(4, match.endsWith('-->') ? -3 : match.length)
-    },
-    {
-      pattern: /<!\[CDATA\[([\s\S]*?)\]\]>/g,
-      type: TokenType.CDATA,
-      getValue: (match: string) => match.slice(9, -3)
-    },
-    {
-      pattern: /<\?([^?]*(?:\?(?!>)[^?]*)*)\?>/g,
-      type: TokenType.PROCESSING_INSTRUCTION,
-      getValue: (match: string) => match.slice(0, -2)
-    }
-  ];
-  const processedRanges: Array<[number, number]> = [];
-  for (const { pattern, type, getValue } of specialCases) {
-    const regex = new RegExp(pattern);
-    let match;
-    while ((match = regex.exec(html)) !== null) {
-      const start = match.index;
-      const end = start + match[0].length;
-      tokens.push({
-        type,
-        value: getValue(match[0]),
-        position: calculatePosition(html, start)
-      });
-      processedRanges.push([start, end]);
-    }
-  }
-  processedRanges.sort((a, b) => a[0] - b[0]);
   let currentPos = 0;
   while (currentPos < html.length) {
-    const inProcessedRange = processedRanges.some(([start, end]) =>
-      currentPos >= start && currentPos < end
-    );
-    if (inProcessedRange) {
-      const range = processedRanges.find(([start, end]) =>
-        currentPos >= start && currentPos < end
-      );
-      if (range) {
-        currentPos = range[1];
-      }
-      continue;
-    }
     const char = html[currentPos];
     if (char === '<') {
-      const tagMatch = html.slice(currentPos).match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
+      const remaining = html.slice(currentPos);
+      const doctypeMatch = remaining.match(/^<!DOCTYPE\s+[^>]*>/i);
+      if (doctypeMatch) {
+        const match = doctypeMatch[0];
+        const nameMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
+        tokens.push({
+          type: TokenType.DOCTYPE,
+          value: nameMatch && nameMatch[1] ? nameMatch[1].toLowerCase() : match,
+          position: calculatePosition(html, currentPos)
+        });
+        currentPos += match.length;
+        continue;
+      }
+      const commentMatch = remaining.match(/^<!--([\s\S]*?)(?:-->|$)/);
+      if (commentMatch) {
+        const match = commentMatch[0];
+        tokens.push({
+          type: TokenType.COMMENT,
+          value: match.slice(4, match.endsWith('-->') ? -3 : match.length),
+          position: calculatePosition(html, currentPos)
+        });
+        currentPos += match.length;
+        continue;
+      }
+      const cdataMatch = remaining.match(/^<!\[CDATA\[([\s\S]*?)\]\]>/);
+      if (cdataMatch) {
+        const content = cdataMatch[1];
+        tokens.push({
+          type: TokenType.COMMENT,
+          value: '[CDATA[' + content + ']]',
+          position: calculatePosition(html, currentPos)
+        });
+        currentPos += cdataMatch[0].length;
+        continue;
+      }
+      const piMatch = remaining.match(/^<\?([^>]*)/);
+      if (piMatch) {
+        let consumed = piMatch[0].length;
+        if (remaining[consumed] === '>') {
+          consumed++;
+        }
+        tokens.push({
+          type: TokenType.COMMENT,
+          value: '?' + piMatch[1],
+          position: calculatePosition(html, currentPos)
+        });
+        currentPos += consumed;
+        continue;
+      }
+      const tagMatch = remaining.match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
       if (tagMatch) {
         const fullTag = tagMatch[0];
@@ -206,6 +262,24 @@ export function tokenize(html: string): Token[] {
         });
         currentPos += fullTag.length;
+        if (!isClosing && !isSelfClosing && (RAW_TEXT_ELEMENTS.has(tagName) || RCDATA_ELEMENTS.has(tagName))) {
+          const closeTagPattern = new RegExp(`</${tagName}\\s*>`, 'i');
+          const restOfHtml = html.slice(currentPos);
+          const closeMatch = restOfHtml.match(closeTagPattern);
+          if (closeMatch && closeMatch.index !== undefined) {
+            const rawContent = restOfHtml.slice(0, closeMatch.index);
+            if (rawContent) {
+              tokens.push({
+                type: TokenType.TEXT,
+                value: RCDATA_ELEMENTS.has(tagName) ? decodeEntities(rawContent) : rawContent,
+                position: calculatePosition(html, currentPos)
+              });
+            }
+            currentPos += rawContent.length;
+          }
+        }
       } else {
         const textStart = currentPos;
         currentPos++;
@@ -241,8 +315,6 @@ export function tokenize(html: string): Token[] {
     }
   }
-  tokens.sort((a, b) => a.position.offset - b.position.offset);
   tokens.push({
     type: TokenType.EOF,
     value: '',