npm - @tkeron/html-parser - Versions diffs - 1.0.0 → 1.1.0 - Mend

@tkeron/html-parser 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/package.json +1 -1
package/src/dom-simulator.ts +3 -1
package/src/parser.ts +34 -2
package/src/tokenizer.ts +131 -75
package/tests/advanced.test.ts +3 -3
package/tests/custom-elements-head.test.ts +105 -0
package/tests/helpers/tree-adapter.test.ts +1 -1
package/tests/helpers/tree-adapter.ts +21 -4
package/tests/parser.test.ts +2 -1
package/tests/tokenizer.test.ts +22 -26
package/tests/tree-construction-html5test-com.test.ts +16 -8
package/tests/custom-elements.test.ts +0 -755

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@tkeron/html-parser",
-  "version": "1.0.0",
+  "version": "1.1.0",
   "description": "A fast and lightweight HTML parser for Bun",
   "main": "index.js",
   "module": "index.ts",

package/src/dom-simulator.ts CHANGED Viewed

@@ -28,7 +28,8 @@ export const enum NodeType {
 export function createElement(
   tagName: string,
-  attributes: Record<string, string> = {}
+  attributes: Record<string, string> = {},
+  namespaceURI?: string
 ): any {
   const innerHTML = "";
   const tagNameLower = tagName.toLowerCase();
@@ -46,6 +47,7 @@ export function createElement(
     nodeName: tagName.toUpperCase(),
     nodeValue: null,
     tagName: tagName.toUpperCase(),
+    namespaceURI: namespaceURI || null,
     attributes: { ...attributes },
     childNodes: [],
     children: [],

package/src/parser.ts CHANGED Viewed

@@ -138,20 +138,35 @@ export function parse(tokens: Token[]): any {
       appendChild(html, body);
       const doctypes: any[] = [];
+      const commentsBeforeHtml: any[] = [];
+      const bodyContent: any[] = [];
       const children = [...state.root.childNodes];
+      let foundElement = false;
       for (const child of children) {
         if (child.nodeType === 10) {
           doctypes.push(child);
+        } else if (child.nodeType === 8 && !foundElement) {
+          commentsBeforeHtml.push(child);
         } else {
-          appendChild(body, child);
+          if (child.nodeType === 1) foundElement = true;
+          bodyContent.push(child);
         }
       }
+      for (const content of bodyContent) {
+        appendChild(body, content);
+      }
       state.root.childNodes = [];
       for (const doctype of doctypes) {
         doctype.parentNode = null;
         appendChild(state.root, doctype);
       }
+      for (const comment of commentsBeforeHtml) {
+        comment.parentNode = null;
+        appendChild(state.root, comment);
+      }
       appendChild(state.root, html);
       state.root.documentElement = html;
       state.root.head = head;
@@ -364,6 +379,10 @@ function parseTokenInInHeadMode(state: ParserState, token: Token): void {
     } else if (tagName === 'meta' || tagName === 'link' || tagName === 'base') {
       parseOpenTag(state, token);
     } else if (tagName === 'head') {
+      // Ignore duplicate <head> tags
+    } else if (tagName.includes('-')) {
+      // Custom elements (tags with hyphens) are valid in <head>
+      parseOpenTag(state, token);
     } else {
       state.stack.pop();
       state.insertionMode = InsertionMode.AfterHead;
@@ -378,6 +397,9 @@ function parseTokenInInHeadMode(state: ParserState, token: Token): void {
       if (currentTagName === tagName) {
         state.stack.pop();
       }
+    } else if (tagName.includes('-') && currentTagName === tagName) {
+      // Handle closing tags for custom elements in <head>
+      state.stack.pop();
     }
   } else if (token.type === TokenType.COMMENT) {
     parseComment(state, token);
@@ -410,6 +432,9 @@ function parseTokenInAfterHeadMode(state: ParserState, token: Token): void {
   }
 }
+const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
+const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
 function parseTokenInInBodyMode(state: ParserState, token: Token): void {
   if (token.type === TokenType.TAG_OPEN) {
     const tagName = token.value.toLowerCase();
@@ -418,7 +443,14 @@ function parseTokenInInBodyMode(state: ParserState, token: Token): void {
     const currentParent = getCurrentParent(state);
-    const element = createElement(tagName, token.attributes || {});
+    let namespaceURI: string | undefined;
+    if (tagName === 'svg') {
+      namespaceURI = SVG_NAMESPACE;
+    } else if (tagName === 'math') {
+      namespaceURI = MATHML_NAMESPACE;
+    }
+    const element = createElement(tagName, token.attributes || {}, namespaceURI);
     appendChild(currentParent, element);

package/src/tokenizer.ts CHANGED Viewed

@@ -93,16 +93,63 @@ function decodeEntities(text: string): string {
 function parseAttributes(attributeString: string): Record<string, string> {
   const attributes: Record<string, string> = {};
+  let i = 0;
-  const attrRegex = /([a-zA-Z][a-zA-Z0-9\-_:]*)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+)))?/g;
-  let match;
-  while ((match = attrRegex.exec(attributeString)) !== null) {
-    const [, name, doubleQuoted, singleQuoted, unquoted] = match;
-    if (name) {
-      const value = doubleQuoted ?? singleQuoted ?? unquoted ?? '';
-      attributes[name.toLowerCase()] = decodeEntities(value);
+  while (i < attributeString.length) {
+    while (i < attributeString.length && /\s/.test(attributeString[i])) {
+      i++;
     }
+    if (i >= attributeString.length || attributeString[i] === '/' || attributeString[i] === '>') {
+      break;
+    }
+    let name = '';
+    while (i < attributeString.length && !/[\s=\/>]/.test(attributeString[i])) {
+      name += attributeString[i];
+      i++;
+    }
+    if (!name) {
+      i++;
+      continue;
+    }
+    while (i < attributeString.length && /\s/.test(attributeString[i])) {
+      i++;
+    }
+    let value = '';
+    if (i < attributeString.length && attributeString[i] === '=') {
+      i++;
+      while (i < attributeString.length && /\s/.test(attributeString[i])) {
+        i++;
+      }
+      if (i < attributeString.length) {
+        if (attributeString[i] === '"') {
+          i++;
+          while (i < attributeString.length && attributeString[i] !== '"') {
+            value += attributeString[i];
+            i++;
+          }
+          i++;
+        } else if (attributeString[i] === "'") {
+          i++;
+          while (i < attributeString.length && attributeString[i] !== "'") {
+            value += attributeString[i];
+            i++;
+          }
+          i++;
+        } else {
+          while (i < attributeString.length && !/[\s>]/.test(attributeString[i])) {
+            value += attributeString[i];
+            i++;
+          }
+        }
+      }
+    }
+    attributes[name.toLowerCase()] = decodeEntities(value);
   }
   return attributes;
@@ -117,79 +164,72 @@ function calculatePosition(text: string, offset: number): Position {
   };
 }
+const RAW_TEXT_ELEMENTS = new Set(['script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript']);
+const RCDATA_ELEMENTS = new Set(['textarea', 'title']);
 export function tokenize(html: string): Token[] {
   const tokens: Token[] = [];
-  let position = 0;
-  const specialCases = [
-    {
-      pattern: /<!DOCTYPE\s+[^>]*>/gi,
-      type: TokenType.DOCTYPE,
-      getValue: (match: string) => {
-        const doctypeMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
-        return doctypeMatch && doctypeMatch[1] ? doctypeMatch[1].toLowerCase() : match;
-      }
-    },
-    {
-      pattern: /<!--([\s\S]*?)(?:-->|$)/g,
-      type: TokenType.COMMENT,
-      getValue: (match: string) => match.slice(4, match.endsWith('-->') ? -3 : match.length)
-    },
-    {
-      pattern: /<!\[CDATA\[([\s\S]*?)\]\]>/g,
-      type: TokenType.CDATA,
-      getValue: (match: string) => match.slice(9, -3)
-    },
-    {
-      pattern: /<\?([^?]*(?:\?(?!>)[^?]*)*)\?>/g,
-      type: TokenType.PROCESSING_INSTRUCTION,
-      getValue: (match: string) => match.slice(0, -2)
-    }
-  ];
-  const processedRanges: Array<[number, number]> = [];
-  for (const { pattern, type, getValue } of specialCases) {
-    const regex = new RegExp(pattern);
-    let match;
-    while ((match = regex.exec(html)) !== null) {
-      const start = match.index;
-      const end = start + match[0].length;
-      tokens.push({
-        type,
-        value: getValue(match[0]),
-        position: calculatePosition(html, start)
-      });
-      processedRanges.push([start, end]);
-    }
-  }
-  processedRanges.sort((a, b) => a[0] - b[0]);
   let currentPos = 0;
   while (currentPos < html.length) {
-    const inProcessedRange = processedRanges.some(([start, end]) =>
-      currentPos >= start && currentPos < end
-    );
-    if (inProcessedRange) {
-      const range = processedRanges.find(([start, end]) =>
-        currentPos >= start && currentPos < end
-      );
-      if (range) {
-        currentPos = range[1];
-      }
-      continue;
-    }
     const char = html[currentPos];
     if (char === '<') {
-      const tagMatch = html.slice(currentPos).match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
+      const remaining = html.slice(currentPos);
+      const doctypeMatch = remaining.match(/^<!DOCTYPE\s+[^>]*>/i);
+      if (doctypeMatch) {
+        const match = doctypeMatch[0];
+        const nameMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
+        tokens.push({
+          type: TokenType.DOCTYPE,
+          value: nameMatch && nameMatch[1] ? nameMatch[1].toLowerCase() : match,
+          position: calculatePosition(html, currentPos)
+        });
+        currentPos += match.length;
+        continue;
+      }
+      const commentMatch = remaining.match(/^<!--([\s\S]*?)(?:-->|$)/);
+      if (commentMatch) {
+        const match = commentMatch[0];
+        tokens.push({
+          type: TokenType.COMMENT,
+          value: match.slice(4, match.endsWith('-->') ? -3 : match.length),
+          position: calculatePosition(html, currentPos)
+        });
+        currentPos += match.length;
+        continue;
+      }
+      const cdataMatch = remaining.match(/^<!\[CDATA\[([\s\S]*?)\]\]>/);
+      if (cdataMatch) {
+        const content = cdataMatch[1];
+        tokens.push({
+          type: TokenType.COMMENT,
+          value: '[CDATA[' + content + ']]',
+          position: calculatePosition(html, currentPos)
+        });
+        currentPos += cdataMatch[0].length;
+        continue;
+      }
+      const piMatch = remaining.match(/^<\?([^>]*)/);
+      if (piMatch) {
+        let consumed = piMatch[0].length;
+        if (remaining[consumed] === '>') {
+          consumed++;
+        }
+        tokens.push({
+          type: TokenType.COMMENT,
+          value: '?' + piMatch[1],
+          position: calculatePosition(html, currentPos)
+        });
+        currentPos += consumed;
+        continue;
+      }
+      const tagMatch = remaining.match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
       if (tagMatch) {
         const fullTag = tagMatch[0];
@@ -222,6 +262,24 @@ export function tokenize(html: string): Token[] {
         });
         currentPos += fullTag.length;
+        if (!isClosing && !isSelfClosing && (RAW_TEXT_ELEMENTS.has(tagName) || RCDATA_ELEMENTS.has(tagName))) {
+          const closeTagPattern = new RegExp(`</${tagName}\\s*>`, 'i');
+          const restOfHtml = html.slice(currentPos);
+          const closeMatch = restOfHtml.match(closeTagPattern);
+          if (closeMatch && closeMatch.index !== undefined) {
+            const rawContent = restOfHtml.slice(0, closeMatch.index);
+            if (rawContent) {
+              tokens.push({
+                type: TokenType.TEXT,
+                value: RCDATA_ELEMENTS.has(tagName) ? decodeEntities(rawContent) : rawContent,
+                position: calculatePosition(html, currentPos)
+              });
+            }
+            currentPos += rawContent.length;
+          }
+        }
       } else {
         const textStart = currentPos;
         currentPos++;
@@ -257,8 +315,6 @@ export function tokenize(html: string): Token[] {
     }
   }
-  tokens.sort((a, b) => a.position.offset - b.position.offset);
   tokens.push({
     type: TokenType.EOF,
     value: '',

package/tests/advanced.test.ts CHANGED Viewed

@@ -60,7 +60,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
       });
     });
-    it('should handle complex CDATA content', () => {
+    it('should handle complex CDATA content as bogus comment', () => {
       const complexContent = `
         function test() {
           return "<div>HTML inside JS</div>";
@@ -71,8 +71,8 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
       expect(tokens.length).toBeGreaterThan(0);
       const cdataToken = tokens[0]!;
-      expect(cdataToken.type).toBe(TokenType.CDATA);
-      expect(cdataToken.value).toBe(complexContent);
+      expect(cdataToken.type).toBe(TokenType.COMMENT);
+      expect(cdataToken.value).toBe('[CDATA[' + complexContent + ']]');
     });
     it('should handle performance with large documents', () => {

package/tests/custom-elements-head.test.ts ADDED Viewed

@@ -0,0 +1,105 @@
+import { describe, it, expect } from 'bun:test';
+import { parseHTML } from '../index';
+describe('Custom Elements in <head>', () => {
+  it('should keep <meta-tags> custom element in head', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><meta-tags></meta-tags></head><body></body></html>'
+    );
+    const metaTags = doc.head?.querySelector('meta-tags');
+    expect(metaTags).toBeTruthy();
+    expect(metaTags?.parentElement?.tagName).toBe('HEAD');
+  });
+  it('should keep <social-meta> custom element in head', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><social-meta></social-meta></head><body></body></html>'
+    );
+    const socialMeta = doc.head?.querySelector('social-meta');
+    expect(socialMeta).toBeTruthy();
+    expect(socialMeta?.parentElement?.tagName).toBe('HEAD');
+  });
+  it('should keep any <custom-element> with hyphen in head', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><my-component></my-component></head><body></body></html>'
+    );
+    const myComponent = doc.head?.querySelector('my-component');
+    expect(myComponent).toBeTruthy();
+    expect(myComponent?.parentElement?.tagName).toBe('HEAD');
+  });
+  it('should still eject non-custom elements like <div> to body', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><div>test</div></head><body></body></html>'
+    );
+    const divInHead = doc.head?.querySelector('div');
+    const divInBody = doc.body?.querySelector('div');
+    expect(divInHead).toBeFalsy();
+    expect(divInBody).toBeTruthy();
+  });
+  it('should handle nested custom elements in head', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><my-wrapper><inner-comp></inner-comp></my-wrapper></head><body></body></html>'
+    );
+    const myWrapper = doc.head?.querySelector('my-wrapper');
+    expect(myWrapper).toBeTruthy();
+    expect(myWrapper?.parentElement?.tagName).toBe('HEAD');
+    const innerComp = myWrapper?.querySelector('inner-comp');
+    expect(innerComp).toBeTruthy();
+  });
+  it('should keep custom elements with attributes in head', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><seo-meta property="og:title" content="Test"></seo-meta></head><body></body></html>'
+    );
+    const seoMeta = doc.head?.querySelector('seo-meta');
+    expect(seoMeta).toBeTruthy();
+    expect(seoMeta?.getAttribute('property')).toBe('og:title');
+    expect(seoMeta?.getAttribute('content')).toBe('Test');
+    expect(seoMeta?.parentElement?.tagName).toBe('HEAD');
+  });
+  it('should keep self-closing custom elements in head', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><custom-void /></head><body></body></html>'
+    );
+    const customVoid = doc.head?.querySelector('custom-void');
+    expect(customVoid).toBeTruthy();
+    expect(customVoid?.parentElement?.tagName).toBe('HEAD');
+  });
+  it('should handle custom elements mixed with standard head elements', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><title>Test</title><meta-tags></meta-tags><link rel="stylesheet" href="style.css"></head><body></body></html>'
+    );
+    const title = doc.head?.querySelector('title');
+    const metaTags = doc.head?.querySelector('meta-tags');
+    const link = doc.head?.querySelector('link');
+    expect(title).toBeTruthy();
+    expect(metaTags).toBeTruthy();
+    expect(link).toBeTruthy();
+  });
+  it('should handle custom element containing text in head', () => {
+    const doc = parseHTML(
+      '<!DOCTYPE html><html><head><inline-script>console.log("test")</inline-script></head><body></body></html>'
+    );
+    const inlineScript = doc.head?.querySelector('inline-script');
+    expect(inlineScript).toBeTruthy();
+    expect(inlineScript?.parentElement?.tagName).toBe('HEAD');
+  });
+});

package/tests/helpers/tree-adapter.test.ts CHANGED Viewed

@@ -27,7 +27,7 @@ describe('Tree Adapter Tests', () => {
   it('should serialize comment', () => {
     const doc = parseHTML('<div><!-- comment --></div>');
     const serialized = serializeToHtml5lib(doc);
-    expect(serialized).toContain('<!--  -->');
+    expect(serialized).toContain('<!--  comment  -->');
   });
   it('should serialize DOCTYPE', () => {

package/tests/helpers/tree-adapter.ts CHANGED Viewed

@@ -1,6 +1,10 @@
 // tests/helpers/tree-adapter.ts
-export function serializeToHtml5lib(doc: any): string {
+export interface SerializeOptions {
+  skipImplicitDoctype?: boolean;
+}
+export function serializeToHtml5lib(doc: any, options: SerializeOptions = {}): string {
   const lines: string[] = [];
   function serialize(node: any, depth: number): void {
@@ -11,7 +15,17 @@ export function serializeToHtml5lib(doc: any): string {
         serialize(child, depth);
       }
     } else if (node.nodeType === 1) { // ELEMENT
-      lines.push(`${indent}<${node.tagName.toLowerCase()}>`);
+      const tagName = node.tagName.toLowerCase();
+      const ns = node.namespaceURI;
+      let nsPrefix = '';
+      if (ns === 'http://www.w3.org/2000/svg') {
+        nsPrefix = ' svg';
+      } else if (ns === 'http://www.w3.org/1998/Math/MathML') {
+        nsPrefix = ' math';
+      }
+      lines.push(`${indent}<${tagName}${nsPrefix}>`);
       // Atributos en orden alfabético
       const attrs = Object.entries(node.attributes || {}).sort(([a], [b]) => a.localeCompare(b));
@@ -32,9 +46,12 @@ export function serializeToHtml5lib(doc: any): string {
     } else if (node.nodeType === 3) { // TEXT
       lines.push(`${indent}"${node.textContent}"`);
     } else if (node.nodeType === 8) { // COMMENT
-      lines.push(`${indent}<!-- ${node.textContent} -->`);
+      const commentData = node.data || node.nodeValue || node.textContent || '';
+      lines.push(`${indent}<!-- ${commentData} -->`);
     } else if (node.nodeType === 10) { // DOCTYPE
-      lines.push(`${indent}<!DOCTYPE ${node.name || 'html'}>`);
+      if (!options.skipImplicitDoctype) {
+        lines.push(`${indent}<!DOCTYPE ${node.name || 'html'}>`);
+      }
     }
   }

package/tests/parser.test.ts CHANGED Viewed

@@ -18,7 +18,8 @@ function parseToAST(html: string): ASTNode {
   if (htmlEl) {
     const bodyEl = htmlEl.children?.find(c => c.tagName === 'body');
     if (bodyEl && bodyEl.children) {
-      return { type: ASTNodeType.Document, children: bodyEl.children };
+      const nonHtmlChildren = ast.children?.filter(c => c.tagName !== 'html' && c.type !== 'doctype') || [];
+      return { type: ASTNodeType.Document, children: [...nonHtmlChildren, ...bodyEl.children] };
     }
   }
   return ast;

package/tests/tokenizer.test.ts CHANGED Viewed

@@ -198,21 +198,21 @@ describe('HTML Tokenizer', () => {
     });
   });
-  describe('CDATA Sections', () => {
-    it('should parse CDATA sections', () => {
+  describe('CDATA Sections (HTML5: treated as bogus comments)', () => {
+    it('should parse CDATA sections as bogus comments in HTML5', () => {
       const tokens = tokenize('<![CDATA[Some data]]>');
       expect(tokens[0]).toEqual({
-        type: TokenType.CDATA,
-        value: 'Some data',
+        type: TokenType.COMMENT,
+        value: '[CDATA[Some data]]',
         position: expect.any(Object)
       });
     });
-    it('should handle CDATA with special characters', () => {
+    it('should handle CDATA with special characters as bogus comment', () => {
       const tokens = tokenize('<![CDATA[<script>alert("test");</script>]]>');
-      expect(tokens[0]?.value).toBe('<script>alert("test");</script>');
+      expect(tokens[0]?.value).toBe('[CDATA[<script>alert("test");</script>]]');
     });
   });
@@ -235,22 +235,22 @@ describe('HTML Tokenizer', () => {
     });
   });
-  describe('Processing Instructions', () => {
-    it('should parse XML processing instruction', () => {
+  describe('Processing Instructions (HTML5: treated as bogus comments)', () => {
+    it('should parse XML processing instruction as bogus comment', () => {
       const tokens = tokenize('<?xml version="1.0" encoding="UTF-8"?>');
       expect(tokens[0]).toEqual({
-        type: TokenType.PROCESSING_INSTRUCTION,
-        value: '<?xml version="1.0" encoding="UTF-8"',
+        type: TokenType.COMMENT,
+        value: '?xml version="1.0" encoding="UTF-8"?',
         position: expect.any(Object)
       });
     });
-    it('should parse PHP-style processing instruction', () => {
+    it('should parse PHP-style processing instruction as bogus comment', () => {
       const tokens = tokenize('<?php echo "Hello"; ?>');
-      expect(tokens[0]?.type).toBe(TokenType.PROCESSING_INSTRUCTION);
-      expect(tokens[0]?.value).toBe('<?php echo "Hello"; ');
+      expect(tokens[0]?.type).toBe(TokenType.COMMENT);
+      expect(tokens[0]?.value).toBe('?php echo "Hello"; ?');
     });
   });
@@ -429,7 +429,7 @@ describe('HTML Tokenizer', () => {
       });
     });
-    it('should handle CDATA with complex content', () => {
+    it('should handle CDATA as bogus comment with complex content', () => {
       const complexContent = `
         function it() {
           return "<div>HTML inside JS</div>";
@@ -440,11 +440,11 @@ describe('HTML Tokenizer', () => {
       const tokens = tokenize(`<![CDATA[${complexContent}]]>`);
       const cdataToken = tokens[0]!;
-      expect(cdataToken.type).toBe(TokenType.CDATA);
-      expect(cdataToken.value).toBe(complexContent);
+      expect(cdataToken.type).toBe(TokenType.COMMENT);
+      expect(cdataToken.value).toBe('[CDATA[' + complexContent + ']]');
     });
-    it('should handle processing instructions with various formats', () => {
+    it('should handle processing instructions as bogus comments', () => {
       const tests = [
         { input: '<?xml version="1.0" encoding="UTF-8"?>', expected: 'xml' },
         { input: '<?xml-stylesheet type="text/xsl" href="style.xsl"?>', expected: 'xml' },
@@ -456,7 +456,7 @@ describe('HTML Tokenizer', () => {
         const tokens = tokenize(test.input);
         const piToken = tokens[0]!;
-        expect(piToken.type).toBe(TokenType.PROCESSING_INSTRUCTION);
+        expect(piToken.type).toBe(TokenType.COMMENT);
         expect(piToken.value.toLowerCase()).toContain(test.expected);
       });
     });
@@ -478,15 +478,13 @@ describe('HTML Tokenizer', () => {
       });
     });
-    it('should handle mixed content with all token types', () => {
+    it('should handle mixed content with all token types (HTML5 mode)', () => {
       const html = `
-        <?xml version="1.0"?>
         <!DOCTYPE html>
         <!-- Main document -->
         <html lang="en">
           <head>
             <title>Test &amp; Demo</title>
-            <![CDATA[Some raw data]]>
           </head>
           <body>
             <h1>Hello World</h1>
@@ -500,27 +498,25 @@ describe('HTML Tokenizer', () => {
       const tokens = tokenize(html);
       const tokenCounts = {
-        [TokenType.PROCESSING_INSTRUCTION]: 0,
         [TokenType.DOCTYPE]: 0,
         [TokenType.COMMENT]: 0,
         [TokenType.TAG_OPEN]: 0,
         [TokenType.TAG_CLOSE]: 0,
         [TokenType.TEXT]: 0,
-        [TokenType.CDATA]: 0,
         [TokenType.EOF]: 0
       };
       tokens.forEach(token => {
-        tokenCounts[token.type]++;
+        if (token.type in tokenCounts) {
+          tokenCounts[token.type]++;
+        }
       });
-      expect(tokenCounts[TokenType.PROCESSING_INSTRUCTION]).toBeGreaterThan(0);
       expect(tokenCounts[TokenType.DOCTYPE]).toBeGreaterThan(0);
       expect(tokenCounts[TokenType.COMMENT]).toBeGreaterThan(0);
       expect(tokenCounts[TokenType.TAG_OPEN]).toBeGreaterThan(0);
       expect(tokenCounts[TokenType.TAG_CLOSE]).toBeGreaterThan(0);
       expect(tokenCounts[TokenType.TEXT]).toBeGreaterThan(0);
-      expect(tokenCounts[TokenType.CDATA]).toBeGreaterThan(0);
       expect(tokenCounts[TokenType.EOF]).toBe(1);
     });
   })