npm - @tkeron/html-parser - Versions diffs - 0.1.5 → 1.0.0 - Mend

@tkeron/html-parser 0.1.5 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (91) hide show

package/README.md +1 -7
package/bun.lock +8 -3
package/index.ts +4 -0
package/package.json +13 -6
package/src/css-selector.ts +45 -27
package/src/dom-simulator.ts +162 -20
package/src/encoding.ts +39 -0
package/src/index.ts +9 -0
package/src/parser.ts +478 -183
package/src/serializer.ts +450 -0
package/src/tokenizer.ts +59 -139
package/tests/advanced.test.ts +119 -106
package/tests/custom-elements.test.ts +172 -162
package/tests/dom-extended.test.ts +12 -12
package/tests/dom-manipulation.test.ts +637 -0
package/tests/dom.test.ts +32 -27
package/tests/helpers/tokenizer-adapter.test.ts +70 -0
package/tests/helpers/tokenizer-adapter.ts +65 -0
package/tests/helpers/tree-adapter.test.ts +39 -0
package/tests/helpers/tree-adapter.ts +43 -0
package/tests/html5lib-data/tokenizer/namedEntities.test +42422 -0
package/tests/html5lib-data/tokenizer/pendingSpecChanges.test +9 -0
package/tests/html5lib-data/tree-construction/adoption01.dat +354 -0
package/tests/html5lib-data/tree-construction/adoption02.dat +39 -0
package/tests/html5lib-data/tree-construction/domjs-unsafe.dat +0 -0
package/tests/html5lib-data/tree-construction/entities02.dat +309 -0
package/tests/html5lib-data/tree-construction/html5test-com.dat +301 -0
package/tests/html5lib-data/tree-construction/math.dat +104 -0
package/tests/html5lib-data/tree-construction/namespace-sensitivity.dat +22 -0
package/tests/html5lib-data/tree-construction/noscript01.dat +237 -0
package/tests/html5lib-data/tree-construction/ruby.dat +302 -0
package/tests/html5lib-data/tree-construction/scriptdata01.dat +372 -0
package/tests/html5lib-data/tree-construction/svg.dat +104 -0
package/tests/html5lib-data/tree-construction/template.dat +1673 -0
package/tests/html5lib-data/tree-construction/tests10.dat +853 -0
package/tests/html5lib-data/tree-construction/tests11.dat +523 -0
package/tests/html5lib-data/tree-construction/tests20.dat +842 -0
package/tests/html5lib-data/tree-construction/tests21.dat +306 -0
package/tests/html5lib-data/tree-construction/tests23.dat +168 -0
package/tests/html5lib-data/tree-construction/tests24.dat +79 -0
package/tests/html5lib-data/tree-construction/tests5.dat +210 -0
package/tests/html5lib-data/tree-construction/tests6.dat +663 -0
package/tests/html5lib-data/tree-construction/tests_innerHTML_1.dat +844 -0
package/tests/parser.test.ts +172 -193
package/tests/selectors.test.ts +64 -1
package/tests/serializer-core.test.ts +16 -0
package/tests/serializer-data/core.test +125 -0
package/tests/serializer-data/injectmeta.test +66 -0
package/tests/serializer-data/optionaltags.test +965 -0
package/tests/serializer-data/options.test +60 -0
package/tests/serializer-data/whitespace.test +51 -0
package/tests/serializer-injectmeta.test.ts +16 -0
package/tests/serializer-optionaltags.test.ts +16 -0
package/tests/serializer-options.test.ts +16 -0
package/tests/serializer-whitespace.test.ts +16 -0
package/tests/tokenizer-namedEntities.test.ts +20 -0
package/tests/tokenizer-pendingSpecChanges.test.ts +20 -0
package/tests/tokenizer.test.ts +83 -0
package/tests/tree-construction-adoption01.test.ts +37 -0
package/tests/tree-construction-adoption02.test.ts +34 -0
package/tests/tree-construction-domjs-unsafe.test.ts +24 -0
package/tests/tree-construction-entities02.test.ts +33 -0
package/tests/tree-construction-html5test-com.test.ts +24 -0
package/tests/tree-construction-math.test.ts +18 -0
package/tests/tree-construction-namespace-sensitivity.test.ts +18 -0
package/tests/tree-construction-noscript01.test.ts +18 -0
package/tests/tree-construction-ruby.test.ts +21 -0
package/tests/tree-construction-scriptdata01.test.ts +21 -0
package/tests/tree-construction-svg.test.ts +21 -0
package/tests/tree-construction-template.test.ts +21 -0
package/tests/tree-construction-tests10.test.ts +21 -0
package/tests/tree-construction-tests11.test.ts +21 -0
package/tests/tree-construction-tests20.test.ts +18 -0
package/tests/tree-construction-tests21.test.ts +18 -0
package/tests/tree-construction-tests23.test.ts +18 -0
package/tests/tree-construction-tests24.test.ts +18 -0
package/tests/tree-construction-tests5.test.ts +21 -0
package/tests/tree-construction-tests6.test.ts +21 -0
package/tests/tree-construction-tests_innerHTML_1.test.ts +21 -0
package/tests/void-elements.test.ts +471 -0
package/tests/official/README.md +0 -87
package/tests/official/acid/acid-tests.test.ts +0 -309
package/tests/official/final-output/final-output.test.ts +0 -361
package/tests/official/html5lib/tokenizer-utils.ts +0 -192
package/tests/official/html5lib/tokenizer.test.ts +0 -171
package/tests/official/html5lib/tree-construction-utils.ts +0 -194
package/tests/official/html5lib/tree-construction.test.ts +0 -250
package/tests/official/validator/validator-tests.test.ts +0 -237
package/tests/official/validator-nu/validator-nu.test.ts +0 -335
package/tests/official/whatwg/whatwg-tests.test.ts +0 -205
package/tests/official/wpt/wpt-tests.test.ts +0 -409

package/src/parser.ts CHANGED Viewed

@@ -1,38 +1,14 @@
 import type { Token } from './tokenizer.js';
 import { TokenType } from './tokenizer.js';
-export interface ASTNode {
-  type: ASTNodeType;
-  tagName?: string;
-  attributes?: Record<string, string>;
-  children?: ASTNode[];
-  content?: string;
-  parent?: ASTNode;
-  isSelfClosing?: boolean;
-  position?: {
-    start: number;
-    end: number;
-    line: number;
-    column: number;
-  };
-}
-export enum ASTNodeType {
-  DOCUMENT = 'DOCUMENT',
-  ELEMENT = 'ELEMENT',
-  TEXT = 'TEXT',
-  COMMENT = 'COMMENT',
-  CDATA = 'CDATA',
-  DOCTYPE = 'DOCTYPE',
-  PROCESSING_INSTRUCTION = 'PROCESSING_INSTRUCTION'
-}
+import { createDocument, createElement, createTextNode, createComment, createDoctype, appendChild } from './dom-simulator.js';
 export interface ParserState {
   tokens: Token[];
   position: number;
   length: number;
-  stack: ASTNode[];
-  root: ASTNode;
+  stack: any[]; // DOM elements
+  root: any; // Document
+  insertionMode: InsertionMode;
   errors: ParseError[];
 }
@@ -44,6 +20,32 @@ export interface ParseError {
   severity: 'error' | 'warning';
 }
+export enum InsertionMode {
+  Initial = 'initial',
+  BeforeHtml = 'beforeHtml',
+  BeforeHead = 'beforeHead',
+  InHead = 'inHead',
+  AfterHead = 'afterHead',
+  InBody = 'inBody'
+}
+export enum ASTNodeType {
+  Document = 'document',
+  Element = 'element',
+  Text = 'text',
+  Comment = 'comment',
+  Doctype = 'doctype',
+  CDATA = 'cdata'
+}
+export interface ASTNode {
+  type: ASTNodeType;
+  tagName?: string;
+  value?: string;
+  attributes?: Record<string, string>;
+  children?: ASTNode[];
+}
 const VOID_ELEMENTS = new Set([
   'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
   'link', 'meta', 'param', 'source', 'track', 'wbr'
@@ -57,7 +59,41 @@ const AUTO_CLOSE_RULES: Record<string, string[]> = {
   'li': ['li'],
   'dt': ['dt', 'dd'],
   'dd': ['dt', 'dd'],
+  'address': ['p'],
+  'article': ['p'],
+  'aside': ['p'],
+  'blockquote': ['p'],
+  'center': ['p'],
+  'details': ['p'],
+  'dialog': ['p'],
+  'dir': ['p'],
+  'div': ['p'],
+  'dl': ['p'],
+  'fieldset': ['p'],
+  'figcaption': ['p'],
+  'figure': ['p'],
+  'footer': ['p'],
+  'form': ['p'],
+  'h1': ['p'],
+  'h2': ['p'],
+  'h3': ['p'],
+  'h4': ['p'],
+  'h5': ['p'],
+  'h6': ['p'],
+  'header': ['p'],
+  'hgroup': ['p'],
+  'hr': ['p'],
+  'listing': ['p'],
+  'main': ['p'],
+  'menu': ['p'],
+  'nav': ['p'],
+  'ol': ['p'],
   'p': ['p'],
+  'pre': ['p'],
+  'section': ['p'],
+  'summary': ['p'],
+  'table': ['p'],
+  'ul': ['p'],
   'rt': ['rt', 'rp'],
   'rp': ['rt', 'rp'],
   'optgroup': ['optgroup'],
@@ -70,7 +106,7 @@ const AUTO_CLOSE_RULES: Record<string, string[]> = {
   'th': ['td', 'th']
 };
-export function parse(tokens: Token[]): ASTNode {
+export function parse(tokens: Token[]): any {
   const state = createParserState(tokens);
   while (state.position < state.length) {
@@ -84,21 +120,119 @@ export function parse(tokens: Token[]): ASTNode {
     advance(state);
   }
+  // Create implicit html, head, body if needed
+  if (state.root.childNodes && state.root.childNodes.length > 0) {
+    let hasHtml = false;
+    for (const child of state.root.childNodes) {
+      if (child.nodeType === 1 && child.tagName === 'HTML') {
+        hasHtml = true;
+        state.root.documentElement = child;
+        break;
+      }
+    }
+    if (!hasHtml) {
+      const html = createElement('html', {});
+      const head = createElement('head', {});
+      const body = createElement('body', {});
+      appendChild(html, head);
+      appendChild(html, body);
+      const doctypes: any[] = [];
+      const children = [...state.root.childNodes];
+      for (const child of children) {
+        if (child.nodeType === 10) {
+          doctypes.push(child);
+        } else {
+          appendChild(body, child);
+        }
+      }
+      state.root.childNodes = [];
+      for (const doctype of doctypes) {
+        doctype.parentNode = null;
+        appendChild(state.root, doctype);
+      }
+      appendChild(state.root, html);
+      state.root.documentElement = html;
+      state.root.head = head;
+      state.root.body = body;
+    }
+  }
   while (state.stack.length > 1) {
     const unclosedElement = state.stack.pop()!;
     const currentToken = getCurrentToken(state);
-    addError(state, `Unclosed tag: ${unclosedElement.tagName}`, currentToken?.position?.start || 0);
+    addError(state, `Unclosed tag: ${unclosedElement.tagName}`, currentToken?.position?.offset || 0);
   }
   return state.root;
 }
+export function domToAST(dom: any): ASTNode {
+  function convert(node: any): ASTNode | null {
+    if (!node) return null;
+    if (node.nodeType === 9) {
+      const children: ASTNode[] = [];
+      if (node.childNodes) {
+        for (const child of node.childNodes) {
+          const converted = convert(child);
+          if (converted) children.push(converted);
+        }
+      }
+      return {
+        type: ASTNodeType.Document,
+        children
+      };
+    }
+    if (node.nodeType === 1) {
+      const children: ASTNode[] = [];
+      if (node.childNodes) {
+        for (const child of node.childNodes) {
+          const converted = convert(child);
+          if (converted) children.push(converted);
+        }
+      }
+      const tagName = node.tagName?.toLowerCase();
+      return {
+        type: ASTNodeType.Element,
+        tagName,
+        attributes: node.attributes || {},
+        children,
+        isSelfClosing: VOID_ELEMENTS.has(tagName)
+      } as ASTNode & { isSelfClosing: boolean };
+    }
+    if (node.nodeType === 3) {
+      return {
+        type: ASTNodeType.Text,
+        content: node.nodeValue || ''
+      } as ASTNode & { content: string };
+    }
+    if (node.nodeType === 8) {
+      return {
+        type: ASTNodeType.Comment,
+        content: node.nodeValue || ''
+      } as ASTNode & { content: string };
+    }
+    if (node.nodeType === 10) {
+      return {
+        type: ASTNodeType.Doctype,
+        content: node.name || 'html'
+      } as ASTNode & { content: string };
+    }
+    return null;
+  }
+  return convert(dom) || { type: ASTNodeType.Document, children: [] };
+}
 function createParserState(tokens: Token[]): ParserState {
-  const root: ASTNode = {
-    type: ASTNodeType.DOCUMENT,
-    children: [],
-    tagName: '#document'
-  };
+  const root = createDocument();
   return {
     tokens,
@@ -106,81 +240,223 @@ function createParserState(tokens: Token[]): ParserState {
     length: tokens.length,
     stack: [root],
     root,
+    insertionMode: InsertionMode.Initial,
     errors: []
   };
 }
 function parseToken(state: ParserState, token: Token): void {
-  switch (token.type) {
-    case TokenType.TAG_OPEN:
-      parseOpenTag(state, token);
+  switch (state.insertionMode) {
+    case InsertionMode.Initial:
+      parseTokenInInitialMode(state, token);
       break;
-    case TokenType.TAG_CLOSE:
-      parseCloseTag(state, token);
+    case InsertionMode.BeforeHtml:
+      parseTokenInBeforeHtmlMode(state, token);
       break;
-    case TokenType.TEXT:
-      parseText(state, token);
-      break;
-    case TokenType.COMMENT:
-      parseComment(state, token);
+    case InsertionMode.BeforeHead:
+      parseTokenInBeforeHeadMode(state, token);
       break;
-    case TokenType.CDATA:
-      parseCDATA(state, token);
+    case InsertionMode.InHead:
+      parseTokenInInHeadMode(state, token);
       break;
-    case TokenType.DOCTYPE:
-      parseDoctype(state, token);
+    case InsertionMode.AfterHead:
+      parseTokenInAfterHeadMode(state, token);
       break;
-    case TokenType.PROCESSING_INSTRUCTION:
-      parseProcessingInstruction(state, token);
+    case InsertionMode.InBody:
+      parseTokenInInBodyMode(state, token);
       break;
+    default:
+      parseTokenInInBodyMode(state, token); // fallback
   }
 }
-function parseOpenTag(state: ParserState, token: Token): void {
-  const tagName = token.value.toLowerCase();
-  handleAutoClosing(state, tagName);
-  const currentParent = getCurrentParent(state);
+function parseTokenInInitialMode(state: ParserState, token: Token): void {
+  if (token.type === TokenType.DOCTYPE) {
+    // TODO: Create DOCTYPE node
+    parseDoctype(state, token);
+    state.insertionMode = InsertionMode.BeforeHtml;
+  } else if (token.type === TokenType.COMMENT) {
+    parseComment(state, token);
+  } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
+    // Ignore whitespace
+  } else {
+    // No DOCTYPE, create implicit DOCTYPE and switch to BeforeHtml
+    const doctype = createDoctype('html');
+    appendChild(state.root, doctype);
+    state.insertionMode = InsertionMode.BeforeHtml;
+    parseToken(state, token); // Re-parse in new mode
+  }
+}
-  const element: ASTNode = {
-    type: ASTNodeType.ELEMENT,
-    tagName,
-    attributes: token.attributes || {},
-    children: [],
-    parent: currentParent,
-    isSelfClosing: token.isSelfClosing || VOID_ELEMENTS.has(tagName),
-    position: token.position
-  };
+function parseTokenInBeforeHtmlMode(state: ParserState, token: Token): void {
+  if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'html') {
+    const html = createElement('html', token.attributes || {});
+    appendChild(state.root, html);
+    state.root.documentElement = html;
+    state.stack.push(html);
+    state.insertionMode = InsertionMode.BeforeHead;
+  } else if (token.type === TokenType.COMMENT) {
+    parseComment(state, token);
+  } else if (token.type === TokenType.DOCTYPE) {
+    // Ignore
+  } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
+    // Ignore whitespace
+  } else {
+    const html = createElement('html', {});
+    appendChild(state.root, html);
+    state.root.documentElement = html;
+    state.stack.push(html);
+    state.insertionMode = InsertionMode.BeforeHead;
+    parseToken(state, token);
+  }
+}
-  if (currentParent.children) {
-    currentParent.children.push(element);
+function parseTokenInBeforeHeadMode(state: ParserState, token: Token): void {
+  if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'head') {
+    const head = createElement('head', token.attributes || {});
+    appendChild(getCurrentParent(state), head);
+    state.root.head = head;
+    state.stack.push(head);
+    state.insertionMode = InsertionMode.InHead;
+  } else if (token.type === TokenType.COMMENT) {
+    parseComment(state, token);
+  } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
+    // Ignore whitespace
+  } else {
+    const head = createElement('head', {});
+    appendChild(getCurrentParent(state), head);
+    state.root.head = head;
+    state.stack.push(head);
+    state.insertionMode = InsertionMode.InHead;
+    parseToken(state, token);
   }
+}
-  if (!element.isSelfClosing) {
+function parseOpenTag(state: ParserState, token: Token): void {
+  const tagName = token.value.toLowerCase();
+  const currentParent = getCurrentParent(state);
+  const element = createElement(tagName, token.attributes || {});
+  appendChild(currentParent, element);
+  if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
     state.stack.push(element);
   }
 }
-function parseCloseTag(state: ParserState, token: Token): void {
-  const tagName = token.value.toLowerCase();
+function parseTokenInInHeadMode(state: ParserState, token: Token): void {
+  const currentElement = getCurrentElement(state);
+  const currentTagName = currentElement?.tagName?.toLowerCase();
-  let found = false;
-  for (let i = state.stack.length - 1; i >= 0; i--) {
-    const element = state.stack[i]!;
-    if (element.tagName === tagName) {
-      while (state.stack.length > i + 1) {
-        const unclosedElement = state.stack.pop()!;
-        addError(state, `Unclosed tag: ${unclosedElement.tagName}`, token.position?.start || 0);
-      }
+  if (RAW_TEXT_ELEMENTS.has(currentTagName)) {
+    if (token.type === TokenType.TEXT) {
+      parseText(state, token);
+      return;
+    } else if (token.type === TokenType.TAG_CLOSE && token.value.toLowerCase() === currentTagName) {
       state.stack.pop();
-      found = true;
-      break;
+      return;
+    }
+  }
+  if (token.type === TokenType.TAG_OPEN) {
+    const tagName = token.value.toLowerCase();
+    if (tagName === 'title' || tagName === 'style' || tagName === 'script' || tagName === 'noscript') {
+      parseOpenTag(state, token);
+    } else if (tagName === 'meta' || tagName === 'link' || tagName === 'base') {
+      parseOpenTag(state, token);
+    } else if (tagName === 'head') {
+    } else {
+      state.stack.pop();
+      state.insertionMode = InsertionMode.AfterHead;
+      parseToken(state, token);
+    }
+  } else if (token.type === TokenType.TAG_CLOSE) {
+    const tagName = token.value.toLowerCase();
+    if (tagName === 'head') {
+      state.stack.pop();
+      state.insertionMode = InsertionMode.AfterHead;
+    } else if (tagName === 'title' || tagName === 'style' || tagName === 'script' || tagName === 'noscript') {
+      if (currentTagName === tagName) {
+        state.stack.pop();
+      }
     }
+  } else if (token.type === TokenType.COMMENT) {
+    parseComment(state, token);
+  } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
+  } else {
+    state.stack.pop();
+    state.insertionMode = InsertionMode.AfterHead;
+    parseToken(state, token);
   }
+}
-  if (!found) {
-    addError(state, `Unexpected closing tag: ${tagName}`, token.position?.start || 0);
+function parseTokenInAfterHeadMode(state: ParserState, token: Token): void {
+  if (token.type === TokenType.TAG_OPEN && token.value.toLowerCase() === 'body') {
+    const body = createElement('body', token.attributes || {});
+    appendChild(getCurrentParent(state), body);
+    state.root.body = body;
+    state.stack.push(body);
+    state.insertionMode = InsertionMode.InBody;
+  } else if (token.type === TokenType.COMMENT) {
+    parseComment(state, token);
+  } else if (token.type === TokenType.TEXT && token.value.trim() === '') {
+    // Ignore whitespace
+  } else {
+    const body = createElement('body', {});
+    appendChild(getCurrentParent(state), body);
+    state.root.body = body;
+    state.stack.push(body);
+    state.insertionMode = InsertionMode.InBody;
+    parseToken(state, token);
+  }
+}
+function parseTokenInInBodyMode(state: ParserState, token: Token): void {
+  if (token.type === TokenType.TAG_OPEN) {
+    const tagName = token.value.toLowerCase();
+    handleAutoClosing(state, tagName);
+    const currentParent = getCurrentParent(state);
+    const element = createElement(tagName, token.attributes || {});
+    appendChild(currentParent, element);
+    if (!token.isSelfClosing && !VOID_ELEMENTS.has(tagName)) {
+      state.stack.push(element);
+    }
+  } else if (token.type === TokenType.TAG_CLOSE) {
+    const tagName = token.value.toLowerCase();
+    // Generate implied end tags
+    const impliedEndTags = ['dd', 'dt', 'li', 'option', 'optgroup', 'p', 'rb', 'rp', 'rt', 'rtc'];
+    while (state.stack.length > 1) { // Don't pop document
+      const currentElement = getCurrentElement(state);
+      if (!currentElement || !impliedEndTags.includes(currentElement.tagName.toLowerCase()) || currentElement.tagName.toLowerCase() === tagName) {
+        break;
+      }
+      state.stack.pop();
+      addError(state, `Implied end tag: ${currentElement.tagName}`, token.position?.offset || 0);
+    }
+    const currentElement = getCurrentElement(state);
+    if (currentElement && currentElement.tagName.toLowerCase() === tagName) {
+      state.stack.pop();
+    } else {
+      // For now, just ignore unmatched closing tags
+      // TODO: Implement full adoption agency algorithm
+      addError(state, `Unmatched closing tag: ${tagName}`, token.position?.offset || 0);
+    }
+  } else if (token.type === TokenType.TEXT) {
+    parseText(state, token);
+  } else if (token.type === TokenType.COMMENT) {
+    parseComment(state, token);
+  } else if (token.type === TokenType.CDATA) {
+    parseCDATA(state, token);
+  } else if (token.type === TokenType.DOCTYPE) {
+    // Ignore
+  } else if (token.type === TokenType.PROCESSING_INSTRUCTION) {
+    parseProcessingInstruction(state, token);
   }
 }
@@ -192,76 +468,134 @@ function parseText(state: ParserState, token: Token): void {
     return;
   }
-  const textNode: ASTNode = {
-    type: ASTNodeType.TEXT,
-    content,
-    parent: currentParent,
-    position: token.position
-  };
-  if (currentParent.children) {
-    currentParent.children.push(textNode);
-  }
+  const textNode = createTextNode(content);
+  appendChild(currentParent, textNode);
 }
 function parseComment(state: ParserState, token: Token): void {
   const currentParent = getCurrentParent(state);
-  const commentNode: ASTNode = {
-    type: ASTNodeType.COMMENT,
-    content: token.value,
-    parent: currentParent,
-    position: token.position
-  };
-  if (currentParent.children) {
-    currentParent.children.push(commentNode);
-  }
+  const commentNode = createComment(token.value);
+  appendChild(currentParent, commentNode);
 }
 function parseCDATA(state: ParserState, token: Token): void {
-  const currentParent = getCurrentParent(state);
-  const cdataNode: ASTNode = {
-    type: ASTNodeType.CDATA,
-    content: token.value,
-    parent: currentParent,
-    position: token.position
-  };
-  if (currentParent.children) {
-    currentParent.children.push(cdataNode);
-  }
+  // TODO: implement CDATA
 }
 function parseDoctype(state: ParserState, token: Token): void {
-  const currentParent = getCurrentParent(state);
-  const doctypeNode: ASTNode = {
-    type: ASTNodeType.DOCTYPE,
-    content: token.value,
-    parent: currentParent,
-    position: token.position
-  };
-  if (currentParent.children) {
-    currentParent.children.push(doctypeNode);
-  }
+  const doctype = createDoctype(token.value || 'html');
+  appendChild(state.root, doctype);
+  state.root.doctype = doctype;
 }
 function parseProcessingInstruction(state: ParserState, token: Token): void {
-  const currentParent = getCurrentParent(state);
-  const piNode: ASTNode = {
-    type: ASTNodeType.PROCESSING_INSTRUCTION,
-    content: token.value,
-    parent: currentParent,
-    position: token.position
-  };
+  // TODO: implement ProcessingInstruction
+}
-  if (currentParent.children) {
-    currentParent.children.push(piNode);
+function runAdoptionAgencyAlgorithm(state: ParserState, tagName: string, token: Token): void {
+  // HTML5 Adoption Agency Algorithm - simplified but more correct implementation
+  // 1. If the current node is an HTML element whose tag name matches the token's tag name,
+  // then pop the current node off the stack of open elements and abort these steps.
+  const currentElement = getCurrentElement(state);
+  if (currentElement && currentElement.tagName.toLowerCase() === tagName) {
+    state.stack.pop();
+    return;
   }
+  // 2. Let outer loop counter be 0
+  let outerLoopCounter = 0;
+  const formattingElements = ['a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'];
+  while (outerLoopCounter < 8) { // Prevent infinite loops
+    outerLoopCounter++;
+    // 3. Let the formatting element be the last element in the list of active formatting elements
+    // that is between the end of the list and the last scope marker or the start of the list,
+    // if any, that has the same tag name as the token.
+    // For simplicity, find the innermost element with matching tag name
+    let formattingElementIndex = -1;
+    for (let i = state.stack.length - 1; i >= 0; i--) {
+      const element = state.stack[i];
+      if (element.tagName && element.tagName.toLowerCase() === tagName && formattingElements.includes(tagName)) {
+        formattingElementIndex = i;
+        break;
+      }
+    }
+    if (formattingElementIndex === -1) {
+      // No formatting element found, just find any element with matching tag name
+      for (let i = state.stack.length - 1; i >= 0; i--) {
+        const element = state.stack[i];
+        if (element.tagName && element.tagName.toLowerCase() === tagName) {
+          formattingElementIndex = i;
+          break;
+        }
+      }
+    }
+    if (formattingElementIndex === -1) {
+      // No matching element found, ignore the token
+      addError(state, `Stray end tag: ${tagName}`, token.position?.offset || 0);
+      return;
+    }
+    const formattingElement = state.stack[formattingElementIndex];
+    // 4. If there is no element in the stack of open elements that has the same tag name as the
+    // formatting element, then remove the element from the list of active formatting elements
+    // and abort these steps.
+    let openElementIndex = -1;
+    for (let i = state.stack.length - 1; i >= 0; i--) {
+      if (state.stack[i] === formattingElement) {
+        openElementIndex = i;
+        break;
+      }
+    }
+    if (openElementIndex === -1) {
+      // Element not in stack, ignore
+      return;
+    }
+    // 5. If the element is not in the stack of open elements, then this is a parse error;
+    // remove the element from the list of active formatting elements and abort these steps.
+    // (Already checked above)
+    // 6. Let the furthest block be the topmost node in the stack of open elements that is lower
+    // in the stack than the formatting element, and is an element in the special category.
+    const specialElements = ['address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', 'listing', 'main', 'menu', 'nav', 'ol', 'p', 'pre', 'section', 'table', 'ul', 'xmp'];
+    let furthestBlockIndex = -1;
+    for (let i = openElementIndex + 1; i < state.stack.length; i++) {
+      const element = state.stack[i];
+      if (element.tagName && specialElements.includes(element.tagName.toLowerCase())) {
+        furthestBlockIndex = i;
+        break;
+      }
+    }
+    if (furthestBlockIndex === -1) {
+      // No special element found, just pop elements until we reach the formatting element
+      while (state.stack.length > openElementIndex + 1) {
+        state.stack.pop();
+      }
+      state.stack.pop(); // Pop the formatting element
+      return;
+    }
+    // 7. Simplified: just pop everything until the formatting element
+    while (state.stack.length > openElementIndex + 1) {
+      state.stack.pop();
+    }
+    state.stack.pop(); // Pop the formatting element
+    return;
+  }
+  // If we get here, something went wrong, ignore the token
+  addError(state, `Adoption agency gave up on: ${tagName}`, token.position?.offset || 0);
 }
 function handleAutoClosing(state: ParserState, tagName: string): void {
@@ -269,19 +603,19 @@ function handleAutoClosing(state: ParserState, tagName: string): void {
   if (!autoCloseList) return;
   const currentElement = getCurrentElement(state);
-  if (currentElement && currentElement.tagName && autoCloseList.includes(currentElement.tagName)) {
+  if (currentElement && currentElement.tagName && autoCloseList.includes(currentElement.tagName.toLowerCase())) {
     state.stack.pop();
   }
 }
-function getCurrentParent(state: ParserState): ASTNode {
-  return state.stack[state.stack.length - 1]!;
+function getCurrentParent(state: ParserState): any {
+  return state.stack[state.stack.length - 1];
 }
-function getCurrentElement(state: ParserState): ASTNode | null {
+function getCurrentElement(state: ParserState): any {
   for (let i = state.stack.length - 1; i >= 0; i--) {
-    const element = state.stack[i]!;
-    if (element.type === ASTNodeType.ELEMENT) {
+    const element = state.stack[i];
+    if (element.nodeType === 1) { // ELEMENT_NODE
       return element;
     }
   }
@@ -306,7 +640,7 @@ function addError(state: ParserState, message: string, position: number): void {
   });
 }
-function shouldSkipWhitespace(parent: ASTNode): boolean {
+function shouldSkipWhitespace(parent: any): boolean {
   const skipWhitespaceIn = new Set([
     'html', 'head', 'body', 'table', 'tbody', 'thead', 'tfoot', 'tr',
     'ul', 'ol', 'dl', 'select', 'optgroup'
@@ -314,42 +648,3 @@ function shouldSkipWhitespace(parent: ASTNode): boolean {
   return parent.tagName ? skipWhitespaceIn.has(parent.tagName) : false;
 }
-export function traverseAST(node: ASTNode, callback: (node: ASTNode) => void): void {
-  callback(node);
-  if (node.children) {
-    for (const child of node.children) {
-      traverseAST(child, callback);
-    }
-  }
-}
-export function findNodesByTagName(root: ASTNode, tagName: string): ASTNode[] {
-  const results: ASTNode[] = [];
-  traverseAST(root, (node) => {
-    if (node.type === ASTNodeType.ELEMENT && node.tagName === tagName.toLowerCase()) {
-      results.push(node);
-    }
-  });
-  return results;
-}
-export function findNodesByAttribute(root: ASTNode, attrName: string, attrValue?: string): ASTNode[] {
-  const results: ASTNode[] = [];
-  traverseAST(root, (node) => {
-    if (node.type === ASTNodeType.ELEMENT && node.attributes) {
-      const hasAttr = attrName in node.attributes;
-      const valueMatches = attrValue === undefined || node.attributes[attrName] === attrValue;
-      if (hasAttr && valueMatches) {
-        results.push(node);
-      }
-    }
-  });
-  return results;
-}