npm - @tkeron/html-parser - Versions diffs - 0.1.0 - Mend

@tkeron/html-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

package/.github/workflows/npm_deploy.yml +24 -0
package/LICENSE +21 -0
package/README.md +120 -0
package/bun.lock +29 -0
package/index.ts +18 -0
package/package.json +25 -0
package/src/css-selector.ts +172 -0
package/src/dom-simulator.ts +592 -0
package/src/dom-types.ts +78 -0
package/src/parser.ts +355 -0
package/src/tokenizer.ts +413 -0
package/tests/advanced.test.ts +487 -0
package/tests/api-integration.test.ts +114 -0
package/tests/dom-extended.test.ts +173 -0
package/tests/dom.test.ts +482 -0
package/tests/google-dom.test.ts +118 -0
package/tests/google-homepage.txt +13 -0
package/tests/official/README.md +87 -0
package/tests/official/acid/acid-tests.test.ts +309 -0
package/tests/official/final-output/final-output.test.ts +361 -0
package/tests/official/html5lib/tokenizer-utils.ts +204 -0
package/tests/official/html5lib/tokenizer.test.ts +184 -0
package/tests/official/html5lib/tree-construction-utils.ts +208 -0
package/tests/official/html5lib/tree-construction.test.ts +250 -0
package/tests/official/validator/validator-tests.test.ts +237 -0
package/tests/official/validator-nu/validator-nu.test.ts +335 -0
package/tests/official/whatwg/whatwg-tests.test.ts +205 -0
package/tests/official/wpt/wpt-tests.test.ts +409 -0
package/tests/parser.test.ts +642 -0
package/tests/selectors.test.ts +65 -0
package/tests/test-page-0.txt +362 -0
package/tests/tokenizer.test.ts +666 -0
package/tsconfig.json +25 -0

package/tests/official/html5lib/tokenizer.test.ts ADDED Viewed

@@ -0,0 +1,184 @@
+import { describe, it } from 'bun:test';
+import {
+  loadHTML5libTokenizerTests,
+  runHTML5libTokenizerTestSuite,
+  type HTML5libTokenizerTestSuite
+} from './tokenizer-utils';
+// Sample HTML5lib tokenizer tests embedded directly
+const basicTokenizerTests: HTML5libTokenizerTestSuite = {
+  "tests": [
+    {
+      "description": "Correct Doctype lowercase",
+      "input": "<!DOCTYPE html>",
+      "output": [["DOCTYPE", "html", null, null, true]]
+    },
+    {
+      "description": "Correct Doctype uppercase",
+      "input": "<!DOCTYPE HTML>",
+      "output": [["DOCTYPE", "html", null, null, true]]
+    },
+    {
+      "description": "Single Start Tag",
+      "input": "<h>",
+      "output": [["StartTag", "h", {}]]
+    },
+    {
+      "description": "Start Tag w/attribute",
+      "input": "<h a='b'>",
+      "output": [["StartTag", "h", { "a": "b" }]]
+    },
+    {
+      "description": "Start/End Tag",
+      "input": "<h></h>",
+      "output": [["StartTag", "h", {}], ["EndTag", "h"]]
+    },
+    {
+      "description": "Simple comment",
+      "input": "<!--comment-->",
+      "output": [["Comment", "comment"]]
+    },
+    {
+      "description": "Character data",
+      "input": "Hello World",
+      "output": [["Character", "Hello World"]]
+    },
+    {
+      "description": "Multiple attributes",
+      "input": "<h a='b' c='d'>",
+      "output": [["StartTag", "h", { "a": "b", "c": "d" }]]
+    },
+    {
+      "description": "Self-closing tag",
+      "input": "<br/>",
+      "output": [["StartTag", "br", {}, true]]
+    },
+    {
+      "description": "Empty comment",
+      "input": "<!---->",
+      "output": [["Comment", ""]]
+    },
+    {
+      "description": "Text with entities",
+      "input": "&amp;&lt;&gt;",
+      "output": [["Character", "&<>"]]
+    },
+    {
+      "description": "Numeric entity",
+      "input": "&#65;",
+      "output": [["Character", "A"]]
+    },
+    {
+      "description": "Hex entity",
+      "input": "&#x41;",
+      "output": [["Character", "A"]]
+    },
+    {
+      "description": "Unquoted attribute",
+      "input": "<h a=b>",
+      "output": [["StartTag", "h", { "a": "b" }]]
+    },
+    {
+      "description": "Tag with mixed case",
+      "input": "<DiV>",
+      "output": [["StartTag", "div", {}]]
+    }
+  ]
+};
+// Entity tests
+const entityTests: HTML5libTokenizerTestSuite = {
+  "tests": [
+    {
+      "description": "Entity with trailing semicolon",
+      "input": "I'm &not;it",
+      "output": [["Character", "I'm ¬it"]]
+    },
+    {
+      "description": "Entity without trailing semicolon",
+      "input": "I'm &notit",
+      "output": [["Character", "I'm ¬it"]],
+      "errors": [
+        { "code": "missing-semicolon-after-character-reference", "line": 1, "col": 9 }
+      ]
+    },
+    {
+      "description": "Ampersand EOF",
+      "input": "&",
+      "output": [["Character", "&"]]
+    },
+    {
+      "description": "Unfinished entity",
+      "input": "&f",
+      "output": [["Character", "&f"]]
+    },
+    {
+      "description": "Ampersand, number sign",
+      "input": "&#",
+      "output": [["Character", "&#"]],
+      "errors": [
+        { "code": "absence-of-digits-in-numeric-character-reference", "line": 1, "col": 3 }
+      ]
+    }
+  ]
+};
+// Comment tests
+const commentTests: HTML5libTokenizerTestSuite = {
+  "tests": [
+    {
+      "description": "Comment, Central dash no space",
+      "input": "<!----->",
+      "output": [["Comment", "-"]]
+    },
+    {
+      "description": "Comment, two central dashes",
+      "input": "<!-- --comment -->",
+      "output": [["Comment", " --comment "]]
+    },
+    {
+      "description": "Unfinished comment",
+      "input": "<!--comment",
+      "output": [["Comment", "comment"]],
+      "errors": [
+        { "code": "eof-in-comment", "line": 1, "col": 12 }
+      ]
+    },
+    {
+      "description": "Short comment",
+      "input": "<!-->",
+      "output": [["Comment", ""]],
+      "errors": [
+        { "code": "abrupt-closing-of-empty-comment", "line": 1, "col": 5 }
+      ]
+    },
+    {
+      "description": "Nested comment",
+      "input": "<!-- <!--test-->",
+      "output": [["Comment", " <!--test"]],
+      "errors": [
+        { "code": "nested-comment", "line": 1, "col": 10 }
+      ]
+    }
+  ]
+};
+// Run the embedded tests
+describe('HTML5lib Tokenizer Tests', () => {
+  runHTML5libTokenizerTestSuite(basicTokenizerTests, 'Basic Tokenizer');
+  runHTML5libTokenizerTestSuite(entityTests, 'Entity Handling');
+  runHTML5libTokenizerTestSuite(commentTests, 'Comment Handling');
+});
+// Test for loading external test files (when available)
+describe('HTML5lib External Tests', () => {
+  it('should be able to load external test files', async () => {
+    // This would be used to load actual HTML5lib test files
+    // const testData = await Bun.file('/path/to/test1.test').text();
+    // await loadHTML5libTokenizerTests(testData, 'External Test');
+    // For now, we'll just verify our utilities work
+    const testData = JSON.stringify(basicTokenizerTests);
+    await loadHTML5libTokenizerTests(testData, 'Loaded Basic Tests');
+  });
+});

package/tests/official/html5lib/tree-construction-utils.ts ADDED Viewed

@@ -0,0 +1,208 @@
+import { expect, describe, it } from 'bun:test';
+import { parse } from '../../../src/parser';
+import { tokenize } from '../../../src/tokenizer';
+import type { ASTNode } from '../../../src/parser';
+// HTML5lib tree construction test format
+export interface HTML5libTreeTest {
+  data: string;
+  errors: string[];
+  newErrors?: string[];
+  documentFragment?: string;
+  scriptOff?: boolean;
+  scriptOn?: boolean;
+  document: string;
+}
+/**
+ * Parses HTML5lib DAT format test files
+ */
+export function parseHTML5libDATFile(content: string): HTML5libTreeTest[] {
+  const tests: HTML5libTreeTest[] = [];
+  const sections = content.split('\n\n').filter(section => section.trim());
+  for (const section of sections) {
+    const lines = section.split('\n');
+    const test: Partial<HTML5libTreeTest> = {
+      errors: [] // Initialize errors as empty array
+    };
+    let currentSection = '';
+    let currentContent: string[] = [];
+    for (const line of lines) {
+      if (line.startsWith('#')) {
+        // Save previous section
+        if (currentSection) {
+          switch (currentSection) {
+            case 'data':
+              test.data = currentContent.join('\n');
+              break;
+            case 'errors':
+              test.errors = currentContent.filter(l => l.trim());
+              break;
+            case 'new-errors':
+              test.newErrors = currentContent.filter(l => l.trim());
+              break;
+            case 'document-fragment':
+              test.documentFragment = currentContent.join('\n');
+              break;
+            case 'document':
+              test.document = currentContent.join('\n');
+              break;
+          }
+        }
+        // Start new section
+        currentSection = line.substring(1);
+        currentContent = [];
+        // Handle script flags
+        if (currentSection === 'script-off') {
+          test.scriptOff = true;
+        } else if (currentSection === 'script-on') {
+          test.scriptOn = true;
+        }
+      } else {
+        currentContent.push(line);
+      }
+    }
+    // Save last section
+    if (currentSection) {
+      switch (currentSection) {
+        case 'data':
+          test.data = currentContent.join('\n');
+          break;
+        case 'errors':
+          test.errors = currentContent.filter(l => l.trim());
+          break;
+        case 'new-errors':
+          test.newErrors = currentContent.filter(l => l.trim());
+          break;
+        case 'document-fragment':
+          test.documentFragment = currentContent.join('\n');
+          break;
+        case 'document':
+          test.document = currentContent.join('\n');
+          break;
+      }
+    }
+    if (test.data && test.document) {
+      tests.push(test as HTML5libTreeTest);
+    }
+  }
+  return tests;
+}
+/**
+ * Converts AST to HTML5lib tree format
+ */
+export function convertASTToHTML5libTree(node: ASTNode, depth: number = 0): string[] {
+  const lines: string[] = [];
+  const indent = '| ' + '  '.repeat(depth);
+  switch (node.type) {
+    case 'DOCUMENT':
+      // Document node doesn't have a line representation
+      break;
+    case 'DOCTYPE':
+      lines.push(`${indent}<!DOCTYPE ${node.tagName || 'html'}>`);
+      break;
+    case 'ELEMENT':
+      const tagName = node.tagName || 'unknown';
+      lines.push(`${indent}<${tagName}>`);
+      // Add attributes
+      if (node.attributes) {
+        for (const [name, value] of Object.entries(node.attributes).sort()) {
+          lines.push(`${indent}  ${name}="${value}"`);
+        }
+      }
+      break;
+    case 'TEXT':
+      if (node.content && node.content.trim()) {
+        lines.push(`${indent}"${node.content}"`);
+      }
+      break;
+    case 'COMMENT':
+      lines.push(`${indent}<!-- ${node.content || ''} -->`);
+      break;
+    case 'CDATA':
+      lines.push(`${indent}<![CDATA[${node.content || ''}]]>`);
+      break;
+  }
+  // Add children
+  if (node.children) {
+    for (const child of node.children) {
+      lines.push(...convertASTToHTML5libTree(child, depth + 1));
+    }
+  }
+  return lines;
+}
+/**
+ * Normalizes HTML5lib tree format for comparison
+ */
+export function normalizeHTML5libTree(tree: string): string {
+  return tree
+    .split('\n')
+    .map(line => line.trim())
+    .filter(line => line.length > 0)
+    .join('\n');
+}
+/**
+ * Runs a single HTML5lib tree construction test
+ */
+export function runHTML5libTreeTest(test: HTML5libTreeTest, testName: string): void {
+  it(testName, () => {
+    const { data, document: expectedTree, documentFragment, scriptOff, scriptOn } = test;
+    // Parse the HTML
+    const tokens = tokenize(data);
+    const ast = parse(tokens);
+    // Convert to HTML5lib tree format
+    const actualTreeLines = convertASTToHTML5libTree(ast);
+    const actualTree = actualTreeLines.join('\n');
+    // Normalize both trees for comparison
+    const normalizedActual = normalizeHTML5libTree(actualTree);
+    const normalizedExpected = normalizeHTML5libTree(expectedTree);
+    // Compare trees
+    expect(normalizedActual).toBe(normalizedExpected);
+  });
+}
+/**
+ * Runs all tests from an HTML5lib tree construction test suite
+ */
+export function runHTML5libTreeTestSuite(tests: HTML5libTreeTest[], suiteName: string): void {
+  describe(`HTML5lib Tree Construction Tests: ${suiteName}`, () => {
+    tests.forEach((test, index) => {
+      const testName = `Test ${index + 1}: ${test.data.substring(0, 50).replace(/\n/g, ' ')}...`;
+      runHTML5libTreeTest(test, testName);
+    });
+  });
+}
+/**
+ * Loads and runs HTML5lib tree construction tests from DAT format
+ */
+export async function loadHTML5libTreeTests(testData: string, suiteName: string): Promise<void> {
+  const tests = parseHTML5libDATFile(testData);
+  runHTML5libTreeTestSuite(tests, suiteName);
+}
+/**
+ * Validates HTML5lib tree construction test format
+ */
+export function validateHTML5libTreeTest(test: HTML5libTreeTest): boolean {
+  return !!(test.data && test.document && test.errors !== undefined);
+}

package/tests/official/html5lib/tree-construction.test.ts ADDED Viewed

@@ -0,0 +1,250 @@
+import { describe, it, expect } from "bun:test";
+import {
+  loadHTML5libTreeTests,
+  runHTML5libTreeTestSuite,
+  parseHTML5libDATFile,
+  type HTML5libTreeTest,
+} from "./tree-construction-utils";
+// Sample HTML5lib tree construction tests in DAT format
+const basicTreeTestData = `#data
+Test
+#errors
+(1,0): expected-doctype-but-got-chars
+#document
+| <html>
+|   <head>
+|   <body>
+|     "Test"
+#data
+<p>One<p>Two
+#errors
+(1,3): expected-doctype-but-got-start-tag
+#document
+| <html>
+|   <head>
+|   <body>
+|     <p>
+|       "One"
+|     <p>
+|       "Two"
+#data
+<html>
+#errors
+(1,6): expected-doctype-but-got-start-tag
+#document
+| <html>
+|   <head>
+|   <body>
+#data
+<head>
+#errors
+(1,6): expected-doctype-but-got-start-tag
+#document
+| <html>
+|   <head>
+|   <body>
+#data
+<body>
+#errors
+(1,6): expected-doctype-but-got-start-tag
+#document
+| <html>
+|   <head>
+|   <body>
+#data
+<html><head></head><body></body>
+#errors
+(1,6): expected-doctype-but-got-start-tag
+#document
+| <html>
+|   <head>
+|   <body>
+#data
+Line1<br>Line2
+#errors
+(1,0): expected-doctype-but-got-chars
+#document
+| <html>
+|   <head>
+|   <body>
+|     "Line1"
+|     <br>
+|     "Line2"
+#data
+<div>hello</div>
+#errors
+(1,5): expected-doctype-but-got-start-tag
+#document
+| <html>
+|   <head>
+|   <body>
+|     <div>
+|       "hello"
+#data
+<p><b>bold</b></p>
+#errors
+(1,3): expected-doctype-but-got-start-tag
+#document
+| <html>
+|   <head>
+|   <body>
+|     <p>
+|       <b>
+|         "bold"
+#data
+<!--comment-->
+#errors
+(1,0): expected-doctype-but-got-chars
+#document
+| <html>
+|   <head>
+|   <body>
+| <!-- comment -->`;
+const doctypeTestData = `#data
+<!DOCTYPE html>
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+|   <head>
+|   <body>
+#data
+<!DOCTYPE html><html><head><title>Test</title></head><body><p>Hello</p></body></html>
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+|   <head>
+|     <title>
+|       "Test"
+|   <body>
+|     <p>
+|       "Hello"
+#data
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+|   <head>
+|   <body>
+#data
+<!DOCTYPE html SYSTEM "about:legacy-compat">
+#errors
+#document
+| <!DOCTYPE html>
+| <html>
+|   <head>
+|   <body>`;
+const errorHandlingTestData = `#data
+<b><table><td></b><i></table>
+#errors
+(1,3): expected-doctype-but-got-start-tag
+(1,14): unexpected-cell-in-table-body
+(1,18): unexpected-end-tag
+(1,29): unexpected-cell-end-tag
+(1,29): expected-closing-tag-but-got-eof
+#document
+| <html>
+|   <head>
+|   <body>
+|     <b>
+|       <table>
+|         <tbody>
+|           <tr>
+|             <td>
+|               <i>
+#data
+<p><b><div><marquee></p></b></div>
+#errors
+(1,3): expected-doctype-but-got-start-tag
+(1,11): unexpected-end-tag
+(1,24): unexpected-end-tag
+(1,28): unexpected-end-tag
+(1,34): end-tag-too-early
+(1,34): expected-closing-tag-but-got-eof
+#document
+| <html>
+|   <head>
+|   <body>
+|     <p>
+|       <b>
+|     <div>
+|       <b>
+|         <marquee>
+|           <p>
+#data
+<a><p><a></a></p></a>
+#errors
+(1,3): expected-doctype-but-got-start-tag
+(1,9): unexpected-start-tag-implies-end-tag
+(1,9): adoption-agency-1.3
+(1,21): unexpected-end-tag
+#document
+| <html>
+|   <head>
+|   <body>
+|     <a>
+|     <p>
+|       <a>
+|     <a>`;
+// Run the embedded tests
+describe("HTML5lib Tree Construction Tests", () => {
+  it("should parse DAT format correctly", () => {
+    const tests = parseHTML5libDATFile(basicTreeTestData);
+    expect(tests.length).toBeGreaterThan(0);
+    // Check first test
+    const firstTest = tests[0];
+    if (firstTest) {
+      expect(firstTest.data).toBe("Test");
+      expect(firstTest.errors.length).toBeGreaterThan(0);
+      expect(firstTest.document).toContain("<html>");
+    }
+  });
+  it("should handle doctype tests", () => {
+    const tests = parseHTML5libDATFile(doctypeTestData);
+    expect(tests.length).toBeGreaterThan(0);
+    // Check first doctype test
+    const firstTest = tests[0];
+    if (firstTest) {
+      expect(firstTest.data).toBe("<!DOCTYPE html>");
+      expect(firstTest.errors.length).toBe(0);
+      expect(firstTest.document).toContain("<!DOCTYPE html>");
+    }
+  });
+  it("should handle error cases", () => {
+    const tests = parseHTML5libDATFile(errorHandlingTestData);
+    expect(tests.length).toBeGreaterThan(0);
+    // Check error handling
+    const firstTest = tests[0];
+    if (firstTest) {
+      expect(firstTest.errors.length).toBeGreaterThan(0);
+      expect(firstTest.errors[0]).toContain(
+        "expected-doctype-but-got-start-tag"
+      );
+    }
+  });
+});