@tkeron/html-parser 0.1.4 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  import { describe, it, expect } from 'bun:test';
2
2
  import { parseHTML } from '../../../index';
3
3
 
4
- // Helper function to normalize text for comparison
4
+
5
5
  function normalizeText(text: string): string {
6
6
  return text
7
7
  .replace(/\s+/g, ' ')
@@ -16,7 +16,7 @@ describe('Final HTML Output Validation', () => {
16
16
  const document = parseHTML(html);
17
17
 
18
18
  expect(document).toBeDefined();
19
- expect(document.nodeType).toBe(9); // DOCUMENT_NODE
19
+ expect(document.nodeType).toBe(9);
20
20
 
21
21
  const div = document.querySelector('div');
22
22
  expect(div).toBeDefined();
@@ -101,9 +101,9 @@ describe('Final HTML Output Validation', () => {
101
101
  expect(p).toBeDefined();
102
102
  expect(p?.textContent).toBe('Content');
103
103
 
104
- // Check for comment node
104
+
105
105
  const commentNode = div?.childNodes[0];
106
- expect(commentNode?.nodeType).toBe(8); // COMMENT_NODE
106
+ expect(commentNode?.nodeType).toBe(8);
107
107
  });
108
108
  });
109
109
 
@@ -135,16 +135,16 @@ describe('Final HTML Output Validation', () => {
135
135
  expect(div).toBeDefined();
136
136
  expect(div?.childNodes.length).toBe(3);
137
137
 
138
- // First text node
139
- expect(div?.childNodes[0]?.nodeType).toBe(3); // TEXT_NODE
138
+
139
+ expect(div?.childNodes[0]?.nodeType).toBe(3);
140
140
  expect(div?.childNodes[0]?.textContent).toBe('Text before ');
141
141
 
142
- // Span element
143
- expect(div?.childNodes[1]?.nodeType).toBe(1); // ELEMENT_NODE
142
+
143
+ expect(div?.childNodes[1]?.nodeType).toBe(1);
144
144
  expect((div?.childNodes[1] as Element)?.tagName).toBe('SPAN');
145
145
 
146
- // Last text node
147
- expect(div?.childNodes[2]?.nodeType).toBe(3); // TEXT_NODE
146
+
147
+ expect(div?.childNodes[2]?.nodeType).toBe(3);
148
148
  expect(div?.childNodes[2]?.textContent).toBe(' text after');
149
149
  });
150
150
 
@@ -264,7 +264,7 @@ describe('Final HTML Output Validation', () => {
264
264
  const document = parseHTML(malformedHTML);
265
265
 
266
266
  expect(document).toBeDefined();
267
- expect(document.nodeType).toBe(9); // DOCUMENT_NODE
267
+ expect(document.nodeType).toBe(9);
268
268
 
269
269
  const divs = document.querySelectorAll('div');
270
270
  expect(divs.length).toBeGreaterThan(0);
@@ -296,7 +296,7 @@ describe('Final HTML Output Validation', () => {
296
296
  const p = document.querySelector('p');
297
297
  expect(p).toBeDefined();
298
298
  expect(p?.textContent).toContain('Special chars:');
299
- // The exact entity handling depends on your implementation
299
+
300
300
  });
301
301
 
302
302
  it('should handle multiple top-level elements', () => {
@@ -321,17 +321,17 @@ describe('Final HTML Output Validation', () => {
321
321
 
322
322
  const document = parseHTML(html);
323
323
 
324
- // Test getElementById
324
+
325
325
  const byId = document.getElementById('test');
326
326
  expect(byId).toBeDefined();
327
327
  expect(byId?.tagName).toBe('DIV');
328
328
 
329
- // Test querySelector
329
+
330
330
  const bySelector = document.querySelector('.container');
331
331
  expect(bySelector).toBeDefined();
332
332
  expect(bySelector?.id).toBe('test');
333
333
 
334
- // Test querySelectorAll
334
+
335
335
  const byClass = document.querySelectorAll('.text');
336
336
  expect(byClass.length).toBe(1);
337
337
  expect(byClass[0]?.textContent).toBe('Hello');
@@ -2,7 +2,7 @@ import { expect, describe, it } from 'bun:test';
2
2
  import { tokenize, TokenType } from '../../../src/tokenizer';
3
3
  import type { Token } from '../../../src/tokenizer';
4
4
 
5
- // HTML5lib tokenizer test format
5
+
6
6
  export interface HTML5libTokenizerTest {
7
7
  description: string;
8
8
  input: string;
@@ -18,12 +18,12 @@ export interface HTML5libTokenizerTestSuite {
18
18
  }
19
19
 
20
20
  export type HTML5libTokenOutput =
21
- | ['StartTag', string, Record<string, string>] // StartTag without self-closing
22
- | ['StartTag', string, Record<string, string>, boolean] // StartTag with self-closing
23
- | ['EndTag', string] // EndTag
24
- | ['Comment', string] // Comment
25
- | ['Character', string] // Character
26
- | ['DOCTYPE', string, string | null, string | null, boolean]; // DOCTYPE
21
+ | ['StartTag', string, Record<string, string>]
22
+ | ['StartTag', string, Record<string, string>, boolean]
23
+ | ['EndTag', string]
24
+ | ['Comment', string]
25
+ | ['Character', string]
26
+ | ['DOCTYPE', string, string | null, string | null, boolean];
27
27
 
28
28
  export interface HTML5libError {
29
29
  code: string;
@@ -31,9 +31,7 @@ export interface HTML5libError {
31
31
  col: number;
32
32
  }
33
33
 
34
- /**
35
- * Converts HTML5lib token format to our internal token format
36
- */
34
+
37
35
  export function convertHTML5libToken(html5libToken: HTML5libTokenOutput): Partial<Token> {
38
36
  const type = html5libToken[0];
39
37
  const nameOrData = html5libToken[1];
@@ -78,9 +76,7 @@ export function convertHTML5libToken(html5libToken: HTML5libTokenOutput): Partia
78
76
  }
79
77
  }
80
78
 
81
- /**
82
- * Converts our internal token format to HTML5lib format for comparison
83
- */
79
+
84
80
  export function convertToHTML5libToken(token: Token): HTML5libTokenOutput {
85
81
  switch (token.type) {
86
82
  case TokenType.DOCTYPE:
@@ -102,9 +98,7 @@ export function convertToHTML5libToken(token: Token): HTML5libTokenOutput {
102
98
  }
103
99
  }
104
100
 
105
- /**
106
- * Normalizes adjacent character tokens as per HTML5lib spec
107
- */
101
+
108
102
  export function normalizeCharacterTokens(tokens: Token[]): Token[] {
109
103
  const normalized: Token[] = [];
110
104
  let currentText = '';
@@ -140,13 +134,11 @@ export function normalizeCharacterTokens(tokens: Token[]): Token[] {
140
134
  return normalized;
141
135
  }
142
136
 
143
- /**
144
- * Runs a single HTML5lib tokenizer test
145
- */
137
+
146
138
  export function runHTML5libTokenizerTest(test: HTML5libTokenizerTest): void {
147
139
  const { description, input, output: expectedOutput, initialStates = ['Data state'] } = test;
148
140
 
149
- // Process double-escaped input if needed
141
+
150
142
  let processedInput = input;
151
143
  if (test.doubleEscaped) {
152
144
  processedInput = processedInput.replace(/\\u([0-9a-fA-F]{4})/g, (match, hex) => {
@@ -156,16 +148,16 @@ export function runHTML5libTokenizerTest(test: HTML5libTokenizerTest): void {
156
148
 
157
149
  for (const initialState of initialStates) {
158
150
  it(`${description} (${initialState})`, () => {
159
- // Tokenize the input
151
+
160
152
  const tokens = tokenize(processedInput);
161
153
 
162
- // Normalize character tokens
154
+
163
155
  const normalizedTokens = normalizeCharacterTokens(tokens);
164
156
 
165
- // Convert to HTML5lib format for comparison
157
+
166
158
  const actualOutput = normalizedTokens.map(convertToHTML5libToken);
167
159
 
168
- // Process expected output if double-escaped
160
+
169
161
  let processedExpectedOutput = expectedOutput;
170
162
  if (test.doubleEscaped) {
171
163
  processedExpectedOutput = expectedOutput.map(token => {
@@ -178,15 +170,13 @@ export function runHTML5libTokenizerTest(test: HTML5libTokenizerTest): void {
178
170
  });
179
171
  }
180
172
 
181
- // Compare outputs
173
+
182
174
  expect(actualOutput).toEqual(processedExpectedOutput);
183
175
  });
184
176
  }
185
177
  }
186
178
 
187
- /**
188
- * Runs all tests from an HTML5lib tokenizer test suite
189
- */
179
+
190
180
  export function runHTML5libTokenizerTestSuite(testSuite: HTML5libTokenizerTestSuite, suiteName: string): void {
191
181
  describe(`HTML5lib Tokenizer Tests: ${suiteName}`, () => {
192
182
  testSuite.tests.forEach(test => {
@@ -195,9 +185,7 @@ export function runHTML5libTokenizerTestSuite(testSuite: HTML5libTokenizerTestSu
195
185
  });
196
186
  }
197
187
 
198
- /**
199
- * Loads and runs HTML5lib tokenizer tests from JSON
200
- */
188
+
201
189
  export async function loadHTML5libTokenizerTests(testData: string, suiteName: string): Promise<void> {
202
190
  const testSuite: HTML5libTokenizerTestSuite = JSON.parse(testData);
203
191
  runHTML5libTokenizerTestSuite(testSuite, suiteName);
@@ -5,7 +5,7 @@ import {
5
5
  type HTML5libTokenizerTestSuite
6
6
  } from './tokenizer-utils';
7
7
 
8
- // Sample HTML5lib tokenizer tests embedded directly
8
+
9
9
  const basicTokenizerTests: HTML5libTokenizerTestSuite = {
10
10
  "tests": [
11
11
  {
@@ -86,7 +86,7 @@ const basicTokenizerTests: HTML5libTokenizerTestSuite = {
86
86
  ]
87
87
  };
88
88
 
89
- // Entity tests
89
+
90
90
  const entityTests: HTML5libTokenizerTestSuite = {
91
91
  "tests": [
92
92
  {
@@ -123,7 +123,7 @@ const entityTests: HTML5libTokenizerTestSuite = {
123
123
  ]
124
124
  };
125
125
 
126
- // Comment tests
126
+
127
127
  const commentTests: HTML5libTokenizerTestSuite = {
128
128
  "tests": [
129
129
  {
@@ -163,7 +163,7 @@ const commentTests: HTML5libTokenizerTestSuite = {
163
163
  ]
164
164
  };
165
165
 
166
- // Run the embedded tests
166
+
167
167
  describe('HTML5lib Tokenizer Tests', () => {
168
168
  runHTML5libTokenizerTestSuite(basicTokenizerTests, 'Basic Tokenizer');
169
169
  runHTML5libTokenizerTestSuite(entityTests, 'Entity Handling');
@@ -3,7 +3,7 @@ import { parse } from '../../../src/parser';
3
3
  import { tokenize } from '../../../src/tokenizer';
4
4
  import type { ASTNode } from '../../../src/parser';
5
5
 
6
- // HTML5lib tree construction test format
6
+
7
7
  export interface HTML5libTreeTest {
8
8
  data: string;
9
9
  errors: string[];
@@ -14,9 +14,7 @@ export interface HTML5libTreeTest {
14
14
  document: string;
15
15
  }
16
16
 
17
- /**
18
- * Parses HTML5lib DAT format test files
19
- */
17
+
20
18
  export function parseHTML5libDATFile(content: string): HTML5libTreeTest[] {
21
19
  const tests: HTML5libTreeTest[] = [];
22
20
  const sections = content.split('\n\n').filter(section => section.trim());
@@ -24,7 +22,7 @@ export function parseHTML5libDATFile(content: string): HTML5libTreeTest[] {
24
22
  for (const section of sections) {
25
23
  const lines = section.split('\n');
26
24
  const test: Partial<HTML5libTreeTest> = {
27
- errors: [] // Initialize errors as empty array
25
+ errors: []
28
26
  };
29
27
 
30
28
  let currentSection = '';
@@ -32,7 +30,7 @@ export function parseHTML5libDATFile(content: string): HTML5libTreeTest[] {
32
30
 
33
31
  for (const line of lines) {
34
32
  if (line.startsWith('#')) {
35
- // Save previous section
33
+
36
34
  if (currentSection) {
37
35
  switch (currentSection) {
38
36
  case 'data':
@@ -53,11 +51,11 @@ export function parseHTML5libDATFile(content: string): HTML5libTreeTest[] {
53
51
  }
54
52
  }
55
53
 
56
- // Start new section
54
+
57
55
  currentSection = line.substring(1);
58
56
  currentContent = [];
59
57
 
60
- // Handle script flags
58
+
61
59
  if (currentSection === 'script-off') {
62
60
  test.scriptOff = true;
63
61
  } else if (currentSection === 'script-on') {
@@ -68,7 +66,7 @@ export function parseHTML5libDATFile(content: string): HTML5libTreeTest[] {
68
66
  }
69
67
  }
70
68
 
71
- // Save last section
69
+
72
70
  if (currentSection) {
73
71
  switch (currentSection) {
74
72
  case 'data':
@@ -97,16 +95,14 @@ export function parseHTML5libDATFile(content: string): HTML5libTreeTest[] {
97
95
  return tests;
98
96
  }
99
97
 
100
- /**
101
- * Converts AST to HTML5lib tree format
102
- */
98
+
103
99
  export function convertASTToHTML5libTree(node: ASTNode, depth: number = 0): string[] {
104
100
  const lines: string[] = [];
105
101
  const indent = '| ' + ' '.repeat(depth);
106
102
 
107
103
  switch (node.type) {
108
104
  case 'DOCUMENT':
109
- // Document node doesn't have a line representation
105
+
110
106
  break;
111
107
  case 'DOCTYPE':
112
108
  lines.push(`${indent}<!DOCTYPE ${node.tagName || 'html'}>`);
@@ -115,7 +111,7 @@ export function convertASTToHTML5libTree(node: ASTNode, depth: number = 0): stri
115
111
  const tagName = node.tagName || 'unknown';
116
112
  lines.push(`${indent}<${tagName}>`);
117
113
 
118
- // Add attributes
114
+
119
115
  if (node.attributes) {
120
116
  for (const [name, value] of Object.entries(node.attributes).sort()) {
121
117
  lines.push(`${indent} ${name}="${value}"`);
@@ -135,7 +131,7 @@ export function convertASTToHTML5libTree(node: ASTNode, depth: number = 0): stri
135
131
  break;
136
132
  }
137
133
 
138
- // Add children
134
+
139
135
  if (node.children) {
140
136
  for (const child of node.children) {
141
137
  lines.push(...convertASTToHTML5libTree(child, depth + 1));
@@ -145,9 +141,7 @@ export function convertASTToHTML5libTree(node: ASTNode, depth: number = 0): stri
145
141
  return lines;
146
142
  }
147
143
 
148
- /**
149
- * Normalizes HTML5lib tree format for comparison
150
- */
144
+
151
145
  export function normalizeHTML5libTree(tree: string): string {
152
146
  return tree
153
147
  .split('\n')
@@ -156,33 +150,29 @@ export function normalizeHTML5libTree(tree: string): string {
156
150
  .join('\n');
157
151
  }
158
152
 
159
- /**
160
- * Runs a single HTML5lib tree construction test
161
- */
153
+
162
154
  export function runHTML5libTreeTest(test: HTML5libTreeTest, testName: string): void {
163
155
  it(testName, () => {
164
156
  const { data, document: expectedTree, documentFragment, scriptOff, scriptOn } = test;
165
157
 
166
- // Parse the HTML
158
+
167
159
  const tokens = tokenize(data);
168
160
  const ast = parse(tokens);
169
161
 
170
- // Convert to HTML5lib tree format
162
+
171
163
  const actualTreeLines = convertASTToHTML5libTree(ast);
172
164
  const actualTree = actualTreeLines.join('\n');
173
165
 
174
- // Normalize both trees for comparison
166
+
175
167
  const normalizedActual = normalizeHTML5libTree(actualTree);
176
168
  const normalizedExpected = normalizeHTML5libTree(expectedTree);
177
169
 
178
- // Compare trees
170
+
179
171
  expect(normalizedActual).toBe(normalizedExpected);
180
172
  });
181
173
  }
182
174
 
183
- /**
184
- * Runs all tests from an HTML5lib tree construction test suite
185
- */
175
+
186
176
  export function runHTML5libTreeTestSuite(tests: HTML5libTreeTest[], suiteName: string): void {
187
177
  describe(`HTML5lib Tree Construction Tests: ${suiteName}`, () => {
188
178
  tests.forEach((test, index) => {
@@ -192,17 +182,13 @@ export function runHTML5libTreeTestSuite(tests: HTML5libTreeTest[], suiteName: s
192
182
  });
193
183
  }
194
184
 
195
- /**
196
- * Loads and runs HTML5lib tree construction tests from DAT format
197
- */
185
+
198
186
  export async function loadHTML5libTreeTests(testData: string, suiteName: string): Promise<void> {
199
187
  const tests = parseHTML5libDATFile(testData);
200
188
  runHTML5libTreeTestSuite(tests, suiteName);
201
189
  }
202
190
 
203
- /**
204
- * Validates HTML5lib tree construction test format
205
- */
191
+
206
192
  export function validateHTML5libTreeTest(test: HTML5libTreeTest): boolean {
207
193
  return !!(test.data && test.document && test.errors !== undefined);
208
194
  }
@@ -6,7 +6,7 @@ import {
6
6
  type HTML5libTreeTest,
7
7
  } from "./tree-construction-utils";
8
8
 
9
- // Sample HTML5lib tree construction tests in DAT format
9
+
10
10
  const basicTreeTestData = `#data
11
11
  Test
12
12
  #errors
@@ -206,13 +206,13 @@ const errorHandlingTestData = `#data
206
206
  | <a>
207
207
  | <a>`;
208
208
 
209
- // Run the embedded tests
209
+
210
210
  describe("HTML5lib Tree Construction Tests", () => {
211
211
  it("should parse DAT format correctly", () => {
212
212
  const tests = parseHTML5libDATFile(basicTreeTestData);
213
213
  expect(tests.length).toBeGreaterThan(0);
214
214
 
215
- // Check first test
215
+
216
216
  const firstTest = tests[0];
217
217
  if (firstTest) {
218
218
  expect(firstTest.data).toBe("Test");
@@ -225,7 +225,7 @@ describe("HTML5lib Tree Construction Tests", () => {
225
225
  const tests = parseHTML5libDATFile(doctypeTestData);
226
226
  expect(tests.length).toBeGreaterThan(0);
227
227
 
228
- // Check first doctype test
228
+
229
229
  const firstTest = tests[0];
230
230
  if (firstTest) {
231
231
  expect(firstTest.data).toBe("<!DOCTYPE html>");
@@ -238,7 +238,7 @@ describe("HTML5lib Tree Construction Tests", () => {
238
238
  const tests = parseHTML5libDATFile(errorHandlingTestData);
239
239
  expect(tests.length).toBeGreaterThan(0);
240
240
 
241
- // Check error handling
241
+
242
242
  const firstTest = tests[0];
243
243
  if (firstTest) {
244
244
  expect(firstTest.errors.length).toBeGreaterThan(0);
@@ -42,7 +42,7 @@ describe('Validator.nu Tests', () => {
42
42
  });
43
43
 
44
44
  it('should handle content model violations', () => {
45
- // These should parse but may generate warnings in a full validator
45
+
46
46
  const contentModelHTML = `
47
47
  <p>
48
48
  <div>Block inside paragraph</div>
@@ -53,7 +53,7 @@ describe('Validator.nu Tests', () => {
53
53
  `;
54
54
 
55
55
  const document = parseHTML(contentModelHTML);
56
- // const ast = parse(tokens);
56
+
57
57
 
58
58
  expect(document).toBeDefined();
59
59
  });
@@ -67,7 +67,7 @@ describe('Validator.nu Tests', () => {
67
67
  `;
68
68
 
69
69
  const document = parseHTML(obsoleteHTML);
70
- // const ast = parse(tokens);
70
+
71
71
 
72
72
  expect(document).toBeDefined();
73
73
  });
@@ -85,7 +85,7 @@ describe('Validator.nu Tests', () => {
85
85
  `;
86
86
 
87
87
  const document = parseHTML(deprecatedHTML);
88
- // const ast = parse(tokens);
88
+
89
89
 
90
90
  expect(document).toBeDefined();
91
91
  });
@@ -114,7 +114,7 @@ describe('Validator.nu Tests', () => {
114
114
  `;
115
115
 
116
116
  const document = parseHTML(formHTML);
117
- // const ast = parse(tokens);
117
+
118
118
 
119
119
  expect(document).toBeDefined();
120
120
  });
@@ -136,7 +136,7 @@ describe('Validator.nu Tests', () => {
136
136
  `;
137
137
 
138
138
  const document = parseHTML(mediaHTML);
139
- // const ast = parse(tokens);
139
+
140
140
 
141
141
  expect(document).toBeDefined();
142
142
  });
@@ -162,7 +162,7 @@ describe('Validator.nu Tests', () => {
162
162
  `;
163
163
 
164
164
  const document = parseHTML(semanticHTML);
165
- // const ast = parse(tokens);
165
+
166
166
 
167
167
  expect(document).toBeDefined();
168
168
  });
@@ -184,7 +184,7 @@ describe('Validator.nu Tests', () => {
184
184
  `;
185
185
 
186
186
  const document = parseHTML(interactiveHTML);
187
- // const ast = parse(tokens);
187
+
188
188
 
189
189
  expect(document).toBeDefined();
190
190
  });
@@ -200,7 +200,7 @@ describe('Validator.nu Tests', () => {
200
200
  `;
201
201
 
202
202
  const document = parseHTML(unclosedHTML);
203
- // const ast = parse(tokens);
203
+
204
204
 
205
205
  expect(document).toBeDefined();
206
206
  });
@@ -215,7 +215,7 @@ describe('Validator.nu Tests', () => {
215
215
  `;
216
216
 
217
217
  const document = parseHTML(mismatchedHTML);
218
- // const ast = parse(tokens);
218
+
219
219
 
220
220
  expect(document).toBeDefined();
221
221
  });
@@ -229,7 +229,7 @@ describe('Validator.nu Tests', () => {
229
229
  `;
230
230
 
231
231
  const document = parseHTML(invalidNestingHTML);
232
- // const ast = parse(tokens);
232
+
233
233
 
234
234
  expect(document).toBeDefined();
235
235
  });
@@ -135,10 +135,10 @@ describe('Web Platform Tests (WPT) Compliance', () => {
135
135
 
136
136
  it('should handle numeric character references', () => {
137
137
  const numericRefs = [
138
- '&#65;', // A
139
- '&#x41;', // A (hex)
140
- '&#8364;', // Euro symbol
141
- '&#x20AC;' // Euro symbol (hex)
138
+ '&#65;',
139
+ '&#x41;',
140
+ '&#8364;',
141
+ '&#x20AC;'
142
142
  ];
143
143
 
144
144
  numericRefs.forEach(ref => {
@@ -404,6 +404,6 @@ describe('WPT Integration Tests', () => {
404
404
 
405
405
  expect(ast).toBeDefined();
406
406
  expect((ast as any).children?.length).toBeGreaterThan(0);
407
- expect(end - start).toBeLessThan(1000); // Should parse within 1 second
407
+ expect(end - start).toBeLessThan(1000);
408
408
  });
409
409
  });