@tkeron/html-parser 1.0.0 → 1.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/dom-simulator.ts +8 -5
- package/src/parser.ts +34 -2
- package/src/tokenizer.ts +131 -75
- package/tests/advanced.test.ts +3 -3
- package/tests/custom-elements-head.test.ts +105 -0
- package/tests/edge-cases.test.ts +457 -0
- package/tests/helpers/tree-adapter.test.ts +1 -1
- package/tests/helpers/tree-adapter.ts +21 -4
- package/tests/innerhtml-void-elements.test.ts +84 -0
- package/tests/parser.test.ts +2 -1
- package/tests/tokenizer.test.ts +22 -26
- package/tests/tree-construction-html5test-com.test.ts +16 -8
- package/tests/custom-elements.test.ts +0 -755
package/tests/tokenizer.test.ts
CHANGED
|
@@ -198,21 +198,21 @@ describe('HTML Tokenizer', () => {
|
|
|
198
198
|
});
|
|
199
199
|
});
|
|
200
200
|
|
|
201
|
-
describe('CDATA Sections', () => {
|
|
202
|
-
it('should parse CDATA sections', () => {
|
|
201
|
+
describe('CDATA Sections (HTML5: treated as bogus comments)', () => {
|
|
202
|
+
it('should parse CDATA sections as bogus comments in HTML5', () => {
|
|
203
203
|
const tokens = tokenize('<![CDATA[Some data]]>');
|
|
204
204
|
|
|
205
205
|
expect(tokens[0]).toEqual({
|
|
206
|
-
type: TokenType.
|
|
207
|
-
value: 'Some data',
|
|
206
|
+
type: TokenType.COMMENT,
|
|
207
|
+
value: '[CDATA[Some data]]',
|
|
208
208
|
position: expect.any(Object)
|
|
209
209
|
});
|
|
210
210
|
});
|
|
211
211
|
|
|
212
|
-
it('should handle CDATA with special characters', () => {
|
|
212
|
+
it('should handle CDATA with special characters as bogus comment', () => {
|
|
213
213
|
const tokens = tokenize('<![CDATA[<script>alert("test");</script>]]>');
|
|
214
214
|
|
|
215
|
-
expect(tokens[0]?.value).toBe('<script>alert("test");</script>');
|
|
215
|
+
expect(tokens[0]?.value).toBe('[CDATA[<script>alert("test");</script>]]');
|
|
216
216
|
});
|
|
217
217
|
});
|
|
218
218
|
|
|
@@ -235,22 +235,22 @@ describe('HTML Tokenizer', () => {
|
|
|
235
235
|
});
|
|
236
236
|
});
|
|
237
237
|
|
|
238
|
-
describe('Processing Instructions', () => {
|
|
239
|
-
it('should parse XML processing instruction', () => {
|
|
238
|
+
describe('Processing Instructions (HTML5: treated as bogus comments)', () => {
|
|
239
|
+
it('should parse XML processing instruction as bogus comment', () => {
|
|
240
240
|
const tokens = tokenize('<?xml version="1.0" encoding="UTF-8"?>');
|
|
241
241
|
|
|
242
242
|
expect(tokens[0]).toEqual({
|
|
243
|
-
type: TokenType.
|
|
244
|
-
value: '
|
|
243
|
+
type: TokenType.COMMENT,
|
|
244
|
+
value: '?xml version="1.0" encoding="UTF-8"?',
|
|
245
245
|
position: expect.any(Object)
|
|
246
246
|
});
|
|
247
247
|
});
|
|
248
248
|
|
|
249
|
-
it('should parse PHP-style processing instruction', () => {
|
|
249
|
+
it('should parse PHP-style processing instruction as bogus comment', () => {
|
|
250
250
|
const tokens = tokenize('<?php echo "Hello"; ?>');
|
|
251
251
|
|
|
252
|
-
expect(tokens[0]?.type).toBe(TokenType.
|
|
253
|
-
expect(tokens[0]?.value).toBe('
|
|
252
|
+
expect(tokens[0]?.type).toBe(TokenType.COMMENT);
|
|
253
|
+
expect(tokens[0]?.value).toBe('?php echo "Hello"; ?');
|
|
254
254
|
});
|
|
255
255
|
});
|
|
256
256
|
|
|
@@ -429,7 +429,7 @@ describe('HTML Tokenizer', () => {
|
|
|
429
429
|
});
|
|
430
430
|
});
|
|
431
431
|
|
|
432
|
-
it('should handle CDATA with complex content', () => {
|
|
432
|
+
it('should handle CDATA as bogus comment with complex content', () => {
|
|
433
433
|
const complexContent = `
|
|
434
434
|
function it() {
|
|
435
435
|
return "<div>HTML inside JS</div>";
|
|
@@ -440,11 +440,11 @@ describe('HTML Tokenizer', () => {
|
|
|
440
440
|
const tokens = tokenize(`<![CDATA[${complexContent}]]>`);
|
|
441
441
|
const cdataToken = tokens[0]!;
|
|
442
442
|
|
|
443
|
-
expect(cdataToken.type).toBe(TokenType.
|
|
444
|
-
expect(cdataToken.value).toBe(complexContent);
|
|
443
|
+
expect(cdataToken.type).toBe(TokenType.COMMENT);
|
|
444
|
+
expect(cdataToken.value).toBe('[CDATA[' + complexContent + ']]');
|
|
445
445
|
});
|
|
446
446
|
|
|
447
|
-
it('should handle processing instructions
|
|
447
|
+
it('should handle processing instructions as bogus comments', () => {
|
|
448
448
|
const tests = [
|
|
449
449
|
{ input: '<?xml version="1.0" encoding="UTF-8"?>', expected: 'xml' },
|
|
450
450
|
{ input: '<?xml-stylesheet type="text/xsl" href="style.xsl"?>', expected: 'xml' },
|
|
@@ -456,7 +456,7 @@ describe('HTML Tokenizer', () => {
|
|
|
456
456
|
const tokens = tokenize(test.input);
|
|
457
457
|
const piToken = tokens[0]!;
|
|
458
458
|
|
|
459
|
-
expect(piToken.type).toBe(TokenType.
|
|
459
|
+
expect(piToken.type).toBe(TokenType.COMMENT);
|
|
460
460
|
expect(piToken.value.toLowerCase()).toContain(test.expected);
|
|
461
461
|
});
|
|
462
462
|
});
|
|
@@ -478,15 +478,13 @@ describe('HTML Tokenizer', () => {
|
|
|
478
478
|
});
|
|
479
479
|
});
|
|
480
480
|
|
|
481
|
-
it('should handle mixed content with all token types', () => {
|
|
481
|
+
it('should handle mixed content with all token types (HTML5 mode)', () => {
|
|
482
482
|
const html = `
|
|
483
|
-
<?xml version="1.0"?>
|
|
484
483
|
<!DOCTYPE html>
|
|
485
484
|
<!-- Main document -->
|
|
486
485
|
<html lang="en">
|
|
487
486
|
<head>
|
|
488
487
|
<title>Test & Demo</title>
|
|
489
|
-
<![CDATA[Some raw data]]>
|
|
490
488
|
</head>
|
|
491
489
|
<body>
|
|
492
490
|
<h1>Hello World</h1>
|
|
@@ -500,27 +498,25 @@ describe('HTML Tokenizer', () => {
|
|
|
500
498
|
const tokens = tokenize(html);
|
|
501
499
|
|
|
502
500
|
const tokenCounts = {
|
|
503
|
-
[TokenType.PROCESSING_INSTRUCTION]: 0,
|
|
504
501
|
[TokenType.DOCTYPE]: 0,
|
|
505
502
|
[TokenType.COMMENT]: 0,
|
|
506
503
|
[TokenType.TAG_OPEN]: 0,
|
|
507
504
|
[TokenType.TAG_CLOSE]: 0,
|
|
508
505
|
[TokenType.TEXT]: 0,
|
|
509
|
-
[TokenType.CDATA]: 0,
|
|
510
506
|
[TokenType.EOF]: 0
|
|
511
507
|
};
|
|
512
508
|
|
|
513
509
|
tokens.forEach(token => {
|
|
514
|
-
|
|
510
|
+
if (token.type in tokenCounts) {
|
|
511
|
+
tokenCounts[token.type]++;
|
|
512
|
+
}
|
|
515
513
|
});
|
|
516
514
|
|
|
517
|
-
expect(tokenCounts[TokenType.PROCESSING_INSTRUCTION]).toBeGreaterThan(0);
|
|
518
515
|
expect(tokenCounts[TokenType.DOCTYPE]).toBeGreaterThan(0);
|
|
519
516
|
expect(tokenCounts[TokenType.COMMENT]).toBeGreaterThan(0);
|
|
520
517
|
expect(tokenCounts[TokenType.TAG_OPEN]).toBeGreaterThan(0);
|
|
521
518
|
expect(tokenCounts[TokenType.TAG_CLOSE]).toBeGreaterThan(0);
|
|
522
519
|
expect(tokenCounts[TokenType.TEXT]).toBeGreaterThan(0);
|
|
523
|
-
expect(tokenCounts[TokenType.CDATA]).toBeGreaterThan(0);
|
|
524
520
|
expect(tokenCounts[TokenType.EOF]).toBe(1);
|
|
525
521
|
});
|
|
526
522
|
})
|
|
@@ -1,6 +1,7 @@
|
|
|
1
|
-
import { describe, it } from "bun:test";
|
|
1
|
+
import { describe, it, expect } from "bun:test";
|
|
2
2
|
import { readFileSync } from "fs";
|
|
3
|
-
import {
|
|
3
|
+
import { parseHTML } from "../src/index.ts";
|
|
4
|
+
import { serializeToHtml5lib } from "./helpers/tree-adapter";
|
|
4
5
|
|
|
5
6
|
describe("Tree Construction Html5testCom Tests", () => {
|
|
6
7
|
const data = readFileSync("tests/html5lib-data/tree-construction/html5test-com.dat", "utf8");
|
|
@@ -10,15 +11,22 @@ describe("Tree Construction Html5testCom Tests", () => {
|
|
|
10
11
|
const parts = section.split("#document\n");
|
|
11
12
|
if (parts.length < 2) continue;
|
|
12
13
|
const inputWithErrors = parts[0];
|
|
13
|
-
const
|
|
14
|
+
const expectedRaw = parts[1].split("\n#")[0];
|
|
15
|
+
const expected = expectedRaw.split("\n").filter(l => l.startsWith("|")).join("\n") + "\n";
|
|
14
16
|
const input = inputWithErrors.split("#errors\n")[0].trim();
|
|
17
|
+
const hasDoctype = input.toLowerCase().startsWith("<!doctype");
|
|
15
18
|
|
|
16
19
|
const testName = input.split("\n")[0] || "Html5testCom test";
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
20
|
+
|
|
21
|
+
const isFosterParenting = input.includes('<table><form><input type=hidden><input></form><div></div></table>');
|
|
22
|
+
const isAdoptionAgency = input.includes('<i>A<b>B<p></i>C</b>D');
|
|
23
|
+
|
|
24
|
+
const testFn = (isFosterParenting || isAdoptionAgency) ? it.skip : it;
|
|
25
|
+
|
|
26
|
+
testFn(testName, () => {
|
|
27
|
+
const doc = parseHTML(input);
|
|
28
|
+
const actual = serializeToHtml5lib(doc, { skipImplicitDoctype: !hasDoctype });
|
|
29
|
+
expect(actual).toBe(expected);
|
|
22
30
|
});
|
|
23
31
|
}
|
|
24
32
|
});
|