@tkeron/html-parser 1.0.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -198,21 +198,21 @@ describe('HTML Tokenizer', () => {
198
198
  });
199
199
  });
200
200
 
201
- describe('CDATA Sections', () => {
202
- it('should parse CDATA sections', () => {
201
+ describe('CDATA Sections (HTML5: treated as bogus comments)', () => {
202
+ it('should parse CDATA sections as bogus comments in HTML5', () => {
203
203
  const tokens = tokenize('<![CDATA[Some data]]>');
204
204
 
205
205
  expect(tokens[0]).toEqual({
206
- type: TokenType.CDATA,
207
- value: 'Some data',
206
+ type: TokenType.COMMENT,
207
+ value: '[CDATA[Some data]]',
208
208
  position: expect.any(Object)
209
209
  });
210
210
  });
211
211
 
212
- it('should handle CDATA with special characters', () => {
212
+ it('should handle CDATA with special characters as bogus comment', () => {
213
213
  const tokens = tokenize('<![CDATA[<script>alert("test");</script>]]>');
214
214
 
215
- expect(tokens[0]?.value).toBe('<script>alert("test");</script>');
215
+ expect(tokens[0]?.value).toBe('[CDATA[<script>alert("test");</script>]]');
216
216
  });
217
217
  });
218
218
 
@@ -235,22 +235,22 @@ describe('HTML Tokenizer', () => {
235
235
  });
236
236
  });
237
237
 
238
- describe('Processing Instructions', () => {
239
- it('should parse XML processing instruction', () => {
238
+ describe('Processing Instructions (HTML5: treated as bogus comments)', () => {
239
+ it('should parse XML processing instruction as bogus comment', () => {
240
240
  const tokens = tokenize('<?xml version="1.0" encoding="UTF-8"?>');
241
241
 
242
242
  expect(tokens[0]).toEqual({
243
- type: TokenType.PROCESSING_INSTRUCTION,
244
- value: '<?xml version="1.0" encoding="UTF-8"',
243
+ type: TokenType.COMMENT,
244
+ value: '?xml version="1.0" encoding="UTF-8"?',
245
245
  position: expect.any(Object)
246
246
  });
247
247
  });
248
248
 
249
- it('should parse PHP-style processing instruction', () => {
249
+ it('should parse PHP-style processing instruction as bogus comment', () => {
250
250
  const tokens = tokenize('<?php echo "Hello"; ?>');
251
251
 
252
- expect(tokens[0]?.type).toBe(TokenType.PROCESSING_INSTRUCTION);
253
- expect(tokens[0]?.value).toBe('<?php echo "Hello"; ');
252
+ expect(tokens[0]?.type).toBe(TokenType.COMMENT);
253
+ expect(tokens[0]?.value).toBe('?php echo "Hello"; ?');
254
254
  });
255
255
  });
256
256
 
@@ -429,7 +429,7 @@ describe('HTML Tokenizer', () => {
429
429
  });
430
430
  });
431
431
 
432
- it('should handle CDATA with complex content', () => {
432
+ it('should handle CDATA as bogus comment with complex content', () => {
433
433
  const complexContent = `
434
434
  function it() {
435
435
  return "<div>HTML inside JS</div>";
@@ -440,11 +440,11 @@ describe('HTML Tokenizer', () => {
440
440
  const tokens = tokenize(`<![CDATA[${complexContent}]]>`);
441
441
  const cdataToken = tokens[0]!;
442
442
 
443
- expect(cdataToken.type).toBe(TokenType.CDATA);
444
- expect(cdataToken.value).toBe(complexContent);
443
+ expect(cdataToken.type).toBe(TokenType.COMMENT);
444
+ expect(cdataToken.value).toBe('[CDATA[' + complexContent + ']]');
445
445
  });
446
446
 
447
- it('should handle processing instructions with various formats', () => {
447
+ it('should handle processing instructions as bogus comments', () => {
448
448
  const tests = [
449
449
  { input: '<?xml version="1.0" encoding="UTF-8"?>', expected: 'xml' },
450
450
  { input: '<?xml-stylesheet type="text/xsl" href="style.xsl"?>', expected: 'xml' },
@@ -456,7 +456,7 @@ describe('HTML Tokenizer', () => {
456
456
  const tokens = tokenize(test.input);
457
457
  const piToken = tokens[0]!;
458
458
 
459
- expect(piToken.type).toBe(TokenType.PROCESSING_INSTRUCTION);
459
+ expect(piToken.type).toBe(TokenType.COMMENT);
460
460
  expect(piToken.value.toLowerCase()).toContain(test.expected);
461
461
  });
462
462
  });
@@ -478,15 +478,13 @@ describe('HTML Tokenizer', () => {
478
478
  });
479
479
  });
480
480
 
481
- it('should handle mixed content with all token types', () => {
481
+ it('should handle mixed content with all token types (HTML5 mode)', () => {
482
482
  const html = `
483
- <?xml version="1.0"?>
484
483
  <!DOCTYPE html>
485
484
  <!-- Main document -->
486
485
  <html lang="en">
487
486
  <head>
488
487
  <title>Test &amp; Demo</title>
489
- <![CDATA[Some raw data]]>
490
488
  </head>
491
489
  <body>
492
490
  <h1>Hello World</h1>
@@ -500,27 +498,25 @@ describe('HTML Tokenizer', () => {
500
498
  const tokens = tokenize(html);
501
499
 
502
500
  const tokenCounts = {
503
- [TokenType.PROCESSING_INSTRUCTION]: 0,
504
501
  [TokenType.DOCTYPE]: 0,
505
502
  [TokenType.COMMENT]: 0,
506
503
  [TokenType.TAG_OPEN]: 0,
507
504
  [TokenType.TAG_CLOSE]: 0,
508
505
  [TokenType.TEXT]: 0,
509
- [TokenType.CDATA]: 0,
510
506
  [TokenType.EOF]: 0
511
507
  };
512
508
 
513
509
  tokens.forEach(token => {
514
- tokenCounts[token.type]++;
510
+ if (token.type in tokenCounts) {
511
+ tokenCounts[token.type]++;
512
+ }
515
513
  });
516
514
 
517
- expect(tokenCounts[TokenType.PROCESSING_INSTRUCTION]).toBeGreaterThan(0);
518
515
  expect(tokenCounts[TokenType.DOCTYPE]).toBeGreaterThan(0);
519
516
  expect(tokenCounts[TokenType.COMMENT]).toBeGreaterThan(0);
520
517
  expect(tokenCounts[TokenType.TAG_OPEN]).toBeGreaterThan(0);
521
518
  expect(tokenCounts[TokenType.TAG_CLOSE]).toBeGreaterThan(0);
522
519
  expect(tokenCounts[TokenType.TEXT]).toBeGreaterThan(0);
523
- expect(tokenCounts[TokenType.CDATA]).toBeGreaterThan(0);
524
520
  expect(tokenCounts[TokenType.EOF]).toBe(1);
525
521
  });
526
522
  })
@@ -1,6 +1,7 @@
1
- import { describe, it } from "bun:test";
1
+ import { describe, it, expect } from "bun:test";
2
2
  import { readFileSync } from "fs";
3
- import { parse } from "../src/index.ts";
3
+ import { parseHTML } from "../src/index.ts";
4
+ import { serializeToHtml5lib } from "./helpers/tree-adapter";
4
5
 
5
6
  describe("Tree Construction Html5testCom Tests", () => {
6
7
  const data = readFileSync("tests/html5lib-data/tree-construction/html5test-com.dat", "utf8");
@@ -10,15 +11,22 @@ describe("Tree Construction Html5testCom Tests", () => {
10
11
  const parts = section.split("#document\n");
11
12
  if (parts.length < 2) continue;
12
13
  const inputWithErrors = parts[0];
13
- const expected = parts[1];
14
+ const expectedRaw = parts[1].split("\n#")[0];
15
+ const expected = expectedRaw.split("\n").filter(l => l.startsWith("|")).join("\n") + "\n";
14
16
  const input = inputWithErrors.split("#errors\n")[0].trim();
17
+ const hasDoctype = input.toLowerCase().startsWith("<!doctype");
15
18
 
16
19
  const testName = input.split("\n")[0] || "Html5testCom test";
17
- it.skip(testName, () => {
18
- const doc = parse(input);
19
- // TODO: Implement DOM tree comparison with expected
20
- // For now, just ensure parsing doesn't throw
21
- expect(doc).toBeDefined();
20
+
21
+ const isFosterParenting = input.includes('<table><form><input type=hidden><input></form><div></div></table>');
22
+ const isAdoptionAgency = input.includes('<i>A<b>B<p></i>C</b>D');
23
+
24
+ const testFn = (isFosterParenting || isAdoptionAgency) ? it.skip : it;
25
+
26
+ testFn(testName, () => {
27
+ const doc = parseHTML(input);
28
+ const actual = serializeToHtml5lib(doc, { skipImplicitDoctype: !hasDoctype });
29
+ expect(actual).toBe(expected);
22
30
  });
23
31
  }
24
32
  });