@tkeron/html-parser 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tkeron/html-parser",
3
- "version": "1.0.0",
3
+ "version": "1.1.0",
4
4
  "description": "A fast and lightweight HTML parser for Bun",
5
5
  "main": "index.js",
6
6
  "module": "index.ts",
@@ -28,7 +28,8 @@ export const enum NodeType {
28
28
 
29
29
  export function createElement(
30
30
  tagName: string,
31
- attributes: Record<string, string> = {}
31
+ attributes: Record<string, string> = {},
32
+ namespaceURI?: string
32
33
  ): any {
33
34
  const innerHTML = "";
34
35
  const tagNameLower = tagName.toLowerCase();
@@ -46,6 +47,7 @@ export function createElement(
46
47
  nodeName: tagName.toUpperCase(),
47
48
  nodeValue: null,
48
49
  tagName: tagName.toUpperCase(),
50
+ namespaceURI: namespaceURI || null,
49
51
  attributes: { ...attributes },
50
52
  childNodes: [],
51
53
  children: [],
package/src/parser.ts CHANGED
@@ -138,20 +138,35 @@ export function parse(tokens: Token[]): any {
138
138
  appendChild(html, body);
139
139
 
140
140
  const doctypes: any[] = [];
141
+ const commentsBeforeHtml: any[] = [];
142
+ const bodyContent: any[] = [];
141
143
  const children = [...state.root.childNodes];
144
+
145
+ let foundElement = false;
142
146
  for (const child of children) {
143
147
  if (child.nodeType === 10) {
144
148
  doctypes.push(child);
149
+ } else if (child.nodeType === 8 && !foundElement) {
150
+ commentsBeforeHtml.push(child);
145
151
  } else {
146
- appendChild(body, child);
152
+ if (child.nodeType === 1) foundElement = true;
153
+ bodyContent.push(child);
147
154
  }
148
155
  }
149
156
 
157
+ for (const content of bodyContent) {
158
+ appendChild(body, content);
159
+ }
160
+
150
161
  state.root.childNodes = [];
151
162
  for (const doctype of doctypes) {
152
163
  doctype.parentNode = null;
153
164
  appendChild(state.root, doctype);
154
165
  }
166
+ for (const comment of commentsBeforeHtml) {
167
+ comment.parentNode = null;
168
+ appendChild(state.root, comment);
169
+ }
155
170
  appendChild(state.root, html);
156
171
  state.root.documentElement = html;
157
172
  state.root.head = head;
@@ -364,6 +379,10 @@ function parseTokenInInHeadMode(state: ParserState, token: Token): void {
364
379
  } else if (tagName === 'meta' || tagName === 'link' || tagName === 'base') {
365
380
  parseOpenTag(state, token);
366
381
  } else if (tagName === 'head') {
382
+ // Ignore duplicate <head> tags
383
+ } else if (tagName.includes('-')) {
384
+ // Custom elements (tags with hyphens) are valid in <head>
385
+ parseOpenTag(state, token);
367
386
  } else {
368
387
  state.stack.pop();
369
388
  state.insertionMode = InsertionMode.AfterHead;
@@ -378,6 +397,9 @@ function parseTokenInInHeadMode(state: ParserState, token: Token): void {
378
397
  if (currentTagName === tagName) {
379
398
  state.stack.pop();
380
399
  }
400
+ } else if (tagName.includes('-') && currentTagName === tagName) {
401
+ // Handle closing tags for custom elements in <head>
402
+ state.stack.pop();
381
403
  }
382
404
  } else if (token.type === TokenType.COMMENT) {
383
405
  parseComment(state, token);
@@ -410,6 +432,9 @@ function parseTokenInAfterHeadMode(state: ParserState, token: Token): void {
410
432
  }
411
433
  }
412
434
 
435
+ const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
436
+ const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
437
+
413
438
  function parseTokenInInBodyMode(state: ParserState, token: Token): void {
414
439
  if (token.type === TokenType.TAG_OPEN) {
415
440
  const tagName = token.value.toLowerCase();
@@ -418,7 +443,14 @@ function parseTokenInInBodyMode(state: ParserState, token: Token): void {
418
443
 
419
444
  const currentParent = getCurrentParent(state);
420
445
 
421
- const element = createElement(tagName, token.attributes || {});
446
+ let namespaceURI: string | undefined;
447
+ if (tagName === 'svg') {
448
+ namespaceURI = SVG_NAMESPACE;
449
+ } else if (tagName === 'math') {
450
+ namespaceURI = MATHML_NAMESPACE;
451
+ }
452
+
453
+ const element = createElement(tagName, token.attributes || {}, namespaceURI);
422
454
 
423
455
  appendChild(currentParent, element);
424
456
 
package/src/tokenizer.ts CHANGED
@@ -93,16 +93,63 @@ function decodeEntities(text: string): string {
93
93
 
94
94
  function parseAttributes(attributeString: string): Record<string, string> {
95
95
  const attributes: Record<string, string> = {};
96
+ let i = 0;
96
97
 
97
- const attrRegex = /([a-zA-Z][a-zA-Z0-9\-_:]*)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+)))?/g;
98
- let match;
99
-
100
- while ((match = attrRegex.exec(attributeString)) !== null) {
101
- const [, name, doubleQuoted, singleQuoted, unquoted] = match;
102
- if (name) {
103
- const value = doubleQuoted ?? singleQuoted ?? unquoted ?? '';
104
- attributes[name.toLowerCase()] = decodeEntities(value);
98
+ while (i < attributeString.length) {
99
+ while (i < attributeString.length && /\s/.test(attributeString[i])) {
100
+ i++;
105
101
  }
102
+ if (i >= attributeString.length || attributeString[i] === '/' || attributeString[i] === '>') {
103
+ break;
104
+ }
105
+
106
+ let name = '';
107
+ while (i < attributeString.length && !/[\s=\/>]/.test(attributeString[i])) {
108
+ name += attributeString[i];
109
+ i++;
110
+ }
111
+
112
+ if (!name) {
113
+ i++;
114
+ continue;
115
+ }
116
+
117
+ while (i < attributeString.length && /\s/.test(attributeString[i])) {
118
+ i++;
119
+ }
120
+
121
+ let value = '';
122
+ if (i < attributeString.length && attributeString[i] === '=') {
123
+ i++;
124
+ while (i < attributeString.length && /\s/.test(attributeString[i])) {
125
+ i++;
126
+ }
127
+
128
+ if (i < attributeString.length) {
129
+ if (attributeString[i] === '"') {
130
+ i++;
131
+ while (i < attributeString.length && attributeString[i] !== '"') {
132
+ value += attributeString[i];
133
+ i++;
134
+ }
135
+ i++;
136
+ } else if (attributeString[i] === "'") {
137
+ i++;
138
+ while (i < attributeString.length && attributeString[i] !== "'") {
139
+ value += attributeString[i];
140
+ i++;
141
+ }
142
+ i++;
143
+ } else {
144
+ while (i < attributeString.length && !/[\s>]/.test(attributeString[i])) {
145
+ value += attributeString[i];
146
+ i++;
147
+ }
148
+ }
149
+ }
150
+ }
151
+
152
+ attributes[name.toLowerCase()] = decodeEntities(value);
106
153
  }
107
154
 
108
155
  return attributes;
@@ -117,79 +164,72 @@ function calculatePosition(text: string, offset: number): Position {
117
164
  };
118
165
  }
119
166
 
167
+ const RAW_TEXT_ELEMENTS = new Set(['script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript']);
168
+ const RCDATA_ELEMENTS = new Set(['textarea', 'title']);
169
+
120
170
  export function tokenize(html: string): Token[] {
121
171
  const tokens: Token[] = [];
122
- let position = 0;
123
-
124
- const specialCases = [
125
- {
126
- pattern: /<!DOCTYPE\s+[^>]*>/gi,
127
- type: TokenType.DOCTYPE,
128
- getValue: (match: string) => {
129
- const doctypeMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
130
- return doctypeMatch && doctypeMatch[1] ? doctypeMatch[1].toLowerCase() : match;
131
- }
132
- },
133
- {
134
- pattern: /<!--([\s\S]*?)(?:-->|$)/g,
135
- type: TokenType.COMMENT,
136
- getValue: (match: string) => match.slice(4, match.endsWith('-->') ? -3 : match.length)
137
- },
138
- {
139
- pattern: /<!\[CDATA\[([\s\S]*?)\]\]>/g,
140
- type: TokenType.CDATA,
141
- getValue: (match: string) => match.slice(9, -3)
142
- },
143
- {
144
- pattern: /<\?([^?]*(?:\?(?!>)[^?]*)*)\?>/g,
145
- type: TokenType.PROCESSING_INSTRUCTION,
146
- getValue: (match: string) => match.slice(0, -2)
147
- }
148
- ];
149
-
150
- const processedRanges: Array<[number, number]> = [];
151
-
152
- for (const { pattern, type, getValue } of specialCases) {
153
- const regex = new RegExp(pattern);
154
- let match;
155
-
156
- while ((match = regex.exec(html)) !== null) {
157
- const start = match.index;
158
- const end = start + match[0].length;
159
-
160
- tokens.push({
161
- type,
162
- value: getValue(match[0]),
163
- position: calculatePosition(html, start)
164
- });
165
-
166
- processedRanges.push([start, end]);
167
- }
168
- }
169
-
170
- processedRanges.sort((a, b) => a[0] - b[0]);
171
-
172
172
  let currentPos = 0;
173
173
 
174
174
  while (currentPos < html.length) {
175
- const inProcessedRange = processedRanges.some(([start, end]) =>
176
- currentPos >= start && currentPos < end
177
- );
178
-
179
- if (inProcessedRange) {
180
- const range = processedRanges.find(([start, end]) =>
181
- currentPos >= start && currentPos < end
182
- );
183
- if (range) {
184
- currentPos = range[1];
185
- }
186
- continue;
187
- }
188
-
189
175
  const char = html[currentPos];
190
176
 
191
177
  if (char === '<') {
192
- const tagMatch = html.slice(currentPos).match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
178
+ const remaining = html.slice(currentPos);
179
+
180
+ const doctypeMatch = remaining.match(/^<!DOCTYPE\s+[^>]*>/i);
181
+ if (doctypeMatch) {
182
+ const match = doctypeMatch[0];
183
+ const nameMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
184
+ tokens.push({
185
+ type: TokenType.DOCTYPE,
186
+ value: nameMatch && nameMatch[1] ? nameMatch[1].toLowerCase() : match,
187
+ position: calculatePosition(html, currentPos)
188
+ });
189
+ currentPos += match.length;
190
+ continue;
191
+ }
192
+
193
+ const commentMatch = remaining.match(/^<!--([\s\S]*?)(?:-->|$)/);
194
+ if (commentMatch) {
195
+ const match = commentMatch[0];
196
+ tokens.push({
197
+ type: TokenType.COMMENT,
198
+ value: match.slice(4, match.endsWith('-->') ? -3 : match.length),
199
+ position: calculatePosition(html, currentPos)
200
+ });
201
+ currentPos += match.length;
202
+ continue;
203
+ }
204
+
205
+ const cdataMatch = remaining.match(/^<!\[CDATA\[([\s\S]*?)\]\]>/);
206
+ if (cdataMatch) {
207
+ const content = cdataMatch[1];
208
+ tokens.push({
209
+ type: TokenType.COMMENT,
210
+ value: '[CDATA[' + content + ']]',
211
+ position: calculatePosition(html, currentPos)
212
+ });
213
+ currentPos += cdataMatch[0].length;
214
+ continue;
215
+ }
216
+
217
+ const piMatch = remaining.match(/^<\?([^>]*)/);
218
+ if (piMatch) {
219
+ let consumed = piMatch[0].length;
220
+ if (remaining[consumed] === '>') {
221
+ consumed++;
222
+ }
223
+ tokens.push({
224
+ type: TokenType.COMMENT,
225
+ value: '?' + piMatch[1],
226
+ position: calculatePosition(html, currentPos)
227
+ });
228
+ currentPos += consumed;
229
+ continue;
230
+ }
231
+
232
+ const tagMatch = remaining.match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
193
233
 
194
234
  if (tagMatch) {
195
235
  const fullTag = tagMatch[0];
@@ -222,6 +262,24 @@ export function tokenize(html: string): Token[] {
222
262
  });
223
263
 
224
264
  currentPos += fullTag.length;
265
+
266
+ if (!isClosing && !isSelfClosing && (RAW_TEXT_ELEMENTS.has(tagName) || RCDATA_ELEMENTS.has(tagName))) {
267
+ const closeTagPattern = new RegExp(`</${tagName}\\s*>`, 'i');
268
+ const restOfHtml = html.slice(currentPos);
269
+ const closeMatch = restOfHtml.match(closeTagPattern);
270
+
271
+ if (closeMatch && closeMatch.index !== undefined) {
272
+ const rawContent = restOfHtml.slice(0, closeMatch.index);
273
+ if (rawContent) {
274
+ tokens.push({
275
+ type: TokenType.TEXT,
276
+ value: RCDATA_ELEMENTS.has(tagName) ? decodeEntities(rawContent) : rawContent,
277
+ position: calculatePosition(html, currentPos)
278
+ });
279
+ }
280
+ currentPos += rawContent.length;
281
+ }
282
+ }
225
283
  } else {
226
284
  const textStart = currentPos;
227
285
  currentPos++;
@@ -257,8 +315,6 @@ export function tokenize(html: string): Token[] {
257
315
  }
258
316
  }
259
317
 
260
- tokens.sort((a, b) => a.position.offset - b.position.offset);
261
-
262
318
  tokens.push({
263
319
  type: TokenType.EOF,
264
320
  value: '',
@@ -60,7 +60,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
60
60
  });
61
61
  });
62
62
 
63
- it('should handle complex CDATA content', () => {
63
+ it('should handle complex CDATA content as bogus comment', () => {
64
64
  const complexContent = `
65
65
  function test() {
66
66
  return "<div>HTML inside JS</div>";
@@ -71,8 +71,8 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
71
71
  expect(tokens.length).toBeGreaterThan(0);
72
72
  const cdataToken = tokens[0]!;
73
73
 
74
- expect(cdataToken.type).toBe(TokenType.CDATA);
75
- expect(cdataToken.value).toBe(complexContent);
74
+ expect(cdataToken.type).toBe(TokenType.COMMENT);
75
+ expect(cdataToken.value).toBe('[CDATA[' + complexContent + ']]');
76
76
  });
77
77
 
78
78
  it('should handle performance with large documents', () => {
@@ -0,0 +1,105 @@
1
+ import { describe, it, expect } from 'bun:test';
2
+ import { parseHTML } from '../index';
3
+
4
+ describe('Custom Elements in <head>', () => {
5
+
6
+ it('should keep <meta-tags> custom element in head', () => {
7
+ const doc = parseHTML(
8
+ '<!DOCTYPE html><html><head><meta-tags></meta-tags></head><body></body></html>'
9
+ );
10
+
11
+ const metaTags = doc.head?.querySelector('meta-tags');
12
+ expect(metaTags).toBeTruthy();
13
+ expect(metaTags?.parentElement?.tagName).toBe('HEAD');
14
+ });
15
+
16
+ it('should keep <social-meta> custom element in head', () => {
17
+ const doc = parseHTML(
18
+ '<!DOCTYPE html><html><head><social-meta></social-meta></head><body></body></html>'
19
+ );
20
+
21
+ const socialMeta = doc.head?.querySelector('social-meta');
22
+ expect(socialMeta).toBeTruthy();
23
+ expect(socialMeta?.parentElement?.tagName).toBe('HEAD');
24
+ });
25
+
26
+ it('should keep any <custom-element> with hyphen in head', () => {
27
+ const doc = parseHTML(
28
+ '<!DOCTYPE html><html><head><my-component></my-component></head><body></body></html>'
29
+ );
30
+
31
+ const myComponent = doc.head?.querySelector('my-component');
32
+ expect(myComponent).toBeTruthy();
33
+ expect(myComponent?.parentElement?.tagName).toBe('HEAD');
34
+ });
35
+
36
+ it('should still eject non-custom elements like <div> to body', () => {
37
+ const doc = parseHTML(
38
+ '<!DOCTYPE html><html><head><div>test</div></head><body></body></html>'
39
+ );
40
+
41
+ const divInHead = doc.head?.querySelector('div');
42
+ const divInBody = doc.body?.querySelector('div');
43
+ expect(divInHead).toBeFalsy();
44
+ expect(divInBody).toBeTruthy();
45
+ });
46
+
47
+ it('should handle nested custom elements in head', () => {
48
+ const doc = parseHTML(
49
+ '<!DOCTYPE html><html><head><my-wrapper><inner-comp></inner-comp></my-wrapper></head><body></body></html>'
50
+ );
51
+
52
+ const myWrapper = doc.head?.querySelector('my-wrapper');
53
+ expect(myWrapper).toBeTruthy();
54
+ expect(myWrapper?.parentElement?.tagName).toBe('HEAD');
55
+
56
+ const innerComp = myWrapper?.querySelector('inner-comp');
57
+ expect(innerComp).toBeTruthy();
58
+ });
59
+
60
+ it('should keep custom elements with attributes in head', () => {
61
+ const doc = parseHTML(
62
+ '<!DOCTYPE html><html><head><seo-meta property="og:title" content="Test"></seo-meta></head><body></body></html>'
63
+ );
64
+
65
+ const seoMeta = doc.head?.querySelector('seo-meta');
66
+ expect(seoMeta).toBeTruthy();
67
+ expect(seoMeta?.getAttribute('property')).toBe('og:title');
68
+ expect(seoMeta?.getAttribute('content')).toBe('Test');
69
+ expect(seoMeta?.parentElement?.tagName).toBe('HEAD');
70
+ });
71
+
72
+ it('should keep self-closing custom elements in head', () => {
73
+ const doc = parseHTML(
74
+ '<!DOCTYPE html><html><head><custom-void /></head><body></body></html>'
75
+ );
76
+
77
+ const customVoid = doc.head?.querySelector('custom-void');
78
+ expect(customVoid).toBeTruthy();
79
+ expect(customVoid?.parentElement?.tagName).toBe('HEAD');
80
+ });
81
+
82
+ it('should handle custom elements mixed with standard head elements', () => {
83
+ const doc = parseHTML(
84
+ '<!DOCTYPE html><html><head><title>Test</title><meta-tags></meta-tags><link rel="stylesheet" href="style.css"></head><body></body></html>'
85
+ );
86
+
87
+ const title = doc.head?.querySelector('title');
88
+ const metaTags = doc.head?.querySelector('meta-tags');
89
+ const link = doc.head?.querySelector('link');
90
+
91
+ expect(title).toBeTruthy();
92
+ expect(metaTags).toBeTruthy();
93
+ expect(link).toBeTruthy();
94
+ });
95
+
96
+ it('should handle custom element containing text in head', () => {
97
+ const doc = parseHTML(
98
+ '<!DOCTYPE html><html><head><inline-script>console.log("test")</inline-script></head><body></body></html>'
99
+ );
100
+
101
+ const inlineScript = doc.head?.querySelector('inline-script');
102
+ expect(inlineScript).toBeTruthy();
103
+ expect(inlineScript?.parentElement?.tagName).toBe('HEAD');
104
+ });
105
+ });
@@ -27,7 +27,7 @@ describe('Tree Adapter Tests', () => {
27
27
  it('should serialize comment', () => {
28
28
  const doc = parseHTML('<div><!-- comment --></div>');
29
29
  const serialized = serializeToHtml5lib(doc);
30
- expect(serialized).toContain('<!-- -->');
30
+ expect(serialized).toContain('<!-- comment -->');
31
31
  });
32
32
 
33
33
  it('should serialize DOCTYPE', () => {
@@ -1,6 +1,10 @@
1
1
  // tests/helpers/tree-adapter.ts
2
2
 
3
- export function serializeToHtml5lib(doc: any): string {
3
+ export interface SerializeOptions {
4
+ skipImplicitDoctype?: boolean;
5
+ }
6
+
7
+ export function serializeToHtml5lib(doc: any, options: SerializeOptions = {}): string {
4
8
  const lines: string[] = [];
5
9
 
6
10
  function serialize(node: any, depth: number): void {
@@ -11,7 +15,17 @@ export function serializeToHtml5lib(doc: any): string {
11
15
  serialize(child, depth);
12
16
  }
13
17
  } else if (node.nodeType === 1) { // ELEMENT
14
- lines.push(`${indent}<${node.tagName.toLowerCase()}>`);
18
+ const tagName = node.tagName.toLowerCase();
19
+ const ns = node.namespaceURI;
20
+
21
+ let nsPrefix = '';
22
+ if (ns === 'http://www.w3.org/2000/svg') {
23
+ nsPrefix = ' svg';
24
+ } else if (ns === 'http://www.w3.org/1998/Math/MathML') {
25
+ nsPrefix = ' math';
26
+ }
27
+
28
+ lines.push(`${indent}<${tagName}${nsPrefix}>`);
15
29
 
16
30
  // Atributos en orden alfabético
17
31
  const attrs = Object.entries(node.attributes || {}).sort(([a], [b]) => a.localeCompare(b));
@@ -32,9 +46,12 @@ export function serializeToHtml5lib(doc: any): string {
32
46
  } else if (node.nodeType === 3) { // TEXT
33
47
  lines.push(`${indent}"${node.textContent}"`);
34
48
  } else if (node.nodeType === 8) { // COMMENT
35
- lines.push(`${indent}<!-- ${node.textContent} -->`);
49
+ const commentData = node.data || node.nodeValue || node.textContent || '';
50
+ lines.push(`${indent}<!-- ${commentData} -->`);
36
51
  } else if (node.nodeType === 10) { // DOCTYPE
37
- lines.push(`${indent}<!DOCTYPE ${node.name || 'html'}>`);
52
+ if (!options.skipImplicitDoctype) {
53
+ lines.push(`${indent}<!DOCTYPE ${node.name || 'html'}>`);
54
+ }
38
55
  }
39
56
  }
40
57
 
@@ -18,7 +18,8 @@ function parseToAST(html: string): ASTNode {
18
18
  if (htmlEl) {
19
19
  const bodyEl = htmlEl.children?.find(c => c.tagName === 'body');
20
20
  if (bodyEl && bodyEl.children) {
21
- return { type: ASTNodeType.Document, children: bodyEl.children };
21
+ const nonHtmlChildren = ast.children?.filter(c => c.tagName !== 'html' && c.type !== 'doctype') || [];
22
+ return { type: ASTNodeType.Document, children: [...nonHtmlChildren, ...bodyEl.children] };
22
23
  }
23
24
  }
24
25
  return ast;
@@ -198,21 +198,21 @@ describe('HTML Tokenizer', () => {
198
198
  });
199
199
  });
200
200
 
201
- describe('CDATA Sections', () => {
202
- it('should parse CDATA sections', () => {
201
+ describe('CDATA Sections (HTML5: treated as bogus comments)', () => {
202
+ it('should parse CDATA sections as bogus comments in HTML5', () => {
203
203
  const tokens = tokenize('<![CDATA[Some data]]>');
204
204
 
205
205
  expect(tokens[0]).toEqual({
206
- type: TokenType.CDATA,
207
- value: 'Some data',
206
+ type: TokenType.COMMENT,
207
+ value: '[CDATA[Some data]]',
208
208
  position: expect.any(Object)
209
209
  });
210
210
  });
211
211
 
212
- it('should handle CDATA with special characters', () => {
212
+ it('should handle CDATA with special characters as bogus comment', () => {
213
213
  const tokens = tokenize('<![CDATA[<script>alert("test");</script>]]>');
214
214
 
215
- expect(tokens[0]?.value).toBe('<script>alert("test");</script>');
215
+ expect(tokens[0]?.value).toBe('[CDATA[<script>alert("test");</script>]]');
216
216
  });
217
217
  });
218
218
 
@@ -235,22 +235,22 @@ describe('HTML Tokenizer', () => {
235
235
  });
236
236
  });
237
237
 
238
- describe('Processing Instructions', () => {
239
- it('should parse XML processing instruction', () => {
238
+ describe('Processing Instructions (HTML5: treated as bogus comments)', () => {
239
+ it('should parse XML processing instruction as bogus comment', () => {
240
240
  const tokens = tokenize('<?xml version="1.0" encoding="UTF-8"?>');
241
241
 
242
242
  expect(tokens[0]).toEqual({
243
- type: TokenType.PROCESSING_INSTRUCTION,
244
- value: '<?xml version="1.0" encoding="UTF-8"',
243
+ type: TokenType.COMMENT,
244
+ value: '?xml version="1.0" encoding="UTF-8"?',
245
245
  position: expect.any(Object)
246
246
  });
247
247
  });
248
248
 
249
- it('should parse PHP-style processing instruction', () => {
249
+ it('should parse PHP-style processing instruction as bogus comment', () => {
250
250
  const tokens = tokenize('<?php echo "Hello"; ?>');
251
251
 
252
- expect(tokens[0]?.type).toBe(TokenType.PROCESSING_INSTRUCTION);
253
- expect(tokens[0]?.value).toBe('<?php echo "Hello"; ');
252
+ expect(tokens[0]?.type).toBe(TokenType.COMMENT);
253
+ expect(tokens[0]?.value).toBe('?php echo "Hello"; ?');
254
254
  });
255
255
  });
256
256
 
@@ -429,7 +429,7 @@ describe('HTML Tokenizer', () => {
429
429
  });
430
430
  });
431
431
 
432
- it('should handle CDATA with complex content', () => {
432
+ it('should handle CDATA as bogus comment with complex content', () => {
433
433
  const complexContent = `
434
434
  function it() {
435
435
  return "<div>HTML inside JS</div>";
@@ -440,11 +440,11 @@ describe('HTML Tokenizer', () => {
440
440
  const tokens = tokenize(`<![CDATA[${complexContent}]]>`);
441
441
  const cdataToken = tokens[0]!;
442
442
 
443
- expect(cdataToken.type).toBe(TokenType.CDATA);
444
- expect(cdataToken.value).toBe(complexContent);
443
+ expect(cdataToken.type).toBe(TokenType.COMMENT);
444
+ expect(cdataToken.value).toBe('[CDATA[' + complexContent + ']]');
445
445
  });
446
446
 
447
- it('should handle processing instructions with various formats', () => {
447
+ it('should handle processing instructions as bogus comments', () => {
448
448
  const tests = [
449
449
  { input: '<?xml version="1.0" encoding="UTF-8"?>', expected: 'xml' },
450
450
  { input: '<?xml-stylesheet type="text/xsl" href="style.xsl"?>', expected: 'xml' },
@@ -456,7 +456,7 @@ describe('HTML Tokenizer', () => {
456
456
  const tokens = tokenize(test.input);
457
457
  const piToken = tokens[0]!;
458
458
 
459
- expect(piToken.type).toBe(TokenType.PROCESSING_INSTRUCTION);
459
+ expect(piToken.type).toBe(TokenType.COMMENT);
460
460
  expect(piToken.value.toLowerCase()).toContain(test.expected);
461
461
  });
462
462
  });
@@ -478,15 +478,13 @@ describe('HTML Tokenizer', () => {
478
478
  });
479
479
  });
480
480
 
481
- it('should handle mixed content with all token types', () => {
481
+ it('should handle mixed content with all token types (HTML5 mode)', () => {
482
482
  const html = `
483
- <?xml version="1.0"?>
484
483
  <!DOCTYPE html>
485
484
  <!-- Main document -->
486
485
  <html lang="en">
487
486
  <head>
488
487
  <title>Test &amp; Demo</title>
489
- <![CDATA[Some raw data]]>
490
488
  </head>
491
489
  <body>
492
490
  <h1>Hello World</h1>
@@ -500,27 +498,25 @@ describe('HTML Tokenizer', () => {
500
498
  const tokens = tokenize(html);
501
499
 
502
500
  const tokenCounts = {
503
- [TokenType.PROCESSING_INSTRUCTION]: 0,
504
501
  [TokenType.DOCTYPE]: 0,
505
502
  [TokenType.COMMENT]: 0,
506
503
  [TokenType.TAG_OPEN]: 0,
507
504
  [TokenType.TAG_CLOSE]: 0,
508
505
  [TokenType.TEXT]: 0,
509
- [TokenType.CDATA]: 0,
510
506
  [TokenType.EOF]: 0
511
507
  };
512
508
 
513
509
  tokens.forEach(token => {
514
- tokenCounts[token.type]++;
510
+ if (token.type in tokenCounts) {
511
+ tokenCounts[token.type]++;
512
+ }
515
513
  });
516
514
 
517
- expect(tokenCounts[TokenType.PROCESSING_INSTRUCTION]).toBeGreaterThan(0);
518
515
  expect(tokenCounts[TokenType.DOCTYPE]).toBeGreaterThan(0);
519
516
  expect(tokenCounts[TokenType.COMMENT]).toBeGreaterThan(0);
520
517
  expect(tokenCounts[TokenType.TAG_OPEN]).toBeGreaterThan(0);
521
518
  expect(tokenCounts[TokenType.TAG_CLOSE]).toBeGreaterThan(0);
522
519
  expect(tokenCounts[TokenType.TEXT]).toBeGreaterThan(0);
523
- expect(tokenCounts[TokenType.CDATA]).toBeGreaterThan(0);
524
520
  expect(tokenCounts[TokenType.EOF]).toBe(1);
525
521
  });
526
522
  })