@tkeron/html-parser 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/dom-simulator.ts +3 -1
- package/src/parser.ts +34 -2
- package/src/tokenizer.ts +131 -75
- package/tests/advanced.test.ts +3 -3
- package/tests/custom-elements-head.test.ts +105 -0
- package/tests/helpers/tree-adapter.test.ts +1 -1
- package/tests/helpers/tree-adapter.ts +21 -4
- package/tests/parser.test.ts +2 -1
- package/tests/tokenizer.test.ts +22 -26
- package/tests/tree-construction-html5test-com.test.ts +16 -8
- package/tests/custom-elements.test.ts +0 -755
package/package.json
CHANGED
package/src/dom-simulator.ts
CHANGED
|
@@ -28,7 +28,8 @@ export const enum NodeType {
|
|
|
28
28
|
|
|
29
29
|
export function createElement(
|
|
30
30
|
tagName: string,
|
|
31
|
-
attributes: Record<string, string> = {}
|
|
31
|
+
attributes: Record<string, string> = {},
|
|
32
|
+
namespaceURI?: string
|
|
32
33
|
): any {
|
|
33
34
|
const innerHTML = "";
|
|
34
35
|
const tagNameLower = tagName.toLowerCase();
|
|
@@ -46,6 +47,7 @@ export function createElement(
|
|
|
46
47
|
nodeName: tagName.toUpperCase(),
|
|
47
48
|
nodeValue: null,
|
|
48
49
|
tagName: tagName.toUpperCase(),
|
|
50
|
+
namespaceURI: namespaceURI || null,
|
|
49
51
|
attributes: { ...attributes },
|
|
50
52
|
childNodes: [],
|
|
51
53
|
children: [],
|
package/src/parser.ts
CHANGED
|
@@ -138,20 +138,35 @@ export function parse(tokens: Token[]): any {
|
|
|
138
138
|
appendChild(html, body);
|
|
139
139
|
|
|
140
140
|
const doctypes: any[] = [];
|
|
141
|
+
const commentsBeforeHtml: any[] = [];
|
|
142
|
+
const bodyContent: any[] = [];
|
|
141
143
|
const children = [...state.root.childNodes];
|
|
144
|
+
|
|
145
|
+
let foundElement = false;
|
|
142
146
|
for (const child of children) {
|
|
143
147
|
if (child.nodeType === 10) {
|
|
144
148
|
doctypes.push(child);
|
|
149
|
+
} else if (child.nodeType === 8 && !foundElement) {
|
|
150
|
+
commentsBeforeHtml.push(child);
|
|
145
151
|
} else {
|
|
146
|
-
|
|
152
|
+
if (child.nodeType === 1) foundElement = true;
|
|
153
|
+
bodyContent.push(child);
|
|
147
154
|
}
|
|
148
155
|
}
|
|
149
156
|
|
|
157
|
+
for (const content of bodyContent) {
|
|
158
|
+
appendChild(body, content);
|
|
159
|
+
}
|
|
160
|
+
|
|
150
161
|
state.root.childNodes = [];
|
|
151
162
|
for (const doctype of doctypes) {
|
|
152
163
|
doctype.parentNode = null;
|
|
153
164
|
appendChild(state.root, doctype);
|
|
154
165
|
}
|
|
166
|
+
for (const comment of commentsBeforeHtml) {
|
|
167
|
+
comment.parentNode = null;
|
|
168
|
+
appendChild(state.root, comment);
|
|
169
|
+
}
|
|
155
170
|
appendChild(state.root, html);
|
|
156
171
|
state.root.documentElement = html;
|
|
157
172
|
state.root.head = head;
|
|
@@ -364,6 +379,10 @@ function parseTokenInInHeadMode(state: ParserState, token: Token): void {
|
|
|
364
379
|
} else if (tagName === 'meta' || tagName === 'link' || tagName === 'base') {
|
|
365
380
|
parseOpenTag(state, token);
|
|
366
381
|
} else if (tagName === 'head') {
|
|
382
|
+
// Ignore duplicate <head> tags
|
|
383
|
+
} else if (tagName.includes('-')) {
|
|
384
|
+
// Custom elements (tags with hyphens) are valid in <head>
|
|
385
|
+
parseOpenTag(state, token);
|
|
367
386
|
} else {
|
|
368
387
|
state.stack.pop();
|
|
369
388
|
state.insertionMode = InsertionMode.AfterHead;
|
|
@@ -378,6 +397,9 @@ function parseTokenInInHeadMode(state: ParserState, token: Token): void {
|
|
|
378
397
|
if (currentTagName === tagName) {
|
|
379
398
|
state.stack.pop();
|
|
380
399
|
}
|
|
400
|
+
} else if (tagName.includes('-') && currentTagName === tagName) {
|
|
401
|
+
// Handle closing tags for custom elements in <head>
|
|
402
|
+
state.stack.pop();
|
|
381
403
|
}
|
|
382
404
|
} else if (token.type === TokenType.COMMENT) {
|
|
383
405
|
parseComment(state, token);
|
|
@@ -410,6 +432,9 @@ function parseTokenInAfterHeadMode(state: ParserState, token: Token): void {
|
|
|
410
432
|
}
|
|
411
433
|
}
|
|
412
434
|
|
|
435
|
+
const SVG_NAMESPACE = 'http://www.w3.org/2000/svg';
|
|
436
|
+
const MATHML_NAMESPACE = 'http://www.w3.org/1998/Math/MathML';
|
|
437
|
+
|
|
413
438
|
function parseTokenInInBodyMode(state: ParserState, token: Token): void {
|
|
414
439
|
if (token.type === TokenType.TAG_OPEN) {
|
|
415
440
|
const tagName = token.value.toLowerCase();
|
|
@@ -418,7 +443,14 @@ function parseTokenInInBodyMode(state: ParserState, token: Token): void {
|
|
|
418
443
|
|
|
419
444
|
const currentParent = getCurrentParent(state);
|
|
420
445
|
|
|
421
|
-
|
|
446
|
+
let namespaceURI: string | undefined;
|
|
447
|
+
if (tagName === 'svg') {
|
|
448
|
+
namespaceURI = SVG_NAMESPACE;
|
|
449
|
+
} else if (tagName === 'math') {
|
|
450
|
+
namespaceURI = MATHML_NAMESPACE;
|
|
451
|
+
}
|
|
452
|
+
|
|
453
|
+
const element = createElement(tagName, token.attributes || {}, namespaceURI);
|
|
422
454
|
|
|
423
455
|
appendChild(currentParent, element);
|
|
424
456
|
|
package/src/tokenizer.ts
CHANGED
|
@@ -93,16 +93,63 @@ function decodeEntities(text: string): string {
|
|
|
93
93
|
|
|
94
94
|
function parseAttributes(attributeString: string): Record<string, string> {
|
|
95
95
|
const attributes: Record<string, string> = {};
|
|
96
|
+
let i = 0;
|
|
96
97
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
while ((match = attrRegex.exec(attributeString)) !== null) {
|
|
101
|
-
const [, name, doubleQuoted, singleQuoted, unquoted] = match;
|
|
102
|
-
if (name) {
|
|
103
|
-
const value = doubleQuoted ?? singleQuoted ?? unquoted ?? '';
|
|
104
|
-
attributes[name.toLowerCase()] = decodeEntities(value);
|
|
98
|
+
while (i < attributeString.length) {
|
|
99
|
+
while (i < attributeString.length && /\s/.test(attributeString[i])) {
|
|
100
|
+
i++;
|
|
105
101
|
}
|
|
102
|
+
if (i >= attributeString.length || attributeString[i] === '/' || attributeString[i] === '>') {
|
|
103
|
+
break;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
let name = '';
|
|
107
|
+
while (i < attributeString.length && !/[\s=\/>]/.test(attributeString[i])) {
|
|
108
|
+
name += attributeString[i];
|
|
109
|
+
i++;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
if (!name) {
|
|
113
|
+
i++;
|
|
114
|
+
continue;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
while (i < attributeString.length && /\s/.test(attributeString[i])) {
|
|
118
|
+
i++;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
let value = '';
|
|
122
|
+
if (i < attributeString.length && attributeString[i] === '=') {
|
|
123
|
+
i++;
|
|
124
|
+
while (i < attributeString.length && /\s/.test(attributeString[i])) {
|
|
125
|
+
i++;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (i < attributeString.length) {
|
|
129
|
+
if (attributeString[i] === '"') {
|
|
130
|
+
i++;
|
|
131
|
+
while (i < attributeString.length && attributeString[i] !== '"') {
|
|
132
|
+
value += attributeString[i];
|
|
133
|
+
i++;
|
|
134
|
+
}
|
|
135
|
+
i++;
|
|
136
|
+
} else if (attributeString[i] === "'") {
|
|
137
|
+
i++;
|
|
138
|
+
while (i < attributeString.length && attributeString[i] !== "'") {
|
|
139
|
+
value += attributeString[i];
|
|
140
|
+
i++;
|
|
141
|
+
}
|
|
142
|
+
i++;
|
|
143
|
+
} else {
|
|
144
|
+
while (i < attributeString.length && !/[\s>]/.test(attributeString[i])) {
|
|
145
|
+
value += attributeString[i];
|
|
146
|
+
i++;
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
attributes[name.toLowerCase()] = decodeEntities(value);
|
|
106
153
|
}
|
|
107
154
|
|
|
108
155
|
return attributes;
|
|
@@ -117,79 +164,72 @@ function calculatePosition(text: string, offset: number): Position {
|
|
|
117
164
|
};
|
|
118
165
|
}
|
|
119
166
|
|
|
167
|
+
const RAW_TEXT_ELEMENTS = new Set(['script', 'style', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript']);
|
|
168
|
+
const RCDATA_ELEMENTS = new Set(['textarea', 'title']);
|
|
169
|
+
|
|
120
170
|
export function tokenize(html: string): Token[] {
|
|
121
171
|
const tokens: Token[] = [];
|
|
122
|
-
let position = 0;
|
|
123
|
-
|
|
124
|
-
const specialCases = [
|
|
125
|
-
{
|
|
126
|
-
pattern: /<!DOCTYPE\s+[^>]*>/gi,
|
|
127
|
-
type: TokenType.DOCTYPE,
|
|
128
|
-
getValue: (match: string) => {
|
|
129
|
-
const doctypeMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
|
|
130
|
-
return doctypeMatch && doctypeMatch[1] ? doctypeMatch[1].toLowerCase() : match;
|
|
131
|
-
}
|
|
132
|
-
},
|
|
133
|
-
{
|
|
134
|
-
pattern: /<!--([\s\S]*?)(?:-->|$)/g,
|
|
135
|
-
type: TokenType.COMMENT,
|
|
136
|
-
getValue: (match: string) => match.slice(4, match.endsWith('-->') ? -3 : match.length)
|
|
137
|
-
},
|
|
138
|
-
{
|
|
139
|
-
pattern: /<!\[CDATA\[([\s\S]*?)\]\]>/g,
|
|
140
|
-
type: TokenType.CDATA,
|
|
141
|
-
getValue: (match: string) => match.slice(9, -3)
|
|
142
|
-
},
|
|
143
|
-
{
|
|
144
|
-
pattern: /<\?([^?]*(?:\?(?!>)[^?]*)*)\?>/g,
|
|
145
|
-
type: TokenType.PROCESSING_INSTRUCTION,
|
|
146
|
-
getValue: (match: string) => match.slice(0, -2)
|
|
147
|
-
}
|
|
148
|
-
];
|
|
149
|
-
|
|
150
|
-
const processedRanges: Array<[number, number]> = [];
|
|
151
|
-
|
|
152
|
-
for (const { pattern, type, getValue } of specialCases) {
|
|
153
|
-
const regex = new RegExp(pattern);
|
|
154
|
-
let match;
|
|
155
|
-
|
|
156
|
-
while ((match = regex.exec(html)) !== null) {
|
|
157
|
-
const start = match.index;
|
|
158
|
-
const end = start + match[0].length;
|
|
159
|
-
|
|
160
|
-
tokens.push({
|
|
161
|
-
type,
|
|
162
|
-
value: getValue(match[0]),
|
|
163
|
-
position: calculatePosition(html, start)
|
|
164
|
-
});
|
|
165
|
-
|
|
166
|
-
processedRanges.push([start, end]);
|
|
167
|
-
}
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
processedRanges.sort((a, b) => a[0] - b[0]);
|
|
171
|
-
|
|
172
172
|
let currentPos = 0;
|
|
173
173
|
|
|
174
174
|
while (currentPos < html.length) {
|
|
175
|
-
const inProcessedRange = processedRanges.some(([start, end]) =>
|
|
176
|
-
currentPos >= start && currentPos < end
|
|
177
|
-
);
|
|
178
|
-
|
|
179
|
-
if (inProcessedRange) {
|
|
180
|
-
const range = processedRanges.find(([start, end]) =>
|
|
181
|
-
currentPos >= start && currentPos < end
|
|
182
|
-
);
|
|
183
|
-
if (range) {
|
|
184
|
-
currentPos = range[1];
|
|
185
|
-
}
|
|
186
|
-
continue;
|
|
187
|
-
}
|
|
188
|
-
|
|
189
175
|
const char = html[currentPos];
|
|
190
176
|
|
|
191
177
|
if (char === '<') {
|
|
192
|
-
const
|
|
178
|
+
const remaining = html.slice(currentPos);
|
|
179
|
+
|
|
180
|
+
const doctypeMatch = remaining.match(/^<!DOCTYPE\s+[^>]*>/i);
|
|
181
|
+
if (doctypeMatch) {
|
|
182
|
+
const match = doctypeMatch[0];
|
|
183
|
+
const nameMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
|
|
184
|
+
tokens.push({
|
|
185
|
+
type: TokenType.DOCTYPE,
|
|
186
|
+
value: nameMatch && nameMatch[1] ? nameMatch[1].toLowerCase() : match,
|
|
187
|
+
position: calculatePosition(html, currentPos)
|
|
188
|
+
});
|
|
189
|
+
currentPos += match.length;
|
|
190
|
+
continue;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
const commentMatch = remaining.match(/^<!--([\s\S]*?)(?:-->|$)/);
|
|
194
|
+
if (commentMatch) {
|
|
195
|
+
const match = commentMatch[0];
|
|
196
|
+
tokens.push({
|
|
197
|
+
type: TokenType.COMMENT,
|
|
198
|
+
value: match.slice(4, match.endsWith('-->') ? -3 : match.length),
|
|
199
|
+
position: calculatePosition(html, currentPos)
|
|
200
|
+
});
|
|
201
|
+
currentPos += match.length;
|
|
202
|
+
continue;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
const cdataMatch = remaining.match(/^<!\[CDATA\[([\s\S]*?)\]\]>/);
|
|
206
|
+
if (cdataMatch) {
|
|
207
|
+
const content = cdataMatch[1];
|
|
208
|
+
tokens.push({
|
|
209
|
+
type: TokenType.COMMENT,
|
|
210
|
+
value: '[CDATA[' + content + ']]',
|
|
211
|
+
position: calculatePosition(html, currentPos)
|
|
212
|
+
});
|
|
213
|
+
currentPos += cdataMatch[0].length;
|
|
214
|
+
continue;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
const piMatch = remaining.match(/^<\?([^>]*)/);
|
|
218
|
+
if (piMatch) {
|
|
219
|
+
let consumed = piMatch[0].length;
|
|
220
|
+
if (remaining[consumed] === '>') {
|
|
221
|
+
consumed++;
|
|
222
|
+
}
|
|
223
|
+
tokens.push({
|
|
224
|
+
type: TokenType.COMMENT,
|
|
225
|
+
value: '?' + piMatch[1],
|
|
226
|
+
position: calculatePosition(html, currentPos)
|
|
227
|
+
});
|
|
228
|
+
currentPos += consumed;
|
|
229
|
+
continue;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
const tagMatch = remaining.match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
|
|
193
233
|
|
|
194
234
|
if (tagMatch) {
|
|
195
235
|
const fullTag = tagMatch[0];
|
|
@@ -222,6 +262,24 @@ export function tokenize(html: string): Token[] {
|
|
|
222
262
|
});
|
|
223
263
|
|
|
224
264
|
currentPos += fullTag.length;
|
|
265
|
+
|
|
266
|
+
if (!isClosing && !isSelfClosing && (RAW_TEXT_ELEMENTS.has(tagName) || RCDATA_ELEMENTS.has(tagName))) {
|
|
267
|
+
const closeTagPattern = new RegExp(`</${tagName}\\s*>`, 'i');
|
|
268
|
+
const restOfHtml = html.slice(currentPos);
|
|
269
|
+
const closeMatch = restOfHtml.match(closeTagPattern);
|
|
270
|
+
|
|
271
|
+
if (closeMatch && closeMatch.index !== undefined) {
|
|
272
|
+
const rawContent = restOfHtml.slice(0, closeMatch.index);
|
|
273
|
+
if (rawContent) {
|
|
274
|
+
tokens.push({
|
|
275
|
+
type: TokenType.TEXT,
|
|
276
|
+
value: RCDATA_ELEMENTS.has(tagName) ? decodeEntities(rawContent) : rawContent,
|
|
277
|
+
position: calculatePosition(html, currentPos)
|
|
278
|
+
});
|
|
279
|
+
}
|
|
280
|
+
currentPos += rawContent.length;
|
|
281
|
+
}
|
|
282
|
+
}
|
|
225
283
|
} else {
|
|
226
284
|
const textStart = currentPos;
|
|
227
285
|
currentPos++;
|
|
@@ -257,8 +315,6 @@ export function tokenize(html: string): Token[] {
|
|
|
257
315
|
}
|
|
258
316
|
}
|
|
259
317
|
|
|
260
|
-
tokens.sort((a, b) => a.position.offset - b.position.offset);
|
|
261
|
-
|
|
262
318
|
tokens.push({
|
|
263
319
|
type: TokenType.EOF,
|
|
264
320
|
value: '',
|
package/tests/advanced.test.ts
CHANGED
|
@@ -60,7 +60,7 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
60
60
|
});
|
|
61
61
|
});
|
|
62
62
|
|
|
63
|
-
it('should handle complex CDATA content', () => {
|
|
63
|
+
it('should handle complex CDATA content as bogus comment', () => {
|
|
64
64
|
const complexContent = `
|
|
65
65
|
function test() {
|
|
66
66
|
return "<div>HTML inside JS</div>";
|
|
@@ -71,8 +71,8 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
|
|
|
71
71
|
expect(tokens.length).toBeGreaterThan(0);
|
|
72
72
|
const cdataToken = tokens[0]!;
|
|
73
73
|
|
|
74
|
-
expect(cdataToken.type).toBe(TokenType.
|
|
75
|
-
expect(cdataToken.value).toBe(complexContent);
|
|
74
|
+
expect(cdataToken.type).toBe(TokenType.COMMENT);
|
|
75
|
+
expect(cdataToken.value).toBe('[CDATA[' + complexContent + ']]');
|
|
76
76
|
});
|
|
77
77
|
|
|
78
78
|
it('should handle performance with large documents', () => {
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import { describe, it, expect } from 'bun:test';
|
|
2
|
+
import { parseHTML } from '../index';
|
|
3
|
+
|
|
4
|
+
describe('Custom Elements in <head>', () => {
|
|
5
|
+
|
|
6
|
+
it('should keep <meta-tags> custom element in head', () => {
|
|
7
|
+
const doc = parseHTML(
|
|
8
|
+
'<!DOCTYPE html><html><head><meta-tags></meta-tags></head><body></body></html>'
|
|
9
|
+
);
|
|
10
|
+
|
|
11
|
+
const metaTags = doc.head?.querySelector('meta-tags');
|
|
12
|
+
expect(metaTags).toBeTruthy();
|
|
13
|
+
expect(metaTags?.parentElement?.tagName).toBe('HEAD');
|
|
14
|
+
});
|
|
15
|
+
|
|
16
|
+
it('should keep <social-meta> custom element in head', () => {
|
|
17
|
+
const doc = parseHTML(
|
|
18
|
+
'<!DOCTYPE html><html><head><social-meta></social-meta></head><body></body></html>'
|
|
19
|
+
);
|
|
20
|
+
|
|
21
|
+
const socialMeta = doc.head?.querySelector('social-meta');
|
|
22
|
+
expect(socialMeta).toBeTruthy();
|
|
23
|
+
expect(socialMeta?.parentElement?.tagName).toBe('HEAD');
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
it('should keep any <custom-element> with hyphen in head', () => {
|
|
27
|
+
const doc = parseHTML(
|
|
28
|
+
'<!DOCTYPE html><html><head><my-component></my-component></head><body></body></html>'
|
|
29
|
+
);
|
|
30
|
+
|
|
31
|
+
const myComponent = doc.head?.querySelector('my-component');
|
|
32
|
+
expect(myComponent).toBeTruthy();
|
|
33
|
+
expect(myComponent?.parentElement?.tagName).toBe('HEAD');
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
it('should still eject non-custom elements like <div> to body', () => {
|
|
37
|
+
const doc = parseHTML(
|
|
38
|
+
'<!DOCTYPE html><html><head><div>test</div></head><body></body></html>'
|
|
39
|
+
);
|
|
40
|
+
|
|
41
|
+
const divInHead = doc.head?.querySelector('div');
|
|
42
|
+
const divInBody = doc.body?.querySelector('div');
|
|
43
|
+
expect(divInHead).toBeFalsy();
|
|
44
|
+
expect(divInBody).toBeTruthy();
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
it('should handle nested custom elements in head', () => {
|
|
48
|
+
const doc = parseHTML(
|
|
49
|
+
'<!DOCTYPE html><html><head><my-wrapper><inner-comp></inner-comp></my-wrapper></head><body></body></html>'
|
|
50
|
+
);
|
|
51
|
+
|
|
52
|
+
const myWrapper = doc.head?.querySelector('my-wrapper');
|
|
53
|
+
expect(myWrapper).toBeTruthy();
|
|
54
|
+
expect(myWrapper?.parentElement?.tagName).toBe('HEAD');
|
|
55
|
+
|
|
56
|
+
const innerComp = myWrapper?.querySelector('inner-comp');
|
|
57
|
+
expect(innerComp).toBeTruthy();
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it('should keep custom elements with attributes in head', () => {
|
|
61
|
+
const doc = parseHTML(
|
|
62
|
+
'<!DOCTYPE html><html><head><seo-meta property="og:title" content="Test"></seo-meta></head><body></body></html>'
|
|
63
|
+
);
|
|
64
|
+
|
|
65
|
+
const seoMeta = doc.head?.querySelector('seo-meta');
|
|
66
|
+
expect(seoMeta).toBeTruthy();
|
|
67
|
+
expect(seoMeta?.getAttribute('property')).toBe('og:title');
|
|
68
|
+
expect(seoMeta?.getAttribute('content')).toBe('Test');
|
|
69
|
+
expect(seoMeta?.parentElement?.tagName).toBe('HEAD');
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
it('should keep self-closing custom elements in head', () => {
|
|
73
|
+
const doc = parseHTML(
|
|
74
|
+
'<!DOCTYPE html><html><head><custom-void /></head><body></body></html>'
|
|
75
|
+
);
|
|
76
|
+
|
|
77
|
+
const customVoid = doc.head?.querySelector('custom-void');
|
|
78
|
+
expect(customVoid).toBeTruthy();
|
|
79
|
+
expect(customVoid?.parentElement?.tagName).toBe('HEAD');
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
it('should handle custom elements mixed with standard head elements', () => {
|
|
83
|
+
const doc = parseHTML(
|
|
84
|
+
'<!DOCTYPE html><html><head><title>Test</title><meta-tags></meta-tags><link rel="stylesheet" href="style.css"></head><body></body></html>'
|
|
85
|
+
);
|
|
86
|
+
|
|
87
|
+
const title = doc.head?.querySelector('title');
|
|
88
|
+
const metaTags = doc.head?.querySelector('meta-tags');
|
|
89
|
+
const link = doc.head?.querySelector('link');
|
|
90
|
+
|
|
91
|
+
expect(title).toBeTruthy();
|
|
92
|
+
expect(metaTags).toBeTruthy();
|
|
93
|
+
expect(link).toBeTruthy();
|
|
94
|
+
});
|
|
95
|
+
|
|
96
|
+
it('should handle custom element containing text in head', () => {
|
|
97
|
+
const doc = parseHTML(
|
|
98
|
+
'<!DOCTYPE html><html><head><inline-script>console.log("test")</inline-script></head><body></body></html>'
|
|
99
|
+
);
|
|
100
|
+
|
|
101
|
+
const inlineScript = doc.head?.querySelector('inline-script');
|
|
102
|
+
expect(inlineScript).toBeTruthy();
|
|
103
|
+
expect(inlineScript?.parentElement?.tagName).toBe('HEAD');
|
|
104
|
+
});
|
|
105
|
+
});
|
|
@@ -27,7 +27,7 @@ describe('Tree Adapter Tests', () => {
|
|
|
27
27
|
it('should serialize comment', () => {
|
|
28
28
|
const doc = parseHTML('<div><!-- comment --></div>');
|
|
29
29
|
const serialized = serializeToHtml5lib(doc);
|
|
30
|
-
expect(serialized).toContain('<!-- -->');
|
|
30
|
+
expect(serialized).toContain('<!-- comment -->');
|
|
31
31
|
});
|
|
32
32
|
|
|
33
33
|
it('should serialize DOCTYPE', () => {
|
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
// tests/helpers/tree-adapter.ts
|
|
2
2
|
|
|
3
|
-
export
|
|
3
|
+
export interface SerializeOptions {
|
|
4
|
+
skipImplicitDoctype?: boolean;
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
export function serializeToHtml5lib(doc: any, options: SerializeOptions = {}): string {
|
|
4
8
|
const lines: string[] = [];
|
|
5
9
|
|
|
6
10
|
function serialize(node: any, depth: number): void {
|
|
@@ -11,7 +15,17 @@ export function serializeToHtml5lib(doc: any): string {
|
|
|
11
15
|
serialize(child, depth);
|
|
12
16
|
}
|
|
13
17
|
} else if (node.nodeType === 1) { // ELEMENT
|
|
14
|
-
|
|
18
|
+
const tagName = node.tagName.toLowerCase();
|
|
19
|
+
const ns = node.namespaceURI;
|
|
20
|
+
|
|
21
|
+
let nsPrefix = '';
|
|
22
|
+
if (ns === 'http://www.w3.org/2000/svg') {
|
|
23
|
+
nsPrefix = ' svg';
|
|
24
|
+
} else if (ns === 'http://www.w3.org/1998/Math/MathML') {
|
|
25
|
+
nsPrefix = ' math';
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
lines.push(`${indent}<${tagName}${nsPrefix}>`);
|
|
15
29
|
|
|
16
30
|
// Atributos en orden alfabético
|
|
17
31
|
const attrs = Object.entries(node.attributes || {}).sort(([a], [b]) => a.localeCompare(b));
|
|
@@ -32,9 +46,12 @@ export function serializeToHtml5lib(doc: any): string {
|
|
|
32
46
|
} else if (node.nodeType === 3) { // TEXT
|
|
33
47
|
lines.push(`${indent}"${node.textContent}"`);
|
|
34
48
|
} else if (node.nodeType === 8) { // COMMENT
|
|
35
|
-
|
|
49
|
+
const commentData = node.data || node.nodeValue || node.textContent || '';
|
|
50
|
+
lines.push(`${indent}<!-- ${commentData} -->`);
|
|
36
51
|
} else if (node.nodeType === 10) { // DOCTYPE
|
|
37
|
-
|
|
52
|
+
if (!options.skipImplicitDoctype) {
|
|
53
|
+
lines.push(`${indent}<!DOCTYPE ${node.name || 'html'}>`);
|
|
54
|
+
}
|
|
38
55
|
}
|
|
39
56
|
}
|
|
40
57
|
|
package/tests/parser.test.ts
CHANGED
|
@@ -18,7 +18,8 @@ function parseToAST(html: string): ASTNode {
|
|
|
18
18
|
if (htmlEl) {
|
|
19
19
|
const bodyEl = htmlEl.children?.find(c => c.tagName === 'body');
|
|
20
20
|
if (bodyEl && bodyEl.children) {
|
|
21
|
-
|
|
21
|
+
const nonHtmlChildren = ast.children?.filter(c => c.tagName !== 'html' && c.type !== 'doctype') || [];
|
|
22
|
+
return { type: ASTNodeType.Document, children: [...nonHtmlChildren, ...bodyEl.children] };
|
|
22
23
|
}
|
|
23
24
|
}
|
|
24
25
|
return ast;
|
package/tests/tokenizer.test.ts
CHANGED
|
@@ -198,21 +198,21 @@ describe('HTML Tokenizer', () => {
|
|
|
198
198
|
});
|
|
199
199
|
});
|
|
200
200
|
|
|
201
|
-
describe('CDATA Sections', () => {
|
|
202
|
-
it('should parse CDATA sections', () => {
|
|
201
|
+
describe('CDATA Sections (HTML5: treated as bogus comments)', () => {
|
|
202
|
+
it('should parse CDATA sections as bogus comments in HTML5', () => {
|
|
203
203
|
const tokens = tokenize('<![CDATA[Some data]]>');
|
|
204
204
|
|
|
205
205
|
expect(tokens[0]).toEqual({
|
|
206
|
-
type: TokenType.
|
|
207
|
-
value: 'Some data',
|
|
206
|
+
type: TokenType.COMMENT,
|
|
207
|
+
value: '[CDATA[Some data]]',
|
|
208
208
|
position: expect.any(Object)
|
|
209
209
|
});
|
|
210
210
|
});
|
|
211
211
|
|
|
212
|
-
it('should handle CDATA with special characters', () => {
|
|
212
|
+
it('should handle CDATA with special characters as bogus comment', () => {
|
|
213
213
|
const tokens = tokenize('<![CDATA[<script>alert("test");</script>]]>');
|
|
214
214
|
|
|
215
|
-
expect(tokens[0]?.value).toBe('<script>alert("test");</script>');
|
|
215
|
+
expect(tokens[0]?.value).toBe('[CDATA[<script>alert("test");</script>]]');
|
|
216
216
|
});
|
|
217
217
|
});
|
|
218
218
|
|
|
@@ -235,22 +235,22 @@ describe('HTML Tokenizer', () => {
|
|
|
235
235
|
});
|
|
236
236
|
});
|
|
237
237
|
|
|
238
|
-
describe('Processing Instructions', () => {
|
|
239
|
-
it('should parse XML processing instruction', () => {
|
|
238
|
+
describe('Processing Instructions (HTML5: treated as bogus comments)', () => {
|
|
239
|
+
it('should parse XML processing instruction as bogus comment', () => {
|
|
240
240
|
const tokens = tokenize('<?xml version="1.0" encoding="UTF-8"?>');
|
|
241
241
|
|
|
242
242
|
expect(tokens[0]).toEqual({
|
|
243
|
-
type: TokenType.
|
|
244
|
-
value: '
|
|
243
|
+
type: TokenType.COMMENT,
|
|
244
|
+
value: '?xml version="1.0" encoding="UTF-8"?',
|
|
245
245
|
position: expect.any(Object)
|
|
246
246
|
});
|
|
247
247
|
});
|
|
248
248
|
|
|
249
|
-
it('should parse PHP-style processing instruction', () => {
|
|
249
|
+
it('should parse PHP-style processing instruction as bogus comment', () => {
|
|
250
250
|
const tokens = tokenize('<?php echo "Hello"; ?>');
|
|
251
251
|
|
|
252
|
-
expect(tokens[0]?.type).toBe(TokenType.
|
|
253
|
-
expect(tokens[0]?.value).toBe('
|
|
252
|
+
expect(tokens[0]?.type).toBe(TokenType.COMMENT);
|
|
253
|
+
expect(tokens[0]?.value).toBe('?php echo "Hello"; ?');
|
|
254
254
|
});
|
|
255
255
|
});
|
|
256
256
|
|
|
@@ -429,7 +429,7 @@ describe('HTML Tokenizer', () => {
|
|
|
429
429
|
});
|
|
430
430
|
});
|
|
431
431
|
|
|
432
|
-
it('should handle CDATA with complex content', () => {
|
|
432
|
+
it('should handle CDATA as bogus comment with complex content', () => {
|
|
433
433
|
const complexContent = `
|
|
434
434
|
function it() {
|
|
435
435
|
return "<div>HTML inside JS</div>";
|
|
@@ -440,11 +440,11 @@ describe('HTML Tokenizer', () => {
|
|
|
440
440
|
const tokens = tokenize(`<![CDATA[${complexContent}]]>`);
|
|
441
441
|
const cdataToken = tokens[0]!;
|
|
442
442
|
|
|
443
|
-
expect(cdataToken.type).toBe(TokenType.
|
|
444
|
-
expect(cdataToken.value).toBe(complexContent);
|
|
443
|
+
expect(cdataToken.type).toBe(TokenType.COMMENT);
|
|
444
|
+
expect(cdataToken.value).toBe('[CDATA[' + complexContent + ']]');
|
|
445
445
|
});
|
|
446
446
|
|
|
447
|
-
it('should handle processing instructions
|
|
447
|
+
it('should handle processing instructions as bogus comments', () => {
|
|
448
448
|
const tests = [
|
|
449
449
|
{ input: '<?xml version="1.0" encoding="UTF-8"?>', expected: 'xml' },
|
|
450
450
|
{ input: '<?xml-stylesheet type="text/xsl" href="style.xsl"?>', expected: 'xml' },
|
|
@@ -456,7 +456,7 @@ describe('HTML Tokenizer', () => {
|
|
|
456
456
|
const tokens = tokenize(test.input);
|
|
457
457
|
const piToken = tokens[0]!;
|
|
458
458
|
|
|
459
|
-
expect(piToken.type).toBe(TokenType.
|
|
459
|
+
expect(piToken.type).toBe(TokenType.COMMENT);
|
|
460
460
|
expect(piToken.value.toLowerCase()).toContain(test.expected);
|
|
461
461
|
});
|
|
462
462
|
});
|
|
@@ -478,15 +478,13 @@ describe('HTML Tokenizer', () => {
|
|
|
478
478
|
});
|
|
479
479
|
});
|
|
480
480
|
|
|
481
|
-
it('should handle mixed content with all token types', () => {
|
|
481
|
+
it('should handle mixed content with all token types (HTML5 mode)', () => {
|
|
482
482
|
const html = `
|
|
483
|
-
<?xml version="1.0"?>
|
|
484
483
|
<!DOCTYPE html>
|
|
485
484
|
<!-- Main document -->
|
|
486
485
|
<html lang="en">
|
|
487
486
|
<head>
|
|
488
487
|
<title>Test & Demo</title>
|
|
489
|
-
<![CDATA[Some raw data]]>
|
|
490
488
|
</head>
|
|
491
489
|
<body>
|
|
492
490
|
<h1>Hello World</h1>
|
|
@@ -500,27 +498,25 @@ describe('HTML Tokenizer', () => {
|
|
|
500
498
|
const tokens = tokenize(html);
|
|
501
499
|
|
|
502
500
|
const tokenCounts = {
|
|
503
|
-
[TokenType.PROCESSING_INSTRUCTION]: 0,
|
|
504
501
|
[TokenType.DOCTYPE]: 0,
|
|
505
502
|
[TokenType.COMMENT]: 0,
|
|
506
503
|
[TokenType.TAG_OPEN]: 0,
|
|
507
504
|
[TokenType.TAG_CLOSE]: 0,
|
|
508
505
|
[TokenType.TEXT]: 0,
|
|
509
|
-
[TokenType.CDATA]: 0,
|
|
510
506
|
[TokenType.EOF]: 0
|
|
511
507
|
};
|
|
512
508
|
|
|
513
509
|
tokens.forEach(token => {
|
|
514
|
-
|
|
510
|
+
if (token.type in tokenCounts) {
|
|
511
|
+
tokenCounts[token.type]++;
|
|
512
|
+
}
|
|
515
513
|
});
|
|
516
514
|
|
|
517
|
-
expect(tokenCounts[TokenType.PROCESSING_INSTRUCTION]).toBeGreaterThan(0);
|
|
518
515
|
expect(tokenCounts[TokenType.DOCTYPE]).toBeGreaterThan(0);
|
|
519
516
|
expect(tokenCounts[TokenType.COMMENT]).toBeGreaterThan(0);
|
|
520
517
|
expect(tokenCounts[TokenType.TAG_OPEN]).toBeGreaterThan(0);
|
|
521
518
|
expect(tokenCounts[TokenType.TAG_CLOSE]).toBeGreaterThan(0);
|
|
522
519
|
expect(tokenCounts[TokenType.TEXT]).toBeGreaterThan(0);
|
|
523
|
-
expect(tokenCounts[TokenType.CDATA]).toBeGreaterThan(0);
|
|
524
520
|
expect(tokenCounts[TokenType.EOF]).toBe(1);
|
|
525
521
|
});
|
|
526
522
|
})
|