@tkeron/html-parser 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm_deploy.yml +24 -0
- package/LICENSE +21 -0
- package/README.md +120 -0
- package/bun.lock +29 -0
- package/index.ts +18 -0
- package/package.json +25 -0
- package/src/css-selector.ts +172 -0
- package/src/dom-simulator.ts +592 -0
- package/src/dom-types.ts +78 -0
- package/src/parser.ts +355 -0
- package/src/tokenizer.ts +413 -0
- package/tests/advanced.test.ts +487 -0
- package/tests/api-integration.test.ts +114 -0
- package/tests/dom-extended.test.ts +173 -0
- package/tests/dom.test.ts +482 -0
- package/tests/google-dom.test.ts +118 -0
- package/tests/google-homepage.txt +13 -0
- package/tests/official/README.md +87 -0
- package/tests/official/acid/acid-tests.test.ts +309 -0
- package/tests/official/final-output/final-output.test.ts +361 -0
- package/tests/official/html5lib/tokenizer-utils.ts +204 -0
- package/tests/official/html5lib/tokenizer.test.ts +184 -0
- package/tests/official/html5lib/tree-construction-utils.ts +208 -0
- package/tests/official/html5lib/tree-construction.test.ts +250 -0
- package/tests/official/validator/validator-tests.test.ts +237 -0
- package/tests/official/validator-nu/validator-nu.test.ts +335 -0
- package/tests/official/whatwg/whatwg-tests.test.ts +205 -0
- package/tests/official/wpt/wpt-tests.test.ts +409 -0
- package/tests/parser.test.ts +642 -0
- package/tests/selectors.test.ts +65 -0
- package/tests/test-page-0.txt +362 -0
- package/tests/tokenizer.test.ts +666 -0
- package/tsconfig.json +25 -0
package/src/tokenizer.ts
ADDED
|
@@ -0,0 +1,413 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML Tokenizer using Bun's HTMLRewriter for efficient HTML parsing
|
|
3
|
+
* This tokenizer provides a stream-based approach to HTML parsing
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export enum TokenType {
|
|
7
|
+
TAG_OPEN = 'TAG_OPEN',
|
|
8
|
+
TAG_CLOSE = 'TAG_CLOSE',
|
|
9
|
+
TEXT = 'TEXT',
|
|
10
|
+
COMMENT = 'COMMENT',
|
|
11
|
+
CDATA = 'CDATA',
|
|
12
|
+
DOCTYPE = 'DOCTYPE',
|
|
13
|
+
PROCESSING_INSTRUCTION = 'PROCESSING_INSTRUCTION',
|
|
14
|
+
EOF = 'EOF'
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface Position {
|
|
18
|
+
line: number;
|
|
19
|
+
column: number;
|
|
20
|
+
offset: number;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export interface Token {
|
|
24
|
+
type: TokenType;
|
|
25
|
+
value: string;
|
|
26
|
+
position: Position;
|
|
27
|
+
attributes?: Record<string, string>;
|
|
28
|
+
isSelfClosing?: boolean;
|
|
29
|
+
isClosing?: boolean;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// HTML entities mapping
|
|
33
|
+
const HTML_ENTITIES: Record<string, string> = {
|
|
34
|
+
'&': '&',
|
|
35
|
+
'<': '<',
|
|
36
|
+
'>': '>',
|
|
37
|
+
'"': '"',
|
|
38
|
+
''': "'",
|
|
39
|
+
' ': '\u00A0',
|
|
40
|
+
'©': '©',
|
|
41
|
+
'®': '®',
|
|
42
|
+
'™': '™',
|
|
43
|
+
'…': '…',
|
|
44
|
+
'—': '—',
|
|
45
|
+
'–': '–',
|
|
46
|
+
'‘': '\u2018',
|
|
47
|
+
'’': '\u2019',
|
|
48
|
+
'“': '\u201C',
|
|
49
|
+
'”': '\u201D',
|
|
50
|
+
'¬': '¬'
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Decode HTML entities in a string and handle null characters
|
|
55
|
+
*/
|
|
56
|
+
function decodeEntities(text: string): string {
|
|
57
|
+
// First, replace null characters with the Unicode replacement character
|
|
58
|
+
let result = text.replace(/\u0000/g, '\uFFFD');
|
|
59
|
+
|
|
60
|
+
// Then decode HTML entities
|
|
61
|
+
return result.replace(/&(?:#x([0-9a-fA-F]+);?|#([0-9]+);?|([a-zA-Z][a-zA-Z0-9]*);?)/g, (match, hex, decimal, named) => {
|
|
62
|
+
if (hex) {
|
|
63
|
+
return String.fromCharCode(parseInt(hex, 16));
|
|
64
|
+
}
|
|
65
|
+
if (decimal) {
|
|
66
|
+
return String.fromCharCode(parseInt(decimal, 10));
|
|
67
|
+
}
|
|
68
|
+
if (named) {
|
|
69
|
+
// First try with semicolon
|
|
70
|
+
if (HTML_ENTITIES[`&${named};`]) {
|
|
71
|
+
return HTML_ENTITIES[`&${named};`];
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// For entities without semicolon, try to find the longest valid entity prefix
|
|
75
|
+
if (!match.endsWith(';')) {
|
|
76
|
+
for (let i = named.length; i > 0; i--) {
|
|
77
|
+
const prefix = named.substring(0, i);
|
|
78
|
+
if (HTML_ENTITIES[`&${prefix};`]) {
|
|
79
|
+
const remainder = named.substring(i);
|
|
80
|
+
return HTML_ENTITIES[`&${prefix};`] + remainder;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
return match;
|
|
86
|
+
}
|
|
87
|
+
return match;
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Parse attributes from a tag string
|
|
93
|
+
*/
|
|
94
|
+
function parseAttributes(attributeString: string): Record<string, string> {
|
|
95
|
+
const attributes: Record<string, string> = {};
|
|
96
|
+
|
|
97
|
+
// Regex to match attributes: name="value", name='value', name=value, or just name
|
|
98
|
+
const attrRegex = /([a-zA-Z][a-zA-Z0-9\-_:]*)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+)))?/g;
|
|
99
|
+
let match;
|
|
100
|
+
|
|
101
|
+
while ((match = attrRegex.exec(attributeString)) !== null) {
|
|
102
|
+
const [, name, doubleQuoted, singleQuoted, unquoted] = match;
|
|
103
|
+
if (name) {
|
|
104
|
+
const value = doubleQuoted ?? singleQuoted ?? unquoted ?? '';
|
|
105
|
+
attributes[name.toLowerCase()] = decodeEntities(value);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
return attributes;
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
/**
|
|
113
|
+
* Calculate position in text
|
|
114
|
+
*/
|
|
115
|
+
function calculatePosition(text: string, offset: number): Position {
|
|
116
|
+
const lines = text.slice(0, offset).split('\n');
|
|
117
|
+
return {
|
|
118
|
+
line: lines.length,
|
|
119
|
+
column: lines[lines.length - 1]?.length ?? 0,
|
|
120
|
+
offset
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/**
|
|
125
|
+
* Tokenize HTML using a combination of HTMLRewriter and manual parsing
|
|
126
|
+
* HTMLRewriter is great for structured HTML but we need manual parsing for edge cases
|
|
127
|
+
*/
|
|
128
|
+
export function tokenize(html: string): Token[] {
|
|
129
|
+
const tokens: Token[] = [];
|
|
130
|
+
let position = 0;
|
|
131
|
+
|
|
132
|
+
// Handle special cases first (DOCTYPE, comments, CDATA, processing instructions)
|
|
133
|
+
const specialCases = [
|
|
134
|
+
// DOCTYPE
|
|
135
|
+
{
|
|
136
|
+
pattern: /<!DOCTYPE\s+[^>]*>/gi,
|
|
137
|
+
type: TokenType.DOCTYPE,
|
|
138
|
+
getValue: (match: string) => {
|
|
139
|
+
// Extract just the doctype name (e.g., "html" from "<!DOCTYPE html>")
|
|
140
|
+
const doctypeMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
|
|
141
|
+
return doctypeMatch && doctypeMatch[1] ? doctypeMatch[1].toLowerCase() : match;
|
|
142
|
+
}
|
|
143
|
+
},
|
|
144
|
+
// Comments (including unclosed ones)
|
|
145
|
+
{
|
|
146
|
+
pattern: /<!--([\s\S]*?)(?:-->|$)/g,
|
|
147
|
+
type: TokenType.COMMENT,
|
|
148
|
+
getValue: (match: string) => match.slice(4, match.endsWith('-->') ? -3 : match.length)
|
|
149
|
+
},
|
|
150
|
+
// CDATA
|
|
151
|
+
{
|
|
152
|
+
pattern: /<!\[CDATA\[([\s\S]*?)\]\]>/g,
|
|
153
|
+
type: TokenType.CDATA,
|
|
154
|
+
getValue: (match: string) => match.slice(9, -3)
|
|
155
|
+
},
|
|
156
|
+
// Processing Instructions
|
|
157
|
+
{
|
|
158
|
+
pattern: /<\?([^?]*(?:\?(?!>)[^?]*)*)\?>/g,
|
|
159
|
+
type: TokenType.PROCESSING_INSTRUCTION,
|
|
160
|
+
getValue: (match: string) => match.slice(0, -2) // Remove the ?> at the end
|
|
161
|
+
}
|
|
162
|
+
];
|
|
163
|
+
|
|
164
|
+
// Track processed ranges to avoid double processing
|
|
165
|
+
const processedRanges: Array<[number, number]> = [];
|
|
166
|
+
|
|
167
|
+
// Process special cases first
|
|
168
|
+
for (const { pattern, type, getValue } of specialCases) {
|
|
169
|
+
const regex = new RegExp(pattern);
|
|
170
|
+
let match;
|
|
171
|
+
|
|
172
|
+
while ((match = regex.exec(html)) !== null) {
|
|
173
|
+
const start = match.index;
|
|
174
|
+
const end = start + match[0].length;
|
|
175
|
+
|
|
176
|
+
tokens.push({
|
|
177
|
+
type,
|
|
178
|
+
value: getValue(match[0]),
|
|
179
|
+
position: calculatePosition(html, start)
|
|
180
|
+
});
|
|
181
|
+
|
|
182
|
+
processedRanges.push([start, end]);
|
|
183
|
+
}
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Sort processed ranges by start position
|
|
187
|
+
processedRanges.sort((a, b) => a[0] - b[0]);
|
|
188
|
+
|
|
189
|
+
// Process remaining HTML with manual parsing
|
|
190
|
+
let currentPos = 0;
|
|
191
|
+
|
|
192
|
+
while (currentPos < html.length) {
|
|
193
|
+
// Check if current position is in a processed range
|
|
194
|
+
const inProcessedRange = processedRanges.some(([start, end]) =>
|
|
195
|
+
currentPos >= start && currentPos < end
|
|
196
|
+
);
|
|
197
|
+
|
|
198
|
+
if (inProcessedRange) {
|
|
199
|
+
// Skip to end of processed range
|
|
200
|
+
const range = processedRanges.find(([start, end]) =>
|
|
201
|
+
currentPos >= start && currentPos < end
|
|
202
|
+
);
|
|
203
|
+
if (range) {
|
|
204
|
+
currentPos = range[1];
|
|
205
|
+
}
|
|
206
|
+
continue;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
const char = html[currentPos];
|
|
210
|
+
|
|
211
|
+
if (char === '<') {
|
|
212
|
+
// Check if it's a tag
|
|
213
|
+
const tagMatch = html.slice(currentPos).match(/^<\/?([a-zA-Z][a-zA-Z0-9]*)[^>]*>/);
|
|
214
|
+
|
|
215
|
+
if (tagMatch) {
|
|
216
|
+
const fullTag = tagMatch[0];
|
|
217
|
+
const tagName = tagMatch[1]?.toLowerCase();
|
|
218
|
+
|
|
219
|
+
if (!tagName) {
|
|
220
|
+
currentPos++;
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const isClosing = fullTag.startsWith('</');
|
|
225
|
+
const isSelfClosing = fullTag.endsWith('/>');
|
|
226
|
+
|
|
227
|
+
// Parse attributes if it's an opening tag
|
|
228
|
+
let attributes: Record<string, string> = {};
|
|
229
|
+
if (!isClosing) {
|
|
230
|
+
const attrMatch = fullTag.match(/^<[a-zA-Z][a-zA-Z0-9]*\s+([^>]*?)\/?>$/);
|
|
231
|
+
if (attrMatch && attrMatch[1]) {
|
|
232
|
+
attributes = parseAttributes(attrMatch[1]);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
tokens.push({
|
|
237
|
+
type: isClosing ? TokenType.TAG_CLOSE : TokenType.TAG_OPEN,
|
|
238
|
+
value: tagName,
|
|
239
|
+
position: calculatePosition(html, currentPos),
|
|
240
|
+
...(isClosing ? { isClosing: true } : {
|
|
241
|
+
attributes,
|
|
242
|
+
isSelfClosing
|
|
243
|
+
})
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
currentPos += fullTag.length;
|
|
247
|
+
} else {
|
|
248
|
+
// Not a valid tag, treat as text
|
|
249
|
+
const textStart = currentPos;
|
|
250
|
+
currentPos++;
|
|
251
|
+
|
|
252
|
+
// Find the end of text (next '<' or end of string)
|
|
253
|
+
while (currentPos < html.length && html[currentPos] !== '<') {
|
|
254
|
+
currentPos++;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
const textContent = html.slice(textStart, currentPos);
|
|
258
|
+
if (textContent) { // Keep all text content, including whitespace-only
|
|
259
|
+
tokens.push({
|
|
260
|
+
type: TokenType.TEXT,
|
|
261
|
+
value: decodeEntities(textContent),
|
|
262
|
+
position: calculatePosition(html, textStart)
|
|
263
|
+
});
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
} else {
|
|
267
|
+
// Text content
|
|
268
|
+
const textStart = currentPos;
|
|
269
|
+
|
|
270
|
+
// Find the end of text (next '<' or end of string)
|
|
271
|
+
while (currentPos < html.length && html[currentPos] !== '<') {
|
|
272
|
+
currentPos++;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
const textContent = html.slice(textStart, currentPos);
|
|
276
|
+
if (textContent) { // Keep all text content, including whitespace-only
|
|
277
|
+
tokens.push({
|
|
278
|
+
type: TokenType.TEXT,
|
|
279
|
+
value: decodeEntities(textContent),
|
|
280
|
+
position: calculatePosition(html, textStart)
|
|
281
|
+
});
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// Sort tokens by position
|
|
287
|
+
tokens.sort((a, b) => a.position.offset - b.position.offset);
|
|
288
|
+
|
|
289
|
+
// Add EOF token
|
|
290
|
+
tokens.push({
|
|
291
|
+
type: TokenType.EOF,
|
|
292
|
+
value: '',
|
|
293
|
+
position: calculatePosition(html, html.length)
|
|
294
|
+
});
|
|
295
|
+
|
|
296
|
+
return tokens;
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
/**
|
|
300
|
+
* Enhanced tokenizer that uses HTMLRewriter for better performance on large HTML
|
|
301
|
+
* This is more efficient for well-formed HTML documents
|
|
302
|
+
*/
|
|
303
|
+
export function tokenizeWithRewriter(html: string): Token[] {
|
|
304
|
+
const tokens: Token[] = [];
|
|
305
|
+
let textBuffer = '';
|
|
306
|
+
let position = 0;
|
|
307
|
+
|
|
308
|
+
// First pass: collect all tokens using HTMLRewriter
|
|
309
|
+
const rewriter = new HTMLRewriter();
|
|
310
|
+
|
|
311
|
+
// Handle all elements
|
|
312
|
+
rewriter.on('*', {
|
|
313
|
+
element(element) {
|
|
314
|
+
// Flush any accumulated text
|
|
315
|
+
if (textBuffer.trim()) {
|
|
316
|
+
tokens.push({
|
|
317
|
+
type: TokenType.TEXT,
|
|
318
|
+
value: decodeEntities(textBuffer),
|
|
319
|
+
position: calculatePosition(html, position - textBuffer.length)
|
|
320
|
+
});
|
|
321
|
+
textBuffer = '';
|
|
322
|
+
}
|
|
323
|
+
|
|
324
|
+
// Add opening tag
|
|
325
|
+
const attributes: Record<string, string> = {};
|
|
326
|
+
for (const [name, value] of element.attributes) {
|
|
327
|
+
attributes[name] = value;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
tokens.push({
|
|
331
|
+
type: TokenType.TAG_OPEN,
|
|
332
|
+
value: element.tagName.toLowerCase(),
|
|
333
|
+
position: calculatePosition(html, position),
|
|
334
|
+
attributes,
|
|
335
|
+
isSelfClosing: element.selfClosing
|
|
336
|
+
});
|
|
337
|
+
|
|
338
|
+
// Handle self-closing tags
|
|
339
|
+
if (!element.selfClosing) {
|
|
340
|
+
// We'll add the closing tag in the end handler
|
|
341
|
+
element.onEndTag((endTag) => {
|
|
342
|
+
tokens.push({
|
|
343
|
+
type: TokenType.TAG_CLOSE,
|
|
344
|
+
value: endTag.name.toLowerCase(),
|
|
345
|
+
position: calculatePosition(html, position),
|
|
346
|
+
isClosing: true
|
|
347
|
+
});
|
|
348
|
+
});
|
|
349
|
+
}
|
|
350
|
+
},
|
|
351
|
+
|
|
352
|
+
text(text) {
|
|
353
|
+
textBuffer += text.text;
|
|
354
|
+
},
|
|
355
|
+
|
|
356
|
+
comments(comment) {
|
|
357
|
+
tokens.push({
|
|
358
|
+
type: TokenType.COMMENT,
|
|
359
|
+
value: comment.text,
|
|
360
|
+
position: calculatePosition(html, position)
|
|
361
|
+
});
|
|
362
|
+
}
|
|
363
|
+
});
|
|
364
|
+
|
|
365
|
+
try {
|
|
366
|
+
// Transform the HTML (this triggers the rewriter)
|
|
367
|
+
const response = new Response(html, {
|
|
368
|
+
headers: { 'Content-Type': 'text/html' }
|
|
369
|
+
});
|
|
370
|
+
|
|
371
|
+
rewriter.transform(response);
|
|
372
|
+
|
|
373
|
+
// Flush any remaining text
|
|
374
|
+
if (textBuffer.trim()) {
|
|
375
|
+
tokens.push({
|
|
376
|
+
type: TokenType.TEXT,
|
|
377
|
+
value: decodeEntities(textBuffer),
|
|
378
|
+
position: calculatePosition(html, position - textBuffer.length)
|
|
379
|
+
});
|
|
380
|
+
}
|
|
381
|
+
|
|
382
|
+
} catch (error) {
|
|
383
|
+
// If HTMLRewriter fails, fall back to manual parsing
|
|
384
|
+
console.warn('HTMLRewriter failed, falling back to manual parsing:', error);
|
|
385
|
+
return tokenize(html);
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
// Sort tokens by position and add EOF
|
|
389
|
+
tokens.sort((a, b) => a.position.offset - b.position.offset);
|
|
390
|
+
tokens.push({
|
|
391
|
+
type: TokenType.EOF,
|
|
392
|
+
value: '',
|
|
393
|
+
position: calculatePosition(html, html.length)
|
|
394
|
+
});
|
|
395
|
+
|
|
396
|
+
return tokens;
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
/**
|
|
400
|
+
* Smart tokenizer that chooses the best method based on HTML content
|
|
401
|
+
*/
|
|
402
|
+
export function smartTokenize(html: string): Token[] {
|
|
403
|
+
// Use HTMLRewriter for well-formed HTML, manual parsing for edge cases
|
|
404
|
+
const hasSpecialContent = /<!DOCTYPE|<!--|\[CDATA\[|<\?/.test(html);
|
|
405
|
+
|
|
406
|
+
if (hasSpecialContent || html.length < 1000) {
|
|
407
|
+
// Use manual parsing for small HTML or HTML with special content
|
|
408
|
+
return tokenize(html);
|
|
409
|
+
} else {
|
|
410
|
+
// Use HTMLRewriter for large, well-formed HTML
|
|
411
|
+
return tokenizeWithRewriter(html);
|
|
412
|
+
}
|
|
413
|
+
}
|