@tkeron/html-parser 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bun.lock +6 -9
- package/package.json +3 -3
- package/src/dom-simulator.ts +396 -51
- package/src/dom-types.ts +3 -0
- package/src/tokenizer.ts +5 -49
- package/tests/custom-elements.test.ts +745 -0
- package/tests/dom-adoption.test.ts +363 -0
- package/tests/dom-manipulation.test.ts +688 -0
- package/tests/dom-synchronization.test.ts +675 -0
package/src/tokenizer.ts
CHANGED
|
@@ -1,8 +1,3 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* HTML Tokenizer using Bun's HTMLRewriter for efficient HTML parsing
|
|
3
|
-
* This tokenizer provides a stream-based approach to HTML parsing
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
1
|
export enum TokenType {
|
|
7
2
|
TAG_OPEN = 'TAG_OPEN',
|
|
8
3
|
TAG_CLOSE = 'TAG_CLOSE',
|
|
@@ -29,7 +24,6 @@ export interface Token {
|
|
|
29
24
|
isClosing?: boolean;
|
|
30
25
|
}
|
|
31
26
|
|
|
32
|
-
// HTML entities mapping
|
|
33
27
|
const HTML_ENTITIES: Record<string, string> = {
|
|
34
28
|
'&': '&',
|
|
35
29
|
'<': '<',
|
|
@@ -54,10 +48,8 @@ const HTML_ENTITIES: Record<string, string> = {
|
|
|
54
48
|
* Decode HTML entities in a string and handle null characters
|
|
55
49
|
*/
|
|
56
50
|
function decodeEntities(text: string): string {
|
|
57
|
-
// First, replace null characters with the Unicode replacement character
|
|
58
51
|
let result = text.replace(/\u0000/g, '\uFFFD');
|
|
59
52
|
|
|
60
|
-
// Then decode HTML entities
|
|
61
53
|
return result.replace(/&(?:#x([0-9a-fA-F]+);?|#([0-9]+);?|([a-zA-Z][a-zA-Z0-9]*);?)/g, (match, hex, decimal, named) => {
|
|
62
54
|
if (hex) {
|
|
63
55
|
return String.fromCharCode(parseInt(hex, 16));
|
|
@@ -66,12 +58,10 @@ function decodeEntities(text: string): string {
|
|
|
66
58
|
return String.fromCharCode(parseInt(decimal, 10));
|
|
67
59
|
}
|
|
68
60
|
if (named) {
|
|
69
|
-
// First try with semicolon
|
|
70
61
|
if (HTML_ENTITIES[`&${named};`]) {
|
|
71
62
|
return HTML_ENTITIES[`&${named};`];
|
|
72
63
|
}
|
|
73
64
|
|
|
74
|
-
// For entities without semicolon, try to find the longest valid entity prefix
|
|
75
65
|
if (!match.endsWith(';')) {
|
|
76
66
|
for (let i = named.length; i > 0; i--) {
|
|
77
67
|
const prefix = named.substring(0, i);
|
|
@@ -94,7 +84,6 @@ function decodeEntities(text: string): string {
|
|
|
94
84
|
function parseAttributes(attributeString: string): Record<string, string> {
|
|
95
85
|
const attributes: Record<string, string> = {};
|
|
96
86
|
|
|
97
|
-
// Regex to match attributes: name="value", name='value', name=value, or just name
|
|
98
87
|
const attrRegex = /([a-zA-Z][a-zA-Z0-9\-_:]*)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+)))?/g;
|
|
99
88
|
let match;
|
|
100
89
|
|
|
@@ -129,42 +118,34 @@ export function tokenize(html: string): Token[] {
|
|
|
129
118
|
const tokens: Token[] = [];
|
|
130
119
|
let position = 0;
|
|
131
120
|
|
|
132
|
-
// Handle special cases first (DOCTYPE, comments, CDATA, processing instructions)
|
|
133
121
|
const specialCases = [
|
|
134
|
-
// DOCTYPE
|
|
135
122
|
{
|
|
136
123
|
pattern: /<!DOCTYPE\s+[^>]*>/gi,
|
|
137
124
|
type: TokenType.DOCTYPE,
|
|
138
125
|
getValue: (match: string) => {
|
|
139
|
-
// Extract just the doctype name (e.g., "html" from "<!DOCTYPE html>")
|
|
140
126
|
const doctypeMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
|
|
141
127
|
return doctypeMatch && doctypeMatch[1] ? doctypeMatch[1].toLowerCase() : match;
|
|
142
128
|
}
|
|
143
129
|
},
|
|
144
|
-
// Comments (including unclosed ones)
|
|
145
130
|
{
|
|
146
131
|
pattern: /<!--([\s\S]*?)(?:-->|$)/g,
|
|
147
132
|
type: TokenType.COMMENT,
|
|
148
133
|
getValue: (match: string) => match.slice(4, match.endsWith('-->') ? -3 : match.length)
|
|
149
134
|
},
|
|
150
|
-
// CDATA
|
|
151
135
|
{
|
|
152
136
|
pattern: /<!\[CDATA\[([\s\S]*?)\]\]>/g,
|
|
153
137
|
type: TokenType.CDATA,
|
|
154
138
|
getValue: (match: string) => match.slice(9, -3)
|
|
155
139
|
},
|
|
156
|
-
// Processing Instructions
|
|
157
140
|
{
|
|
158
141
|
pattern: /<\?([^?]*(?:\?(?!>)[^?]*)*)\?>/g,
|
|
159
142
|
type: TokenType.PROCESSING_INSTRUCTION,
|
|
160
|
-
getValue: (match: string) => match.slice(0, -2)
|
|
143
|
+
getValue: (match: string) => match.slice(0, -2)
|
|
161
144
|
}
|
|
162
145
|
];
|
|
163
146
|
|
|
164
|
-
// Track processed ranges to avoid double processing
|
|
165
147
|
const processedRanges: Array<[number, number]> = [];
|
|
166
148
|
|
|
167
|
-
// Process special cases first
|
|
168
149
|
for (const { pattern, type, getValue } of specialCases) {
|
|
169
150
|
const regex = new RegExp(pattern);
|
|
170
151
|
let match;
|
|
@@ -183,20 +164,16 @@ export function tokenize(html: string): Token[] {
|
|
|
183
164
|
}
|
|
184
165
|
}
|
|
185
166
|
|
|
186
|
-
// Sort processed ranges by start position
|
|
187
167
|
processedRanges.sort((a, b) => a[0] - b[0]);
|
|
188
168
|
|
|
189
|
-
// Process remaining HTML with manual parsing
|
|
190
169
|
let currentPos = 0;
|
|
191
170
|
|
|
192
171
|
while (currentPos < html.length) {
|
|
193
|
-
// Check if current position is in a processed range
|
|
194
172
|
const inProcessedRange = processedRanges.some(([start, end]) =>
|
|
195
173
|
currentPos >= start && currentPos < end
|
|
196
174
|
);
|
|
197
175
|
|
|
198
176
|
if (inProcessedRange) {
|
|
199
|
-
// Skip to end of processed range
|
|
200
177
|
const range = processedRanges.find(([start, end]) =>
|
|
201
178
|
currentPos >= start && currentPos < end
|
|
202
179
|
);
|
|
@@ -209,8 +186,7 @@ export function tokenize(html: string): Token[] {
|
|
|
209
186
|
const char = html[currentPos];
|
|
210
187
|
|
|
211
188
|
if (char === '<') {
|
|
212
|
-
|
|
213
|
-
const tagMatch = html.slice(currentPos).match(/^<\/?([a-zA-Z][a-zA-Z0-9]*)[^>]*>/);
|
|
189
|
+
const tagMatch = html.slice(currentPos).match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
|
|
214
190
|
|
|
215
191
|
if (tagMatch) {
|
|
216
192
|
const fullTag = tagMatch[0];
|
|
@@ -224,10 +200,9 @@ export function tokenize(html: string): Token[] {
|
|
|
224
200
|
const isClosing = fullTag.startsWith('</');
|
|
225
201
|
const isSelfClosing = fullTag.endsWith('/>');
|
|
226
202
|
|
|
227
|
-
// Parse attributes if it's an opening tag
|
|
228
203
|
let attributes: Record<string, string> = {};
|
|
229
204
|
if (!isClosing) {
|
|
230
|
-
const attrMatch = fullTag.match(/^<[a-zA-Z][
|
|
205
|
+
const attrMatch = fullTag.match(/^<[a-zA-Z][^\s/>]*\s+([^>]*?)\/?>$/);
|
|
231
206
|
if (attrMatch && attrMatch[1]) {
|
|
232
207
|
attributes = parseAttributes(attrMatch[1]);
|
|
233
208
|
}
|
|
@@ -245,17 +220,15 @@ export function tokenize(html: string): Token[] {
|
|
|
245
220
|
|
|
246
221
|
currentPos += fullTag.length;
|
|
247
222
|
} else {
|
|
248
|
-
// Not a valid tag, treat as text
|
|
249
223
|
const textStart = currentPos;
|
|
250
224
|
currentPos++;
|
|
251
225
|
|
|
252
|
-
// Find the end of text (next '<' or end of string)
|
|
253
226
|
while (currentPos < html.length && html[currentPos] !== '<') {
|
|
254
227
|
currentPos++;
|
|
255
228
|
}
|
|
256
229
|
|
|
257
230
|
const textContent = html.slice(textStart, currentPos);
|
|
258
|
-
if (textContent) {
|
|
231
|
+
if (textContent) {
|
|
259
232
|
tokens.push({
|
|
260
233
|
type: TokenType.TEXT,
|
|
261
234
|
value: decodeEntities(textContent),
|
|
@@ -264,16 +237,14 @@ export function tokenize(html: string): Token[] {
|
|
|
264
237
|
}
|
|
265
238
|
}
|
|
266
239
|
} else {
|
|
267
|
-
// Text content
|
|
268
240
|
const textStart = currentPos;
|
|
269
241
|
|
|
270
|
-
// Find the end of text (next '<' or end of string)
|
|
271
242
|
while (currentPos < html.length && html[currentPos] !== '<') {
|
|
272
243
|
currentPos++;
|
|
273
244
|
}
|
|
274
245
|
|
|
275
246
|
const textContent = html.slice(textStart, currentPos);
|
|
276
|
-
if (textContent) {
|
|
247
|
+
if (textContent) {
|
|
277
248
|
tokens.push({
|
|
278
249
|
type: TokenType.TEXT,
|
|
279
250
|
value: decodeEntities(textContent),
|
|
@@ -296,22 +267,15 @@ export function tokenize(html: string): Token[] {
|
|
|
296
267
|
return tokens;
|
|
297
268
|
}
|
|
298
269
|
|
|
299
|
-
/**
|
|
300
|
-
* Enhanced tokenizer that uses HTMLRewriter for better performance on large HTML
|
|
301
|
-
* This is more efficient for well-formed HTML documents
|
|
302
|
-
*/
|
|
303
270
|
export function tokenizeWithRewriter(html: string): Token[] {
|
|
304
271
|
const tokens: Token[] = [];
|
|
305
272
|
let textBuffer = '';
|
|
306
273
|
let position = 0;
|
|
307
274
|
|
|
308
|
-
// First pass: collect all tokens using HTMLRewriter
|
|
309
275
|
const rewriter = new HTMLRewriter();
|
|
310
276
|
|
|
311
|
-
// Handle all elements
|
|
312
277
|
rewriter.on('*', {
|
|
313
278
|
element(element) {
|
|
314
|
-
// Flush any accumulated text
|
|
315
279
|
if (textBuffer.trim()) {
|
|
316
280
|
tokens.push({
|
|
317
281
|
type: TokenType.TEXT,
|
|
@@ -335,9 +299,7 @@ export function tokenizeWithRewriter(html: string): Token[] {
|
|
|
335
299
|
isSelfClosing: element.selfClosing
|
|
336
300
|
});
|
|
337
301
|
|
|
338
|
-
// Handle self-closing tags
|
|
339
302
|
if (!element.selfClosing) {
|
|
340
|
-
// We'll add the closing tag in the end handler
|
|
341
303
|
element.onEndTag((endTag) => {
|
|
342
304
|
tokens.push({
|
|
343
305
|
type: TokenType.TAG_CLOSE,
|
|
@@ -396,18 +358,12 @@ export function tokenizeWithRewriter(html: string): Token[] {
|
|
|
396
358
|
return tokens;
|
|
397
359
|
}
|
|
398
360
|
|
|
399
|
-
/**
|
|
400
|
-
* Smart tokenizer that chooses the best method based on HTML content
|
|
401
|
-
*/
|
|
402
361
|
export function smartTokenize(html: string): Token[] {
|
|
403
|
-
// Use HTMLRewriter for well-formed HTML, manual parsing for edge cases
|
|
404
362
|
const hasSpecialContent = /<!DOCTYPE|<!--|\[CDATA\[|<\?/.test(html);
|
|
405
363
|
|
|
406
364
|
if (hasSpecialContent || html.length < 1000) {
|
|
407
|
-
// Use manual parsing for small HTML or HTML with special content
|
|
408
365
|
return tokenize(html);
|
|
409
366
|
} else {
|
|
410
|
-
// Use HTMLRewriter for large, well-formed HTML
|
|
411
367
|
return tokenizeWithRewriter(html);
|
|
412
368
|
}
|
|
413
369
|
}
|