@tkeron/html-parser 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,413 @@
1
+ /**
2
+ * HTML Tokenizer using Bun's HTMLRewriter for efficient HTML parsing
3
+ * This tokenizer provides a stream-based approach to HTML parsing
4
+ */
5
+
6
+ export enum TokenType {
7
+ TAG_OPEN = 'TAG_OPEN',
8
+ TAG_CLOSE = 'TAG_CLOSE',
9
+ TEXT = 'TEXT',
10
+ COMMENT = 'COMMENT',
11
+ CDATA = 'CDATA',
12
+ DOCTYPE = 'DOCTYPE',
13
+ PROCESSING_INSTRUCTION = 'PROCESSING_INSTRUCTION',
14
+ EOF = 'EOF'
15
+ }
16
+
17
+ export interface Position {
18
+ line: number;
19
+ column: number;
20
+ offset: number;
21
+ }
22
+
23
+ export interface Token {
24
+ type: TokenType;
25
+ value: string;
26
+ position: Position;
27
+ attributes?: Record<string, string>;
28
+ isSelfClosing?: boolean;
29
+ isClosing?: boolean;
30
+ }
31
+
32
+ // HTML entities mapping
33
+ const HTML_ENTITIES: Record<string, string> = {
34
+ '&amp;': '&',
35
+ '&lt;': '<',
36
+ '&gt;': '>',
37
+ '&quot;': '"',
38
+ '&apos;': "'",
39
+ '&nbsp;': '\u00A0',
40
+ '&copy;': '©',
41
+ '&reg;': '®',
42
+ '&trade;': '™',
43
+ '&hellip;': '…',
44
+ '&mdash;': '—',
45
+ '&ndash;': '–',
46
+ '&lsquo;': '\u2018',
47
+ '&rsquo;': '\u2019',
48
+ '&ldquo;': '\u201C',
49
+ '&rdquo;': '\u201D',
50
+ '&not;': '¬'
51
+ };
52
+
53
+ /**
54
+ * Decode HTML entities in a string and handle null characters
55
+ */
56
+ function decodeEntities(text: string): string {
57
+ // First, replace null characters with the Unicode replacement character
58
+ let result = text.replace(/\u0000/g, '\uFFFD');
59
+
60
+ // Then decode HTML entities
61
+ return result.replace(/&(?:#x([0-9a-fA-F]+);?|#([0-9]+);?|([a-zA-Z][a-zA-Z0-9]*);?)/g, (match, hex, decimal, named) => {
62
+ if (hex) {
63
+ return String.fromCharCode(parseInt(hex, 16));
64
+ }
65
+ if (decimal) {
66
+ return String.fromCharCode(parseInt(decimal, 10));
67
+ }
68
+ if (named) {
69
+ // First try with semicolon
70
+ if (HTML_ENTITIES[`&${named};`]) {
71
+ return HTML_ENTITIES[`&${named};`];
72
+ }
73
+
74
+ // For entities without semicolon, try to find the longest valid entity prefix
75
+ if (!match.endsWith(';')) {
76
+ for (let i = named.length; i > 0; i--) {
77
+ const prefix = named.substring(0, i);
78
+ if (HTML_ENTITIES[`&${prefix};`]) {
79
+ const remainder = named.substring(i);
80
+ return HTML_ENTITIES[`&${prefix};`] + remainder;
81
+ }
82
+ }
83
+ }
84
+
85
+ return match;
86
+ }
87
+ return match;
88
+ });
89
+ }
90
+
91
+ /**
92
+ * Parse attributes from a tag string
93
+ */
94
+ function parseAttributes(attributeString: string): Record<string, string> {
95
+ const attributes: Record<string, string> = {};
96
+
97
+ // Regex to match attributes: name="value", name='value', name=value, or just name
98
+ const attrRegex = /([a-zA-Z][a-zA-Z0-9\-_:]*)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+)))?/g;
99
+ let match;
100
+
101
+ while ((match = attrRegex.exec(attributeString)) !== null) {
102
+ const [, name, doubleQuoted, singleQuoted, unquoted] = match;
103
+ if (name) {
104
+ const value = doubleQuoted ?? singleQuoted ?? unquoted ?? '';
105
+ attributes[name.toLowerCase()] = decodeEntities(value);
106
+ }
107
+ }
108
+
109
+ return attributes;
110
+ }
111
+
112
+ /**
113
+ * Calculate position in text
114
+ */
115
+ function calculatePosition(text: string, offset: number): Position {
116
+ const lines = text.slice(0, offset).split('\n');
117
+ return {
118
+ line: lines.length,
119
+ column: lines[lines.length - 1]?.length ?? 0,
120
+ offset
121
+ };
122
+ }
123
+
124
+ /**
125
+ * Tokenize HTML using a combination of HTMLRewriter and manual parsing
126
+ * HTMLRewriter is great for structured HTML but we need manual parsing for edge cases
127
+ */
128
+ export function tokenize(html: string): Token[] {
129
+ const tokens: Token[] = [];
130
+ let position = 0;
131
+
132
+ // Handle special cases first (DOCTYPE, comments, CDATA, processing instructions)
133
+ const specialCases = [
134
+ // DOCTYPE
135
+ {
136
+ pattern: /<!DOCTYPE\s+[^>]*>/gi,
137
+ type: TokenType.DOCTYPE,
138
+ getValue: (match: string) => {
139
+ // Extract just the doctype name (e.g., "html" from "<!DOCTYPE html>")
140
+ const doctypeMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
141
+ return doctypeMatch && doctypeMatch[1] ? doctypeMatch[1].toLowerCase() : match;
142
+ }
143
+ },
144
+ // Comments (including unclosed ones)
145
+ {
146
+ pattern: /<!--([\s\S]*?)(?:-->|$)/g,
147
+ type: TokenType.COMMENT,
148
+ getValue: (match: string) => match.slice(4, match.endsWith('-->') ? -3 : match.length)
149
+ },
150
+ // CDATA
151
+ {
152
+ pattern: /<!\[CDATA\[([\s\S]*?)\]\]>/g,
153
+ type: TokenType.CDATA,
154
+ getValue: (match: string) => match.slice(9, -3)
155
+ },
156
+ // Processing Instructions
157
+ {
158
+ pattern: /<\?([^?]*(?:\?(?!>)[^?]*)*)\?>/g,
159
+ type: TokenType.PROCESSING_INSTRUCTION,
160
+ getValue: (match: string) => match.slice(0, -2) // Remove the ?> at the end
161
+ }
162
+ ];
163
+
164
+ // Track processed ranges to avoid double processing
165
+ const processedRanges: Array<[number, number]> = [];
166
+
167
+ // Process special cases first
168
+ for (const { pattern, type, getValue } of specialCases) {
169
+ const regex = new RegExp(pattern);
170
+ let match;
171
+
172
+ while ((match = regex.exec(html)) !== null) {
173
+ const start = match.index;
174
+ const end = start + match[0].length;
175
+
176
+ tokens.push({
177
+ type,
178
+ value: getValue(match[0]),
179
+ position: calculatePosition(html, start)
180
+ });
181
+
182
+ processedRanges.push([start, end]);
183
+ }
184
+ }
185
+
186
+ // Sort processed ranges by start position
187
+ processedRanges.sort((a, b) => a[0] - b[0]);
188
+
189
+ // Process remaining HTML with manual parsing
190
+ let currentPos = 0;
191
+
192
+ while (currentPos < html.length) {
193
+ // Check if current position is in a processed range
194
+ const inProcessedRange = processedRanges.some(([start, end]) =>
195
+ currentPos >= start && currentPos < end
196
+ );
197
+
198
+ if (inProcessedRange) {
199
+ // Skip to end of processed range
200
+ const range = processedRanges.find(([start, end]) =>
201
+ currentPos >= start && currentPos < end
202
+ );
203
+ if (range) {
204
+ currentPos = range[1];
205
+ }
206
+ continue;
207
+ }
208
+
209
+ const char = html[currentPos];
210
+
211
+ if (char === '<') {
212
+ // Check if it's a tag
213
+ const tagMatch = html.slice(currentPos).match(/^<\/?([a-zA-Z][a-zA-Z0-9]*)[^>]*>/);
214
+
215
+ if (tagMatch) {
216
+ const fullTag = tagMatch[0];
217
+ const tagName = tagMatch[1]?.toLowerCase();
218
+
219
+ if (!tagName) {
220
+ currentPos++;
221
+ continue;
222
+ }
223
+
224
+ const isClosing = fullTag.startsWith('</');
225
+ const isSelfClosing = fullTag.endsWith('/>');
226
+
227
+ // Parse attributes if it's an opening tag
228
+ let attributes: Record<string, string> = {};
229
+ if (!isClosing) {
230
+ const attrMatch = fullTag.match(/^<[a-zA-Z][a-zA-Z0-9]*\s+([^>]*?)\/?>$/);
231
+ if (attrMatch && attrMatch[1]) {
232
+ attributes = parseAttributes(attrMatch[1]);
233
+ }
234
+ }
235
+
236
+ tokens.push({
237
+ type: isClosing ? TokenType.TAG_CLOSE : TokenType.TAG_OPEN,
238
+ value: tagName,
239
+ position: calculatePosition(html, currentPos),
240
+ ...(isClosing ? { isClosing: true } : {
241
+ attributes,
242
+ isSelfClosing
243
+ })
244
+ });
245
+
246
+ currentPos += fullTag.length;
247
+ } else {
248
+ // Not a valid tag, treat as text
249
+ const textStart = currentPos;
250
+ currentPos++;
251
+
252
+ // Find the end of text (next '<' or end of string)
253
+ while (currentPos < html.length && html[currentPos] !== '<') {
254
+ currentPos++;
255
+ }
256
+
257
+ const textContent = html.slice(textStart, currentPos);
258
+ if (textContent) { // Keep all text content, including whitespace-only
259
+ tokens.push({
260
+ type: TokenType.TEXT,
261
+ value: decodeEntities(textContent),
262
+ position: calculatePosition(html, textStart)
263
+ });
264
+ }
265
+ }
266
+ } else {
267
+ // Text content
268
+ const textStart = currentPos;
269
+
270
+ // Find the end of text (next '<' or end of string)
271
+ while (currentPos < html.length && html[currentPos] !== '<') {
272
+ currentPos++;
273
+ }
274
+
275
+ const textContent = html.slice(textStart, currentPos);
276
+ if (textContent) { // Keep all text content, including whitespace-only
277
+ tokens.push({
278
+ type: TokenType.TEXT,
279
+ value: decodeEntities(textContent),
280
+ position: calculatePosition(html, textStart)
281
+ });
282
+ }
283
+ }
284
+ }
285
+
286
+ // Sort tokens by position
287
+ tokens.sort((a, b) => a.position.offset - b.position.offset);
288
+
289
+ // Add EOF token
290
+ tokens.push({
291
+ type: TokenType.EOF,
292
+ value: '',
293
+ position: calculatePosition(html, html.length)
294
+ });
295
+
296
+ return tokens;
297
+ }
298
+
299
+ /**
300
+ * Enhanced tokenizer that uses HTMLRewriter for better performance on large HTML
301
+ * This is more efficient for well-formed HTML documents
302
+ */
303
+ export function tokenizeWithRewriter(html: string): Token[] {
304
+ const tokens: Token[] = [];
305
+ let textBuffer = '';
306
+ let position = 0;
307
+
308
+ // First pass: collect all tokens using HTMLRewriter
309
+ const rewriter = new HTMLRewriter();
310
+
311
+ // Handle all elements
312
+ rewriter.on('*', {
313
+ element(element) {
314
+ // Flush any accumulated text
315
+ if (textBuffer.trim()) {
316
+ tokens.push({
317
+ type: TokenType.TEXT,
318
+ value: decodeEntities(textBuffer),
319
+ position: calculatePosition(html, position - textBuffer.length)
320
+ });
321
+ textBuffer = '';
322
+ }
323
+
324
+ // Add opening tag
325
+ const attributes: Record<string, string> = {};
326
+ for (const [name, value] of element.attributes) {
327
+ attributes[name] = value;
328
+ }
329
+
330
+ tokens.push({
331
+ type: TokenType.TAG_OPEN,
332
+ value: element.tagName.toLowerCase(),
333
+ position: calculatePosition(html, position),
334
+ attributes,
335
+ isSelfClosing: element.selfClosing
336
+ });
337
+
338
+ // Handle self-closing tags
339
+ if (!element.selfClosing) {
340
+ // We'll add the closing tag in the end handler
341
+ element.onEndTag((endTag) => {
342
+ tokens.push({
343
+ type: TokenType.TAG_CLOSE,
344
+ value: endTag.name.toLowerCase(),
345
+ position: calculatePosition(html, position),
346
+ isClosing: true
347
+ });
348
+ });
349
+ }
350
+ },
351
+
352
+ text(text) {
353
+ textBuffer += text.text;
354
+ },
355
+
356
+ comments(comment) {
357
+ tokens.push({
358
+ type: TokenType.COMMENT,
359
+ value: comment.text,
360
+ position: calculatePosition(html, position)
361
+ });
362
+ }
363
+ });
364
+
365
+ try {
366
+ // Transform the HTML (this triggers the rewriter)
367
+ const response = new Response(html, {
368
+ headers: { 'Content-Type': 'text/html' }
369
+ });
370
+
371
+ rewriter.transform(response);
372
+
373
+ // Flush any remaining text
374
+ if (textBuffer.trim()) {
375
+ tokens.push({
376
+ type: TokenType.TEXT,
377
+ value: decodeEntities(textBuffer),
378
+ position: calculatePosition(html, position - textBuffer.length)
379
+ });
380
+ }
381
+
382
+ } catch (error) {
383
+ // If HTMLRewriter fails, fall back to manual parsing
384
+ console.warn('HTMLRewriter failed, falling back to manual parsing:', error);
385
+ return tokenize(html);
386
+ }
387
+
388
+ // Sort tokens by position and add EOF
389
+ tokens.sort((a, b) => a.position.offset - b.position.offset);
390
+ tokens.push({
391
+ type: TokenType.EOF,
392
+ value: '',
393
+ position: calculatePosition(html, html.length)
394
+ });
395
+
396
+ return tokens;
397
+ }
398
+
399
+ /**
400
+ * Smart tokenizer that chooses the best method based on HTML content
401
+ */
402
+ export function smartTokenize(html: string): Token[] {
403
+ // Use HTMLRewriter for well-formed HTML, manual parsing for edge cases
404
+ const hasSpecialContent = /<!DOCTYPE|<!--|\[CDATA\[|<\?/.test(html);
405
+
406
+ if (hasSpecialContent || html.length < 1000) {
407
+ // Use manual parsing for small HTML or HTML with special content
408
+ return tokenize(html);
409
+ } else {
410
+ // Use HTMLRewriter for large, well-formed HTML
411
+ return tokenizeWithRewriter(html);
412
+ }
413
+ }