@tkeron/html-parser 0.1.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/tokenizer.ts CHANGED
@@ -1,8 +1,3 @@
1
- /**
2
- * HTML Tokenizer using Bun's HTMLRewriter for efficient HTML parsing
3
- * This tokenizer provides a stream-based approach to HTML parsing
4
- */
5
-
6
1
  export enum TokenType {
7
2
  TAG_OPEN = 'TAG_OPEN',
8
3
  TAG_CLOSE = 'TAG_CLOSE',
@@ -29,7 +24,6 @@ export interface Token {
29
24
  isClosing?: boolean;
30
25
  }
31
26
 
32
- // HTML entities mapping
33
27
  const HTML_ENTITIES: Record<string, string> = {
34
28
  '&amp;': '&',
35
29
  '&lt;': '<',
@@ -54,10 +48,8 @@ const HTML_ENTITIES: Record<string, string> = {
54
48
  * Decode HTML entities in a string and handle null characters
55
49
  */
56
50
  function decodeEntities(text: string): string {
57
- // First, replace null characters with the Unicode replacement character
58
51
  let result = text.replace(/\u0000/g, '\uFFFD');
59
52
 
60
- // Then decode HTML entities
61
53
  return result.replace(/&(?:#x([0-9a-fA-F]+);?|#([0-9]+);?|([a-zA-Z][a-zA-Z0-9]*);?)/g, (match, hex, decimal, named) => {
62
54
  if (hex) {
63
55
  return String.fromCharCode(parseInt(hex, 16));
@@ -66,12 +58,10 @@ function decodeEntities(text: string): string {
66
58
  return String.fromCharCode(parseInt(decimal, 10));
67
59
  }
68
60
  if (named) {
69
- // First try with semicolon
70
61
  if (HTML_ENTITIES[`&${named};`]) {
71
62
  return HTML_ENTITIES[`&${named};`];
72
63
  }
73
64
 
74
- // For entities without semicolon, try to find the longest valid entity prefix
75
65
  if (!match.endsWith(';')) {
76
66
  for (let i = named.length; i > 0; i--) {
77
67
  const prefix = named.substring(0, i);
@@ -94,7 +84,6 @@ function decodeEntities(text: string): string {
94
84
  function parseAttributes(attributeString: string): Record<string, string> {
95
85
  const attributes: Record<string, string> = {};
96
86
 
97
- // Regex to match attributes: name="value", name='value', name=value, or just name
98
87
  const attrRegex = /([a-zA-Z][a-zA-Z0-9\-_:]*)\s*(?:=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+)))?/g;
99
88
  let match;
100
89
 
@@ -129,42 +118,34 @@ export function tokenize(html: string): Token[] {
129
118
  const tokens: Token[] = [];
130
119
  let position = 0;
131
120
 
132
- // Handle special cases first (DOCTYPE, comments, CDATA, processing instructions)
133
121
  const specialCases = [
134
- // DOCTYPE
135
122
  {
136
123
  pattern: /<!DOCTYPE\s+[^>]*>/gi,
137
124
  type: TokenType.DOCTYPE,
138
125
  getValue: (match: string) => {
139
- // Extract just the doctype name (e.g., "html" from "<!DOCTYPE html>")
140
126
  const doctypeMatch = match.match(/<!DOCTYPE\s+([^\s>]+)/i);
141
127
  return doctypeMatch && doctypeMatch[1] ? doctypeMatch[1].toLowerCase() : match;
142
128
  }
143
129
  },
144
- // Comments (including unclosed ones)
145
130
  {
146
131
  pattern: /<!--([\s\S]*?)(?:-->|$)/g,
147
132
  type: TokenType.COMMENT,
148
133
  getValue: (match: string) => match.slice(4, match.endsWith('-->') ? -3 : match.length)
149
134
  },
150
- // CDATA
151
135
  {
152
136
  pattern: /<!\[CDATA\[([\s\S]*?)\]\]>/g,
153
137
  type: TokenType.CDATA,
154
138
  getValue: (match: string) => match.slice(9, -3)
155
139
  },
156
- // Processing Instructions
157
140
  {
158
141
  pattern: /<\?([^?]*(?:\?(?!>)[^?]*)*)\?>/g,
159
142
  type: TokenType.PROCESSING_INSTRUCTION,
160
- getValue: (match: string) => match.slice(0, -2) // Remove the ?> at the end
143
+ getValue: (match: string) => match.slice(0, -2)
161
144
  }
162
145
  ];
163
146
 
164
- // Track processed ranges to avoid double processing
165
147
  const processedRanges: Array<[number, number]> = [];
166
148
 
167
- // Process special cases first
168
149
  for (const { pattern, type, getValue } of specialCases) {
169
150
  const regex = new RegExp(pattern);
170
151
  let match;
@@ -183,20 +164,16 @@ export function tokenize(html: string): Token[] {
183
164
  }
184
165
  }
185
166
 
186
- // Sort processed ranges by start position
187
167
  processedRanges.sort((a, b) => a[0] - b[0]);
188
168
 
189
- // Process remaining HTML with manual parsing
190
169
  let currentPos = 0;
191
170
 
192
171
  while (currentPos < html.length) {
193
- // Check if current position is in a processed range
194
172
  const inProcessedRange = processedRanges.some(([start, end]) =>
195
173
  currentPos >= start && currentPos < end
196
174
  );
197
175
 
198
176
  if (inProcessedRange) {
199
- // Skip to end of processed range
200
177
  const range = processedRanges.find(([start, end]) =>
201
178
  currentPos >= start && currentPos < end
202
179
  );
@@ -209,8 +186,7 @@ export function tokenize(html: string): Token[] {
209
186
  const char = html[currentPos];
210
187
 
211
188
  if (char === '<') {
212
- // Check if it's a tag
213
- const tagMatch = html.slice(currentPos).match(/^<\/?([a-zA-Z][a-zA-Z0-9]*)[^>]*>/);
189
+ const tagMatch = html.slice(currentPos).match(/^<\/?([a-zA-Z][^\s/>]*)([^>]*)>/);
214
190
 
215
191
  if (tagMatch) {
216
192
  const fullTag = tagMatch[0];
@@ -224,10 +200,9 @@ export function tokenize(html: string): Token[] {
224
200
  const isClosing = fullTag.startsWith('</');
225
201
  const isSelfClosing = fullTag.endsWith('/>');
226
202
 
227
- // Parse attributes if it's an opening tag
228
203
  let attributes: Record<string, string> = {};
229
204
  if (!isClosing) {
230
- const attrMatch = fullTag.match(/^<[a-zA-Z][a-zA-Z0-9]*\s+([^>]*?)\/?>$/);
205
+ const attrMatch = fullTag.match(/^<[a-zA-Z][^\s/>]*\s+([^>]*?)\/?>$/);
231
206
  if (attrMatch && attrMatch[1]) {
232
207
  attributes = parseAttributes(attrMatch[1]);
233
208
  }
@@ -245,17 +220,15 @@ export function tokenize(html: string): Token[] {
245
220
 
246
221
  currentPos += fullTag.length;
247
222
  } else {
248
- // Not a valid tag, treat as text
249
223
  const textStart = currentPos;
250
224
  currentPos++;
251
225
 
252
- // Find the end of text (next '<' or end of string)
253
226
  while (currentPos < html.length && html[currentPos] !== '<') {
254
227
  currentPos++;
255
228
  }
256
229
 
257
230
  const textContent = html.slice(textStart, currentPos);
258
- if (textContent) { // Keep all text content, including whitespace-only
231
+ if (textContent) {
259
232
  tokens.push({
260
233
  type: TokenType.TEXT,
261
234
  value: decodeEntities(textContent),
@@ -264,16 +237,14 @@ export function tokenize(html: string): Token[] {
264
237
  }
265
238
  }
266
239
  } else {
267
- // Text content
268
240
  const textStart = currentPos;
269
241
 
270
- // Find the end of text (next '<' or end of string)
271
242
  while (currentPos < html.length && html[currentPos] !== '<') {
272
243
  currentPos++;
273
244
  }
274
245
 
275
246
  const textContent = html.slice(textStart, currentPos);
276
- if (textContent) { // Keep all text content, including whitespace-only
247
+ if (textContent) {
277
248
  tokens.push({
278
249
  type: TokenType.TEXT,
279
250
  value: decodeEntities(textContent),
@@ -296,22 +267,15 @@ export function tokenize(html: string): Token[] {
296
267
  return tokens;
297
268
  }
298
269
 
299
- /**
300
- * Enhanced tokenizer that uses HTMLRewriter for better performance on large HTML
301
- * This is more efficient for well-formed HTML documents
302
- */
303
270
  export function tokenizeWithRewriter(html: string): Token[] {
304
271
  const tokens: Token[] = [];
305
272
  let textBuffer = '';
306
273
  let position = 0;
307
274
 
308
- // First pass: collect all tokens using HTMLRewriter
309
275
  const rewriter = new HTMLRewriter();
310
276
 
311
- // Handle all elements
312
277
  rewriter.on('*', {
313
278
  element(element) {
314
- // Flush any accumulated text
315
279
  if (textBuffer.trim()) {
316
280
  tokens.push({
317
281
  type: TokenType.TEXT,
@@ -335,9 +299,7 @@ export function tokenizeWithRewriter(html: string): Token[] {
335
299
  isSelfClosing: element.selfClosing
336
300
  });
337
301
 
338
- // Handle self-closing tags
339
302
  if (!element.selfClosing) {
340
- // We'll add the closing tag in the end handler
341
303
  element.onEndTag((endTag) => {
342
304
  tokens.push({
343
305
  type: TokenType.TAG_CLOSE,
@@ -396,18 +358,12 @@ export function tokenizeWithRewriter(html: string): Token[] {
396
358
  return tokens;
397
359
  }
398
360
 
399
- /**
400
- * Smart tokenizer that chooses the best method based on HTML content
401
- */
402
361
  export function smartTokenize(html: string): Token[] {
403
- // Use HTMLRewriter for well-formed HTML, manual parsing for edge cases
404
362
  const hasSpecialContent = /<!DOCTYPE|<!--|\[CDATA\[|<\?/.test(html);
405
363
 
406
364
  if (hasSpecialContent || html.length < 1000) {
407
- // Use manual parsing for small HTML or HTML with special content
408
365
  return tokenize(html);
409
366
  } else {
410
- // Use HTMLRewriter for large, well-formed HTML
411
367
  return tokenizeWithRewriter(html);
412
368
  }
413
369
  }