@tkeron/html-parser 0.1.4 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/tokenizer.ts CHANGED
@@ -44,9 +44,6 @@ const HTML_ENTITIES: Record<string, string> = {
44
44
  '&not;': '¬'
45
45
  };
46
46
 
47
- /**
48
- * Decode HTML entities in a string and handle null characters
49
- */
50
47
  function decodeEntities(text: string): string {
51
48
  let result = text.replace(/\u0000/g, '\uFFFD');
52
49
 
@@ -78,9 +75,6 @@ function decodeEntities(text: string): string {
78
75
  });
79
76
  }
80
77
 
81
- /**
82
- * Parse attributes from a tag string
83
- */
84
78
  function parseAttributes(attributeString: string): Record<string, string> {
85
79
  const attributes: Record<string, string> = {};
86
80
 
@@ -98,9 +92,6 @@ function parseAttributes(attributeString: string): Record<string, string> {
98
92
  return attributes;
99
93
  }
100
94
 
101
- /**
102
- * Calculate position in text
103
- */
104
95
  function calculatePosition(text: string, offset: number): Position {
105
96
  const lines = text.slice(0, offset).split('\n');
106
97
  return {
@@ -110,10 +101,6 @@ function calculatePosition(text: string, offset: number): Position {
110
101
  };
111
102
  }
112
103
 
113
- /**
114
- * Tokenize HTML using a combination of HTMLRewriter and manual parsing
115
- * HTMLRewriter is great for structured HTML but we need manual parsing for edge cases
116
- */
117
104
  export function tokenize(html: string): Token[] {
118
105
  const tokens: Token[] = [];
119
106
  let position = 0;
@@ -254,10 +241,8 @@ export function tokenize(html: string): Token[] {
254
241
  }
255
242
  }
256
243
 
257
- // Sort tokens by position
258
244
  tokens.sort((a, b) => a.position.offset - b.position.offset);
259
245
 
260
- // Add EOF token
261
246
  tokens.push({
262
247
  type: TokenType.EOF,
263
248
  value: '',
@@ -266,104 +251,3 @@ export function tokenize(html: string): Token[] {
266
251
 
267
252
  return tokens;
268
253
  }
269
-
270
- export function tokenizeWithRewriter(html: string): Token[] {
271
- const tokens: Token[] = [];
272
- let textBuffer = '';
273
- let position = 0;
274
-
275
- const rewriter = new HTMLRewriter();
276
-
277
- rewriter.on('*', {
278
- element(element) {
279
- if (textBuffer.trim()) {
280
- tokens.push({
281
- type: TokenType.TEXT,
282
- value: decodeEntities(textBuffer),
283
- position: calculatePosition(html, position - textBuffer.length)
284
- });
285
- textBuffer = '';
286
- }
287
-
288
- // Add opening tag
289
- const attributes: Record<string, string> = {};
290
- for (const [name, value] of element.attributes) {
291
- attributes[name] = value;
292
- }
293
-
294
- tokens.push({
295
- type: TokenType.TAG_OPEN,
296
- value: element.tagName.toLowerCase(),
297
- position: calculatePosition(html, position),
298
- attributes,
299
- isSelfClosing: element.selfClosing
300
- });
301
-
302
- if (!element.selfClosing) {
303
- element.onEndTag((endTag) => {
304
- tokens.push({
305
- type: TokenType.TAG_CLOSE,
306
- value: endTag.name.toLowerCase(),
307
- position: calculatePosition(html, position),
308
- isClosing: true
309
- });
310
- });
311
- }
312
- },
313
-
314
- text(text) {
315
- textBuffer += text.text;
316
- },
317
-
318
- comments(comment) {
319
- tokens.push({
320
- type: TokenType.COMMENT,
321
- value: comment.text,
322
- position: calculatePosition(html, position)
323
- });
324
- }
325
- });
326
-
327
- try {
328
- // Transform the HTML (this triggers the rewriter)
329
- const response = new Response(html, {
330
- headers: { 'Content-Type': 'text/html' }
331
- });
332
-
333
- rewriter.transform(response);
334
-
335
- // Flush any remaining text
336
- if (textBuffer.trim()) {
337
- tokens.push({
338
- type: TokenType.TEXT,
339
- value: decodeEntities(textBuffer),
340
- position: calculatePosition(html, position - textBuffer.length)
341
- });
342
- }
343
-
344
- } catch (error) {
345
- // If HTMLRewriter fails, fall back to manual parsing
346
- console.warn('HTMLRewriter failed, falling back to manual parsing:', error);
347
- return tokenize(html);
348
- }
349
-
350
- // Sort tokens by position and add EOF
351
- tokens.sort((a, b) => a.position.offset - b.position.offset);
352
- tokens.push({
353
- type: TokenType.EOF,
354
- value: '',
355
- position: calculatePosition(html, html.length)
356
- });
357
-
358
- return tokens;
359
- }
360
-
361
- export function smartTokenize(html: string): Token[] {
362
- const hasSpecialContent = /<!DOCTYPE|<!--|\[CDATA\[|<\?/.test(html);
363
-
364
- if (hasSpecialContent || html.length < 1000) {
365
- return tokenize(html);
366
- } else {
367
- return tokenizeWithRewriter(html);
368
- }
369
- }
@@ -28,14 +28,14 @@ describe('HTML Parser & Tokenizer - Advanced Tests', () => {
28
28
  });
29
29
 
30
30
  test('should handle unicode characters', () => {
31
- const tokens = tokenize('<div title="测试" data-emoji="🚀" class="café">');
31
+ const tokens = tokenize('<div title="测试" data-emoji="🚀" class="lorem">');
32
32
  expect(tokens.length).toBeGreaterThan(0);
33
33
  const tag = tokens[0]!;
34
34
 
35
35
  expect(tag.attributes).toEqual({
36
36
  title: '测试',
37
37
  'data-emoji': '🚀',
38
- class: 'café'
38
+ class: 'lorem'
39
39
  });
40
40
  });
41
41
 
@@ -11,17 +11,17 @@ describe("cloneNode functionality", () => {
11
11
 
12
12
  const cloned = original.cloneNode(true);
13
13
 
14
- // Verificar que el nodo clonado existe
14
+
15
15
  expect(cloned).toBeTruthy();
16
16
  expect(cloned.nodeName).toBe("DIV");
17
17
 
18
- // Verificar que los atributos se copian
18
+
19
19
  expect(cloned.getAttribute("id")).toBe("original");
20
20
 
21
- // Verificar que el contenido de texto se copia
21
+
22
22
  expect(cloned.textContent).toBe("Hello World");
23
23
 
24
- // Verificar que tiene los hijos correctos
24
+
25
25
  expect(cloned.childNodes.length).toBeGreaterThan(0);
26
26
  });
27
27
 
@@ -38,20 +38,20 @@ describe("cloneNode functionality", () => {
38
38
 
39
39
  const cloned = parent.cloneNode(true);
40
40
 
41
- // Verificar estructura básica
41
+
42
42
  expect(cloned.nodeName).toBe("DIV");
43
43
  expect(cloned.getAttribute("id")).toBe("parent");
44
44
 
45
- // Verificar que los hijos se clonaron
45
+
46
46
  expect(cloned.childNodes.length).toBeGreaterThan(0);
47
47
 
48
- // Verificar que el hijo div está presente
48
+
49
49
  const childDiv = cloned.querySelector(".child");
50
50
  expect(childDiv).toBeTruthy();
51
51
  expect(childDiv?.nodeName).toBe("DIV");
52
52
  expect(childDiv?.getAttribute("class")).toBe("child");
53
53
 
54
- // Verificar el nieto span
54
+
55
55
  const span = cloned.querySelector("span");
56
56
  expect(span).toBeTruthy();
57
57
  expect(span?.textContent).toBe("Nested Text");
@@ -70,7 +70,7 @@ describe("cloneNode functionality", () => {
70
70
 
71
71
  const cloned = list.cloneNode(true);
72
72
 
73
- // Verificar que todos los items se clonaron
73
+
74
74
  const items = cloned.querySelectorAll("li");
75
75
  expect(items.length).toBe(3);
76
76
  expect(items[0]?.textContent).toBe("Item 1");
@@ -89,18 +89,18 @@ describe("cloneNode functionality", () => {
89
89
  const doc = parseHTML(html);
90
90
  const container = doc.querySelector("#container")!;
91
91
 
92
- // Guardar innerHTML original
92
+
93
93
  const originalInnerHTML = container.innerHTML;
94
94
  expect(originalInnerHTML).toBeTruthy();
95
95
  expect(originalInnerHTML.length).toBeGreaterThan(0);
96
96
 
97
97
  const cloned = container.cloneNode(true);
98
98
 
99
- // Verificar que innerHTML del clon no esté vacío
99
+
100
100
  expect(cloned.innerHTML).toBeTruthy();
101
101
  expect(cloned.innerHTML.length).toBeGreaterThan(0);
102
102
 
103
- // Verificar que el contenido es similar
103
+
104
104
  expect(cloned.innerHTML).toContain("<h1>Title</h1>");
105
105
  expect(cloned.innerHTML).toContain("<p>Paragraph 1</p>");
106
106
  expect(cloned.innerHTML).toContain("<p>Paragraph 2</p>");
@@ -111,19 +111,19 @@ describe("cloneNode functionality", () => {
111
111
  const doc = parseHTML(html);
112
112
  const mixed = doc.querySelector("#mixed")!;
113
113
 
114
- // Verificar estructura original
114
+
115
115
  const originalChildCount = mixed.childNodes.length;
116
116
  expect(originalChildCount).toBeGreaterThan(0);
117
117
 
118
118
  const cloned = mixed.cloneNode(true);
119
119
 
120
- // Verificar que tiene la misma cantidad de hijos
120
+
121
121
  expect(cloned.childNodes.length).toBe(originalChildCount);
122
122
 
123
- // Verificar contenido completo
123
+
124
124
  expect(cloned.textContent).toBe("Text beforebold textText after");
125
125
 
126
- // Verificar que el elemento strong existe
126
+
127
127
  const strong = cloned.querySelector("strong");
128
128
  expect(strong).toBeTruthy();
129
129
  expect(strong?.textContent).toBe("bold text");
@@ -136,7 +136,7 @@ describe("cloneNode functionality", () => {
136
136
 
137
137
  const cloned = element.cloneNode(true);
138
138
 
139
- // Verificar todos los atributos
139
+
140
140
  expect(cloned.getAttribute("id")).toBe("attrs");
141
141
  expect(cloned.getAttribute("class")).toBe("test");
142
142
  expect(cloned.getAttribute("data-value")).toBe("123");
@@ -161,11 +161,11 @@ describe("cloneNode functionality", () => {
161
161
 
162
162
  const cloned = article.cloneNode(true);
163
163
 
164
- // Verificar estructura
164
+
165
165
  expect(cloned.nodeName).toBe("ARTICLE");
166
166
  expect(cloned.getAttribute("id")).toBe("article");
167
167
 
168
- // Verificar elementos
168
+
169
169
  expect(cloned.querySelector("h2")?.textContent).toBe("Article Title");
170
170
 
171
171
  const paragraphs = cloned.querySelectorAll("p");
@@ -173,12 +173,12 @@ describe("cloneNode functionality", () => {
173
173
  expect(paragraphs[0]?.textContent).toBe("First paragraph");
174
174
  expect(paragraphs[1]?.textContent).toBe("Last paragraph");
175
175
 
176
- // Verificar div anidado
176
+
177
177
  const highlight = cloned.querySelector(".highlight");
178
178
  expect(highlight).toBeTruthy();
179
179
  expect(highlight?.querySelector("span")?.textContent).toBe("Highlighted");
180
180
 
181
- // Verificar que se copió el comentario
181
+
182
182
  const hasComment = Array.from(cloned.childNodes).some(
183
183
  (node: any) => node.nodeType === NodeType.COMMENT_NODE
184
184
  );
@@ -203,7 +203,7 @@ describe("cloneNode functionality", () => {
203
203
 
204
204
  const cloned = section.cloneNode(true);
205
205
 
206
- // Verificar outerHTML
206
+
207
207
  expect(cloned.outerHTML).toBeTruthy();
208
208
  expect(cloned.outerHTML).toContain("section");
209
209
  expect(cloned.outerHTML).toContain("class=\"main\"");
@@ -220,7 +220,7 @@ describe("cloneNode functionality", () => {
220
220
 
221
221
  const cloned = parent.cloneNode(false);
222
222
 
223
- // Debe copiar el elemento pero no los hijos
223
+
224
224
  expect(cloned.nodeName).toBe("DIV");
225
225
  expect(cloned.getAttribute("id")).toBe("parent");
226
226
  expect(cloned.childNodes.length).toBe(0);
@@ -234,12 +234,12 @@ describe("cloneNode functionality", () => {
234
234
 
235
235
  const cloned = element.cloneNode(false);
236
236
 
237
- // Atributos deben copiarse
237
+
238
238
  expect(cloned.getAttribute("id")).toBe("test");
239
239
  expect(cloned.getAttribute("class")).toBe("container");
240
240
  expect(cloned.getAttribute("data-value")).toBe("123");
241
241
 
242
- // Hijos NO deben copiarse
242
+
243
243
  expect(cloned.childNodes.length).toBe(0);
244
244
  expect(cloned.innerHTML).toBe("");
245
245
  });
@@ -253,7 +253,7 @@ describe("cloneNode functionality", () => {
253
253
 
254
254
  const cloned = original.cloneNode(true);
255
255
 
256
- // Modificar el clon no debe afectar el original
256
+
257
257
  cloned.setAttribute("id", "cloned");
258
258
  cloned.setAttribute("data-modified", "true");
259
259
 
@@ -273,10 +273,10 @@ describe("cloneNode functionality", () => {
273
273
 
274
274
  expect(clonedChild).toBeTruthy();
275
275
 
276
- // Modificar hijo del clon
276
+
277
277
  clonedChild?.setAttribute("data-cloned", "yes");
278
278
 
279
- // El hijo original no debe verse afectado
279
+
280
280
  const originalChild = parent.querySelector("#child");
281
281
  expect(originalChild?.hasAttribute("data-cloned")).toBe(false);
282
282
  });
@@ -324,7 +324,7 @@ describe("cloneNode functionality", () => {
324
324
 
325
325
  const cloned = level1.cloneNode(true);
326
326
 
327
- // Verificar todos los niveles
327
+
328
328
  expect(cloned.querySelector("#level2")).toBeTruthy();
329
329
  expect(cloned.querySelector("#level3")).toBeTruthy();
330
330
  expect(cloned.querySelector("#level4")).toBeTruthy();
@@ -340,22 +340,22 @@ describe("cloneNode functionality", () => {
340
340
  const doc = parseHTML(html);
341
341
  const container = doc.querySelector("#container")!;
342
342
 
343
- // Acceder a innerHTML para asegurar que _internalInnerHTML esté establecido
343
+
344
344
  const originalInnerHTML = container.innerHTML;
345
345
  expect(originalInnerHTML).toBeTruthy();
346
346
 
347
347
  const cloned = container.cloneNode(true);
348
348
 
349
- // Verificar que innerHTML funciona en el clon
349
+
350
350
  const clonedInnerHTML = cloned.innerHTML;
351
351
  expect(clonedInnerHTML).toBeTruthy();
352
352
  expect(clonedInnerHTML.length).toBeGreaterThan(0);
353
353
 
354
- // Verificar que contiene el mismo contenido
354
+
355
355
  expect(clonedInnerHTML).toContain("<p>Paragraph 1</p>");
356
356
  expect(clonedInnerHTML).toContain("<p>Paragraph 2</p>");
357
357
 
358
- // Verificar que el accessor de innerHTML funciona correctamente
358
+
359
359
  expect(typeof cloned.innerHTML).toBe("string");
360
360
  });
361
361
 
@@ -369,10 +369,10 @@ describe("cloneNode functionality", () => {
369
369
 
370
370
  const cloned = parent.cloneNode(true);
371
371
 
372
- // Verificar que childNodes tiene la misma estructura
372
+
373
373
  expect(cloned.childNodes.length).toBe(originalChildCount);
374
374
 
375
- // Verificar que podemos acceder a cada hijo
375
+
376
376
  for (let i = 0; i < cloned.childNodes.length; i++) {
377
377
  expect(cloned.childNodes[i]).toBeTruthy();
378
378
  expect(cloned.childNodes[i].nodeType).toBeDefined();
@@ -386,12 +386,12 @@ describe("cloneNode functionality", () => {
386
386
 
387
387
  const cloned = container.cloneNode(true);
388
388
 
389
- // Verificar que el array children está correctamente poblado
389
+
390
390
  expect(cloned.children).toBeTruthy();
391
391
  expect(Array.isArray(cloned.children)).toBe(true);
392
392
  expect(cloned.children.length).toBe(3);
393
393
 
394
- // Verificar que todos son elementos
394
+
395
395
  for (const child of cloned.children) {
396
396
  expect(child.nodeType).toBe(NodeType.ELEMENT_NODE);
397
397
  }
@@ -404,12 +404,12 @@ describe("cloneNode functionality", () => {
404
404
 
405
405
  const cloned = list.cloneNode(true);
406
406
 
407
- // Verificar referencias firstChild y lastChild
407
+
408
408
  expect(cloned.firstChild).toBeTruthy();
409
409
  expect(cloned.lastChild).toBeTruthy();
410
410
 
411
- // En DOM real, firstChild puede ser un nodo de texto (whitespace)
412
- // pero debemos asegurar que existen
411
+
412
+
413
413
  expect(cloned.firstElementChild).toBeTruthy();
414
414
  expect(cloned.lastElementChild).toBeTruthy();
415
415
 
@@ -427,12 +427,12 @@ describe("cloneNode functionality", () => {
427
427
  const doc = parseHTML(html);
428
428
  const dynamic = doc.querySelector("#dynamic")!;
429
429
 
430
- // Modificar innerHTML antes de clonar
430
+
431
431
  dynamic.innerHTML = "<p>Dynamic content</p><span>More content</span>";
432
432
 
433
433
  const cloned = dynamic.cloneNode(true);
434
434
 
435
- // Verificar que el contenido modificado se clonó
435
+
436
436
  expect(cloned.querySelector("p")).toBeTruthy();
437
437
  expect(cloned.querySelector("p")?.textContent).toBe("Dynamic content");
438
438
  expect(cloned.querySelector("span")).toBeTruthy();
@@ -446,17 +446,17 @@ describe("cloneNode functionality", () => {
446
446
 
447
447
  const cloned = original.cloneNode(true);
448
448
 
449
- // Verificar contenido clonado
449
+
450
450
  expect(cloned.querySelector("p")?.textContent).toBe("Original");
451
451
 
452
- // Modificar innerHTML del clon
452
+
453
453
  cloned.innerHTML = "<span>Modified</span>";
454
454
 
455
- // Original no debe cambiar
455
+
456
456
  expect(original.querySelector("p")?.textContent).toBe("Original");
457
457
  expect(original.querySelector("span")).toBeNull();
458
458
 
459
- // Clon debe tener el nuevo contenido
459
+
460
460
  expect(cloned.querySelector("span")?.textContent).toBe("Modified");
461
461
  expect(cloned.querySelector("p")).toBeNull();
462
462
  });
@@ -488,13 +488,13 @@ describe("cloneNode functionality", () => {
488
488
 
489
489
  const cloned = card.cloneNode(true);
490
490
 
491
- // Verificar estructura completa
491
+
492
492
  expect(cloned.getAttribute("data-id")).toBe("123");
493
493
  expect(cloned.querySelector(".card-header")).toBeTruthy();
494
494
  expect(cloned.querySelector(".card-body")).toBeTruthy();
495
495
  expect(cloned.querySelector(".card-footer")).toBeTruthy();
496
496
 
497
- // Verificar contenido específico
497
+
498
498
  expect(cloned.querySelector(".card-title")?.textContent).toBe("Card Title");
499
499
  expect(cloned.querySelector("strong")?.textContent).toBe("bold");
500
500
 
@@ -502,7 +502,7 @@ describe("cloneNode functionality", () => {
502
502
  expect(items.length).toBe(2);
503
503
 
504
504
  const buttons = cloned.querySelectorAll("button");
505
- expect(buttons.length).toBe(3); // close, save, cancel
505
+ expect(buttons.length).toBe(3);
506
506
  });
507
507
 
508
508
  it("should clone a form with various input types", () => {
@@ -522,7 +522,7 @@ describe("cloneNode functionality", () => {
522
522
 
523
523
  const cloned = form.cloneNode(true);
524
524
 
525
- // Verificar que todos los inputs se clonaron
525
+
526
526
  const textInput = cloned.querySelector('[name="username"]');
527
527
  expect(textInput).toBeTruthy();
528
528
  expect(textInput?.getAttribute("value")).toBe("john");
@@ -210,7 +210,7 @@ describe('Custom Elements Support', () => {
210
210
  const ast = parse(tokens);
211
211
 
212
212
  const element = ast.children![0]!;
213
- // nodeName should also be uppercase
213
+
214
214
  if (element.nodeName) {
215
215
  expect(element.nodeName.toUpperCase()).toBe('MY-COMP');
216
216
  }
@@ -382,11 +382,11 @@ describe('Custom Elements Support', () => {
382
382
  const tokens = tokenize(html);
383
383
  const ast = parse(tokens);
384
384
 
385
- // Find first element (skip whitespace text nodes)
385
+
386
386
  const userProfile = ast.children!.find(node => node.type === ASTNodeType.ELEMENT)!;
387
387
  expect(userProfile.tagName).toBe('user-profile');
388
388
 
389
- // Should have proper nesting
389
+
390
390
  expect(userProfile.children).toBeDefined();
391
391
  expect(userProfile.children!.length).toBeGreaterThan(0);
392
392
  });
@@ -412,7 +412,7 @@ describe('Custom Elements Support', () => {
412
412
  const tokens = tokenize(html);
413
413
  const ast = parse(tokens);
414
414
 
415
- // Find first element (skip whitespace text nodes)
415
+
416
416
  const appRoot = ast.children!.find(node => node.type === ASTNodeType.ELEMENT)!;
417
417
  expect(appRoot.tagName).toBe('app-root');
418
418
  });
@@ -471,12 +471,12 @@ describe('Custom Elements Support', () => {
471
471
  test('tokenizer should capture full custom element name', () => {
472
472
  const tokens = tokenize('<my-component-123></my-component-123>');
473
473
 
474
- // Find the opening tag token
474
+
475
475
  const openTag = tokens.find(t => t.type === 'TAG_OPEN');
476
476
  expect(openTag).toBeDefined();
477
477
  expect(openTag!.value).toBe('my-component-123');
478
478
 
479
- // Find the closing tag token
479
+
480
480
  const closeTag = tokens.find(t => t.type === 'TAG_CLOSE');
481
481
  expect(closeTag).toBeDefined();
482
482
  expect(closeTag!.value).toBe('my-component-123');
@@ -642,7 +642,7 @@ describe('Custom Elements Support', () => {
642
642
  const tokens = tokenize(html);
643
643
  const ast = parse(tokens);
644
644
 
645
- // Should have comment, element, comment
645
+
646
646
  const myComp = ast.children!.find(node => node.type === ASTNodeType.ELEMENT)!;
647
647
  expect(myComp.tagName).toBe('my-comp');
648
648
  });
@@ -700,7 +700,7 @@ describe('Custom Elements Support', () => {
700
700
  const tokens = tokenize('<table><tr><td><my-cell>content</my-cell></td></tr></table>');
701
701
  const ast = parse(tokens);
702
702
 
703
- // Find the custom element
703
+
704
704
  const table = ast.children![0]!;
705
705
  expect(table.tagName).toBe('table');
706
706
  });