@tkeron/html-parser 0.1.5 → 0.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,7 +11,7 @@ A fast and lightweight HTML parser for Bun that converts HTML strings into DOM D
11
11
  - ðŸŠķ **Lightweight**: Minimal dependencies, native implementation
12
12
  - 🌐 **Standards Compliant**: Returns standard DOM Document objects
13
13
  - 🔧 **TypeScript Support**: Full TypeScript definitions included
14
- - ✅ **Well Tested**: Comprehensive unit test suite (181/181 passing)
14
+ - ✅ **Well Tested**: Comprehensive unit test suite (569 tests passing)
15
15
  - 🔄 **100% Compatible**: Drop-in replacement, same API
16
16
 
17
17
  ## Installation
package/bun.lock CHANGED
@@ -5,7 +5,7 @@
5
5
  "": {
6
6
  "name": "@tkeron/html-parser",
7
7
  "devDependencies": {
8
- "@types/bun": "^1.3.4",
8
+ "@types/bun": "^1.3.6",
9
9
  },
10
10
  "peerDependencies": {
11
11
  "typescript": "^5.9.3",
@@ -13,11 +13,11 @@
13
13
  },
14
14
  },
15
15
  "packages": {
16
- "@types/bun": ["@types/bun@1.3.4", "", { "dependencies": { "bun-types": "1.3.4" } }, "sha512-EEPTKXHP+zKGPkhRLv+HI0UEX8/o+65hqARxLy8Ov5rIxMBPNTjeZww00CIihrIQGEQBYg+0roO5qOnS/7boGA=="],
16
+ "@types/bun": ["@types/bun@1.3.6", "", { "dependencies": { "bun-types": "1.3.6" } }, "sha512-uWCv6FO/8LcpREhenN1d1b6fcspAB+cefwD7uti8C8VffIv0Um08TKMn98FynpTiU38+y2dUO55T11NgDt8VAA=="],
17
17
 
18
18
  "@types/node": ["@types/node@25.0.3", "", { "dependencies": { "undici-types": "~7.16.0" } }, "sha512-W609buLVRVmeW693xKfzHeIV6nJGGz98uCPfeXI1ELMLXVeKYZ9m15fAMSaUPBHYLGFsVRcMmSCksQOrZV9BYA=="],
19
19
 
20
- "bun-types": ["bun-types@1.3.4", "", { "dependencies": { "@types/node": "*" } }, "sha512-5ua817+BZPZOlNaRgGBpZJOSAQ9RQ17pkwPD0yR7CfJg+r8DgIILByFifDTa+IPDDxzf5VNhtNlcKqFzDgJvlQ=="],
20
+ "bun-types": ["bun-types@1.3.6", "", { "dependencies": { "@types/node": "*" } }, "sha512-OlFwHcnNV99r//9v5IIOgQ9Uk37gZqrNMCcqEaExdkVq3Avwqok1bJFmvGMCkCE0FqzdY8VMOZpfpR3lwI+CsQ=="],
21
21
 
22
22
  "typescript": ["typescript@5.9.3", "", { "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" } }, "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw=="],
23
23
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@tkeron/html-parser",
3
- "version": "0.1.5",
3
+ "version": "0.1.7",
4
4
  "description": "A fast and lightweight HTML parser for Bun",
5
5
  "main": "index.js",
6
6
  "module": "index.ts",
@@ -8,16 +8,17 @@
8
8
  "author": "tkeron",
9
9
  "license": "MIT",
10
10
  "devDependencies": {
11
- "@types/bun": "^1.3.4"
11
+ "@types/bun": "^1.3.6"
12
12
  },
13
13
  "peerDependencies": {
14
14
  "typescript": "^5.9.3"
15
15
  },
16
16
  "keywords": [
17
- "cli",
18
- "commands",
19
- "command-line",
20
- "arguments"
17
+ "html",
18
+ "parser",
19
+ "dom",
20
+ "bun",
21
+ "tokenizer"
21
22
  ],
22
23
  "repository": {
23
24
  "url": "git@github.com:tkeron/html-parser.git"
@@ -14,33 +14,47 @@ function parseSelector(selector: string): SelectorGroup[] {
14
14
 
15
15
  return parts.map((part) => {
16
16
  const trimmed = part.trim();
17
- let tokens: SelectorToken[];
18
-
19
- if (trimmed.startsWith("#")) {
20
- tokens = [{ type: "id", value: trimmed.slice(1) }];
21
- } else if (trimmed.startsWith(".")) {
22
- tokens = [{ type: "class", value: trimmed.slice(1) }];
23
- } else if (trimmed.includes("[") && trimmed.includes("]")) {
24
- const attributeMatch = trimmed.match(/^([^[\]]*)\[([^=\]]+)(?:=["']?([^"'\]]*?)["']?)?\]$/);
25
- if (attributeMatch) {
26
- const [, tagName, attrName, attrValue] = attributeMatch;
27
- tokens = [];
28
-
29
- if (tagName && tagName.trim()) {
30
- tokens.push({ type: "tag", value: tagName.trim().toLowerCase() });
31
- }
32
-
33
- tokens.push({
34
- type: "attribute",
35
- value: (attrName || "").trim(),
36
- attributeName: (attrName || "").trim(),
37
- attributeValue: attrValue ? attrValue.trim() : undefined
38
- });
39
- } else {
40
- tokens = [{ type: "tag", value: trimmed.toLowerCase() }];
41
- }
42
- } else {
43
- tokens = [{ type: "tag", value: trimmed.toLowerCase() }];
17
+ let tokens: SelectorToken[] = [];
18
+
19
+ // Handle universal selector
20
+ if (trimmed === '*') {
21
+ // Match any element - we'll handle this specially
22
+ return { tokens: [] };
23
+ }
24
+
25
+ // Parse complex selectors like p#intro.first or .foo.bar.baz
26
+ let remaining = trimmed;
27
+
28
+ // Extract tag name first if present
29
+ const tagMatch = remaining.match(/^([a-zA-Z][a-zA-Z0-9]*)/);
30
+ if (tagMatch) {
31
+ tokens.push({ type: "tag", value: tagMatch[1].toLowerCase() });
32
+ remaining = remaining.slice(tagMatch[1].length);
33
+ }
34
+
35
+ // Extract all IDs (HTML5 allows IDs starting with digits)
36
+ const idMatches = remaining.matchAll(/#([a-zA-Z0-9][a-zA-Z0-9_-]*)/g);
37
+ for (const match of idMatches) {
38
+ tokens.push({ type: "id", value: match[1] });
39
+ }
40
+ remaining = remaining.replace(/#[a-zA-Z0-9][a-zA-Z0-9_-]*/g, '');
41
+
42
+ // Extract all classes
43
+ const classMatches = remaining.matchAll(/\.([a-zA-Z][a-zA-Z0-9_-]*)/g);
44
+ for (const match of classMatches) {
45
+ tokens.push({ type: "class", value: match[1] });
46
+ }
47
+ remaining = remaining.replace(/\.[a-zA-Z][a-zA-Z0-9_-]*/g, '');
48
+
49
+ // Extract attributes
50
+ const attrMatches = remaining.matchAll(/\[([^=\]]+)(?:=["']?([^"'\]]*?)["']?)?\]/g);
51
+ for (const match of attrMatches) {
52
+ tokens.push({
53
+ type: "attribute",
54
+ value: match[1].trim(),
55
+ attributeName: match[1].trim(),
56
+ attributeValue: match[2] ? match[2].trim() : undefined
57
+ });
44
58
  }
45
59
 
46
60
  return { tokens };
@@ -74,6 +88,10 @@ function matchesToken(element: any, token: SelectorToken): boolean {
74
88
  }
75
89
 
76
90
  function matchesSelector(element: any, tokens: SelectorToken[]): boolean {
91
+ // Universal selector - matches any element
92
+ if (tokens.length === 0) {
93
+ return true;
94
+ }
77
95
  return tokens.every((token) => matchesToken(element, token));
78
96
  }
79
97
 
@@ -6,6 +6,11 @@ import {
6
6
  querySelectorAll as querySelectorAllFunction,
7
7
  } from "./css-selector.js";
8
8
 
9
+ const VOID_ELEMENTS = new Set([
10
+ 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input',
11
+ 'link', 'meta', 'param', 'source', 'track', 'wbr'
12
+ ]);
13
+
9
14
  export const enum NodeType {
10
15
  ELEMENT_NODE = 1,
11
16
  TEXT_NODE = 3,
@@ -22,9 +27,13 @@ export function createElement(
22
27
  ): any {
23
28
  const innerHTML = "";
24
29
  const tagNameLower = tagName.toLowerCase();
25
- const initialOuterHTML = `<${tagNameLower}${Object.entries(attributes)
30
+ const isVoid = VOID_ELEMENTS.has(tagNameLower);
31
+ const attrsStr = Object.entries(attributes)
26
32
  .map(([k, v]) => ` ${k}="${v}"`)
27
- .join("")}></${tagNameLower}>`;
33
+ .join("");
34
+ const initialOuterHTML = isVoid
35
+ ? `<${tagNameLower}${attrsStr}>`
36
+ : `<${tagNameLower}${attrsStr}></${tagNameLower}>`;
28
37
  const textContent = "";
29
38
 
30
39
  const element: any = {
@@ -54,6 +63,18 @@ export function createElement(
54
63
  return child;
55
64
  },
56
65
 
66
+ prepend(...nodes: any[]): void {
67
+ prepend(element, ...nodes);
68
+ },
69
+
70
+ append(...nodes: any[]): void {
71
+ append(element, ...nodes);
72
+ },
73
+
74
+ remove(): void {
75
+ remove(element);
76
+ },
77
+
57
78
  removeChild(child: any): any {
58
79
  return removeChild(element, child);
59
80
  },
@@ -96,6 +117,10 @@ export function createElement(
96
117
  return querySelectorAllFunction(element, selector);
97
118
  },
98
119
 
120
+ matches(selector: string): boolean {
121
+ return matches(element, selector);
122
+ },
123
+
99
124
  cloneNode(deep: boolean = false): any {
100
125
  return cloneNode(element, deep);
101
126
  },
@@ -172,6 +197,10 @@ export function createTextNode(content: string): any {
172
197
  lastChild: null,
173
198
  nextSibling: null,
174
199
  previousSibling: null,
200
+
201
+ remove(): void {
202
+ remove(textNode);
203
+ },
175
204
  };
176
205
  return textNode;
177
206
  }
@@ -189,6 +218,10 @@ export function createComment(content: string): any {
189
218
  lastChild: null,
190
219
  nextSibling: null,
191
220
  previousSibling: null,
221
+
222
+ remove(): void {
223
+ remove(commentNode);
224
+ },
192
225
  };
193
226
  return commentNode;
194
227
  }
@@ -222,6 +255,14 @@ export function createDocument(): any {
222
255
  return child;
223
256
  },
224
257
 
258
+ prepend(...nodes: any[]): void {
259
+ prepend(document, ...nodes);
260
+ },
261
+
262
+ append(...nodes: any[]): void {
263
+ append(document, ...nodes);
264
+ },
265
+
225
266
  removeChild(child: any): any {
226
267
  return removeChild(document, child);
227
268
  },
@@ -395,6 +436,83 @@ function appendChild(parent: any, child: any): void {
395
436
  }
396
437
  }
397
438
 
439
+ function prepend(parent: any, ...nodes: any[]): void {
440
+ if (nodes.length === 0) return;
441
+
442
+ for (let i = nodes.length - 1; i >= 0; i--) {
443
+ const node = nodes[i];
444
+ let childNode: any;
445
+
446
+ if (typeof node === 'string') {
447
+ childNode = createTextNode(node);
448
+ } else {
449
+ childNode = node;
450
+ }
451
+
452
+ if (parent.firstChild) {
453
+ insertBefore(parent, childNode, parent.firstChild);
454
+ } else {
455
+ appendChild(parent, childNode);
456
+ }
457
+ }
458
+ }
459
+
460
+ function append(parent: any, ...nodes: any[]): void {
461
+ if (nodes.length === 0) return;
462
+
463
+ for (const node of nodes) {
464
+ let childNode: any;
465
+
466
+ if (typeof node === 'string') {
467
+ childNode = createTextNode(node);
468
+ } else {
469
+ childNode = node;
470
+ }
471
+
472
+ appendChild(parent, childNode);
473
+ }
474
+ }
475
+
476
+ function remove(node: any): void {
477
+ if (node.parentNode) {
478
+ removeChild(node.parentNode, node);
479
+ }
480
+ }
481
+
482
+ function matches(element: any, selector: string): boolean {
483
+ if (!selector || element.nodeType !== NodeType.ELEMENT_NODE) {
484
+ return false;
485
+ }
486
+
487
+ try {
488
+ // Para selectores complejos con descendientes, necesitamos buscar desde un ancestro
489
+ if (selector.includes(' ') || selector.includes('>')) {
490
+ // Buscar desde la raíz del documento
491
+ let root = element;
492
+ while (root.parentNode) {
493
+ root = root.parentNode;
494
+ }
495
+ const results = querySelectorAllFunction(root, selector);
496
+ return results.includes(element);
497
+ }
498
+
499
+ // Para selectores simples, usar el padre o crear uno temporal
500
+ const parent = element.parentNode || createTempParent(element);
501
+ const results = querySelectorAllFunction(parent, selector);
502
+ return results.includes(element);
503
+ } catch (error) {
504
+ return false;
505
+ }
506
+ }
507
+
508
+ function createTempParent(element: any): any {
509
+ const temp = createElement('div');
510
+ temp.childNodes.push(element);
511
+ temp.children.push(element);
512
+ element._tempParent = temp;
513
+ return temp;
514
+ }
515
+
398
516
  function removeChild(parent: any, child: any): any {
399
517
  const index = parent.childNodes.indexOf(child);
400
518
  if (index === -1) {
@@ -733,9 +851,10 @@ function updateElementContent(element: any): void {
733
851
  .map(([k, v]) => ` ${k}="${v}"`)
734
852
  .join("");
735
853
  const tagNameLower = element.tagName.toLowerCase();
854
+ const isVoid = VOID_ELEMENTS.has(tagNameLower);
736
855
 
737
856
  Object.defineProperty(element, "_internalOuterHTML", {
738
- value: `<${tagNameLower}${attrs}>${innerHTML}</${tagNameLower}>`,
857
+ value: isVoid ? `<${tagNameLower}${attrs}>` : `<${tagNameLower}${attrs}>${innerHTML}</${tagNameLower}>`,
739
858
  writable: true,
740
859
  enumerable: false,
741
860
  configurable: true,
@@ -830,9 +949,10 @@ export function setInnerHTML(element: any, html: string): void {
830
949
  .map(([k, v]) => ` ${k}="${v}"`)
831
950
  .join("");
832
951
  const tagNameLower = element.tagName.toLowerCase();
952
+ const isVoid = VOID_ELEMENTS.has(tagNameLower);
833
953
 
834
954
  Object.defineProperty(element, "_internalOuterHTML", {
835
- value: `<${tagNameLower}${attrs}>${actualInnerHTML}</${tagNameLower}>`,
955
+ value: isVoid ? `<${tagNameLower}${attrs}>` : `<${tagNameLower}${attrs}>${actualInnerHTML}</${tagNameLower}>`,
836
956
  writable: true,
837
957
  enumerable: false,
838
958
  configurable: true,
package/src/parser.ts CHANGED
@@ -314,42 +314,3 @@ function shouldSkipWhitespace(parent: ASTNode): boolean {
314
314
 
315
315
  return parent.tagName ? skipWhitespaceIn.has(parent.tagName) : false;
316
316
  }
317
-
318
- export function traverseAST(node: ASTNode, callback: (node: ASTNode) => void): void {
319
- callback(node);
320
-
321
- if (node.children) {
322
- for (const child of node.children) {
323
- traverseAST(child, callback);
324
- }
325
- }
326
- }
327
-
328
- export function findNodesByTagName(root: ASTNode, tagName: string): ASTNode[] {
329
- const results: ASTNode[] = [];
330
-
331
- traverseAST(root, (node) => {
332
- if (node.type === ASTNodeType.ELEMENT && node.tagName === tagName.toLowerCase()) {
333
- results.push(node);
334
- }
335
- });
336
-
337
- return results;
338
- }
339
-
340
- export function findNodesByAttribute(root: ASTNode, attrName: string, attrValue?: string): ASTNode[] {
341
- const results: ASTNode[] = [];
342
-
343
- traverseAST(root, (node) => {
344
- if (node.type === ASTNodeType.ELEMENT && node.attributes) {
345
- const hasAttr = attrName in node.attributes;
346
- const valueMatches = attrValue === undefined || node.attributes[attrName] === attrValue;
347
-
348
- if (hasAttr && valueMatches) {
349
- results.push(node);
350
- }
351
- }
352
- });
353
-
354
- return results;
355
- }
package/src/tokenizer.ts CHANGED
@@ -251,99 +251,3 @@ export function tokenize(html: string): Token[] {
251
251
 
252
252
  return tokens;
253
253
  }
254
-
255
- export function tokenizeWithRewriter(html: string): Token[] {
256
- const tokens: Token[] = [];
257
- let textBuffer = '';
258
- let position = 0;
259
-
260
- const rewriter = new HTMLRewriter();
261
-
262
- rewriter.on('*', {
263
- element(element) {
264
- if (textBuffer.trim()) {
265
- tokens.push({
266
- type: TokenType.TEXT,
267
- value: decodeEntities(textBuffer),
268
- position: calculatePosition(html, position - textBuffer.length)
269
- });
270
- textBuffer = '';
271
- }
272
-
273
- const attributes: Record<string, string> = {};
274
- for (const [name, value] of element.attributes) {
275
- attributes[name] = value;
276
- }
277
-
278
- tokens.push({
279
- type: TokenType.TAG_OPEN,
280
- value: element.tagName.toLowerCase(),
281
- position: calculatePosition(html, position),
282
- attributes,
283
- isSelfClosing: element.selfClosing
284
- });
285
-
286
- if (!element.selfClosing) {
287
- element.onEndTag((endTag) => {
288
- tokens.push({
289
- type: TokenType.TAG_CLOSE,
290
- value: endTag.name.toLowerCase(),
291
- position: calculatePosition(html, position),
292
- isClosing: true
293
- });
294
- });
295
- }
296
- },
297
-
298
- text(text) {
299
- textBuffer += text.text;
300
- },
301
-
302
- comments(comment) {
303
- tokens.push({
304
- type: TokenType.COMMENT,
305
- value: comment.text,
306
- position: calculatePosition(html, position)
307
- });
308
- }
309
- });
310
-
311
- try {
312
- const response = new Response(html, {
313
- headers: { 'Content-Type': 'text/html' }
314
- });
315
-
316
- rewriter.transform(response);
317
-
318
- if (textBuffer.trim()) {
319
- tokens.push({
320
- type: TokenType.TEXT,
321
- value: decodeEntities(textBuffer),
322
- position: calculatePosition(html, position - textBuffer.length)
323
- });
324
- }
325
-
326
- } catch (error) {
327
- console.warn('HTMLRewriter failed, falling back to manual parsing:', error);
328
- return tokenize(html);
329
- }
330
-
331
- tokens.sort((a, b) => a.position.offset - b.position.offset);
332
- tokens.push({
333
- type: TokenType.EOF,
334
- value: '',
335
- position: calculatePosition(html, html.length)
336
- });
337
-
338
- return tokens;
339
- }
340
-
341
- export function smartTokenize(html: string): Token[] {
342
- const hasSpecialContent = /<!DOCTYPE|<!--|\[CDATA\[|<\?/.test(html);
343
-
344
- if (hasSpecialContent || html.length < 1000) {
345
- return tokenize(html);
346
- } else {
347
- return tokenizeWithRewriter(html);
348
- }
349
- }