@tkeron/html-parser 1.1.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/.github/workflows/npm_deploy.yml +14 -4
  2. package/README.md +6 -6
  3. package/bun.lock +6 -8
  4. package/check-versions.ts +147 -0
  5. package/index.ts +4 -8
  6. package/package.json +5 -6
  7. package/src/dom-simulator/append-child.ts +130 -0
  8. package/src/dom-simulator/append.ts +18 -0
  9. package/src/dom-simulator/attributes.ts +23 -0
  10. package/src/dom-simulator/clone-node.ts +51 -0
  11. package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
  12. package/src/dom-simulator/create-cdata.ts +18 -0
  13. package/src/dom-simulator/create-comment.ts +23 -0
  14. package/src/dom-simulator/create-doctype.ts +24 -0
  15. package/src/dom-simulator/create-document.ts +81 -0
  16. package/src/dom-simulator/create-element.ts +195 -0
  17. package/src/dom-simulator/create-processing-instruction.ts +19 -0
  18. package/src/dom-simulator/create-temp-parent.ts +9 -0
  19. package/src/dom-simulator/create-text-node.ts +23 -0
  20. package/src/dom-simulator/escape-text-content.ts +6 -0
  21. package/src/dom-simulator/find-special-elements.ts +14 -0
  22. package/src/dom-simulator/get-text-content.ts +18 -0
  23. package/src/dom-simulator/index.ts +36 -0
  24. package/src/dom-simulator/inner-outer-html.ts +182 -0
  25. package/src/dom-simulator/insert-after.ts +20 -0
  26. package/src/dom-simulator/insert-before.ts +108 -0
  27. package/src/dom-simulator/matches.ts +26 -0
  28. package/src/dom-simulator/node-types.ts +26 -0
  29. package/src/dom-simulator/prepend.ts +24 -0
  30. package/src/dom-simulator/remove-child.ts +68 -0
  31. package/src/dom-simulator/remove.ts +7 -0
  32. package/src/dom-simulator/replace-child.ts +152 -0
  33. package/src/dom-simulator/set-text-content.ts +33 -0
  34. package/src/dom-simulator/update-element-content.ts +56 -0
  35. package/src/dom-simulator.ts +12 -1126
  36. package/src/encoding/constants.ts +8 -0
  37. package/src/encoding/detect-encoding.ts +21 -0
  38. package/src/encoding/index.ts +1 -0
  39. package/src/encoding/normalize-encoding.ts +6 -0
  40. package/src/html-entities.ts +2127 -0
  41. package/src/index.ts +5 -5
  42. package/src/parser/adoption-agency-helpers.ts +145 -0
  43. package/src/parser/constants.ts +137 -0
  44. package/src/parser/dom-to-ast.ts +79 -0
  45. package/src/parser/index.ts +9 -0
  46. package/src/parser/parse.ts +772 -0
  47. package/src/parser/types.ts +56 -0
  48. package/src/selectors/find-elements-descendant.ts +47 -0
  49. package/src/selectors/index.ts +2 -0
  50. package/src/selectors/matches-selector.ts +12 -0
  51. package/src/selectors/matches-token.ts +27 -0
  52. package/src/selectors/parse-selector.ts +48 -0
  53. package/src/selectors/query-selector-all.ts +43 -0
  54. package/src/selectors/query-selector.ts +6 -0
  55. package/src/selectors/types.ts +10 -0
  56. package/src/serializer/attributes.ts +74 -0
  57. package/src/serializer/escape.ts +13 -0
  58. package/src/serializer/index.ts +1 -0
  59. package/src/serializer/serialize-tokens.ts +511 -0
  60. package/src/tokenizer/calculate-position.ts +10 -0
  61. package/src/tokenizer/constants.ts +11 -0
  62. package/src/tokenizer/decode-entities.ts +64 -0
  63. package/src/tokenizer/index.ts +2 -0
  64. package/src/tokenizer/parse-attributes.ts +74 -0
  65. package/src/tokenizer/tokenize.ts +165 -0
  66. package/src/tokenizer/types.ts +25 -0
  67. package/tests/adoption-agency-helpers.test.ts +304 -0
  68. package/tests/advanced.test.ts +242 -221
  69. package/tests/cloneNode.test.ts +19 -66
  70. package/tests/custom-elements-head.test.ts +54 -55
  71. package/tests/dom-extended.test.ts +77 -64
  72. package/tests/dom-manipulation.test.ts +51 -24
  73. package/tests/dom.test.ts +15 -13
  74. package/tests/encoding/detect-encoding.test.ts +33 -0
  75. package/tests/google-dom.test.ts +2 -2
  76. package/tests/helpers/tokenizer-adapter.test.ts +29 -43
  77. package/tests/helpers/tokenizer-adapter.ts +36 -33
  78. package/tests/helpers/tree-adapter.test.ts +20 -20
  79. package/tests/helpers/tree-adapter.ts +34 -24
  80. package/tests/html-entities-text.test.ts +6 -2
  81. package/tests/innerhtml-void-elements.test.ts +52 -36
  82. package/tests/outerHTML-replacement.test.ts +37 -65
  83. package/tests/parser/dom-to-ast.test.ts +109 -0
  84. package/tests/parser/parse.test.ts +139 -0
  85. package/tests/parser.test.ts +281 -217
  86. package/tests/selectors/query-selector-all.test.ts +39 -0
  87. package/tests/selectors/query-selector.test.ts +42 -0
  88. package/tests/serializer/attributes.test.ts +132 -0
  89. package/tests/serializer/escape.test.ts +51 -0
  90. package/tests/serializer/serialize-tokens.test.ts +80 -0
  91. package/tests/serializer-core.test.ts +6 -6
  92. package/tests/serializer-injectmeta.test.ts +6 -6
  93. package/tests/serializer-optionaltags.test.ts +9 -6
  94. package/tests/serializer-options.test.ts +6 -6
  95. package/tests/serializer-whitespace.test.ts +6 -6
  96. package/tests/tokenizer/calculate-position.test.ts +34 -0
  97. package/tests/tokenizer/decode-entities.test.ts +31 -0
  98. package/tests/tokenizer/parse-attributes.test.ts +44 -0
  99. package/tests/tokenizer/tokenize.test.ts +757 -0
  100. package/tests/tokenizer-namedEntities.test.ts +10 -7
  101. package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
  102. package/tests/tokenizer.test.ts +268 -256
  103. package/tests/tree-construction-adoption01.test.ts +25 -16
  104. package/tests/tree-construction-adoption02.test.ts +30 -19
  105. package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
  106. package/tests/tree-construction-entities02.test.ts +18 -16
  107. package/tests/tree-construction-html5test-com.test.ts +16 -10
  108. package/tests/tree-construction-math.test.ts +11 -9
  109. package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
  110. package/tests/tree-construction-noscript01.test.ts +11 -9
  111. package/tests/tree-construction-ruby.test.ts +6 -4
  112. package/tests/tree-construction-scriptdata01.test.ts +6 -4
  113. package/tests/tree-construction-svg.test.ts +6 -4
  114. package/tests/tree-construction-template.test.ts +6 -4
  115. package/tests/tree-construction-tests10.test.ts +6 -4
  116. package/tests/tree-construction-tests11.test.ts +6 -4
  117. package/tests/tree-construction-tests20.test.ts +7 -4
  118. package/tests/tree-construction-tests21.test.ts +7 -4
  119. package/tests/tree-construction-tests23.test.ts +7 -4
  120. package/tests/tree-construction-tests24.test.ts +7 -4
  121. package/tests/tree-construction-tests5.test.ts +6 -5
  122. package/tests/tree-construction-tests6.test.ts +6 -5
  123. package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
  124. package/tests/void-elements.test.ts +85 -40
  125. package/tsconfig.json +1 -1
  126. package/src/css-selector.ts +0 -185
  127. package/src/encoding.ts +0 -39
  128. package/src/parser.ts +0 -682
  129. package/src/serializer.ts +0 -450
  130. package/src/tokenizer.ts +0 -325
  131. package/tests/selectors.test.ts +0 -128
@@ -2,15 +2,16 @@ import { readFileSync } from "fs";
2
2
  import { parse } from "../src/index.ts";
3
3
 
4
4
  describe("Tree Construction Tests6 Tests", () => {
5
- const content = readFileSync("tests/html5lib-data/tree-construction/tests6.dat", "utf8");
5
+ const content = readFileSync(
6
+ "tests/html5lib-data/tree-construction/tests6.dat",
7
+ "utf8",
8
+ );
6
9
  const sections = content.split("#data\n");
7
10
 
8
11
  for (let i = 1; i < sections.length; i++) {
9
12
  const section = sections[i];
10
- const [dataPart, documentPart] = section.split("#document\n");
13
+ const [dataPart] = section.split("#document\n");
11
14
  const data = dataPart.trim();
12
- const expectedDocument = documentPart ? documentPart.split("#errors\n")[0].trim() : "";
13
- const errors = documentPart && documentPart.includes("#errors\n") ? documentPart.split("#errors\n")[1].trim() : "";
14
15
 
15
16
  it(`Tests6 test ${i}`, () => {
16
17
  const doc = parse(data);
@@ -18,4 +19,4 @@ describe("Tree Construction Tests6 Tests", () => {
18
19
  // TODO: Implement DOM serialization and comparison
19
20
  });
20
21
  }
21
- });
22
+ });
@@ -2,15 +2,16 @@ import { readFileSync } from "fs";
2
2
  import { parse } from "../src/index.ts";
3
3
 
4
4
  describe("Tree Construction Tests_innerHTML_1 Tests", () => {
5
- const content = readFileSync("tests/html5lib-data/tree-construction/tests_innerHTML_1.dat", "utf8");
5
+ const content = readFileSync(
6
+ "tests/html5lib-data/tree-construction/tests_innerHTML_1.dat",
7
+ "utf8",
8
+ );
6
9
  const sections = content.split("#data\n");
7
10
 
8
11
  for (let i = 1; i < sections.length; i++) {
9
12
  const section = sections[i];
10
- const [dataPart, documentPart] = section.split("#document\n");
13
+ const [dataPart] = section.split("#document\n");
11
14
  const data = dataPart.trim();
12
- const expectedDocument = documentPart ? documentPart.split("#errors\n")[0].trim() : "";
13
- const errors = documentPart && documentPart.includes("#errors\n") ? documentPart.split("#errors\n")[1].trim() : "";
14
15
 
15
16
  it(`Tests_innerHTML_1 test ${i}`, () => {
16
17
  const doc = parse(data);
@@ -18,4 +19,4 @@ describe("Tree Construction Tests_innerHTML_1 Tests", () => {
18
19
  // TODO: Implement DOM serialization and comparison
19
20
  });
20
21
  }
21
- });
22
+ });
@@ -3,10 +3,10 @@ import { parseHTML } from "../index";
3
3
 
4
4
  /**
5
5
  * Test suite for HTML void elements serialization
6
- *
6
+ *
7
7
  * Void elements should NOT have closing tags according to HTML spec:
8
8
  * https://html.spec.whatwg.org/multipage/syntax.html#void-elements
9
- *
9
+ *
10
10
  * List: area, base, br, col, embed, hr, img, input, link, meta, source, track, wbr
11
11
  */
12
12
 
@@ -52,73 +52,101 @@ describe("Void Elements - outerHTML serialization", () => {
52
52
 
53
53
  describe("Individual void elements with attributes", () => {
54
54
  it("should serialize <img> with attributes without closing tag", () => {
55
- const doc = parseHTML('<html><body><img src="test.jpg" alt="test image"></body></html>');
55
+ const doc = parseHTML(
56
+ '<html><body><img src="test.jpg" alt="test image"></body></html>',
57
+ );
56
58
  const img = doc.querySelector("img");
57
59
  expect(img).not.toBeNull();
58
60
  expect(img!.outerHTML).toBe('<img src="test.jpg" alt="test image">');
59
61
  });
60
62
 
61
63
  it("should serialize <input> with type attribute without closing tag", () => {
62
- const doc = parseHTML('<html><body><input type="text" name="username"></body></html>');
64
+ const doc = parseHTML(
65
+ '<html><body><input type="text" name="username"></body></html>',
66
+ );
63
67
  const input = doc.querySelector("input");
64
68
  expect(input).not.toBeNull();
65
69
  expect(input!.outerHTML).toBe('<input type="text" name="username">');
66
70
  });
67
71
 
68
72
  it("should serialize <meta> with attributes without closing tag", () => {
69
- const doc = parseHTML('<html><head><meta charset="utf-8"></head><body></body></html>');
73
+ const doc = parseHTML(
74
+ '<html><head><meta charset="utf-8"></head><body></body></html>',
75
+ );
70
76
  const meta = doc.querySelector("meta");
71
77
  expect(meta).not.toBeNull();
72
78
  expect(meta!.outerHTML).toBe('<meta charset="utf-8">');
73
79
  });
74
80
 
75
81
  it("should serialize <link> with attributes without closing tag", () => {
76
- const doc = parseHTML('<html><head><link rel="stylesheet" href="style.css"></head><body></body></html>');
82
+ const doc = parseHTML(
83
+ '<html><head><link rel="stylesheet" href="style.css"></head><body></body></html>',
84
+ );
77
85
  const link = doc.querySelector("link");
78
86
  expect(link).not.toBeNull();
79
87
  expect(link!.outerHTML).toBe('<link rel="stylesheet" href="style.css">');
80
88
  });
81
89
 
82
90
  it("should serialize <base> with href without closing tag", () => {
83
- const doc = parseHTML('<html><head><base href="https://example.com/"></head><body></body></html>');
91
+ const doc = parseHTML(
92
+ '<html><head><base href="https://example.com/"></head><body></body></html>',
93
+ );
84
94
  const base = doc.querySelector("base");
85
95
  expect(base).not.toBeNull();
86
96
  expect(base!.outerHTML).toBe('<base href="https://example.com/">');
87
97
  });
88
98
 
89
99
  it("should serialize <col> with attributes without closing tag", () => {
90
- const doc = parseHTML('<html><body><table><colgroup><col span="2" style="background:red"></colgroup></table></body></html>');
100
+ const doc = parseHTML(
101
+ '<html><body><table><colgroup><col span="2" style="background:red"></colgroup></table></body></html>',
102
+ );
91
103
  const col = doc.querySelector("col");
92
104
  expect(col).not.toBeNull();
93
105
  expect(col!.outerHTML).toBe('<col span="2" style="background:red">');
94
106
  });
95
107
 
96
108
  it("should serialize <embed> with attributes without closing tag", () => {
97
- const doc = parseHTML('<html><body><embed src="video.swf" type="application/x-shockwave-flash"></body></html>');
109
+ const doc = parseHTML(
110
+ '<html><body><embed src="video.swf" type="application/x-shockwave-flash"></body></html>',
111
+ );
98
112
  const embed = doc.querySelector("embed");
99
113
  expect(embed).not.toBeNull();
100
- expect(embed!.outerHTML).toBe('<embed src="video.swf" type="application/x-shockwave-flash">');
114
+ expect(embed!.outerHTML).toBe(
115
+ '<embed src="video.swf" type="application/x-shockwave-flash">',
116
+ );
101
117
  });
102
118
 
103
119
  it("should serialize <source> with attributes without closing tag", () => {
104
- const doc = parseHTML('<html><body><video><source src="video.mp4" type="video/mp4"></video></body></html>');
120
+ const doc = parseHTML(
121
+ '<html><body><video><source src="video.mp4" type="video/mp4"></video></body></html>',
122
+ );
105
123
  const source = doc.querySelector("source");
106
124
  expect(source).not.toBeNull();
107
- expect(source!.outerHTML).toBe('<source src="video.mp4" type="video/mp4">');
125
+ expect(source!.outerHTML).toBe(
126
+ '<source src="video.mp4" type="video/mp4">',
127
+ );
108
128
  });
109
129
 
110
130
  it("should serialize <track> with attributes without closing tag", () => {
111
- const doc = parseHTML('<html><body><video><track kind="subtitles" src="subs.vtt" srclang="en"></video></body></html>');
131
+ const doc = parseHTML(
132
+ '<html><body><video><track kind="subtitles" src="subs.vtt" srclang="en"></video></body></html>',
133
+ );
112
134
  const track = doc.querySelector("track");
113
135
  expect(track).not.toBeNull();
114
- expect(track!.outerHTML).toBe('<track kind="subtitles" src="subs.vtt" srclang="en">');
136
+ expect(track!.outerHTML).toBe(
137
+ '<track kind="subtitles" src="subs.vtt" srclang="en">',
138
+ );
115
139
  });
116
140
 
117
141
  it("should serialize <area> with attributes without closing tag", () => {
118
- const doc = parseHTML('<html><body><map name="test"><area shape="rect" coords="0,0,100,100" href="link.html"></map></body></html>');
142
+ const doc = parseHTML(
143
+ '<html><body><map name="test"><area shape="rect" coords="0,0,100,100" href="link.html"></map></body></html>',
144
+ );
119
145
  const area = doc.querySelector("area");
120
146
  expect(area).not.toBeNull();
121
- expect(area!.outerHTML).toBe('<area shape="rect" coords="0,0,100,100" href="link.html">');
147
+ expect(area!.outerHTML).toBe(
148
+ '<area shape="rect" coords="0,0,100,100" href="link.html">',
149
+ );
122
150
  });
123
151
  });
124
152
 
@@ -136,22 +164,25 @@ describe("Void Elements - outerHTML serialization", () => {
136
164
 
137
165
  describe("Multiple void elements in same document", () => {
138
166
  it("should serialize multiple void elements correctly", () => {
139
- const doc = parseHTML('<html><body><img src="test.jpg"><br><input type="text"></body></html>');
140
-
167
+ const doc = parseHTML(
168
+ '<html><body><img src="test.jpg"><br><input type="text"></body></html>',
169
+ );
170
+
141
171
  const img = doc.querySelector("img");
142
172
  const br = doc.querySelector("br");
143
173
  const input = doc.querySelector("input");
144
-
174
+
145
175
  expect(img!.outerHTML).toBe('<img src="test.jpg">');
146
176
  expect(br!.outerHTML).toBe("<br>");
147
177
  expect(input!.outerHTML).toBe('<input type="text">');
148
178
  });
149
179
 
150
180
  it("should serialize document with multiple void elements without closing tags", () => {
151
- const html = '<html><body><img src="test.jpg"><br><input type="text"></body></html>';
181
+ const html =
182
+ '<html><body><img src="test.jpg"><br><input type="text"></body></html>';
152
183
  const doc = parseHTML(html);
153
184
  const outerHTML = doc.documentElement.outerHTML;
154
-
185
+
155
186
  expect(outerHTML).not.toContain("</img>");
156
187
  expect(outerHTML).not.toContain("</br>");
157
188
  expect(outerHTML).not.toContain("</input>");
@@ -170,11 +201,11 @@ describe("Void Elements - outerHTML serialization", () => {
170
201
  <body></body>
171
202
  </html>`;
172
203
  const doc = parseHTML(html);
173
-
204
+
174
205
  const metas = doc.querySelectorAll("meta");
175
206
  const link = doc.querySelector("link");
176
207
  const base = doc.querySelector("base");
177
-
208
+
178
209
  metas.forEach((meta: any) => {
179
210
  expect(meta.outerHTML).not.toContain("</meta>");
180
211
  });
@@ -210,7 +241,9 @@ describe("Void Elements - outerHTML serialization", () => {
210
241
  const meta = doc.createElement("meta");
211
242
  meta.setAttribute("name", "description");
212
243
  meta.setAttribute("content", "Test page");
213
- expect(meta.outerHTML).toBe('<meta name="description" content="Test page">');
244
+ expect(meta.outerHTML).toBe(
245
+ '<meta name="description" content="Test page">',
246
+ );
214
247
  });
215
248
 
216
249
  it("should serialize dynamically created <hr> without closing tag", () => {
@@ -285,14 +318,18 @@ describe("Void Elements - outerHTML serialization", () => {
285
318
  });
286
319
 
287
320
  it("should serialize <style> with closing tag", () => {
288
- const doc = parseHTML("<html><head><style></style></head><body></body></html>");
321
+ const doc = parseHTML(
322
+ "<html><head><style></style></head><body></body></html>",
323
+ );
289
324
  const style = doc.querySelector("style");
290
325
  expect(style).not.toBeNull();
291
326
  expect(style!.outerHTML).toBe("<style></style>");
292
327
  });
293
328
 
294
329
  it("should serialize <iframe> with closing tag", () => {
295
- const doc = parseHTML('<html><body><iframe src="page.html"></iframe></body></html>');
330
+ const doc = parseHTML(
331
+ '<html><body><iframe src="page.html"></iframe></body></html>',
332
+ );
296
333
  const iframe = doc.querySelector("iframe");
297
334
  expect(iframe).not.toBeNull();
298
335
  expect(iframe!.outerHTML).toBe('<iframe src="page.html"></iframe>');
@@ -336,7 +373,7 @@ describe("Void Elements - outerHTML serialization", () => {
336
373
  });
337
374
 
338
375
  it("should not include innerHTML content in void element", () => {
339
- const doc = parseHTML("<html><body><img src=\"test.jpg\"></body></html>");
376
+ const doc = parseHTML('<html><body><img src="test.jpg"></body></html>');
340
377
  const img = doc.querySelector("img");
341
378
  expect(img).not.toBeNull();
342
379
  expect(img!.innerHTML).toBe("");
@@ -357,11 +394,11 @@ describe("Void Elements - outerHTML serialization", () => {
357
394
  </form>
358
395
  </div>
359
396
  </body></html>`;
360
-
397
+
361
398
  const doc = parseHTML(html);
362
399
  const inputs = doc.querySelectorAll("input");
363
400
  const br = doc.querySelector("br");
364
-
401
+
365
402
  expect(inputs.length).toBe(2);
366
403
  inputs.forEach((input: any) => {
367
404
  expect(input.outerHTML).not.toContain("</input>");
@@ -379,11 +416,11 @@ describe("Void Elements - outerHTML serialization", () => {
379
416
  <tr><td><img src="icon.png"></td></tr>
380
417
  </table>
381
418
  </body></html>`;
382
-
419
+
383
420
  const doc = parseHTML(html);
384
421
  const cols = doc.querySelectorAll("col");
385
422
  const img = doc.querySelector("img");
386
-
423
+
387
424
  expect(cols.length).toBe(2);
388
425
  cols.forEach((col: any) => {
389
426
  expect(col.outerHTML).not.toContain("</col>");
@@ -394,24 +431,30 @@ describe("Void Elements - outerHTML serialization", () => {
394
431
 
395
432
  describe("Edge cases", () => {
396
433
  it("should handle void element with boolean attributes", () => {
397
- const doc = parseHTML('<html><body><input type="checkbox" checked disabled></body></html>');
434
+ const doc = parseHTML(
435
+ '<html><body><input type="checkbox" checked disabled></body></html>',
436
+ );
398
437
  const input = doc.querySelector("input");
399
438
  expect(input).not.toBeNull();
400
439
  expect(input!.outerHTML).not.toContain("</input>");
401
440
  });
402
441
 
403
442
  it("should handle void element with empty attribute value", () => {
404
- const doc = parseHTML('<html><body><input type="text" value=""></body></html>');
443
+ const doc = parseHTML(
444
+ '<html><body><input type="text" value=""></body></html>',
445
+ );
405
446
  const input = doc.querySelector("input");
406
447
  expect(input).not.toBeNull();
407
448
  expect(input!.outerHTML).not.toContain("</input>");
408
449
  });
409
450
 
410
451
  it("should handle uppercase void element tag names", () => {
411
- const doc = parseHTML("<html><body><BR><IMG SRC=\"test.jpg\"></body></html>");
452
+ const doc = parseHTML(
453
+ '<html><body><BR><IMG SRC="test.jpg"></body></html>',
454
+ );
412
455
  const br = doc.querySelector("br");
413
456
  const img = doc.querySelector("img");
414
-
457
+
415
458
  expect(br).not.toBeNull();
416
459
  expect(img).not.toBeNull();
417
460
  expect(br!.outerHTML).not.toContain("</br>");
@@ -421,10 +464,12 @@ describe("Void Elements - outerHTML serialization", () => {
421
464
  });
422
465
 
423
466
  it("should handle mixed case void element tag names", () => {
424
- const doc = parseHTML("<html><body><Br><ImG src=\"test.jpg\"></body></html>");
467
+ const doc = parseHTML(
468
+ '<html><body><Br><ImG src="test.jpg"></body></html>',
469
+ );
425
470
  const br = doc.querySelector("br");
426
471
  const img = doc.querySelector("img");
427
-
472
+
428
473
  expect(br).not.toBeNull();
429
474
  expect(img).not.toBeNull();
430
475
  expect(br!.outerHTML.toLowerCase()).not.toContain("</br>");
@@ -449,10 +494,10 @@ describe("Void Elements - outerHTML serialization", () => {
449
494
  </form>
450
495
  </body>
451
496
  </html>`;
452
-
497
+
453
498
  const doc = parseHTML(html);
454
499
  const fullHTML = doc.documentElement.outerHTML;
455
-
500
+
456
501
  // Check no void elements have closing tags
457
502
  expect(fullHTML).not.toContain("</meta>");
458
503
  expect(fullHTML).not.toContain("</link>");
@@ -460,7 +505,7 @@ describe("Void Elements - outerHTML serialization", () => {
460
505
  expect(fullHTML).not.toContain("</hr>");
461
506
  expect(fullHTML).not.toContain("</input>");
462
507
  expect(fullHTML).not.toContain("</br>");
463
-
508
+
464
509
  // Check non-void elements still have closing tags
465
510
  expect(fullHTML).toContain("</head>");
466
511
  expect(fullHTML).toContain("</body>");
package/tsconfig.json CHANGED
@@ -18,7 +18,7 @@
18
18
  "noUncheckedIndexedAccess": true,
19
19
  "noImplicitOverride": true,
20
20
 
21
- "noUnusedLocals": false,
21
+ "noUnusedLocals": true,
22
22
  "noUnusedParameters": false,
23
23
  "noPropertyAccessFromIndexSignature": false
24
24
  }
@@ -1,185 +0,0 @@
1
- interface SelectorToken {
2
- type: "tag" | "class" | "id" | "attribute";
3
- value: string;
4
- attributeName?: string;
5
- attributeValue?: string;
6
- }
7
-
8
- interface SelectorGroup {
9
- tokens: SelectorToken[];
10
- }
11
-
12
- function parseSelector(selector: string): SelectorGroup[] {
13
- const parts = selector.trim().split(/\s+/);
14
-
15
- return parts.map((part) => {
16
- const trimmed = part.trim();
17
- let tokens: SelectorToken[] = [];
18
-
19
- // Handle universal selector
20
- if (trimmed === '*') {
21
- // Match any element - we'll handle this specially
22
- return { tokens: [] };
23
- }
24
-
25
- // Parse complex selectors like p#intro.first or .foo.bar.baz
26
- let remaining = trimmed;
27
-
28
- // Extract tag name first if present
29
- const tagMatch = remaining.match(/^([a-zA-Z][a-zA-Z0-9-]*)/);
30
- if (tagMatch) {
31
- tokens.push({ type: "tag", value: tagMatch[1].toLowerCase() });
32
- remaining = remaining.slice(tagMatch[1].length);
33
- }
34
-
35
- // Extract all IDs (HTML5 allows IDs starting with digits)
36
- const idMatches = remaining.matchAll(/#([a-zA-Z0-9][a-zA-Z0-9_-]*)/g);
37
- for (const match of idMatches) {
38
- tokens.push({ type: "id", value: match[1] });
39
- }
40
- remaining = remaining.replace(/#[a-zA-Z0-9][a-zA-Z0-9_-]*/g, '');
41
-
42
- // Extract all classes
43
- const classMatches = remaining.matchAll(/\.([a-zA-Z][a-zA-Z0-9_-]*)/g);
44
- for (const match of classMatches) {
45
- tokens.push({ type: "class", value: match[1] });
46
- }
47
- remaining = remaining.replace(/\.[a-zA-Z][a-zA-Z0-9_-]*/g, '');
48
-
49
- // Extract attributes
50
- const attrMatches = remaining.matchAll(/\[([^=\]]+)(?:=["']?([^"'\]]*?)["']?)?\]/g);
51
- for (const match of attrMatches) {
52
- tokens.push({
53
- type: "attribute",
54
- value: match[1].trim(),
55
- attributeName: match[1].trim(),
56
- attributeValue: match[2] ? match[2].trim() : undefined
57
- });
58
- }
59
-
60
- return { tokens };
61
- });
62
- }
63
-
64
- function matchesToken(element: any, token: SelectorToken): boolean {
65
- if (!element || !element.tagName) {
66
- return false;
67
- }
68
-
69
- switch (token.type) {
70
- case "tag":
71
- return element.tagName.toLowerCase() === token.value;
72
- case "class":
73
- const classAttr =
74
- element.attributes?.class || element.attributes?.className || "";
75
- const classes = classAttr.split(/\s+/).filter(Boolean);
76
- return classes.includes(token.value);
77
- case "id":
78
- return element.attributes?.id === token.value;
79
- case "attribute":
80
- const attrValue = element.attributes?.[token.attributeName || ""];
81
- if (token.attributeValue === undefined) {
82
- return attrValue !== undefined;
83
- }
84
- return attrValue === token.attributeValue;
85
- default:
86
- return false;
87
- }
88
- }
89
-
90
- function matchesSelector(element: any, tokens: SelectorToken[]): boolean {
91
- // Universal selector - matches any element
92
- if (tokens.length === 0) {
93
- return true;
94
- }
95
- return tokens.every((token) => matchesToken(element, token));
96
- }
97
-
98
- function findElementsDescendant(
99
- node: any,
100
- selectorGroups: SelectorGroup[],
101
- groupIndex: number,
102
- results: any[]
103
- ): void {
104
- if (groupIndex >= selectorGroups.length) {
105
- return;
106
- }
107
-
108
- const currentGroup = selectorGroups[groupIndex];
109
- if (!currentGroup) {
110
- return;
111
- }
112
-
113
- const isLastGroup = groupIndex === selectorGroups.length - 1;
114
-
115
- for (const child of node.childNodes || []) {
116
- if (child.nodeType === 1) {
117
- const element = child;
118
-
119
- if (matchesSelector(element, currentGroup.tokens)) {
120
- if (isLastGroup) {
121
- results.push(element);
122
- } else {
123
- findElementsDescendant(
124
- element,
125
- selectorGroups,
126
- groupIndex + 1,
127
- results
128
- );
129
- }
130
- }
131
- }
132
-
133
- const shouldContinueSearching =
134
- !isLastGroup ||
135
- child.nodeType !== 1 ||
136
- !matchesSelector(child, currentGroup.tokens);
137
- if (shouldContinueSearching) {
138
- findElementsDescendant(child, selectorGroups, groupIndex, results);
139
- }
140
- }
141
- }
142
-
143
- function findElements(
144
- node: any,
145
- selectorGroups: SelectorGroup[],
146
- results: any[]
147
- ): void {
148
- if (selectorGroups.length === 1) {
149
- const firstGroup = selectorGroups[0];
150
- if (firstGroup) {
151
- const tokens = firstGroup.tokens;
152
- findElementsSimple(node, tokens, results);
153
- }
154
- } else {
155
- findElementsDescendant(node, selectorGroups, 0, results);
156
- }
157
- }
158
-
159
- function findElementsSimple(
160
- node: any,
161
- tokens: SelectorToken[],
162
- results: any[]
163
- ): void {
164
- if (node.nodeType === 1) {
165
- const element = node;
166
- if (matchesSelector(element, tokens)) {
167
- results.push(element);
168
- }
169
- }
170
- for (const child of node.childNodes || []) {
171
- findElementsSimple(child, tokens, results);
172
- }
173
- }
174
-
175
- export function querySelectorAll(root: any, selector: string): any[] {
176
- const selectorGroups = parseSelector(selector);
177
- const results: any[] = [];
178
- findElements(root, selectorGroups, results);
179
- return results;
180
- }
181
-
182
- export function querySelector(root: any, selector: string): any | null {
183
- const results = querySelectorAll(root, selector);
184
- return results[0] || null;
185
- }
package/src/encoding.ts DELETED
@@ -1,39 +0,0 @@
1
- /**
2
- * Detects the character encoding of an HTML document.
3
- * Based on HTML5 specification for encoding detection.
4
- */
5
-
6
- const encodingAliases: Record<string, string> = {
7
- 'iso-8859-1': 'windows-1252',
8
- 'iso8859-1': 'windows-1252',
9
- 'iso-8859-2': 'iso-8859-2',
10
- 'iso8859-2': 'iso-8859-2',
11
- 'utf-8': 'utf-8',
12
- 'utf8': 'utf-8',
13
- // Add more as needed
14
- };
15
-
16
- function normalizeEncoding(name: string): string | null {
17
- const lower = name.toLowerCase().replace(/[^a-z0-9-]/g, '');
18
- return encodingAliases[lower] || lower;
19
- }
20
-
21
- export function detectEncoding(html: string): string | null {
22
- // Limit to first 1024 characters for performance
23
- const prefix = html.substring(0, 1024);
24
-
25
- // Look for <meta charset="...">
26
- const charsetMatch = prefix.match(/<meta[^>]*charset\s*=\s*["']?([^"'\s>]+)["']?/i);
27
- if (charsetMatch) {
28
- return normalizeEncoding(charsetMatch[1]);
29
- }
30
-
31
- // Look for <meta http-equiv="Content-Type" content="text/html; charset=...">
32
- const contentTypeMatch = prefix.match(/<meta[^>]*http-equiv\s*=\s*["']?\s*content-type\s*["']?[^>]*content\s*=\s*["']?\s*text\/html;\s*charset\s*=\s*([^"'\s>]+)["']?/i);
33
- if (contentTypeMatch) {
34
- return normalizeEncoding(contentTypeMatch[1]);
35
- }
36
-
37
- // Default to Windows-1252 if no encoding found (as per HTML5 spec)
38
- return 'windows-1252';
39
- }