@tkeron/html-parser 1.1.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm_deploy.yml +14 -4
- package/README.md +6 -6
- package/bun.lock +6 -8
- package/check-versions.ts +147 -0
- package/index.ts +4 -8
- package/package.json +5 -6
- package/src/dom-simulator/append-child.ts +130 -0
- package/src/dom-simulator/append.ts +18 -0
- package/src/dom-simulator/attributes.ts +23 -0
- package/src/dom-simulator/clone-node.ts +51 -0
- package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
- package/src/dom-simulator/create-cdata.ts +18 -0
- package/src/dom-simulator/create-comment.ts +23 -0
- package/src/dom-simulator/create-doctype.ts +24 -0
- package/src/dom-simulator/create-document.ts +81 -0
- package/src/dom-simulator/create-element.ts +195 -0
- package/src/dom-simulator/create-processing-instruction.ts +19 -0
- package/src/dom-simulator/create-temp-parent.ts +9 -0
- package/src/dom-simulator/create-text-node.ts +23 -0
- package/src/dom-simulator/escape-text-content.ts +6 -0
- package/src/dom-simulator/find-special-elements.ts +14 -0
- package/src/dom-simulator/get-text-content.ts +18 -0
- package/src/dom-simulator/index.ts +36 -0
- package/src/dom-simulator/inner-outer-html.ts +182 -0
- package/src/dom-simulator/insert-after.ts +20 -0
- package/src/dom-simulator/insert-before.ts +108 -0
- package/src/dom-simulator/matches.ts +26 -0
- package/src/dom-simulator/node-types.ts +26 -0
- package/src/dom-simulator/prepend.ts +24 -0
- package/src/dom-simulator/remove-child.ts +68 -0
- package/src/dom-simulator/remove.ts +7 -0
- package/src/dom-simulator/replace-child.ts +152 -0
- package/src/dom-simulator/set-text-content.ts +33 -0
- package/src/dom-simulator/update-element-content.ts +56 -0
- package/src/dom-simulator.ts +12 -1126
- package/src/encoding/constants.ts +8 -0
- package/src/encoding/detect-encoding.ts +21 -0
- package/src/encoding/index.ts +1 -0
- package/src/encoding/normalize-encoding.ts +6 -0
- package/src/html-entities.ts +2127 -0
- package/src/index.ts +5 -5
- package/src/parser/adoption-agency-helpers.ts +145 -0
- package/src/parser/constants.ts +137 -0
- package/src/parser/dom-to-ast.ts +79 -0
- package/src/parser/index.ts +9 -0
- package/src/parser/parse.ts +772 -0
- package/src/parser/types.ts +56 -0
- package/src/selectors/find-elements-descendant.ts +47 -0
- package/src/selectors/index.ts +2 -0
- package/src/selectors/matches-selector.ts +12 -0
- package/src/selectors/matches-token.ts +27 -0
- package/src/selectors/parse-selector.ts +48 -0
- package/src/selectors/query-selector-all.ts +43 -0
- package/src/selectors/query-selector.ts +6 -0
- package/src/selectors/types.ts +10 -0
- package/src/serializer/attributes.ts +74 -0
- package/src/serializer/escape.ts +13 -0
- package/src/serializer/index.ts +1 -0
- package/src/serializer/serialize-tokens.ts +511 -0
- package/src/tokenizer/calculate-position.ts +10 -0
- package/src/tokenizer/constants.ts +11 -0
- package/src/tokenizer/decode-entities.ts +64 -0
- package/src/tokenizer/index.ts +2 -0
- package/src/tokenizer/parse-attributes.ts +74 -0
- package/src/tokenizer/tokenize.ts +165 -0
- package/src/tokenizer/types.ts +25 -0
- package/tests/adoption-agency-helpers.test.ts +304 -0
- package/tests/advanced.test.ts +242 -221
- package/tests/cloneNode.test.ts +19 -66
- package/tests/custom-elements-head.test.ts +54 -55
- package/tests/dom-extended.test.ts +77 -64
- package/tests/dom-manipulation.test.ts +51 -24
- package/tests/dom.test.ts +15 -13
- package/tests/encoding/detect-encoding.test.ts +33 -0
- package/tests/google-dom.test.ts +2 -2
- package/tests/helpers/tokenizer-adapter.test.ts +29 -43
- package/tests/helpers/tokenizer-adapter.ts +36 -33
- package/tests/helpers/tree-adapter.test.ts +20 -20
- package/tests/helpers/tree-adapter.ts +34 -24
- package/tests/html-entities-text.test.ts +6 -2
- package/tests/innerhtml-void-elements.test.ts +52 -36
- package/tests/outerHTML-replacement.test.ts +37 -65
- package/tests/parser/dom-to-ast.test.ts +109 -0
- package/tests/parser/parse.test.ts +139 -0
- package/tests/parser.test.ts +281 -217
- package/tests/selectors/query-selector-all.test.ts +39 -0
- package/tests/selectors/query-selector.test.ts +42 -0
- package/tests/serializer/attributes.test.ts +132 -0
- package/tests/serializer/escape.test.ts +51 -0
- package/tests/serializer/serialize-tokens.test.ts +80 -0
- package/tests/serializer-core.test.ts +6 -6
- package/tests/serializer-injectmeta.test.ts +6 -6
- package/tests/serializer-optionaltags.test.ts +9 -6
- package/tests/serializer-options.test.ts +6 -6
- package/tests/serializer-whitespace.test.ts +6 -6
- package/tests/tokenizer/calculate-position.test.ts +34 -0
- package/tests/tokenizer/decode-entities.test.ts +31 -0
- package/tests/tokenizer/parse-attributes.test.ts +44 -0
- package/tests/tokenizer/tokenize.test.ts +757 -0
- package/tests/tokenizer-namedEntities.test.ts +10 -7
- package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
- package/tests/tokenizer.test.ts +268 -256
- package/tests/tree-construction-adoption01.test.ts +25 -16
- package/tests/tree-construction-adoption02.test.ts +30 -19
- package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
- package/tests/tree-construction-entities02.test.ts +18 -16
- package/tests/tree-construction-html5test-com.test.ts +16 -10
- package/tests/tree-construction-math.test.ts +11 -9
- package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
- package/tests/tree-construction-noscript01.test.ts +11 -9
- package/tests/tree-construction-ruby.test.ts +6 -4
- package/tests/tree-construction-scriptdata01.test.ts +6 -4
- package/tests/tree-construction-svg.test.ts +6 -4
- package/tests/tree-construction-template.test.ts +6 -4
- package/tests/tree-construction-tests10.test.ts +6 -4
- package/tests/tree-construction-tests11.test.ts +6 -4
- package/tests/tree-construction-tests20.test.ts +7 -4
- package/tests/tree-construction-tests21.test.ts +7 -4
- package/tests/tree-construction-tests23.test.ts +7 -4
- package/tests/tree-construction-tests24.test.ts +7 -4
- package/tests/tree-construction-tests5.test.ts +6 -5
- package/tests/tree-construction-tests6.test.ts +6 -5
- package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
- package/tests/void-elements.test.ts +85 -40
- package/tsconfig.json +1 -1
- package/src/css-selector.ts +0 -185
- package/src/encoding.ts +0 -39
- package/src/parser.ts +0 -682
- package/src/serializer.ts +0 -450
- package/src/tokenizer.ts +0 -325
- package/tests/selectors.test.ts +0 -128
|
@@ -2,15 +2,16 @@ import { readFileSync } from "fs";
|
|
|
2
2
|
import { parse } from "../src/index.ts";
|
|
3
3
|
|
|
4
4
|
describe("Tree Construction Tests6 Tests", () => {
|
|
5
|
-
const content = readFileSync(
|
|
5
|
+
const content = readFileSync(
|
|
6
|
+
"tests/html5lib-data/tree-construction/tests6.dat",
|
|
7
|
+
"utf8",
|
|
8
|
+
);
|
|
6
9
|
const sections = content.split("#data\n");
|
|
7
10
|
|
|
8
11
|
for (let i = 1; i < sections.length; i++) {
|
|
9
12
|
const section = sections[i];
|
|
10
|
-
const [dataPart
|
|
13
|
+
const [dataPart] = section.split("#document\n");
|
|
11
14
|
const data = dataPart.trim();
|
|
12
|
-
const expectedDocument = documentPart ? documentPart.split("#errors\n")[0].trim() : "";
|
|
13
|
-
const errors = documentPart && documentPart.includes("#errors\n") ? documentPart.split("#errors\n")[1].trim() : "";
|
|
14
15
|
|
|
15
16
|
it(`Tests6 test ${i}`, () => {
|
|
16
17
|
const doc = parse(data);
|
|
@@ -18,4 +19,4 @@ describe("Tree Construction Tests6 Tests", () => {
|
|
|
18
19
|
// TODO: Implement DOM serialization and comparison
|
|
19
20
|
});
|
|
20
21
|
}
|
|
21
|
-
});
|
|
22
|
+
});
|
|
@@ -2,15 +2,16 @@ import { readFileSync } from "fs";
|
|
|
2
2
|
import { parse } from "../src/index.ts";
|
|
3
3
|
|
|
4
4
|
describe("Tree Construction Tests_innerHTML_1 Tests", () => {
|
|
5
|
-
const content = readFileSync(
|
|
5
|
+
const content = readFileSync(
|
|
6
|
+
"tests/html5lib-data/tree-construction/tests_innerHTML_1.dat",
|
|
7
|
+
"utf8",
|
|
8
|
+
);
|
|
6
9
|
const sections = content.split("#data\n");
|
|
7
10
|
|
|
8
11
|
for (let i = 1; i < sections.length; i++) {
|
|
9
12
|
const section = sections[i];
|
|
10
|
-
const [dataPart
|
|
13
|
+
const [dataPart] = section.split("#document\n");
|
|
11
14
|
const data = dataPart.trim();
|
|
12
|
-
const expectedDocument = documentPart ? documentPart.split("#errors\n")[0].trim() : "";
|
|
13
|
-
const errors = documentPart && documentPart.includes("#errors\n") ? documentPart.split("#errors\n")[1].trim() : "";
|
|
14
15
|
|
|
15
16
|
it(`Tests_innerHTML_1 test ${i}`, () => {
|
|
16
17
|
const doc = parse(data);
|
|
@@ -18,4 +19,4 @@ describe("Tree Construction Tests_innerHTML_1 Tests", () => {
|
|
|
18
19
|
// TODO: Implement DOM serialization and comparison
|
|
19
20
|
});
|
|
20
21
|
}
|
|
21
|
-
});
|
|
22
|
+
});
|
|
@@ -3,10 +3,10 @@ import { parseHTML } from "../index";
|
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* Test suite for HTML void elements serialization
|
|
6
|
-
*
|
|
6
|
+
*
|
|
7
7
|
* Void elements should NOT have closing tags according to HTML spec:
|
|
8
8
|
* https://html.spec.whatwg.org/multipage/syntax.html#void-elements
|
|
9
|
-
*
|
|
9
|
+
*
|
|
10
10
|
* List: area, base, br, col, embed, hr, img, input, link, meta, source, track, wbr
|
|
11
11
|
*/
|
|
12
12
|
|
|
@@ -52,73 +52,101 @@ describe("Void Elements - outerHTML serialization", () => {
|
|
|
52
52
|
|
|
53
53
|
describe("Individual void elements with attributes", () => {
|
|
54
54
|
it("should serialize <img> with attributes without closing tag", () => {
|
|
55
|
-
const doc = parseHTML(
|
|
55
|
+
const doc = parseHTML(
|
|
56
|
+
'<html><body><img src="test.jpg" alt="test image"></body></html>',
|
|
57
|
+
);
|
|
56
58
|
const img = doc.querySelector("img");
|
|
57
59
|
expect(img).not.toBeNull();
|
|
58
60
|
expect(img!.outerHTML).toBe('<img src="test.jpg" alt="test image">');
|
|
59
61
|
});
|
|
60
62
|
|
|
61
63
|
it("should serialize <input> with type attribute without closing tag", () => {
|
|
62
|
-
const doc = parseHTML(
|
|
64
|
+
const doc = parseHTML(
|
|
65
|
+
'<html><body><input type="text" name="username"></body></html>',
|
|
66
|
+
);
|
|
63
67
|
const input = doc.querySelector("input");
|
|
64
68
|
expect(input).not.toBeNull();
|
|
65
69
|
expect(input!.outerHTML).toBe('<input type="text" name="username">');
|
|
66
70
|
});
|
|
67
71
|
|
|
68
72
|
it("should serialize <meta> with attributes without closing tag", () => {
|
|
69
|
-
const doc = parseHTML(
|
|
73
|
+
const doc = parseHTML(
|
|
74
|
+
'<html><head><meta charset="utf-8"></head><body></body></html>',
|
|
75
|
+
);
|
|
70
76
|
const meta = doc.querySelector("meta");
|
|
71
77
|
expect(meta).not.toBeNull();
|
|
72
78
|
expect(meta!.outerHTML).toBe('<meta charset="utf-8">');
|
|
73
79
|
});
|
|
74
80
|
|
|
75
81
|
it("should serialize <link> with attributes without closing tag", () => {
|
|
76
|
-
const doc = parseHTML(
|
|
82
|
+
const doc = parseHTML(
|
|
83
|
+
'<html><head><link rel="stylesheet" href="style.css"></head><body></body></html>',
|
|
84
|
+
);
|
|
77
85
|
const link = doc.querySelector("link");
|
|
78
86
|
expect(link).not.toBeNull();
|
|
79
87
|
expect(link!.outerHTML).toBe('<link rel="stylesheet" href="style.css">');
|
|
80
88
|
});
|
|
81
89
|
|
|
82
90
|
it("should serialize <base> with href without closing tag", () => {
|
|
83
|
-
const doc = parseHTML(
|
|
91
|
+
const doc = parseHTML(
|
|
92
|
+
'<html><head><base href="https://example.com/"></head><body></body></html>',
|
|
93
|
+
);
|
|
84
94
|
const base = doc.querySelector("base");
|
|
85
95
|
expect(base).not.toBeNull();
|
|
86
96
|
expect(base!.outerHTML).toBe('<base href="https://example.com/">');
|
|
87
97
|
});
|
|
88
98
|
|
|
89
99
|
it("should serialize <col> with attributes without closing tag", () => {
|
|
90
|
-
const doc = parseHTML(
|
|
100
|
+
const doc = parseHTML(
|
|
101
|
+
'<html><body><table><colgroup><col span="2" style="background:red"></colgroup></table></body></html>',
|
|
102
|
+
);
|
|
91
103
|
const col = doc.querySelector("col");
|
|
92
104
|
expect(col).not.toBeNull();
|
|
93
105
|
expect(col!.outerHTML).toBe('<col span="2" style="background:red">');
|
|
94
106
|
});
|
|
95
107
|
|
|
96
108
|
it("should serialize <embed> with attributes without closing tag", () => {
|
|
97
|
-
const doc = parseHTML(
|
|
109
|
+
const doc = parseHTML(
|
|
110
|
+
'<html><body><embed src="video.swf" type="application/x-shockwave-flash"></body></html>',
|
|
111
|
+
);
|
|
98
112
|
const embed = doc.querySelector("embed");
|
|
99
113
|
expect(embed).not.toBeNull();
|
|
100
|
-
expect(embed!.outerHTML).toBe(
|
|
114
|
+
expect(embed!.outerHTML).toBe(
|
|
115
|
+
'<embed src="video.swf" type="application/x-shockwave-flash">',
|
|
116
|
+
);
|
|
101
117
|
});
|
|
102
118
|
|
|
103
119
|
it("should serialize <source> with attributes without closing tag", () => {
|
|
104
|
-
const doc = parseHTML(
|
|
120
|
+
const doc = parseHTML(
|
|
121
|
+
'<html><body><video><source src="video.mp4" type="video/mp4"></video></body></html>',
|
|
122
|
+
);
|
|
105
123
|
const source = doc.querySelector("source");
|
|
106
124
|
expect(source).not.toBeNull();
|
|
107
|
-
expect(source!.outerHTML).toBe(
|
|
125
|
+
expect(source!.outerHTML).toBe(
|
|
126
|
+
'<source src="video.mp4" type="video/mp4">',
|
|
127
|
+
);
|
|
108
128
|
});
|
|
109
129
|
|
|
110
130
|
it("should serialize <track> with attributes without closing tag", () => {
|
|
111
|
-
const doc = parseHTML(
|
|
131
|
+
const doc = parseHTML(
|
|
132
|
+
'<html><body><video><track kind="subtitles" src="subs.vtt" srclang="en"></video></body></html>',
|
|
133
|
+
);
|
|
112
134
|
const track = doc.querySelector("track");
|
|
113
135
|
expect(track).not.toBeNull();
|
|
114
|
-
expect(track!.outerHTML).toBe(
|
|
136
|
+
expect(track!.outerHTML).toBe(
|
|
137
|
+
'<track kind="subtitles" src="subs.vtt" srclang="en">',
|
|
138
|
+
);
|
|
115
139
|
});
|
|
116
140
|
|
|
117
141
|
it("should serialize <area> with attributes without closing tag", () => {
|
|
118
|
-
const doc = parseHTML(
|
|
142
|
+
const doc = parseHTML(
|
|
143
|
+
'<html><body><map name="test"><area shape="rect" coords="0,0,100,100" href="link.html"></map></body></html>',
|
|
144
|
+
);
|
|
119
145
|
const area = doc.querySelector("area");
|
|
120
146
|
expect(area).not.toBeNull();
|
|
121
|
-
expect(area!.outerHTML).toBe(
|
|
147
|
+
expect(area!.outerHTML).toBe(
|
|
148
|
+
'<area shape="rect" coords="0,0,100,100" href="link.html">',
|
|
149
|
+
);
|
|
122
150
|
});
|
|
123
151
|
});
|
|
124
152
|
|
|
@@ -136,22 +164,25 @@ describe("Void Elements - outerHTML serialization", () => {
|
|
|
136
164
|
|
|
137
165
|
describe("Multiple void elements in same document", () => {
|
|
138
166
|
it("should serialize multiple void elements correctly", () => {
|
|
139
|
-
const doc = parseHTML(
|
|
140
|
-
|
|
167
|
+
const doc = parseHTML(
|
|
168
|
+
'<html><body><img src="test.jpg"><br><input type="text"></body></html>',
|
|
169
|
+
);
|
|
170
|
+
|
|
141
171
|
const img = doc.querySelector("img");
|
|
142
172
|
const br = doc.querySelector("br");
|
|
143
173
|
const input = doc.querySelector("input");
|
|
144
|
-
|
|
174
|
+
|
|
145
175
|
expect(img!.outerHTML).toBe('<img src="test.jpg">');
|
|
146
176
|
expect(br!.outerHTML).toBe("<br>");
|
|
147
177
|
expect(input!.outerHTML).toBe('<input type="text">');
|
|
148
178
|
});
|
|
149
179
|
|
|
150
180
|
it("should serialize document with multiple void elements without closing tags", () => {
|
|
151
|
-
const html =
|
|
181
|
+
const html =
|
|
182
|
+
'<html><body><img src="test.jpg"><br><input type="text"></body></html>';
|
|
152
183
|
const doc = parseHTML(html);
|
|
153
184
|
const outerHTML = doc.documentElement.outerHTML;
|
|
154
|
-
|
|
185
|
+
|
|
155
186
|
expect(outerHTML).not.toContain("</img>");
|
|
156
187
|
expect(outerHTML).not.toContain("</br>");
|
|
157
188
|
expect(outerHTML).not.toContain("</input>");
|
|
@@ -170,11 +201,11 @@ describe("Void Elements - outerHTML serialization", () => {
|
|
|
170
201
|
<body></body>
|
|
171
202
|
</html>`;
|
|
172
203
|
const doc = parseHTML(html);
|
|
173
|
-
|
|
204
|
+
|
|
174
205
|
const metas = doc.querySelectorAll("meta");
|
|
175
206
|
const link = doc.querySelector("link");
|
|
176
207
|
const base = doc.querySelector("base");
|
|
177
|
-
|
|
208
|
+
|
|
178
209
|
metas.forEach((meta: any) => {
|
|
179
210
|
expect(meta.outerHTML).not.toContain("</meta>");
|
|
180
211
|
});
|
|
@@ -210,7 +241,9 @@ describe("Void Elements - outerHTML serialization", () => {
|
|
|
210
241
|
const meta = doc.createElement("meta");
|
|
211
242
|
meta.setAttribute("name", "description");
|
|
212
243
|
meta.setAttribute("content", "Test page");
|
|
213
|
-
expect(meta.outerHTML).toBe(
|
|
244
|
+
expect(meta.outerHTML).toBe(
|
|
245
|
+
'<meta name="description" content="Test page">',
|
|
246
|
+
);
|
|
214
247
|
});
|
|
215
248
|
|
|
216
249
|
it("should serialize dynamically created <hr> without closing tag", () => {
|
|
@@ -285,14 +318,18 @@ describe("Void Elements - outerHTML serialization", () => {
|
|
|
285
318
|
});
|
|
286
319
|
|
|
287
320
|
it("should serialize <style> with closing tag", () => {
|
|
288
|
-
const doc = parseHTML(
|
|
321
|
+
const doc = parseHTML(
|
|
322
|
+
"<html><head><style></style></head><body></body></html>",
|
|
323
|
+
);
|
|
289
324
|
const style = doc.querySelector("style");
|
|
290
325
|
expect(style).not.toBeNull();
|
|
291
326
|
expect(style!.outerHTML).toBe("<style></style>");
|
|
292
327
|
});
|
|
293
328
|
|
|
294
329
|
it("should serialize <iframe> with closing tag", () => {
|
|
295
|
-
const doc = parseHTML(
|
|
330
|
+
const doc = parseHTML(
|
|
331
|
+
'<html><body><iframe src="page.html"></iframe></body></html>',
|
|
332
|
+
);
|
|
296
333
|
const iframe = doc.querySelector("iframe");
|
|
297
334
|
expect(iframe).not.toBeNull();
|
|
298
335
|
expect(iframe!.outerHTML).toBe('<iframe src="page.html"></iframe>');
|
|
@@ -336,7 +373,7 @@ describe("Void Elements - outerHTML serialization", () => {
|
|
|
336
373
|
});
|
|
337
374
|
|
|
338
375
|
it("should not include innerHTML content in void element", () => {
|
|
339
|
-
const doc = parseHTML(
|
|
376
|
+
const doc = parseHTML('<html><body><img src="test.jpg"></body></html>');
|
|
340
377
|
const img = doc.querySelector("img");
|
|
341
378
|
expect(img).not.toBeNull();
|
|
342
379
|
expect(img!.innerHTML).toBe("");
|
|
@@ -357,11 +394,11 @@ describe("Void Elements - outerHTML serialization", () => {
|
|
|
357
394
|
</form>
|
|
358
395
|
</div>
|
|
359
396
|
</body></html>`;
|
|
360
|
-
|
|
397
|
+
|
|
361
398
|
const doc = parseHTML(html);
|
|
362
399
|
const inputs = doc.querySelectorAll("input");
|
|
363
400
|
const br = doc.querySelector("br");
|
|
364
|
-
|
|
401
|
+
|
|
365
402
|
expect(inputs.length).toBe(2);
|
|
366
403
|
inputs.forEach((input: any) => {
|
|
367
404
|
expect(input.outerHTML).not.toContain("</input>");
|
|
@@ -379,11 +416,11 @@ describe("Void Elements - outerHTML serialization", () => {
|
|
|
379
416
|
<tr><td><img src="icon.png"></td></tr>
|
|
380
417
|
</table>
|
|
381
418
|
</body></html>`;
|
|
382
|
-
|
|
419
|
+
|
|
383
420
|
const doc = parseHTML(html);
|
|
384
421
|
const cols = doc.querySelectorAll("col");
|
|
385
422
|
const img = doc.querySelector("img");
|
|
386
|
-
|
|
423
|
+
|
|
387
424
|
expect(cols.length).toBe(2);
|
|
388
425
|
cols.forEach((col: any) => {
|
|
389
426
|
expect(col.outerHTML).not.toContain("</col>");
|
|
@@ -394,24 +431,30 @@ describe("Void Elements - outerHTML serialization", () => {
|
|
|
394
431
|
|
|
395
432
|
describe("Edge cases", () => {
|
|
396
433
|
it("should handle void element with boolean attributes", () => {
|
|
397
|
-
const doc = parseHTML(
|
|
434
|
+
const doc = parseHTML(
|
|
435
|
+
'<html><body><input type="checkbox" checked disabled></body></html>',
|
|
436
|
+
);
|
|
398
437
|
const input = doc.querySelector("input");
|
|
399
438
|
expect(input).not.toBeNull();
|
|
400
439
|
expect(input!.outerHTML).not.toContain("</input>");
|
|
401
440
|
});
|
|
402
441
|
|
|
403
442
|
it("should handle void element with empty attribute value", () => {
|
|
404
|
-
const doc = parseHTML(
|
|
443
|
+
const doc = parseHTML(
|
|
444
|
+
'<html><body><input type="text" value=""></body></html>',
|
|
445
|
+
);
|
|
405
446
|
const input = doc.querySelector("input");
|
|
406
447
|
expect(input).not.toBeNull();
|
|
407
448
|
expect(input!.outerHTML).not.toContain("</input>");
|
|
408
449
|
});
|
|
409
450
|
|
|
410
451
|
it("should handle uppercase void element tag names", () => {
|
|
411
|
-
const doc = parseHTML(
|
|
452
|
+
const doc = parseHTML(
|
|
453
|
+
'<html><body><BR><IMG SRC="test.jpg"></body></html>',
|
|
454
|
+
);
|
|
412
455
|
const br = doc.querySelector("br");
|
|
413
456
|
const img = doc.querySelector("img");
|
|
414
|
-
|
|
457
|
+
|
|
415
458
|
expect(br).not.toBeNull();
|
|
416
459
|
expect(img).not.toBeNull();
|
|
417
460
|
expect(br!.outerHTML).not.toContain("</br>");
|
|
@@ -421,10 +464,12 @@ describe("Void Elements - outerHTML serialization", () => {
|
|
|
421
464
|
});
|
|
422
465
|
|
|
423
466
|
it("should handle mixed case void element tag names", () => {
|
|
424
|
-
const doc = parseHTML(
|
|
467
|
+
const doc = parseHTML(
|
|
468
|
+
'<html><body><Br><ImG src="test.jpg"></body></html>',
|
|
469
|
+
);
|
|
425
470
|
const br = doc.querySelector("br");
|
|
426
471
|
const img = doc.querySelector("img");
|
|
427
|
-
|
|
472
|
+
|
|
428
473
|
expect(br).not.toBeNull();
|
|
429
474
|
expect(img).not.toBeNull();
|
|
430
475
|
expect(br!.outerHTML.toLowerCase()).not.toContain("</br>");
|
|
@@ -449,10 +494,10 @@ describe("Void Elements - outerHTML serialization", () => {
|
|
|
449
494
|
</form>
|
|
450
495
|
</body>
|
|
451
496
|
</html>`;
|
|
452
|
-
|
|
497
|
+
|
|
453
498
|
const doc = parseHTML(html);
|
|
454
499
|
const fullHTML = doc.documentElement.outerHTML;
|
|
455
|
-
|
|
500
|
+
|
|
456
501
|
// Check no void elements have closing tags
|
|
457
502
|
expect(fullHTML).not.toContain("</meta>");
|
|
458
503
|
expect(fullHTML).not.toContain("</link>");
|
|
@@ -460,7 +505,7 @@ describe("Void Elements - outerHTML serialization", () => {
|
|
|
460
505
|
expect(fullHTML).not.toContain("</hr>");
|
|
461
506
|
expect(fullHTML).not.toContain("</input>");
|
|
462
507
|
expect(fullHTML).not.toContain("</br>");
|
|
463
|
-
|
|
508
|
+
|
|
464
509
|
// Check non-void elements still have closing tags
|
|
465
510
|
expect(fullHTML).toContain("</head>");
|
|
466
511
|
expect(fullHTML).toContain("</body>");
|
package/tsconfig.json
CHANGED
package/src/css-selector.ts
DELETED
|
@@ -1,185 +0,0 @@
|
|
|
1
|
-
interface SelectorToken {
|
|
2
|
-
type: "tag" | "class" | "id" | "attribute";
|
|
3
|
-
value: string;
|
|
4
|
-
attributeName?: string;
|
|
5
|
-
attributeValue?: string;
|
|
6
|
-
}
|
|
7
|
-
|
|
8
|
-
interface SelectorGroup {
|
|
9
|
-
tokens: SelectorToken[];
|
|
10
|
-
}
|
|
11
|
-
|
|
12
|
-
function parseSelector(selector: string): SelectorGroup[] {
|
|
13
|
-
const parts = selector.trim().split(/\s+/);
|
|
14
|
-
|
|
15
|
-
return parts.map((part) => {
|
|
16
|
-
const trimmed = part.trim();
|
|
17
|
-
let tokens: SelectorToken[] = [];
|
|
18
|
-
|
|
19
|
-
// Handle universal selector
|
|
20
|
-
if (trimmed === '*') {
|
|
21
|
-
// Match any element - we'll handle this specially
|
|
22
|
-
return { tokens: [] };
|
|
23
|
-
}
|
|
24
|
-
|
|
25
|
-
// Parse complex selectors like p#intro.first or .foo.bar.baz
|
|
26
|
-
let remaining = trimmed;
|
|
27
|
-
|
|
28
|
-
// Extract tag name first if present
|
|
29
|
-
const tagMatch = remaining.match(/^([a-zA-Z][a-zA-Z0-9-]*)/);
|
|
30
|
-
if (tagMatch) {
|
|
31
|
-
tokens.push({ type: "tag", value: tagMatch[1].toLowerCase() });
|
|
32
|
-
remaining = remaining.slice(tagMatch[1].length);
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
// Extract all IDs (HTML5 allows IDs starting with digits)
|
|
36
|
-
const idMatches = remaining.matchAll(/#([a-zA-Z0-9][a-zA-Z0-9_-]*)/g);
|
|
37
|
-
for (const match of idMatches) {
|
|
38
|
-
tokens.push({ type: "id", value: match[1] });
|
|
39
|
-
}
|
|
40
|
-
remaining = remaining.replace(/#[a-zA-Z0-9][a-zA-Z0-9_-]*/g, '');
|
|
41
|
-
|
|
42
|
-
// Extract all classes
|
|
43
|
-
const classMatches = remaining.matchAll(/\.([a-zA-Z][a-zA-Z0-9_-]*)/g);
|
|
44
|
-
for (const match of classMatches) {
|
|
45
|
-
tokens.push({ type: "class", value: match[1] });
|
|
46
|
-
}
|
|
47
|
-
remaining = remaining.replace(/\.[a-zA-Z][a-zA-Z0-9_-]*/g, '');
|
|
48
|
-
|
|
49
|
-
// Extract attributes
|
|
50
|
-
const attrMatches = remaining.matchAll(/\[([^=\]]+)(?:=["']?([^"'\]]*?)["']?)?\]/g);
|
|
51
|
-
for (const match of attrMatches) {
|
|
52
|
-
tokens.push({
|
|
53
|
-
type: "attribute",
|
|
54
|
-
value: match[1].trim(),
|
|
55
|
-
attributeName: match[1].trim(),
|
|
56
|
-
attributeValue: match[2] ? match[2].trim() : undefined
|
|
57
|
-
});
|
|
58
|
-
}
|
|
59
|
-
|
|
60
|
-
return { tokens };
|
|
61
|
-
});
|
|
62
|
-
}
|
|
63
|
-
|
|
64
|
-
function matchesToken(element: any, token: SelectorToken): boolean {
|
|
65
|
-
if (!element || !element.tagName) {
|
|
66
|
-
return false;
|
|
67
|
-
}
|
|
68
|
-
|
|
69
|
-
switch (token.type) {
|
|
70
|
-
case "tag":
|
|
71
|
-
return element.tagName.toLowerCase() === token.value;
|
|
72
|
-
case "class":
|
|
73
|
-
const classAttr =
|
|
74
|
-
element.attributes?.class || element.attributes?.className || "";
|
|
75
|
-
const classes = classAttr.split(/\s+/).filter(Boolean);
|
|
76
|
-
return classes.includes(token.value);
|
|
77
|
-
case "id":
|
|
78
|
-
return element.attributes?.id === token.value;
|
|
79
|
-
case "attribute":
|
|
80
|
-
const attrValue = element.attributes?.[token.attributeName || ""];
|
|
81
|
-
if (token.attributeValue === undefined) {
|
|
82
|
-
return attrValue !== undefined;
|
|
83
|
-
}
|
|
84
|
-
return attrValue === token.attributeValue;
|
|
85
|
-
default:
|
|
86
|
-
return false;
|
|
87
|
-
}
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
function matchesSelector(element: any, tokens: SelectorToken[]): boolean {
|
|
91
|
-
// Universal selector - matches any element
|
|
92
|
-
if (tokens.length === 0) {
|
|
93
|
-
return true;
|
|
94
|
-
}
|
|
95
|
-
return tokens.every((token) => matchesToken(element, token));
|
|
96
|
-
}
|
|
97
|
-
|
|
98
|
-
function findElementsDescendant(
|
|
99
|
-
node: any,
|
|
100
|
-
selectorGroups: SelectorGroup[],
|
|
101
|
-
groupIndex: number,
|
|
102
|
-
results: any[]
|
|
103
|
-
): void {
|
|
104
|
-
if (groupIndex >= selectorGroups.length) {
|
|
105
|
-
return;
|
|
106
|
-
}
|
|
107
|
-
|
|
108
|
-
const currentGroup = selectorGroups[groupIndex];
|
|
109
|
-
if (!currentGroup) {
|
|
110
|
-
return;
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
const isLastGroup = groupIndex === selectorGroups.length - 1;
|
|
114
|
-
|
|
115
|
-
for (const child of node.childNodes || []) {
|
|
116
|
-
if (child.nodeType === 1) {
|
|
117
|
-
const element = child;
|
|
118
|
-
|
|
119
|
-
if (matchesSelector(element, currentGroup.tokens)) {
|
|
120
|
-
if (isLastGroup) {
|
|
121
|
-
results.push(element);
|
|
122
|
-
} else {
|
|
123
|
-
findElementsDescendant(
|
|
124
|
-
element,
|
|
125
|
-
selectorGroups,
|
|
126
|
-
groupIndex + 1,
|
|
127
|
-
results
|
|
128
|
-
);
|
|
129
|
-
}
|
|
130
|
-
}
|
|
131
|
-
}
|
|
132
|
-
|
|
133
|
-
const shouldContinueSearching =
|
|
134
|
-
!isLastGroup ||
|
|
135
|
-
child.nodeType !== 1 ||
|
|
136
|
-
!matchesSelector(child, currentGroup.tokens);
|
|
137
|
-
if (shouldContinueSearching) {
|
|
138
|
-
findElementsDescendant(child, selectorGroups, groupIndex, results);
|
|
139
|
-
}
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
function findElements(
|
|
144
|
-
node: any,
|
|
145
|
-
selectorGroups: SelectorGroup[],
|
|
146
|
-
results: any[]
|
|
147
|
-
): void {
|
|
148
|
-
if (selectorGroups.length === 1) {
|
|
149
|
-
const firstGroup = selectorGroups[0];
|
|
150
|
-
if (firstGroup) {
|
|
151
|
-
const tokens = firstGroup.tokens;
|
|
152
|
-
findElementsSimple(node, tokens, results);
|
|
153
|
-
}
|
|
154
|
-
} else {
|
|
155
|
-
findElementsDescendant(node, selectorGroups, 0, results);
|
|
156
|
-
}
|
|
157
|
-
}
|
|
158
|
-
|
|
159
|
-
function findElementsSimple(
|
|
160
|
-
node: any,
|
|
161
|
-
tokens: SelectorToken[],
|
|
162
|
-
results: any[]
|
|
163
|
-
): void {
|
|
164
|
-
if (node.nodeType === 1) {
|
|
165
|
-
const element = node;
|
|
166
|
-
if (matchesSelector(element, tokens)) {
|
|
167
|
-
results.push(element);
|
|
168
|
-
}
|
|
169
|
-
}
|
|
170
|
-
for (const child of node.childNodes || []) {
|
|
171
|
-
findElementsSimple(child, tokens, results);
|
|
172
|
-
}
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
export function querySelectorAll(root: any, selector: string): any[] {
|
|
176
|
-
const selectorGroups = parseSelector(selector);
|
|
177
|
-
const results: any[] = [];
|
|
178
|
-
findElements(root, selectorGroups, results);
|
|
179
|
-
return results;
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
export function querySelector(root: any, selector: string): any | null {
|
|
183
|
-
const results = querySelectorAll(root, selector);
|
|
184
|
-
return results[0] || null;
|
|
185
|
-
}
|
package/src/encoding.ts
DELETED
|
@@ -1,39 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Detects the character encoding of an HTML document.
|
|
3
|
-
* Based on HTML5 specification for encoding detection.
|
|
4
|
-
*/
|
|
5
|
-
|
|
6
|
-
const encodingAliases: Record<string, string> = {
|
|
7
|
-
'iso-8859-1': 'windows-1252',
|
|
8
|
-
'iso8859-1': 'windows-1252',
|
|
9
|
-
'iso-8859-2': 'iso-8859-2',
|
|
10
|
-
'iso8859-2': 'iso-8859-2',
|
|
11
|
-
'utf-8': 'utf-8',
|
|
12
|
-
'utf8': 'utf-8',
|
|
13
|
-
// Add more as needed
|
|
14
|
-
};
|
|
15
|
-
|
|
16
|
-
function normalizeEncoding(name: string): string | null {
|
|
17
|
-
const lower = name.toLowerCase().replace(/[^a-z0-9-]/g, '');
|
|
18
|
-
return encodingAliases[lower] || lower;
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
export function detectEncoding(html: string): string | null {
|
|
22
|
-
// Limit to first 1024 characters for performance
|
|
23
|
-
const prefix = html.substring(0, 1024);
|
|
24
|
-
|
|
25
|
-
// Look for <meta charset="...">
|
|
26
|
-
const charsetMatch = prefix.match(/<meta[^>]*charset\s*=\s*["']?([^"'\s>]+)["']?/i);
|
|
27
|
-
if (charsetMatch) {
|
|
28
|
-
return normalizeEncoding(charsetMatch[1]);
|
|
29
|
-
}
|
|
30
|
-
|
|
31
|
-
// Look for <meta http-equiv="Content-Type" content="text/html; charset=...">
|
|
32
|
-
const contentTypeMatch = prefix.match(/<meta[^>]*http-equiv\s*=\s*["']?\s*content-type\s*["']?[^>]*content\s*=\s*["']?\s*text\/html;\s*charset\s*=\s*([^"'\s>]+)["']?/i);
|
|
33
|
-
if (contentTypeMatch) {
|
|
34
|
-
return normalizeEncoding(contentTypeMatch[1]);
|
|
35
|
-
}
|
|
36
|
-
|
|
37
|
-
// Default to Windows-1252 if no encoding found (as per HTML5 spec)
|
|
38
|
-
return 'windows-1252';
|
|
39
|
-
}
|