@tkeron/html-parser 1.1.1 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm_deploy.yml +14 -4
- package/README.md +6 -6
- package/bun.lock +6 -8
- package/check-versions.ts +147 -0
- package/index.ts +4 -8
- package/package.json +5 -6
- package/src/dom-simulator/append-child.ts +130 -0
- package/src/dom-simulator/append.ts +18 -0
- package/src/dom-simulator/attributes.ts +23 -0
- package/src/dom-simulator/clone-node.ts +51 -0
- package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
- package/src/dom-simulator/create-cdata.ts +18 -0
- package/src/dom-simulator/create-comment.ts +23 -0
- package/src/dom-simulator/create-doctype.ts +24 -0
- package/src/dom-simulator/create-document.ts +81 -0
- package/src/dom-simulator/create-element.ts +195 -0
- package/src/dom-simulator/create-processing-instruction.ts +19 -0
- package/src/dom-simulator/create-temp-parent.ts +9 -0
- package/src/dom-simulator/create-text-node.ts +23 -0
- package/src/dom-simulator/escape-text-content.ts +6 -0
- package/src/dom-simulator/find-special-elements.ts +14 -0
- package/src/dom-simulator/get-text-content.ts +18 -0
- package/src/dom-simulator/index.ts +36 -0
- package/src/dom-simulator/inner-outer-html.ts +182 -0
- package/src/dom-simulator/insert-after.ts +20 -0
- package/src/dom-simulator/insert-before.ts +108 -0
- package/src/dom-simulator/matches.ts +26 -0
- package/src/dom-simulator/node-types.ts +26 -0
- package/src/dom-simulator/prepend.ts +24 -0
- package/src/dom-simulator/remove-child.ts +68 -0
- package/src/dom-simulator/remove.ts +7 -0
- package/src/dom-simulator/replace-child.ts +152 -0
- package/src/dom-simulator/set-text-content.ts +33 -0
- package/src/dom-simulator/update-element-content.ts +56 -0
- package/src/dom-simulator.ts +12 -1126
- package/src/encoding/constants.ts +8 -0
- package/src/encoding/detect-encoding.ts +21 -0
- package/src/encoding/index.ts +1 -0
- package/src/encoding/normalize-encoding.ts +6 -0
- package/src/html-entities.ts +2127 -0
- package/src/index.ts +5 -5
- package/src/parser/adoption-agency-helpers.ts +145 -0
- package/src/parser/constants.ts +137 -0
- package/src/parser/dom-to-ast.ts +79 -0
- package/src/parser/index.ts +9 -0
- package/src/parser/parse.ts +772 -0
- package/src/parser/types.ts +56 -0
- package/src/selectors/find-elements-descendant.ts +47 -0
- package/src/selectors/index.ts +2 -0
- package/src/selectors/matches-selector.ts +12 -0
- package/src/selectors/matches-token.ts +27 -0
- package/src/selectors/parse-selector.ts +48 -0
- package/src/selectors/query-selector-all.ts +43 -0
- package/src/selectors/query-selector.ts +6 -0
- package/src/selectors/types.ts +10 -0
- package/src/serializer/attributes.ts +74 -0
- package/src/serializer/escape.ts +13 -0
- package/src/serializer/index.ts +1 -0
- package/src/serializer/serialize-tokens.ts +511 -0
- package/src/tokenizer/calculate-position.ts +10 -0
- package/src/tokenizer/constants.ts +11 -0
- package/src/tokenizer/decode-entities.ts +64 -0
- package/src/tokenizer/index.ts +2 -0
- package/src/tokenizer/parse-attributes.ts +74 -0
- package/src/tokenizer/tokenize.ts +165 -0
- package/src/tokenizer/types.ts +25 -0
- package/tests/adoption-agency-helpers.test.ts +304 -0
- package/tests/advanced.test.ts +242 -221
- package/tests/cloneNode.test.ts +19 -66
- package/tests/custom-elements-head.test.ts +54 -55
- package/tests/dom-extended.test.ts +77 -64
- package/tests/dom-manipulation.test.ts +51 -24
- package/tests/dom.test.ts +15 -13
- package/tests/edge-cases.test.ts +300 -174
- package/tests/encoding/detect-encoding.test.ts +33 -0
- package/tests/google-dom.test.ts +2 -2
- package/tests/helpers/tokenizer-adapter.test.ts +29 -43
- package/tests/helpers/tokenizer-adapter.ts +36 -33
- package/tests/helpers/tree-adapter.test.ts +20 -20
- package/tests/helpers/tree-adapter.ts +34 -24
- package/tests/html-entities-text.test.ts +71 -0
- package/tests/innerhtml-void-elements.test.ts +52 -36
- package/tests/outerHTML-replacement.test.ts +37 -65
- package/tests/parser/dom-to-ast.test.ts +109 -0
- package/tests/parser/parse.test.ts +139 -0
- package/tests/parser.test.ts +281 -217
- package/tests/selectors/query-selector-all.test.ts +39 -0
- package/tests/selectors/query-selector.test.ts +42 -0
- package/tests/serializer/attributes.test.ts +132 -0
- package/tests/serializer/escape.test.ts +51 -0
- package/tests/serializer/serialize-tokens.test.ts +80 -0
- package/tests/serializer-core.test.ts +6 -6
- package/tests/serializer-injectmeta.test.ts +6 -6
- package/tests/serializer-optionaltags.test.ts +9 -6
- package/tests/serializer-options.test.ts +6 -6
- package/tests/serializer-whitespace.test.ts +6 -6
- package/tests/tokenizer/calculate-position.test.ts +34 -0
- package/tests/tokenizer/decode-entities.test.ts +31 -0
- package/tests/tokenizer/parse-attributes.test.ts +44 -0
- package/tests/tokenizer/tokenize.test.ts +757 -0
- package/tests/tokenizer-namedEntities.test.ts +10 -7
- package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
- package/tests/tokenizer.test.ts +268 -256
- package/tests/tree-construction-adoption01.test.ts +25 -16
- package/tests/tree-construction-adoption02.test.ts +30 -19
- package/tests/tree-construction-domjs-unsafe.test.ts +7 -5
- package/tests/tree-construction-entities02.test.ts +18 -16
- package/tests/tree-construction-html5test-com.test.ts +16 -10
- package/tests/tree-construction-math.test.ts +11 -9
- package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
- package/tests/tree-construction-noscript01.test.ts +11 -9
- package/tests/tree-construction-ruby.test.ts +6 -4
- package/tests/tree-construction-scriptdata01.test.ts +6 -4
- package/tests/tree-construction-svg.test.ts +6 -4
- package/tests/tree-construction-template.test.ts +6 -4
- package/tests/tree-construction-tests10.test.ts +6 -4
- package/tests/tree-construction-tests11.test.ts +6 -4
- package/tests/tree-construction-tests20.test.ts +7 -4
- package/tests/tree-construction-tests21.test.ts +7 -4
- package/tests/tree-construction-tests23.test.ts +7 -4
- package/tests/tree-construction-tests24.test.ts +7 -4
- package/tests/tree-construction-tests5.test.ts +6 -5
- package/tests/tree-construction-tests6.test.ts +6 -5
- package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
- package/tests/void-elements.test.ts +85 -40
- package/tsconfig.json +1 -1
- package/src/css-selector.ts +0 -185
- package/src/encoding.ts +0 -39
- package/src/parser.ts +0 -682
- package/src/serializer.ts +0 -450
- package/src/tokenizer.ts +0 -325
- package/tests/selectors.test.ts +0 -128
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { expect, it } from "bun:test";
|
|
2
|
+
import { detectEncoding } from "../../src/encoding/index.ts";
|
|
3
|
+
|
|
4
|
+
it("should detect charset from meta tag", () => {
|
|
5
|
+
const html = '<html><head><meta charset="utf-8"></head></html>';
|
|
6
|
+
expect(detectEncoding(html)).toBe("utf-8");
|
|
7
|
+
});
|
|
8
|
+
|
|
9
|
+
it("should detect charset from meta tag with single quotes", () => {
|
|
10
|
+
const html = "<html><head><meta charset='iso-8859-1'></head></html>";
|
|
11
|
+
expect(detectEncoding(html)).toBe("windows-1252");
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
it("should detect charset from content-type meta", () => {
|
|
15
|
+
const html =
|
|
16
|
+
'<html><head><meta http-equiv="content-type" content="text/html; charset=utf-8"></head></html>';
|
|
17
|
+
expect(detectEncoding(html)).toBe("utf-8");
|
|
18
|
+
});
|
|
19
|
+
|
|
20
|
+
it("should return windows-1252 as default", () => {
|
|
21
|
+
const html = "<html><head></head></html>";
|
|
22
|
+
expect(detectEncoding(html)).toBe("windows-1252");
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it("should normalize encoding aliases", () => {
|
|
26
|
+
const html = '<html><head><meta charset="UTF-8"></head></html>';
|
|
27
|
+
expect(detectEncoding(html)).toBe("utf-8");
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it("should handle case insensitive charset", () => {
|
|
31
|
+
const html = '<html><head><meta CHARSET="utf-8"></head></html>';
|
|
32
|
+
expect(detectEncoding(html)).toBe("utf-8");
|
|
33
|
+
});
|
package/tests/google-dom.test.ts
CHANGED
|
@@ -100,8 +100,8 @@ describe("Google DOM Parsing Test", () => {
|
|
|
100
100
|
for (let i = 0; i < Math.min(navLinks.length, 5); i++) {
|
|
101
101
|
const link = navLinks[i];
|
|
102
102
|
if (link) {
|
|
103
|
-
|
|
104
|
-
|
|
103
|
+
link.getAttribute("href");
|
|
104
|
+
link.textContent?.trim();
|
|
105
105
|
}
|
|
106
106
|
}
|
|
107
107
|
}
|
|
@@ -1,70 +1,56 @@
|
|
|
1
|
-
import { tokenize } from
|
|
2
|
-
import { adaptTokens
|
|
1
|
+
import { tokenize } from "../../src/tokenizer/index.js";
|
|
2
|
+
import { adaptTokens } from "./tokenizer-adapter.ts";
|
|
3
3
|
|
|
4
|
-
describe(
|
|
5
|
-
it(
|
|
6
|
-
const tokens = tokenize(
|
|
4
|
+
describe("Tokenizer Adapter Tests", () => {
|
|
5
|
+
it("should adapt simple start tag", () => {
|
|
6
|
+
const tokens = tokenize("<div>");
|
|
7
7
|
const adapted = adaptTokens(tokens);
|
|
8
|
-
expect(adapted).toEqual([
|
|
9
|
-
['StartTag', 'div', {}]
|
|
10
|
-
]);
|
|
8
|
+
expect(adapted).toEqual([["StartTag", "div", {}]]);
|
|
11
9
|
});
|
|
12
10
|
|
|
13
|
-
it(
|
|
11
|
+
it("should adapt start tag with attributes", () => {
|
|
14
12
|
const tokens = tokenize('<div class="foo" id="bar">');
|
|
15
13
|
const adapted = adaptTokens(tokens);
|
|
16
|
-
expect(adapted).toEqual([
|
|
17
|
-
['StartTag', 'div', { class: 'foo', id: 'bar' }]
|
|
18
|
-
]);
|
|
14
|
+
expect(adapted).toEqual([["StartTag", "div", { class: "foo", id: "bar" }]]);
|
|
19
15
|
});
|
|
20
16
|
|
|
21
|
-
it(
|
|
22
|
-
const tokens = tokenize(
|
|
17
|
+
it("should adapt self-closing tag", () => {
|
|
18
|
+
const tokens = tokenize("<br/>");
|
|
23
19
|
const adapted = adaptTokens(tokens);
|
|
24
|
-
expect(adapted).toEqual([
|
|
25
|
-
['StartTag', 'br', {}, true]
|
|
26
|
-
]);
|
|
20
|
+
expect(adapted).toEqual([["StartTag", "br", {}, true]]);
|
|
27
21
|
});
|
|
28
22
|
|
|
29
|
-
it(
|
|
30
|
-
const tokens = tokenize(
|
|
23
|
+
it("should adapt end tag", () => {
|
|
24
|
+
const tokens = tokenize("</div>");
|
|
31
25
|
const adapted = adaptTokens(tokens);
|
|
32
|
-
expect(adapted).toEqual([
|
|
33
|
-
['EndTag', 'div']
|
|
34
|
-
]);
|
|
26
|
+
expect(adapted).toEqual([["EndTag", "div"]]);
|
|
35
27
|
});
|
|
36
28
|
|
|
37
|
-
it(
|
|
38
|
-
const tokens = tokenize(
|
|
29
|
+
it("should adapt text", () => {
|
|
30
|
+
const tokens = tokenize("hello world");
|
|
39
31
|
const adapted = adaptTokens(tokens);
|
|
40
|
-
expect(adapted).toEqual([
|
|
41
|
-
['Character', 'hello world']
|
|
42
|
-
]);
|
|
32
|
+
expect(adapted).toEqual([["Character", "hello world"]]);
|
|
43
33
|
});
|
|
44
34
|
|
|
45
|
-
it(
|
|
46
|
-
const tokens = tokenize(
|
|
35
|
+
it("should adapt comment", () => {
|
|
36
|
+
const tokens = tokenize("<!-- comment -->");
|
|
47
37
|
const adapted = adaptTokens(tokens);
|
|
48
|
-
expect(adapted).toEqual([
|
|
49
|
-
['Comment', ' comment ']
|
|
50
|
-
]);
|
|
38
|
+
expect(adapted).toEqual([["Comment", " comment "]]);
|
|
51
39
|
});
|
|
52
40
|
|
|
53
|
-
it(
|
|
54
|
-
const tokens = tokenize(
|
|
41
|
+
it("should adapt DOCTYPE", () => {
|
|
42
|
+
const tokens = tokenize("<!DOCTYPE html>");
|
|
55
43
|
const adapted = adaptTokens(tokens);
|
|
56
|
-
expect(adapted).toEqual([
|
|
57
|
-
['DOCTYPE', 'html', null, null, true]
|
|
58
|
-
]);
|
|
44
|
+
expect(adapted).toEqual([["DOCTYPE", "html", null, null, true]]);
|
|
59
45
|
});
|
|
60
46
|
|
|
61
|
-
it(
|
|
62
|
-
const tokens = tokenize(
|
|
47
|
+
it("should adapt mixed content", () => {
|
|
48
|
+
const tokens = tokenize("<div>hello</div>");
|
|
63
49
|
const adapted = adaptTokens(tokens);
|
|
64
50
|
expect(adapted).toEqual([
|
|
65
|
-
[
|
|
66
|
-
[
|
|
67
|
-
[
|
|
51
|
+
["StartTag", "div", {}],
|
|
52
|
+
["Character", "hello"],
|
|
53
|
+
["EndTag", "div"],
|
|
68
54
|
]);
|
|
69
55
|
});
|
|
70
|
-
});
|
|
56
|
+
});
|
|
@@ -1,65 +1,68 @@
|
|
|
1
1
|
// tests/helpers/tokenizer-adapter.ts
|
|
2
2
|
|
|
3
|
-
import type { Token } from
|
|
3
|
+
import type { Token } from "../../src/tokenizer/index.js";
|
|
4
4
|
|
|
5
|
-
export type Html5libToken =
|
|
6
|
-
| [
|
|
7
|
-
| [
|
|
8
|
-
| [
|
|
9
|
-
| [
|
|
10
|
-
| [
|
|
11
|
-
| [
|
|
5
|
+
export type Html5libToken =
|
|
6
|
+
| ["StartTag", string, Record<string, string>]
|
|
7
|
+
| ["StartTag", string, Record<string, string>, boolean] // con self-closing flag
|
|
8
|
+
| ["EndTag", string]
|
|
9
|
+
| ["Character", string]
|
|
10
|
+
| ["Comment", string]
|
|
11
|
+
| ["DOCTYPE", string, string | null, string | null, boolean];
|
|
12
12
|
|
|
13
13
|
export function adaptTokens(tokens: Token[]): Html5libToken[] {
|
|
14
14
|
const result: Html5libToken[] = [];
|
|
15
|
-
|
|
15
|
+
|
|
16
16
|
for (const token of tokens) {
|
|
17
|
-
if (token.type ===
|
|
18
|
-
|
|
17
|
+
if (token.type === "EOF") continue;
|
|
18
|
+
|
|
19
19
|
switch (token.type) {
|
|
20
|
-
case
|
|
20
|
+
case "TAG_OPEN":
|
|
21
21
|
if (token.isClosing) {
|
|
22
|
-
result.push([
|
|
22
|
+
result.push(["EndTag", token.value]);
|
|
23
23
|
} else {
|
|
24
24
|
const attrs = token.attributes || {};
|
|
25
25
|
if (token.isSelfClosing) {
|
|
26
|
-
result.push([
|
|
26
|
+
result.push(["StartTag", token.value, attrs, true]);
|
|
27
27
|
} else {
|
|
28
|
-
result.push([
|
|
28
|
+
result.push(["StartTag", token.value, attrs]);
|
|
29
29
|
}
|
|
30
30
|
}
|
|
31
31
|
break;
|
|
32
|
-
|
|
33
|
-
case
|
|
34
|
-
result.push([
|
|
32
|
+
|
|
33
|
+
case "TAG_CLOSE":
|
|
34
|
+
result.push(["EndTag", token.value]);
|
|
35
35
|
break;
|
|
36
|
-
|
|
37
|
-
case
|
|
38
|
-
result.push([
|
|
36
|
+
|
|
37
|
+
case "TEXT":
|
|
38
|
+
result.push(["Character", token.value]);
|
|
39
39
|
break;
|
|
40
|
-
|
|
41
|
-
case
|
|
42
|
-
result.push([
|
|
40
|
+
|
|
41
|
+
case "COMMENT":
|
|
42
|
+
result.push(["Comment", token.value]);
|
|
43
43
|
break;
|
|
44
|
-
|
|
45
|
-
case
|
|
44
|
+
|
|
45
|
+
case "DOCTYPE":
|
|
46
46
|
// Parsear DOCTYPE para extraer name, publicId, systemId
|
|
47
|
-
result.push([
|
|
47
|
+
result.push(["DOCTYPE", token.value, null, null, true]);
|
|
48
48
|
break;
|
|
49
|
-
|
|
50
|
-
case
|
|
51
|
-
result.push([
|
|
49
|
+
|
|
50
|
+
case "CDATA":
|
|
51
|
+
result.push(["Character", token.value]);
|
|
52
52
|
break;
|
|
53
53
|
}
|
|
54
54
|
}
|
|
55
|
-
|
|
55
|
+
|
|
56
56
|
return result;
|
|
57
57
|
}
|
|
58
58
|
|
|
59
59
|
// Función para comparar tokens, manejando casos especiales
|
|
60
|
-
export function compareTokens(
|
|
60
|
+
export function compareTokens(
|
|
61
|
+
actual: Html5libToken[],
|
|
62
|
+
expected: any[],
|
|
63
|
+
): boolean {
|
|
61
64
|
// Implementar comparación flexible
|
|
62
65
|
// - Coalescer Characters consecutivos
|
|
63
66
|
// - Ignorar diferencias de whitespace en algunos casos
|
|
64
67
|
return JSON.stringify(actual) === JSON.stringify(expected);
|
|
65
|
-
}
|
|
68
|
+
}
|
|
@@ -1,39 +1,39 @@
|
|
|
1
|
-
import { parseHTML } from
|
|
2
|
-
import { serializeToHtml5lib } from
|
|
1
|
+
import { parseHTML } from "../../index.ts";
|
|
2
|
+
import { serializeToHtml5lib } from "./tree-adapter.ts";
|
|
3
3
|
|
|
4
|
-
describe(
|
|
5
|
-
it(
|
|
6
|
-
const doc = parseHTML(
|
|
4
|
+
describe("Tree Adapter Tests", () => {
|
|
5
|
+
it("should serialize simple element", () => {
|
|
6
|
+
const doc = parseHTML("<div></div>");
|
|
7
7
|
const serialized = serializeToHtml5lib(doc);
|
|
8
|
-
expect(serialized).toContain(
|
|
9
|
-
expect(serialized).toContain(
|
|
10
|
-
expect(serialized).toContain(
|
|
8
|
+
expect(serialized).toContain("| <html>");
|
|
9
|
+
expect(serialized).toContain("| <body>");
|
|
10
|
+
expect(serialized).toContain("| <div>");
|
|
11
11
|
});
|
|
12
12
|
|
|
13
|
-
it(
|
|
13
|
+
it("should serialize element with attributes", () => {
|
|
14
14
|
const doc = parseHTML('<div class="foo" id="bar"></div>');
|
|
15
15
|
const serialized = serializeToHtml5lib(doc);
|
|
16
|
-
expect(serialized).toContain(
|
|
16
|
+
expect(serialized).toContain("<div>");
|
|
17
17
|
expect(serialized).toContain('class="foo"');
|
|
18
18
|
expect(serialized).toContain('id="bar"');
|
|
19
19
|
});
|
|
20
20
|
|
|
21
|
-
it(
|
|
22
|
-
const doc = parseHTML(
|
|
21
|
+
it("should serialize text content", () => {
|
|
22
|
+
const doc = parseHTML("<div>hello</div>");
|
|
23
23
|
const serialized = serializeToHtml5lib(doc);
|
|
24
24
|
expect(serialized).toContain('"hello"');
|
|
25
25
|
});
|
|
26
26
|
|
|
27
|
-
it(
|
|
28
|
-
const doc = parseHTML(
|
|
27
|
+
it("should serialize comment", () => {
|
|
28
|
+
const doc = parseHTML("<div><!-- comment --></div>");
|
|
29
29
|
const serialized = serializeToHtml5lib(doc);
|
|
30
|
-
expect(serialized).toContain(
|
|
30
|
+
expect(serialized).toContain("<!-- comment -->");
|
|
31
31
|
});
|
|
32
32
|
|
|
33
|
-
it(
|
|
34
|
-
const doc = parseHTML(
|
|
33
|
+
it("should serialize DOCTYPE", () => {
|
|
34
|
+
const doc = parseHTML("<!DOCTYPE html><div></div>");
|
|
35
35
|
const serialized = serializeToHtml5lib(doc);
|
|
36
|
-
expect(serialized).toContain(
|
|
37
|
-
expect(serialized).toContain(
|
|
36
|
+
expect(serialized).toContain("<!DOCTYPE html>");
|
|
37
|
+
expect(serialized).toContain("<div>");
|
|
38
38
|
});
|
|
39
|
-
});
|
|
39
|
+
});
|
|
@@ -4,57 +4,67 @@ export interface SerializeOptions {
|
|
|
4
4
|
skipImplicitDoctype?: boolean;
|
|
5
5
|
}
|
|
6
6
|
|
|
7
|
-
export function serializeToHtml5lib(
|
|
7
|
+
export function serializeToHtml5lib(
|
|
8
|
+
doc: any,
|
|
9
|
+
options: SerializeOptions = {},
|
|
10
|
+
): string {
|
|
8
11
|
const lines: string[] = [];
|
|
9
12
|
|
|
10
13
|
function serialize(node: any, depth: number): void {
|
|
11
|
-
const indent =
|
|
14
|
+
const indent = "| " + " ".repeat(depth);
|
|
12
15
|
|
|
13
|
-
if (node.nodeType === 9) {
|
|
16
|
+
if (node.nodeType === 9) {
|
|
17
|
+
// DOCUMENT
|
|
14
18
|
for (const child of node.childNodes || []) {
|
|
15
19
|
serialize(child, depth);
|
|
16
20
|
}
|
|
17
|
-
} else if (node.nodeType === 1) {
|
|
21
|
+
} else if (node.nodeType === 1) {
|
|
22
|
+
// ELEMENT
|
|
18
23
|
const tagName = node.tagName.toLowerCase();
|
|
19
24
|
const ns = node.namespaceURI;
|
|
20
|
-
|
|
21
|
-
let nsPrefix =
|
|
22
|
-
if (ns ===
|
|
23
|
-
nsPrefix =
|
|
24
|
-
} else if (ns ===
|
|
25
|
-
nsPrefix =
|
|
25
|
+
|
|
26
|
+
let nsPrefix = "";
|
|
27
|
+
if (ns === "http://www.w3.org/2000/svg") {
|
|
28
|
+
nsPrefix = " svg";
|
|
29
|
+
} else if (ns === "http://www.w3.org/1998/Math/MathML") {
|
|
30
|
+
nsPrefix = " math";
|
|
26
31
|
}
|
|
27
|
-
|
|
32
|
+
|
|
28
33
|
lines.push(`${indent}<${tagName}${nsPrefix}>`);
|
|
29
|
-
|
|
34
|
+
|
|
30
35
|
// Atributos en orden alfabético
|
|
31
|
-
const attrs = Object.entries(node.attributes || {}).sort(([a], [b]) =>
|
|
36
|
+
const attrs = Object.entries(node.attributes || {}).sort(([a], [b]) =>
|
|
37
|
+
a.localeCompare(b),
|
|
38
|
+
);
|
|
32
39
|
for (const [name, value] of attrs) {
|
|
33
40
|
lines.push(`${indent} ${name}="${value}"`);
|
|
34
41
|
}
|
|
35
|
-
|
|
42
|
+
|
|
36
43
|
// Template special case
|
|
37
|
-
if (node.tagName.toLowerCase() ===
|
|
44
|
+
if (node.tagName.toLowerCase() === "template" && node.content) {
|
|
38
45
|
lines.push(`${indent} content`);
|
|
39
46
|
serialize(node.content, depth + 2);
|
|
40
47
|
}
|
|
41
|
-
|
|
48
|
+
|
|
42
49
|
// Children
|
|
43
50
|
for (const child of node.childNodes || []) {
|
|
44
51
|
serialize(child, depth + 1);
|
|
45
52
|
}
|
|
46
|
-
} else if (node.nodeType === 3) {
|
|
53
|
+
} else if (node.nodeType === 3) {
|
|
54
|
+
// TEXT
|
|
47
55
|
lines.push(`${indent}"${node.textContent}"`);
|
|
48
|
-
} else if (node.nodeType === 8) {
|
|
49
|
-
|
|
56
|
+
} else if (node.nodeType === 8) {
|
|
57
|
+
// COMMENT
|
|
58
|
+
const commentData = node.data || node.nodeValue || node.textContent || "";
|
|
50
59
|
lines.push(`${indent}<!-- ${commentData} -->`);
|
|
51
|
-
} else if (node.nodeType === 10) {
|
|
60
|
+
} else if (node.nodeType === 10) {
|
|
61
|
+
// DOCTYPE
|
|
52
62
|
if (!options.skipImplicitDoctype) {
|
|
53
|
-
lines.push(`${indent}<!DOCTYPE ${node.name ||
|
|
63
|
+
lines.push(`${indent}<!DOCTYPE ${node.name || "html"}>`);
|
|
54
64
|
}
|
|
55
65
|
}
|
|
56
66
|
}
|
|
57
|
-
|
|
67
|
+
|
|
58
68
|
serialize(doc, 0);
|
|
59
|
-
return lines.join(
|
|
60
|
-
}
|
|
69
|
+
return lines.join("\n") + "\n";
|
|
70
|
+
}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import { describe, it, expect } from "bun:test";
|
|
2
|
+
import { parseHTML } from "../src/index";
|
|
3
|
+
|
|
4
|
+
describe("HTML entities in text content", () => {
|
|
5
|
+
it("should preserve < and > entities when serializing innerHTML", () => {
|
|
6
|
+
const doc = parseHTML("<p><div></p>");
|
|
7
|
+
const p = doc.querySelector("p");
|
|
8
|
+
expect(p.innerHTML).toBe("<div>");
|
|
9
|
+
});
|
|
10
|
+
|
|
11
|
+
it("should preserve < and > in code elements", () => {
|
|
12
|
+
const doc = parseHTML(
|
|
13
|
+
"<code><script>alert('xss')</script></code>",
|
|
14
|
+
);
|
|
15
|
+
const code = doc.querySelector("code");
|
|
16
|
+
expect(code.innerHTML).toBe("<script>alert('xss')</script>");
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
it("should preserve & entity when serializing innerHTML", () => {
|
|
20
|
+
const doc = parseHTML("<span>foo & bar</span>");
|
|
21
|
+
const span = doc.querySelector("span");
|
|
22
|
+
expect(span.innerHTML).toBe("foo & bar");
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
it("should preserve mixed entities in text", () => {
|
|
26
|
+
const doc = parseHTML(
|
|
27
|
+
"<div><a href="test">link</a></div>",
|
|
28
|
+
);
|
|
29
|
+
const div = doc.querySelector("div");
|
|
30
|
+
expect(div.innerHTML).toBe('<a href="test">link</a>');
|
|
31
|
+
});
|
|
32
|
+
|
|
33
|
+
it("should handle textContent correctly (decoded)", () => {
|
|
34
|
+
const doc = parseHTML("<p><div></p>");
|
|
35
|
+
const p = doc.querySelector("p");
|
|
36
|
+
expect(p.textContent).toBe("<div>");
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
it("should preserve entities in outerHTML", () => {
|
|
40
|
+
const doc = parseHTML("<p><test></p>");
|
|
41
|
+
const p = doc.querySelector("p");
|
|
42
|
+
expect(p.outerHTML).toBe("<p><test></p>");
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
it("should preserve entities in nested elements", () => {
|
|
46
|
+
const doc = parseHTML("<div><span><nested></span></div>");
|
|
47
|
+
const div = doc.querySelector("div");
|
|
48
|
+
expect(div.innerHTML).toBe("<span><nested></span>");
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
it("should handle multiple text nodes with entities", () => {
|
|
52
|
+
const doc = parseHTML("<p><first> and <second></p>");
|
|
53
|
+
const p = doc.querySelector("p");
|
|
54
|
+
expect(p.innerHTML).toBe("<first> and <second>");
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
it("should not double-escape already escaped content", () => {
|
|
58
|
+
const doc = parseHTML("<p>&lt;</p>");
|
|
59
|
+
const p = doc.querySelector("p");
|
|
60
|
+
expect(p.textContent).toBe("<");
|
|
61
|
+
expect(p.innerHTML).toBe("&lt;");
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
it("should preserve entities after DOM manipulation", () => {
|
|
65
|
+
const doc = parseHTML("<div></div>");
|
|
66
|
+
const div = doc.querySelector("div");
|
|
67
|
+
const text = doc.createTextNode("<script>alert('xss')</script>");
|
|
68
|
+
div.appendChild(text);
|
|
69
|
+
expect(div.innerHTML).toBe("<script>alert('xss')</script>");
|
|
70
|
+
});
|
|
71
|
+
});
|
|
@@ -2,50 +2,51 @@ import { describe, it, expect } from "bun:test";
|
|
|
2
2
|
import { parseHTML } from "../src/index";
|
|
3
3
|
|
|
4
4
|
describe("innerHTML with void elements", () => {
|
|
5
|
-
it(
|
|
6
|
-
const doc = parseHTML(
|
|
7
|
-
const element = doc.querySelector(
|
|
5
|
+
it("innerHTML should work with void elements", () => {
|
|
6
|
+
const doc = parseHTML("<custom></custom>");
|
|
7
|
+
const element = doc.querySelector("custom");
|
|
8
8
|
|
|
9
9
|
element!.innerHTML = '<meta name="test">';
|
|
10
10
|
expect(element!.innerHTML).toBe('<meta name="test">');
|
|
11
11
|
expect(element!.childNodes.length).toBe(1);
|
|
12
12
|
});
|
|
13
13
|
|
|
14
|
-
it(
|
|
15
|
-
const doc = parseHTML(
|
|
16
|
-
const element = doc.querySelector(
|
|
14
|
+
it("innerHTML should work with multiple void elements", () => {
|
|
15
|
+
const doc = parseHTML("<custom></custom>");
|
|
16
|
+
const element = doc.querySelector("custom");
|
|
17
17
|
|
|
18
18
|
element!.innerHTML = '<meta name="a"><link rel="b"><input type="c">';
|
|
19
19
|
expect(element!.childNodes.length).toBe(3);
|
|
20
20
|
});
|
|
21
21
|
|
|
22
|
-
it(
|
|
23
|
-
const doc = parseHTML(
|
|
24
|
-
const element = doc.querySelector(
|
|
22
|
+
it("innerHTML should work with mixed void and non-void elements", () => {
|
|
23
|
+
const doc = parseHTML("<custom></custom>");
|
|
24
|
+
const element = doc.querySelector("custom");
|
|
25
25
|
|
|
26
|
-
element!.innerHTML =
|
|
26
|
+
element!.innerHTML =
|
|
27
|
+
'<meta name="test"><div>Hello</div><br><span>World</span>';
|
|
27
28
|
expect(element!.childNodes.length).toBe(4);
|
|
28
|
-
expect(element!.children[0].tagName).toBe(
|
|
29
|
-
expect(element!.children[1].tagName).toBe(
|
|
30
|
-
expect(element!.children[2].tagName).toBe(
|
|
31
|
-
expect(element!.children[3].tagName).toBe(
|
|
29
|
+
expect(element!.children[0].tagName).toBe("META");
|
|
30
|
+
expect(element!.children[1].tagName).toBe("DIV");
|
|
31
|
+
expect(element!.children[2].tagName).toBe("BR");
|
|
32
|
+
expect(element!.children[3].tagName).toBe("SPAN");
|
|
32
33
|
});
|
|
33
34
|
|
|
34
|
-
it(
|
|
35
|
-
const doc = parseHTML(
|
|
36
|
-
const element = doc.querySelector(
|
|
35
|
+
it("innerHTML should work with void elements nested inside containers", () => {
|
|
36
|
+
const doc = parseHTML("<custom></custom>");
|
|
37
|
+
const element = doc.querySelector("custom");
|
|
37
38
|
|
|
38
39
|
element!.innerHTML = '<div><img src="test.jpg"><input type="text"></div>';
|
|
39
40
|
expect(element!.childNodes.length).toBe(1);
|
|
40
41
|
const div = element!.children[0];
|
|
41
42
|
expect(div.childNodes.length).toBe(2);
|
|
42
|
-
expect(div.children[0].tagName).toBe(
|
|
43
|
-
expect(div.children[1].tagName).toBe(
|
|
43
|
+
expect(div.children[0].tagName).toBe("IMG");
|
|
44
|
+
expect(div.children[1].tagName).toBe("INPUT");
|
|
44
45
|
});
|
|
45
46
|
|
|
46
|
-
it(
|
|
47
|
-
const doc = parseHTML(
|
|
48
|
-
const element = doc.querySelector(
|
|
47
|
+
it("innerHTML can be replaced multiple times with void elements", () => {
|
|
48
|
+
const doc = parseHTML("<custom></custom>");
|
|
49
|
+
const element = doc.querySelector("custom");
|
|
49
50
|
|
|
50
51
|
element!.innerHTML = '<meta name="first">';
|
|
51
52
|
expect(element!.childNodes.length).toBe(1);
|
|
@@ -53,17 +54,31 @@ describe("innerHTML with void elements", () => {
|
|
|
53
54
|
element!.innerHTML = '<link rel="second"><hr>';
|
|
54
55
|
expect(element!.childNodes.length).toBe(2);
|
|
55
56
|
|
|
56
|
-
element!.innerHTML =
|
|
57
|
+
element!.innerHTML = "";
|
|
57
58
|
expect(element!.childNodes.length).toBe(0);
|
|
58
59
|
});
|
|
59
60
|
|
|
60
|
-
it(
|
|
61
|
-
const doc = parseHTML(
|
|
62
|
-
const element = doc.querySelector(
|
|
61
|
+
it("innerHTML should work with all void element types", () => {
|
|
62
|
+
const doc = parseHTML("<custom></custom>");
|
|
63
|
+
const element = doc.querySelector("custom");
|
|
63
64
|
|
|
64
65
|
// Test all void elements
|
|
65
|
-
const voidElements = [
|
|
66
|
-
|
|
66
|
+
const voidElements = [
|
|
67
|
+
"area",
|
|
68
|
+
"base",
|
|
69
|
+
"br",
|
|
70
|
+
"col",
|
|
71
|
+
"embed",
|
|
72
|
+
"hr",
|
|
73
|
+
"img",
|
|
74
|
+
"input",
|
|
75
|
+
"link",
|
|
76
|
+
"meta",
|
|
77
|
+
"source",
|
|
78
|
+
"track",
|
|
79
|
+
"wbr",
|
|
80
|
+
];
|
|
81
|
+
|
|
67
82
|
for (const tag of voidElements) {
|
|
68
83
|
element!.innerHTML = `<${tag}>`;
|
|
69
84
|
expect(element!.childNodes.length).toBe(1);
|
|
@@ -71,14 +86,15 @@ describe("innerHTML with void elements", () => {
|
|
|
71
86
|
}
|
|
72
87
|
});
|
|
73
88
|
|
|
74
|
-
it(
|
|
75
|
-
const doc = parseHTML(
|
|
76
|
-
const element = doc.querySelector(
|
|
89
|
+
it("innerHTML with void elements preserves attributes", () => {
|
|
90
|
+
const doc = parseHTML("<custom></custom>");
|
|
91
|
+
const element = doc.querySelector("custom");
|
|
77
92
|
|
|
78
|
-
element!.innerHTML =
|
|
93
|
+
element!.innerHTML =
|
|
94
|
+
'<meta charset="utf-8" name="viewport" content="width=device-width">';
|
|
79
95
|
const meta = element!.children[0];
|
|
80
|
-
expect(meta.getAttribute(
|
|
81
|
-
expect(meta.getAttribute(
|
|
82
|
-
expect(meta.getAttribute(
|
|
96
|
+
expect(meta.getAttribute("charset")).toBe("utf-8");
|
|
97
|
+
expect(meta.getAttribute("name")).toBe("viewport");
|
|
98
|
+
expect(meta.getAttribute("content")).toBe("width=device-width");
|
|
83
99
|
});
|
|
84
|
-
});
|
|
100
|
+
});
|