@tkeron/html-parser 1.1.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm_deploy.yml +14 -4
- package/README.md +6 -6
- package/bun.lock +6 -8
- package/check-versions.ts +147 -0
- package/index.ts +4 -8
- package/package.json +5 -6
- package/src/dom-simulator/append-child.ts +130 -0
- package/src/dom-simulator/append.ts +18 -0
- package/src/dom-simulator/attributes.ts +23 -0
- package/src/dom-simulator/clone-node.ts +51 -0
- package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
- package/src/dom-simulator/create-cdata.ts +18 -0
- package/src/dom-simulator/create-comment.ts +23 -0
- package/src/dom-simulator/create-doctype.ts +24 -0
- package/src/dom-simulator/create-document.ts +81 -0
- package/src/dom-simulator/create-element.ts +195 -0
- package/src/dom-simulator/create-processing-instruction.ts +19 -0
- package/src/dom-simulator/create-temp-parent.ts +9 -0
- package/src/dom-simulator/create-text-node.ts +23 -0
- package/src/dom-simulator/escape-text-content.ts +6 -0
- package/src/dom-simulator/find-special-elements.ts +14 -0
- package/src/dom-simulator/get-text-content.ts +18 -0
- package/src/dom-simulator/index.ts +36 -0
- package/src/dom-simulator/inner-outer-html.ts +182 -0
- package/src/dom-simulator/insert-after.ts +20 -0
- package/src/dom-simulator/insert-before.ts +108 -0
- package/src/dom-simulator/matches.ts +26 -0
- package/src/dom-simulator/node-types.ts +26 -0
- package/src/dom-simulator/prepend.ts +24 -0
- package/src/dom-simulator/remove-child.ts +68 -0
- package/src/dom-simulator/remove.ts +7 -0
- package/src/dom-simulator/replace-child.ts +152 -0
- package/src/dom-simulator/set-text-content.ts +33 -0
- package/src/dom-simulator/update-element-content.ts +56 -0
- package/src/dom-simulator.ts +12 -1126
- package/src/encoding/constants.ts +8 -0
- package/src/encoding/detect-encoding.ts +21 -0
- package/src/encoding/index.ts +1 -0
- package/src/encoding/normalize-encoding.ts +6 -0
- package/src/html-entities.ts +2127 -0
- package/src/index.ts +5 -5
- package/src/parser/adoption-agency-helpers.ts +145 -0
- package/src/parser/constants.ts +137 -0
- package/src/parser/dom-to-ast.ts +79 -0
- package/src/parser/index.ts +9 -0
- package/src/parser/parse.ts +772 -0
- package/src/parser/types.ts +56 -0
- package/src/selectors/find-elements-descendant.ts +47 -0
- package/src/selectors/index.ts +2 -0
- package/src/selectors/matches-selector.ts +12 -0
- package/src/selectors/matches-token.ts +27 -0
- package/src/selectors/parse-selector.ts +48 -0
- package/src/selectors/query-selector-all.ts +43 -0
- package/src/selectors/query-selector.ts +6 -0
- package/src/selectors/types.ts +10 -0
- package/src/serializer/attributes.ts +74 -0
- package/src/serializer/escape.ts +13 -0
- package/src/serializer/index.ts +1 -0
- package/src/serializer/serialize-tokens.ts +511 -0
- package/src/tokenizer/calculate-position.ts +10 -0
- package/src/tokenizer/constants.ts +11 -0
- package/src/tokenizer/decode-entities.ts +64 -0
- package/src/tokenizer/index.ts +2 -0
- package/src/tokenizer/parse-attributes.ts +74 -0
- package/src/tokenizer/tokenize.ts +165 -0
- package/src/tokenizer/types.ts +25 -0
- package/tests/adoption-agency-helpers.test.ts +304 -0
- package/tests/advanced.test.ts +242 -221
- package/tests/cloneNode.test.ts +19 -66
- package/tests/custom-elements-head.test.ts +54 -55
- package/tests/dom-extended.test.ts +77 -64
- package/tests/dom-manipulation.test.ts +51 -24
- package/tests/dom.test.ts +15 -13
- package/tests/encoding/detect-encoding.test.ts +33 -0
- package/tests/google-dom.test.ts +2 -2
- package/tests/helpers/tokenizer-adapter.test.ts +29 -43
- package/tests/helpers/tokenizer-adapter.ts +36 -33
- package/tests/helpers/tree-adapter.test.ts +20 -20
- package/tests/helpers/tree-adapter.ts +34 -24
- package/tests/html-entities-text.test.ts +6 -2
- package/tests/innerhtml-void-elements.test.ts +52 -36
- package/tests/outerHTML-replacement.test.ts +37 -65
- package/tests/parser/dom-to-ast.test.ts +109 -0
- package/tests/parser/parse.test.ts +139 -0
- package/tests/parser.test.ts +281 -217
- package/tests/selectors/query-selector-all.test.ts +39 -0
- package/tests/selectors/query-selector.test.ts +42 -0
- package/tests/serializer/attributes.test.ts +132 -0
- package/tests/serializer/escape.test.ts +51 -0
- package/tests/serializer/serialize-tokens.test.ts +80 -0
- package/tests/serializer-core.test.ts +6 -6
- package/tests/serializer-injectmeta.test.ts +6 -6
- package/tests/serializer-optionaltags.test.ts +9 -6
- package/tests/serializer-options.test.ts +6 -6
- package/tests/serializer-whitespace.test.ts +6 -6
- package/tests/tokenizer/calculate-position.test.ts +34 -0
- package/tests/tokenizer/decode-entities.test.ts +31 -0
- package/tests/tokenizer/parse-attributes.test.ts +44 -0
- package/tests/tokenizer/tokenize.test.ts +757 -0
- package/tests/tokenizer-namedEntities.test.ts +10 -7
- package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
- package/tests/tokenizer.test.ts +268 -256
- package/tests/tree-construction-adoption01.test.ts +25 -16
- package/tests/tree-construction-adoption02.test.ts +30 -19
- package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
- package/tests/tree-construction-entities02.test.ts +18 -16
- package/tests/tree-construction-html5test-com.test.ts +16 -10
- package/tests/tree-construction-math.test.ts +11 -9
- package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
- package/tests/tree-construction-noscript01.test.ts +11 -9
- package/tests/tree-construction-ruby.test.ts +6 -4
- package/tests/tree-construction-scriptdata01.test.ts +6 -4
- package/tests/tree-construction-svg.test.ts +6 -4
- package/tests/tree-construction-template.test.ts +6 -4
- package/tests/tree-construction-tests10.test.ts +6 -4
- package/tests/tree-construction-tests11.test.ts +6 -4
- package/tests/tree-construction-tests20.test.ts +7 -4
- package/tests/tree-construction-tests21.test.ts +7 -4
- package/tests/tree-construction-tests23.test.ts +7 -4
- package/tests/tree-construction-tests24.test.ts +7 -4
- package/tests/tree-construction-tests5.test.ts +6 -5
- package/tests/tree-construction-tests6.test.ts +6 -5
- package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
- package/tests/void-elements.test.ts +85 -40
- package/tsconfig.json +1 -1
- package/src/css-selector.ts +0 -185
- package/src/encoding.ts +0 -39
- package/src/parser.ts +0 -682
- package/src/serializer.ts +0 -450
- package/src/tokenizer.ts +0 -325
- package/tests/selectors.test.ts +0 -128
package/src/index.ts
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { tokenize } from
|
|
2
|
-
import { parse } from
|
|
1
|
+
import { tokenize } from "./tokenizer/index.js";
|
|
2
|
+
import { parse } from "./parser/index.js";
|
|
3
3
|
|
|
4
|
-
export
|
|
4
|
+
export const parseHTML = (html: string): any => {
|
|
5
5
|
const tokens = tokenize(html);
|
|
6
6
|
return parse(tokens);
|
|
7
|
-
}
|
|
7
|
+
};
|
|
8
8
|
|
|
9
|
-
export { parse } from
|
|
9
|
+
export { parse } from "./parser/index.js";
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
import { createElement } from "../dom-simulator/index.js";
|
|
2
|
+
|
|
3
|
+
const SPECIAL_ELEMENTS = new Set([
|
|
4
|
+
"address",
|
|
5
|
+
"applet",
|
|
6
|
+
"area",
|
|
7
|
+
"article",
|
|
8
|
+
"aside",
|
|
9
|
+
"base",
|
|
10
|
+
"basefont",
|
|
11
|
+
"bgsound",
|
|
12
|
+
"blockquote",
|
|
13
|
+
"body",
|
|
14
|
+
"br",
|
|
15
|
+
"button",
|
|
16
|
+
"caption",
|
|
17
|
+
"center",
|
|
18
|
+
"col",
|
|
19
|
+
"colgroup",
|
|
20
|
+
"dd",
|
|
21
|
+
"details",
|
|
22
|
+
"dir",
|
|
23
|
+
"div",
|
|
24
|
+
"dl",
|
|
25
|
+
"dt",
|
|
26
|
+
"embed",
|
|
27
|
+
"fieldset",
|
|
28
|
+
"figcaption",
|
|
29
|
+
"figure",
|
|
30
|
+
"footer",
|
|
31
|
+
"form",
|
|
32
|
+
"frame",
|
|
33
|
+
"frameset",
|
|
34
|
+
"h1",
|
|
35
|
+
"h2",
|
|
36
|
+
"h3",
|
|
37
|
+
"h4",
|
|
38
|
+
"h5",
|
|
39
|
+
"h6",
|
|
40
|
+
"head",
|
|
41
|
+
"header",
|
|
42
|
+
"hgroup",
|
|
43
|
+
"hr",
|
|
44
|
+
"html",
|
|
45
|
+
"iframe",
|
|
46
|
+
"img",
|
|
47
|
+
"input",
|
|
48
|
+
"li",
|
|
49
|
+
"link",
|
|
50
|
+
"listing",
|
|
51
|
+
"main",
|
|
52
|
+
"marquee",
|
|
53
|
+
"menu",
|
|
54
|
+
"meta",
|
|
55
|
+
"nav",
|
|
56
|
+
"noembed",
|
|
57
|
+
"noframes",
|
|
58
|
+
"noscript",
|
|
59
|
+
"object",
|
|
60
|
+
"ol",
|
|
61
|
+
"p",
|
|
62
|
+
"param",
|
|
63
|
+
"plaintext",
|
|
64
|
+
"pre",
|
|
65
|
+
"script",
|
|
66
|
+
"section",
|
|
67
|
+
"select",
|
|
68
|
+
"source",
|
|
69
|
+
"style",
|
|
70
|
+
"summary",
|
|
71
|
+
"table",
|
|
72
|
+
"tbody",
|
|
73
|
+
"td",
|
|
74
|
+
"template",
|
|
75
|
+
"textarea",
|
|
76
|
+
"tfoot",
|
|
77
|
+
"th",
|
|
78
|
+
"thead",
|
|
79
|
+
"title",
|
|
80
|
+
"tr",
|
|
81
|
+
"track",
|
|
82
|
+
"ul",
|
|
83
|
+
"wbr",
|
|
84
|
+
"xmp",
|
|
85
|
+
]);
|
|
86
|
+
|
|
87
|
+
type StackSearchResult = {
|
|
88
|
+
element: any;
|
|
89
|
+
index: number;
|
|
90
|
+
};
|
|
91
|
+
|
|
92
|
+
export const findFormattingElementInStack = (
|
|
93
|
+
stack: any[],
|
|
94
|
+
tagName: string,
|
|
95
|
+
): StackSearchResult | null => {
|
|
96
|
+
const lowerTagName = tagName.toLowerCase();
|
|
97
|
+
for (let i = stack.length - 1; i >= 0; i--) {
|
|
98
|
+
const element = stack[i];
|
|
99
|
+
if (element.tagName && element.tagName.toLowerCase() === lowerTagName) {
|
|
100
|
+
return { element, index: i };
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
return null;
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
export const findFurthestBlock = (
|
|
107
|
+
stack: any[],
|
|
108
|
+
formattingElementIndex: number,
|
|
109
|
+
): StackSearchResult | null => {
|
|
110
|
+
for (let i = formattingElementIndex + 1; i < stack.length; i++) {
|
|
111
|
+
const element = stack[i];
|
|
112
|
+
if (isSpecialElement(element.tagName)) {
|
|
113
|
+
return { element, index: i };
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return null;
|
|
117
|
+
};
|
|
118
|
+
|
|
119
|
+
export const getCommonAncestor = (
|
|
120
|
+
stack: any[],
|
|
121
|
+
formattingElementIndex: number,
|
|
122
|
+
): any | null => {
|
|
123
|
+
if (formattingElementIndex <= 0) {
|
|
124
|
+
return null;
|
|
125
|
+
}
|
|
126
|
+
return stack[formattingElementIndex - 1];
|
|
127
|
+
};
|
|
128
|
+
|
|
129
|
+
export const isSpecialElement = (tagName: string): boolean => {
|
|
130
|
+
if (!tagName) return false;
|
|
131
|
+
return SPECIAL_ELEMENTS.has(tagName.toLowerCase());
|
|
132
|
+
};
|
|
133
|
+
|
|
134
|
+
export const cloneFormattingElement = (element: any): any => {
|
|
135
|
+
const attributes = element.attributes ? { ...element.attributes } : {};
|
|
136
|
+
return createElement(element.tagName.toLowerCase(), attributes);
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
export const reparentChildren = (source: any, target: any): void => {
|
|
140
|
+
while (source.childNodes.length > 0) {
|
|
141
|
+
const child = source.childNodes.shift();
|
|
142
|
+
child.parentNode = target;
|
|
143
|
+
target.childNodes.push(child);
|
|
144
|
+
}
|
|
145
|
+
};
|
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
export const VOID_ELEMENTS = new Set([
|
|
2
|
+
"area",
|
|
3
|
+
"base",
|
|
4
|
+
"br",
|
|
5
|
+
"col",
|
|
6
|
+
"embed",
|
|
7
|
+
"hr",
|
|
8
|
+
"img",
|
|
9
|
+
"input",
|
|
10
|
+
"link",
|
|
11
|
+
"meta",
|
|
12
|
+
"param",
|
|
13
|
+
"source",
|
|
14
|
+
"track",
|
|
15
|
+
"wbr",
|
|
16
|
+
]);
|
|
17
|
+
|
|
18
|
+
export const RAW_TEXT_ELEMENTS = new Set([
|
|
19
|
+
"script",
|
|
20
|
+
"style",
|
|
21
|
+
"textarea",
|
|
22
|
+
"title",
|
|
23
|
+
]);
|
|
24
|
+
|
|
25
|
+
export const AUTO_CLOSE_RULES: Record<string, string[]> = {
|
|
26
|
+
li: ["li"],
|
|
27
|
+
dt: ["dt", "dd"],
|
|
28
|
+
dd: ["dt", "dd"],
|
|
29
|
+
address: ["p"],
|
|
30
|
+
article: ["p"],
|
|
31
|
+
aside: ["p"],
|
|
32
|
+
blockquote: ["p"],
|
|
33
|
+
center: ["p"],
|
|
34
|
+
details: ["p"],
|
|
35
|
+
dialog: ["p"],
|
|
36
|
+
dir: ["p"],
|
|
37
|
+
div: ["p"],
|
|
38
|
+
dl: ["p"],
|
|
39
|
+
fieldset: ["p"],
|
|
40
|
+
figcaption: ["p"],
|
|
41
|
+
figure: ["p"],
|
|
42
|
+
footer: ["p"],
|
|
43
|
+
form: ["p"],
|
|
44
|
+
h1: ["p"],
|
|
45
|
+
h2: ["p"],
|
|
46
|
+
h3: ["p"],
|
|
47
|
+
h4: ["p"],
|
|
48
|
+
h5: ["p"],
|
|
49
|
+
h6: ["p"],
|
|
50
|
+
header: ["p"],
|
|
51
|
+
hgroup: ["p"],
|
|
52
|
+
hr: ["p"],
|
|
53
|
+
listing: ["p"],
|
|
54
|
+
main: ["p"],
|
|
55
|
+
menu: ["p"],
|
|
56
|
+
nav: ["p"],
|
|
57
|
+
ol: ["p"],
|
|
58
|
+
p: ["p"],
|
|
59
|
+
pre: ["p"],
|
|
60
|
+
section: ["p"],
|
|
61
|
+
summary: ["p"],
|
|
62
|
+
table: ["p"],
|
|
63
|
+
ul: ["p"],
|
|
64
|
+
rt: ["rt", "rp"],
|
|
65
|
+
rp: ["rt", "rp"],
|
|
66
|
+
optgroup: ["optgroup"],
|
|
67
|
+
option: ["option"],
|
|
68
|
+
thead: ["tbody", "tfoot"],
|
|
69
|
+
tbody: ["thead", "tbody", "tfoot"],
|
|
70
|
+
tfoot: ["thead", "tbody"],
|
|
71
|
+
tr: ["tr"],
|
|
72
|
+
td: ["td", "th"],
|
|
73
|
+
th: ["td", "th"],
|
|
74
|
+
};
|
|
75
|
+
|
|
76
|
+
export const FORMATTING_ELEMENTS = new Set([
|
|
77
|
+
"a",
|
|
78
|
+
"b",
|
|
79
|
+
"big",
|
|
80
|
+
"code",
|
|
81
|
+
"em",
|
|
82
|
+
"font",
|
|
83
|
+
"i",
|
|
84
|
+
"nobr",
|
|
85
|
+
"s",
|
|
86
|
+
"small",
|
|
87
|
+
"strike",
|
|
88
|
+
"strong",
|
|
89
|
+
"tt",
|
|
90
|
+
"u",
|
|
91
|
+
]);
|
|
92
|
+
|
|
93
|
+
export const TABLE_SCOPE_ELEMENTS = new Set([
|
|
94
|
+
"table",
|
|
95
|
+
"tbody",
|
|
96
|
+
"thead",
|
|
97
|
+
"tfoot",
|
|
98
|
+
"tr",
|
|
99
|
+
"template",
|
|
100
|
+
"html",
|
|
101
|
+
]);
|
|
102
|
+
|
|
103
|
+
export const TABLE_CONTEXT_ELEMENTS = new Set([
|
|
104
|
+
"table",
|
|
105
|
+
"tbody",
|
|
106
|
+
"thead",
|
|
107
|
+
"tfoot",
|
|
108
|
+
"tr",
|
|
109
|
+
]);
|
|
110
|
+
|
|
111
|
+
export const VALID_TABLE_CHILDREN = new Set([
|
|
112
|
+
"caption",
|
|
113
|
+
"colgroup",
|
|
114
|
+
"col",
|
|
115
|
+
"tbody",
|
|
116
|
+
"thead",
|
|
117
|
+
"tfoot",
|
|
118
|
+
"tr",
|
|
119
|
+
"script",
|
|
120
|
+
"template",
|
|
121
|
+
"style",
|
|
122
|
+
]);
|
|
123
|
+
|
|
124
|
+
export const VALID_TABLE_SECTION_CHILDREN = new Set([
|
|
125
|
+
"tr",
|
|
126
|
+
"script",
|
|
127
|
+
"template",
|
|
128
|
+
"style",
|
|
129
|
+
]);
|
|
130
|
+
|
|
131
|
+
export const VALID_TR_CHILDREN = new Set([
|
|
132
|
+
"td",
|
|
133
|
+
"th",
|
|
134
|
+
"script",
|
|
135
|
+
"template",
|
|
136
|
+
"style",
|
|
137
|
+
]);
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
import type { ASTNode } from "./types.js";
|
|
2
|
+
import { ASTNodeType } from "./types.js";
|
|
3
|
+
|
|
4
|
+
export const domToAST = (dom: any): ASTNode => {
|
|
5
|
+
if (!dom) {
|
|
6
|
+
return {
|
|
7
|
+
type: ASTNodeType.Document,
|
|
8
|
+
children: [],
|
|
9
|
+
};
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
if (dom.nodeType === 9) {
|
|
13
|
+
return {
|
|
14
|
+
type: ASTNodeType.Document,
|
|
15
|
+
children: dom.childNodes ? dom.childNodes.map(domToAST) : [],
|
|
16
|
+
};
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
if (dom.nodeType === 1) {
|
|
20
|
+
const attributes: Record<string, string> = {};
|
|
21
|
+
if (dom.attributes) {
|
|
22
|
+
for (const [name, value] of Object.entries(dom.attributes)) {
|
|
23
|
+
attributes[name] = value;
|
|
24
|
+
}
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
return {
|
|
28
|
+
type: ASTNodeType.Element,
|
|
29
|
+
tagName: dom.tagName.toLowerCase(),
|
|
30
|
+
attributes,
|
|
31
|
+
children: dom.childNodes ? dom.childNodes.map(domToAST) : [],
|
|
32
|
+
namespaceURI: dom.namespaceURI,
|
|
33
|
+
isSelfClosing: dom.isSelfClosing || false,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if (dom.nodeType === 3) {
|
|
38
|
+
return {
|
|
39
|
+
type: ASTNodeType.Text,
|
|
40
|
+
content: dom.textContent || "",
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
if (dom.nodeType === 8) {
|
|
45
|
+
return {
|
|
46
|
+
type: ASTNodeType.Comment,
|
|
47
|
+
content: (dom.data as string) || "",
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if (dom.nodeType === 10) {
|
|
52
|
+
return {
|
|
53
|
+
type: ASTNodeType.Doctype,
|
|
54
|
+
name: dom.name || "html",
|
|
55
|
+
publicId: dom.publicId || "",
|
|
56
|
+
systemId: dom.systemId || "",
|
|
57
|
+
content: dom.name || "html",
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
if (dom.nodeType === 4) {
|
|
62
|
+
return {
|
|
63
|
+
type: ASTNodeType.CDATA,
|
|
64
|
+
content: dom.textContent || "",
|
|
65
|
+
};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
if (dom.nodeType === 7) {
|
|
69
|
+
return {
|
|
70
|
+
type: "processing-instruction" as any,
|
|
71
|
+
content: dom.data || dom.textContent || "",
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
return {
|
|
76
|
+
type: ASTNodeType.Text,
|
|
77
|
+
content: "",
|
|
78
|
+
};
|
|
79
|
+
};
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export { parse } from "./parse";
|
|
2
|
+
export { domToAST } from "./dom-to-ast";
|
|
3
|
+
export type { ParserState, ParseError, InsertionMode, ASTNode } from "./types";
|
|
4
|
+
export { ASTNodeType } from "./types";
|
|
5
|
+
export {
|
|
6
|
+
VOID_ELEMENTS,
|
|
7
|
+
RAW_TEXT_ELEMENTS,
|
|
8
|
+
AUTO_CLOSE_RULES,
|
|
9
|
+
} from "./constants";
|