@tkeron/html-parser 1.1.2 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (134) hide show
  1. package/.github/workflows/npm_deploy.yml +14 -4
  2. package/README.md +6 -6
  3. package/bun.lock +6 -8
  4. package/check-versions.ts +147 -0
  5. package/index.ts +4 -8
  6. package/package.json +5 -6
  7. package/src/dom-simulator/append-child.ts +130 -0
  8. package/src/dom-simulator/append.ts +18 -0
  9. package/src/dom-simulator/attributes.ts +23 -0
  10. package/src/dom-simulator/clone-node.ts +51 -0
  11. package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
  12. package/src/dom-simulator/create-cdata.ts +18 -0
  13. package/src/dom-simulator/create-comment.ts +23 -0
  14. package/src/dom-simulator/create-doctype.ts +24 -0
  15. package/src/dom-simulator/create-document.ts +81 -0
  16. package/src/dom-simulator/create-element.ts +195 -0
  17. package/src/dom-simulator/create-processing-instruction.ts +19 -0
  18. package/src/dom-simulator/create-temp-parent.ts +9 -0
  19. package/src/dom-simulator/create-text-node.ts +23 -0
  20. package/src/dom-simulator/escape-text-content.ts +6 -0
  21. package/src/dom-simulator/find-special-elements.ts +14 -0
  22. package/src/dom-simulator/get-text-content.ts +18 -0
  23. package/src/dom-simulator/index.ts +36 -0
  24. package/src/dom-simulator/inner-outer-html.ts +182 -0
  25. package/src/dom-simulator/insert-after.ts +20 -0
  26. package/src/dom-simulator/insert-before.ts +108 -0
  27. package/src/dom-simulator/matches.ts +26 -0
  28. package/src/dom-simulator/node-types.ts +26 -0
  29. package/src/dom-simulator/prepend.ts +24 -0
  30. package/src/dom-simulator/remove-child.ts +68 -0
  31. package/src/dom-simulator/remove.ts +7 -0
  32. package/src/dom-simulator/replace-child.ts +152 -0
  33. package/src/dom-simulator/set-text-content.ts +33 -0
  34. package/src/dom-simulator/update-element-content.ts +56 -0
  35. package/src/dom-simulator.ts +12 -1126
  36. package/src/encoding/constants.ts +8 -0
  37. package/src/encoding/detect-encoding.ts +21 -0
  38. package/src/encoding/index.ts +1 -0
  39. package/src/encoding/normalize-encoding.ts +6 -0
  40. package/src/html-entities.ts +2127 -0
  41. package/src/index.ts +5 -5
  42. package/src/parser/adoption-agency-helpers.ts +145 -0
  43. package/src/parser/constants.ts +137 -0
  44. package/src/parser/dom-to-ast.ts +79 -0
  45. package/src/parser/foster-parenting-helpers.ts +48 -0
  46. package/src/parser/implicit-table-structure.ts +65 -0
  47. package/src/parser/index.ts +9 -0
  48. package/src/parser/parse.ts +924 -0
  49. package/src/parser/types.ts +56 -0
  50. package/src/selectors/find-elements-descendant.ts +47 -0
  51. package/src/selectors/index.ts +2 -0
  52. package/src/selectors/matches-selector.ts +12 -0
  53. package/src/selectors/matches-token.ts +27 -0
  54. package/src/selectors/parse-selector.ts +48 -0
  55. package/src/selectors/query-selector-all.ts +43 -0
  56. package/src/selectors/query-selector.ts +6 -0
  57. package/src/selectors/types.ts +10 -0
  58. package/src/serializer/attributes.ts +74 -0
  59. package/src/serializer/escape.ts +13 -0
  60. package/src/serializer/index.ts +1 -0
  61. package/src/serializer/serialize-tokens.ts +511 -0
  62. package/src/tokenizer/calculate-position.ts +10 -0
  63. package/src/tokenizer/constants.ts +11 -0
  64. package/src/tokenizer/decode-entities.ts +64 -0
  65. package/src/tokenizer/index.ts +2 -0
  66. package/src/tokenizer/parse-attributes.ts +74 -0
  67. package/src/tokenizer/tokenize.ts +165 -0
  68. package/src/tokenizer/types.ts +25 -0
  69. package/tests/adoption-agency-helpers.test.ts +304 -0
  70. package/tests/advanced.test.ts +242 -221
  71. package/tests/cloneNode.test.ts +19 -66
  72. package/tests/custom-elements-head.test.ts +54 -55
  73. package/tests/dom-extended.test.ts +77 -64
  74. package/tests/dom-manipulation.test.ts +51 -24
  75. package/tests/dom.test.ts +15 -13
  76. package/tests/encoding/detect-encoding.test.ts +33 -0
  77. package/tests/foster-parenting.test.ts +127 -0
  78. package/tests/google-dom.test.ts +2 -2
  79. package/tests/helpers/tokenizer-adapter.test.ts +29 -43
  80. package/tests/helpers/tokenizer-adapter.ts +36 -33
  81. package/tests/helpers/tree-adapter.test.ts +20 -20
  82. package/tests/helpers/tree-adapter.ts +34 -24
  83. package/tests/html-entities-text.test.ts +6 -2
  84. package/tests/innerhtml-void-elements.test.ts +52 -36
  85. package/tests/outerHTML-replacement.test.ts +37 -65
  86. package/tests/parser/dom-to-ast.test.ts +109 -0
  87. package/tests/parser/parse.test.ts +139 -0
  88. package/tests/parser.test.ts +281 -217
  89. package/tests/selectors/query-selector-all.test.ts +39 -0
  90. package/tests/selectors/query-selector.test.ts +42 -0
  91. package/tests/serializer/attributes.test.ts +132 -0
  92. package/tests/serializer/escape.test.ts +51 -0
  93. package/tests/serializer/serialize-tokens.test.ts +80 -0
  94. package/tests/serializer-core.test.ts +6 -6
  95. package/tests/serializer-injectmeta.test.ts +6 -6
  96. package/tests/serializer-optionaltags.test.ts +9 -6
  97. package/tests/serializer-options.test.ts +6 -6
  98. package/tests/serializer-whitespace.test.ts +6 -6
  99. package/tests/tokenizer/calculate-position.test.ts +34 -0
  100. package/tests/tokenizer/decode-entities.test.ts +31 -0
  101. package/tests/tokenizer/parse-attributes.test.ts +44 -0
  102. package/tests/tokenizer/tokenize.test.ts +757 -0
  103. package/tests/tokenizer-namedEntities.test.ts +10 -7
  104. package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
  105. package/tests/tokenizer.test.ts +268 -256
  106. package/tests/tree-construction-adoption01.test.ts +25 -16
  107. package/tests/tree-construction-adoption02.test.ts +30 -19
  108. package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
  109. package/tests/tree-construction-entities02.test.ts +18 -16
  110. package/tests/tree-construction-html5test-com.test.ts +16 -10
  111. package/tests/tree-construction-math.test.ts +11 -9
  112. package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
  113. package/tests/tree-construction-noscript01.test.ts +11 -9
  114. package/tests/tree-construction-ruby.test.ts +6 -4
  115. package/tests/tree-construction-scriptdata01.test.ts +6 -4
  116. package/tests/tree-construction-svg.test.ts +6 -4
  117. package/tests/tree-construction-template.test.ts +6 -4
  118. package/tests/tree-construction-tests10.test.ts +6 -4
  119. package/tests/tree-construction-tests11.test.ts +6 -4
  120. package/tests/tree-construction-tests20.test.ts +7 -4
  121. package/tests/tree-construction-tests21.test.ts +7 -4
  122. package/tests/tree-construction-tests23.test.ts +7 -4
  123. package/tests/tree-construction-tests24.test.ts +7 -4
  124. package/tests/tree-construction-tests5.test.ts +6 -5
  125. package/tests/tree-construction-tests6.test.ts +6 -5
  126. package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
  127. package/tests/void-elements.test.ts +85 -40
  128. package/tsconfig.json +1 -1
  129. package/src/css-selector.ts +0 -185
  130. package/src/encoding.ts +0 -39
  131. package/src/parser.ts +0 -682
  132. package/src/serializer.ts +0 -450
  133. package/src/tokenizer.ts +0 -325
  134. package/tests/selectors.test.ts +0 -128
package/src/index.ts CHANGED
@@ -1,9 +1,9 @@
1
- import { tokenize } from './tokenizer.js';
2
- import { parse } from './parser.js';
1
+ import { tokenize } from "./tokenizer/index.js";
2
+ import { parse } from "./parser/index.js";
3
3
 
4
- export function parseHTML(html: string): any {
4
+ export const parseHTML = (html: string): any => {
5
5
  const tokens = tokenize(html);
6
6
  return parse(tokens);
7
- }
7
+ };
8
8
 
9
- export { parse } from './parser';
9
+ export { parse } from "./parser/index.js";
@@ -0,0 +1,145 @@
1
+ import { createElement } from "../dom-simulator/index.js";
2
+
3
+ const SPECIAL_ELEMENTS = new Set([
4
+ "address",
5
+ "applet",
6
+ "area",
7
+ "article",
8
+ "aside",
9
+ "base",
10
+ "basefont",
11
+ "bgsound",
12
+ "blockquote",
13
+ "body",
14
+ "br",
15
+ "button",
16
+ "caption",
17
+ "center",
18
+ "col",
19
+ "colgroup",
20
+ "dd",
21
+ "details",
22
+ "dir",
23
+ "div",
24
+ "dl",
25
+ "dt",
26
+ "embed",
27
+ "fieldset",
28
+ "figcaption",
29
+ "figure",
30
+ "footer",
31
+ "form",
32
+ "frame",
33
+ "frameset",
34
+ "h1",
35
+ "h2",
36
+ "h3",
37
+ "h4",
38
+ "h5",
39
+ "h6",
40
+ "head",
41
+ "header",
42
+ "hgroup",
43
+ "hr",
44
+ "html",
45
+ "iframe",
46
+ "img",
47
+ "input",
48
+ "li",
49
+ "link",
50
+ "listing",
51
+ "main",
52
+ "marquee",
53
+ "menu",
54
+ "meta",
55
+ "nav",
56
+ "noembed",
57
+ "noframes",
58
+ "noscript",
59
+ "object",
60
+ "ol",
61
+ "p",
62
+ "param",
63
+ "plaintext",
64
+ "pre",
65
+ "script",
66
+ "section",
67
+ "select",
68
+ "source",
69
+ "style",
70
+ "summary",
71
+ "table",
72
+ "tbody",
73
+ "td",
74
+ "template",
75
+ "textarea",
76
+ "tfoot",
77
+ "th",
78
+ "thead",
79
+ "title",
80
+ "tr",
81
+ "track",
82
+ "ul",
83
+ "wbr",
84
+ "xmp",
85
+ ]);
86
+
87
+ type StackSearchResult = {
88
+ element: any;
89
+ index: number;
90
+ };
91
+
92
+ export const findFormattingElementInStack = (
93
+ stack: any[],
94
+ tagName: string,
95
+ ): StackSearchResult | null => {
96
+ const lowerTagName = tagName.toLowerCase();
97
+ for (let i = stack.length - 1; i >= 0; i--) {
98
+ const element = stack[i];
99
+ if (element.tagName && element.tagName.toLowerCase() === lowerTagName) {
100
+ return { element, index: i };
101
+ }
102
+ }
103
+ return null;
104
+ };
105
+
106
+ export const findFurthestBlock = (
107
+ stack: any[],
108
+ formattingElementIndex: number,
109
+ ): StackSearchResult | null => {
110
+ for (let i = formattingElementIndex + 1; i < stack.length; i++) {
111
+ const element = stack[i];
112
+ if (isSpecialElement(element.tagName)) {
113
+ return { element, index: i };
114
+ }
115
+ }
116
+ return null;
117
+ };
118
+
119
+ export const getCommonAncestor = (
120
+ stack: any[],
121
+ formattingElementIndex: number,
122
+ ): any | null => {
123
+ if (formattingElementIndex <= 0) {
124
+ return null;
125
+ }
126
+ return stack[formattingElementIndex - 1];
127
+ };
128
+
129
+ export const isSpecialElement = (tagName: string): boolean => {
130
+ if (!tagName) return false;
131
+ return SPECIAL_ELEMENTS.has(tagName.toLowerCase());
132
+ };
133
+
134
+ export const cloneFormattingElement = (element: any): any => {
135
+ const attributes = element.attributes ? { ...element.attributes } : {};
136
+ return createElement(element.tagName.toLowerCase(), attributes);
137
+ };
138
+
139
+ export const reparentChildren = (source: any, target: any): void => {
140
+ while (source.childNodes.length > 0) {
141
+ const child = source.childNodes.shift();
142
+ child.parentNode = target;
143
+ target.childNodes.push(child);
144
+ }
145
+ };
@@ -0,0 +1,137 @@
1
+ export const VOID_ELEMENTS = new Set([
2
+ "area",
3
+ "base",
4
+ "br",
5
+ "col",
6
+ "embed",
7
+ "hr",
8
+ "img",
9
+ "input",
10
+ "link",
11
+ "meta",
12
+ "param",
13
+ "source",
14
+ "track",
15
+ "wbr",
16
+ ]);
17
+
18
+ export const RAW_TEXT_ELEMENTS = new Set([
19
+ "script",
20
+ "style",
21
+ "textarea",
22
+ "title",
23
+ ]);
24
+
25
+ export const AUTO_CLOSE_RULES: Record<string, string[]> = {
26
+ li: ["li"],
27
+ dt: ["dt", "dd"],
28
+ dd: ["dt", "dd"],
29
+ address: ["p"],
30
+ article: ["p"],
31
+ aside: ["p"],
32
+ blockquote: ["p"],
33
+ center: ["p"],
34
+ details: ["p"],
35
+ dialog: ["p"],
36
+ dir: ["p"],
37
+ div: ["p"],
38
+ dl: ["p"],
39
+ fieldset: ["p"],
40
+ figcaption: ["p"],
41
+ figure: ["p"],
42
+ footer: ["p"],
43
+ form: ["p"],
44
+ h1: ["p"],
45
+ h2: ["p"],
46
+ h3: ["p"],
47
+ h4: ["p"],
48
+ h5: ["p"],
49
+ h6: ["p"],
50
+ header: ["p"],
51
+ hgroup: ["p"],
52
+ hr: ["p"],
53
+ listing: ["p"],
54
+ main: ["p"],
55
+ menu: ["p"],
56
+ nav: ["p"],
57
+ ol: ["p"],
58
+ p: ["p"],
59
+ pre: ["p"],
60
+ section: ["p"],
61
+ summary: ["p"],
62
+ table: ["p"],
63
+ ul: ["p"],
64
+ rt: ["rt", "rp"],
65
+ rp: ["rt", "rp"],
66
+ optgroup: ["optgroup"],
67
+ option: ["option"],
68
+ thead: ["tbody", "tfoot"],
69
+ tbody: ["thead", "tbody", "tfoot"],
70
+ tfoot: ["thead", "tbody"],
71
+ tr: ["tr"],
72
+ td: ["td", "th"],
73
+ th: ["td", "th"],
74
+ };
75
+
76
+ export const FORMATTING_ELEMENTS = new Set([
77
+ "a",
78
+ "b",
79
+ "big",
80
+ "code",
81
+ "em",
82
+ "font",
83
+ "i",
84
+ "nobr",
85
+ "s",
86
+ "small",
87
+ "strike",
88
+ "strong",
89
+ "tt",
90
+ "u",
91
+ ]);
92
+
93
+ export const TABLE_SCOPE_ELEMENTS = new Set([
94
+ "table",
95
+ "tbody",
96
+ "thead",
97
+ "tfoot",
98
+ "tr",
99
+ "template",
100
+ "html",
101
+ ]);
102
+
103
+ export const TABLE_CONTEXT_ELEMENTS = new Set([
104
+ "table",
105
+ "tbody",
106
+ "thead",
107
+ "tfoot",
108
+ "tr",
109
+ ]);
110
+
111
+ export const VALID_TABLE_CHILDREN = new Set([
112
+ "caption",
113
+ "colgroup",
114
+ "col",
115
+ "tbody",
116
+ "thead",
117
+ "tfoot",
118
+ "tr",
119
+ "script",
120
+ "template",
121
+ "style",
122
+ ]);
123
+
124
+ export const VALID_TABLE_SECTION_CHILDREN = new Set([
125
+ "tr",
126
+ "script",
127
+ "template",
128
+ "style",
129
+ ]);
130
+
131
+ export const VALID_TR_CHILDREN = new Set([
132
+ "td",
133
+ "th",
134
+ "script",
135
+ "template",
136
+ "style",
137
+ ]);
@@ -0,0 +1,79 @@
1
+ import type { ASTNode } from "./types.js";
2
+ import { ASTNodeType } from "./types.js";
3
+
4
+ export const domToAST = (dom: any): ASTNode => {
5
+ if (!dom) {
6
+ return {
7
+ type: ASTNodeType.Document,
8
+ children: [],
9
+ };
10
+ }
11
+
12
+ if (dom.nodeType === 9) {
13
+ return {
14
+ type: ASTNodeType.Document,
15
+ children: dom.childNodes ? dom.childNodes.map(domToAST) : [],
16
+ };
17
+ }
18
+
19
+ if (dom.nodeType === 1) {
20
+ const attributes: Record<string, string> = {};
21
+ if (dom.attributes) {
22
+ for (const [name, value] of Object.entries(dom.attributes)) {
23
+ attributes[name] = value;
24
+ }
25
+ }
26
+
27
+ return {
28
+ type: ASTNodeType.Element,
29
+ tagName: dom.tagName.toLowerCase(),
30
+ attributes,
31
+ children: dom.childNodes ? dom.childNodes.map(domToAST) : [],
32
+ namespaceURI: dom.namespaceURI,
33
+ isSelfClosing: dom.isSelfClosing || false,
34
+ };
35
+ }
36
+
37
+ if (dom.nodeType === 3) {
38
+ return {
39
+ type: ASTNodeType.Text,
40
+ content: dom.textContent || "",
41
+ };
42
+ }
43
+
44
+ if (dom.nodeType === 8) {
45
+ return {
46
+ type: ASTNodeType.Comment,
47
+ content: (dom.data as string) || "",
48
+ };
49
+ }
50
+
51
+ if (dom.nodeType === 10) {
52
+ return {
53
+ type: ASTNodeType.Doctype,
54
+ name: dom.name || "html",
55
+ publicId: dom.publicId || "",
56
+ systemId: dom.systemId || "",
57
+ content: dom.name || "html",
58
+ };
59
+ }
60
+
61
+ if (dom.nodeType === 4) {
62
+ return {
63
+ type: ASTNodeType.CDATA,
64
+ content: dom.textContent || "",
65
+ };
66
+ }
67
+
68
+ if (dom.nodeType === 7) {
69
+ return {
70
+ type: "processing-instruction" as any,
71
+ content: dom.data || dom.textContent || "",
72
+ };
73
+ }
74
+
75
+ return {
76
+ type: ASTNodeType.Text,
77
+ content: "",
78
+ };
79
+ };
@@ -0,0 +1,48 @@
1
+ export const mergeAdjacentTextNodes = (
2
+ parent: any,
3
+ insertIndex: number,
4
+ ): void => {
5
+ if (!parent.childNodes || parent.childNodes.length < 2) {
6
+ return;
7
+ }
8
+
9
+ const node = parent.childNodes[insertIndex];
10
+ if (!node || node.nodeType !== 3) {
11
+ return;
12
+ }
13
+
14
+ if (insertIndex > 0) {
15
+ const prevNode = parent.childNodes[insertIndex - 1];
16
+ if (prevNode && prevNode.nodeType === 3) {
17
+ prevNode.textContent =
18
+ (prevNode.textContent || "") + (node.textContent || "");
19
+ prevNode.nodeValue = prevNode.textContent;
20
+ parent.childNodes.splice(insertIndex, 1);
21
+ return;
22
+ }
23
+ }
24
+
25
+ if (insertIndex < parent.childNodes.length - 1) {
26
+ const nextNode = parent.childNodes[insertIndex + 1];
27
+ if (nextNode && nextNode.nodeType === 3) {
28
+ node.textContent =
29
+ (node.textContent || "") + (nextNode.textContent || "");
30
+ node.nodeValue = node.textContent;
31
+ parent.childNodes.splice(insertIndex + 1, 1);
32
+ }
33
+ }
34
+ };
35
+
36
+ export const insertNodeBeforeTable = (
37
+ parent: any,
38
+ tableElement: any,
39
+ node: any,
40
+ ): number => {
41
+ const idx = parent.childNodes.indexOf(tableElement);
42
+ if (idx !== -1) {
43
+ node.parentNode = parent;
44
+ parent.childNodes.splice(idx, 0, node);
45
+ return idx;
46
+ }
47
+ return -1;
48
+ };
@@ -0,0 +1,65 @@
1
+ import { createElement, appendChild } from "../dom-simulator/index.js";
2
+
3
+ export const CELL_ELEMENTS = new Set(["td", "th"]);
4
+
5
+ export const TABLE_SECTION_ELEMENTS = new Set(["tbody", "thead", "tfoot"]);
6
+
7
+ export const shouldCreateImplicitTableStructure = (
8
+ parentTagName: string,
9
+ childTagName: string,
10
+ ): boolean => {
11
+ const parent = parentTagName.toLowerCase();
12
+ const child = childTagName.toLowerCase();
13
+
14
+ if (CELL_ELEMENTS.has(child)) {
15
+ return parent === "table" || TABLE_SECTION_ELEMENTS.has(parent);
16
+ }
17
+
18
+ if (child === "tr") {
19
+ return parent === "table";
20
+ }
21
+
22
+ return false;
23
+ };
24
+
25
+ export const createImplicitTableStructure = (
26
+ stack: any[],
27
+ parentTagName: string,
28
+ childTagName: string,
29
+ ): any => {
30
+ const parent = parentTagName.toLowerCase();
31
+ const child = childTagName.toLowerCase();
32
+ const currentParent = stack[stack.length - 1];
33
+
34
+ if (CELL_ELEMENTS.has(child)) {
35
+ if (parent === "table") {
36
+ const tbody = createElement("tbody", {});
37
+ appendChild(currentParent, tbody);
38
+ stack.push(tbody);
39
+
40
+ const tr = createElement("tr", {});
41
+ appendChild(tbody, tr);
42
+ stack.push(tr);
43
+
44
+ return tr;
45
+ }
46
+
47
+ if (TABLE_SECTION_ELEMENTS.has(parent)) {
48
+ const tr = createElement("tr", {});
49
+ appendChild(currentParent, tr);
50
+ stack.push(tr);
51
+
52
+ return tr;
53
+ }
54
+ }
55
+
56
+ if (child === "tr" && parent === "table") {
57
+ const tbody = createElement("tbody", {});
58
+ appendChild(currentParent, tbody);
59
+ stack.push(tbody);
60
+
61
+ return tbody;
62
+ }
63
+
64
+ return currentParent;
65
+ };
@@ -0,0 +1,9 @@
1
+ export { parse } from "./parse";
2
+ export { domToAST } from "./dom-to-ast";
3
+ export type { ParserState, ParseError, InsertionMode, ASTNode } from "./types";
4
+ export { ASTNodeType } from "./types";
5
+ export {
6
+ VOID_ELEMENTS,
7
+ RAW_TEXT_ELEMENTS,
8
+ AUTO_CLOSE_RULES,
9
+ } from "./constants";