@tkeron/html-parser 1.1.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/.github/workflows/npm_deploy.yml +14 -4
  2. package/README.md +6 -6
  3. package/bun.lock +6 -8
  4. package/check-versions.ts +147 -0
  5. package/index.ts +4 -8
  6. package/package.json +5 -6
  7. package/src/dom-simulator/append-child.ts +130 -0
  8. package/src/dom-simulator/append.ts +18 -0
  9. package/src/dom-simulator/attributes.ts +23 -0
  10. package/src/dom-simulator/clone-node.ts +51 -0
  11. package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
  12. package/src/dom-simulator/create-cdata.ts +18 -0
  13. package/src/dom-simulator/create-comment.ts +23 -0
  14. package/src/dom-simulator/create-doctype.ts +24 -0
  15. package/src/dom-simulator/create-document.ts +81 -0
  16. package/src/dom-simulator/create-element.ts +195 -0
  17. package/src/dom-simulator/create-processing-instruction.ts +19 -0
  18. package/src/dom-simulator/create-temp-parent.ts +9 -0
  19. package/src/dom-simulator/create-text-node.ts +23 -0
  20. package/src/dom-simulator/escape-text-content.ts +6 -0
  21. package/src/dom-simulator/find-special-elements.ts +14 -0
  22. package/src/dom-simulator/get-text-content.ts +18 -0
  23. package/src/dom-simulator/index.ts +36 -0
  24. package/src/dom-simulator/inner-outer-html.ts +182 -0
  25. package/src/dom-simulator/insert-after.ts +20 -0
  26. package/src/dom-simulator/insert-before.ts +108 -0
  27. package/src/dom-simulator/matches.ts +26 -0
  28. package/src/dom-simulator/node-types.ts +26 -0
  29. package/src/dom-simulator/prepend.ts +24 -0
  30. package/src/dom-simulator/remove-child.ts +68 -0
  31. package/src/dom-simulator/remove.ts +7 -0
  32. package/src/dom-simulator/replace-child.ts +152 -0
  33. package/src/dom-simulator/set-text-content.ts +33 -0
  34. package/src/dom-simulator/update-element-content.ts +56 -0
  35. package/src/dom-simulator.ts +12 -1126
  36. package/src/encoding/constants.ts +8 -0
  37. package/src/encoding/detect-encoding.ts +21 -0
  38. package/src/encoding/index.ts +1 -0
  39. package/src/encoding/normalize-encoding.ts +6 -0
  40. package/src/html-entities.ts +2127 -0
  41. package/src/index.ts +5 -5
  42. package/src/parser/adoption-agency-helpers.ts +145 -0
  43. package/src/parser/constants.ts +137 -0
  44. package/src/parser/dom-to-ast.ts +79 -0
  45. package/src/parser/index.ts +9 -0
  46. package/src/parser/parse.ts +772 -0
  47. package/src/parser/types.ts +56 -0
  48. package/src/selectors/find-elements-descendant.ts +47 -0
  49. package/src/selectors/index.ts +2 -0
  50. package/src/selectors/matches-selector.ts +12 -0
  51. package/src/selectors/matches-token.ts +27 -0
  52. package/src/selectors/parse-selector.ts +48 -0
  53. package/src/selectors/query-selector-all.ts +43 -0
  54. package/src/selectors/query-selector.ts +6 -0
  55. package/src/selectors/types.ts +10 -0
  56. package/src/serializer/attributes.ts +74 -0
  57. package/src/serializer/escape.ts +13 -0
  58. package/src/serializer/index.ts +1 -0
  59. package/src/serializer/serialize-tokens.ts +511 -0
  60. package/src/tokenizer/calculate-position.ts +10 -0
  61. package/src/tokenizer/constants.ts +11 -0
  62. package/src/tokenizer/decode-entities.ts +64 -0
  63. package/src/tokenizer/index.ts +2 -0
  64. package/src/tokenizer/parse-attributes.ts +74 -0
  65. package/src/tokenizer/tokenize.ts +165 -0
  66. package/src/tokenizer/types.ts +25 -0
  67. package/tests/adoption-agency-helpers.test.ts +304 -0
  68. package/tests/advanced.test.ts +242 -221
  69. package/tests/cloneNode.test.ts +19 -66
  70. package/tests/custom-elements-head.test.ts +54 -55
  71. package/tests/dom-extended.test.ts +77 -64
  72. package/tests/dom-manipulation.test.ts +51 -24
  73. package/tests/dom.test.ts +15 -13
  74. package/tests/encoding/detect-encoding.test.ts +33 -0
  75. package/tests/google-dom.test.ts +2 -2
  76. package/tests/helpers/tokenizer-adapter.test.ts +29 -43
  77. package/tests/helpers/tokenizer-adapter.ts +36 -33
  78. package/tests/helpers/tree-adapter.test.ts +20 -20
  79. package/tests/helpers/tree-adapter.ts +34 -24
  80. package/tests/html-entities-text.test.ts +6 -2
  81. package/tests/innerhtml-void-elements.test.ts +52 -36
  82. package/tests/outerHTML-replacement.test.ts +37 -65
  83. package/tests/parser/dom-to-ast.test.ts +109 -0
  84. package/tests/parser/parse.test.ts +139 -0
  85. package/tests/parser.test.ts +281 -217
  86. package/tests/selectors/query-selector-all.test.ts +39 -0
  87. package/tests/selectors/query-selector.test.ts +42 -0
  88. package/tests/serializer/attributes.test.ts +132 -0
  89. package/tests/serializer/escape.test.ts +51 -0
  90. package/tests/serializer/serialize-tokens.test.ts +80 -0
  91. package/tests/serializer-core.test.ts +6 -6
  92. package/tests/serializer-injectmeta.test.ts +6 -6
  93. package/tests/serializer-optionaltags.test.ts +9 -6
  94. package/tests/serializer-options.test.ts +6 -6
  95. package/tests/serializer-whitespace.test.ts +6 -6
  96. package/tests/tokenizer/calculate-position.test.ts +34 -0
  97. package/tests/tokenizer/decode-entities.test.ts +31 -0
  98. package/tests/tokenizer/parse-attributes.test.ts +44 -0
  99. package/tests/tokenizer/tokenize.test.ts +757 -0
  100. package/tests/tokenizer-namedEntities.test.ts +10 -7
  101. package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
  102. package/tests/tokenizer.test.ts +268 -256
  103. package/tests/tree-construction-adoption01.test.ts +25 -16
  104. package/tests/tree-construction-adoption02.test.ts +30 -19
  105. package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
  106. package/tests/tree-construction-entities02.test.ts +18 -16
  107. package/tests/tree-construction-html5test-com.test.ts +16 -10
  108. package/tests/tree-construction-math.test.ts +11 -9
  109. package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
  110. package/tests/tree-construction-noscript01.test.ts +11 -9
  111. package/tests/tree-construction-ruby.test.ts +6 -4
  112. package/tests/tree-construction-scriptdata01.test.ts +6 -4
  113. package/tests/tree-construction-svg.test.ts +6 -4
  114. package/tests/tree-construction-template.test.ts +6 -4
  115. package/tests/tree-construction-tests10.test.ts +6 -4
  116. package/tests/tree-construction-tests11.test.ts +6 -4
  117. package/tests/tree-construction-tests20.test.ts +7 -4
  118. package/tests/tree-construction-tests21.test.ts +7 -4
  119. package/tests/tree-construction-tests23.test.ts +7 -4
  120. package/tests/tree-construction-tests24.test.ts +7 -4
  121. package/tests/tree-construction-tests5.test.ts +6 -5
  122. package/tests/tree-construction-tests6.test.ts +6 -5
  123. package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
  124. package/tests/void-elements.test.ts +85 -40
  125. package/tsconfig.json +1 -1
  126. package/src/css-selector.ts +0 -185
  127. package/src/encoding.ts +0 -39
  128. package/src/parser.ts +0 -682
  129. package/src/serializer.ts +0 -450
  130. package/src/tokenizer.ts +0 -325
  131. package/tests/selectors.test.ts +0 -128
package/src/index.ts CHANGED
@@ -1,9 +1,9 @@
1
- import { tokenize } from './tokenizer.js';
2
- import { parse } from './parser.js';
1
+ import { tokenize } from "./tokenizer/index.js";
2
+ import { parse } from "./parser/index.js";
3
3
 
4
- export function parseHTML(html: string): any {
4
+ export const parseHTML = (html: string): any => {
5
5
  const tokens = tokenize(html);
6
6
  return parse(tokens);
7
- }
7
+ };
8
8
 
9
- export { parse } from './parser';
9
+ export { parse } from "./parser/index.js";
@@ -0,0 +1,145 @@
1
+ import { createElement } from "../dom-simulator/index.js";
2
+
3
+ const SPECIAL_ELEMENTS = new Set([
4
+ "address",
5
+ "applet",
6
+ "area",
7
+ "article",
8
+ "aside",
9
+ "base",
10
+ "basefont",
11
+ "bgsound",
12
+ "blockquote",
13
+ "body",
14
+ "br",
15
+ "button",
16
+ "caption",
17
+ "center",
18
+ "col",
19
+ "colgroup",
20
+ "dd",
21
+ "details",
22
+ "dir",
23
+ "div",
24
+ "dl",
25
+ "dt",
26
+ "embed",
27
+ "fieldset",
28
+ "figcaption",
29
+ "figure",
30
+ "footer",
31
+ "form",
32
+ "frame",
33
+ "frameset",
34
+ "h1",
35
+ "h2",
36
+ "h3",
37
+ "h4",
38
+ "h5",
39
+ "h6",
40
+ "head",
41
+ "header",
42
+ "hgroup",
43
+ "hr",
44
+ "html",
45
+ "iframe",
46
+ "img",
47
+ "input",
48
+ "li",
49
+ "link",
50
+ "listing",
51
+ "main",
52
+ "marquee",
53
+ "menu",
54
+ "meta",
55
+ "nav",
56
+ "noembed",
57
+ "noframes",
58
+ "noscript",
59
+ "object",
60
+ "ol",
61
+ "p",
62
+ "param",
63
+ "plaintext",
64
+ "pre",
65
+ "script",
66
+ "section",
67
+ "select",
68
+ "source",
69
+ "style",
70
+ "summary",
71
+ "table",
72
+ "tbody",
73
+ "td",
74
+ "template",
75
+ "textarea",
76
+ "tfoot",
77
+ "th",
78
+ "thead",
79
+ "title",
80
+ "tr",
81
+ "track",
82
+ "ul",
83
+ "wbr",
84
+ "xmp",
85
+ ]);
86
+
87
+ type StackSearchResult = {
88
+ element: any;
89
+ index: number;
90
+ };
91
+
92
+ export const findFormattingElementInStack = (
93
+ stack: any[],
94
+ tagName: string,
95
+ ): StackSearchResult | null => {
96
+ const lowerTagName = tagName.toLowerCase();
97
+ for (let i = stack.length - 1; i >= 0; i--) {
98
+ const element = stack[i];
99
+ if (element.tagName && element.tagName.toLowerCase() === lowerTagName) {
100
+ return { element, index: i };
101
+ }
102
+ }
103
+ return null;
104
+ };
105
+
106
+ export const findFurthestBlock = (
107
+ stack: any[],
108
+ formattingElementIndex: number,
109
+ ): StackSearchResult | null => {
110
+ for (let i = formattingElementIndex + 1; i < stack.length; i++) {
111
+ const element = stack[i];
112
+ if (isSpecialElement(element.tagName)) {
113
+ return { element, index: i };
114
+ }
115
+ }
116
+ return null;
117
+ };
118
+
119
+ export const getCommonAncestor = (
120
+ stack: any[],
121
+ formattingElementIndex: number,
122
+ ): any | null => {
123
+ if (formattingElementIndex <= 0) {
124
+ return null;
125
+ }
126
+ return stack[formattingElementIndex - 1];
127
+ };
128
+
129
+ export const isSpecialElement = (tagName: string): boolean => {
130
+ if (!tagName) return false;
131
+ return SPECIAL_ELEMENTS.has(tagName.toLowerCase());
132
+ };
133
+
134
+ export const cloneFormattingElement = (element: any): any => {
135
+ const attributes = element.attributes ? { ...element.attributes } : {};
136
+ return createElement(element.tagName.toLowerCase(), attributes);
137
+ };
138
+
139
+ export const reparentChildren = (source: any, target: any): void => {
140
+ while (source.childNodes.length > 0) {
141
+ const child = source.childNodes.shift();
142
+ child.parentNode = target;
143
+ target.childNodes.push(child);
144
+ }
145
+ };
@@ -0,0 +1,137 @@
1
+ export const VOID_ELEMENTS = new Set([
2
+ "area",
3
+ "base",
4
+ "br",
5
+ "col",
6
+ "embed",
7
+ "hr",
8
+ "img",
9
+ "input",
10
+ "link",
11
+ "meta",
12
+ "param",
13
+ "source",
14
+ "track",
15
+ "wbr",
16
+ ]);
17
+
18
+ export const RAW_TEXT_ELEMENTS = new Set([
19
+ "script",
20
+ "style",
21
+ "textarea",
22
+ "title",
23
+ ]);
24
+
25
+ export const AUTO_CLOSE_RULES: Record<string, string[]> = {
26
+ li: ["li"],
27
+ dt: ["dt", "dd"],
28
+ dd: ["dt", "dd"],
29
+ address: ["p"],
30
+ article: ["p"],
31
+ aside: ["p"],
32
+ blockquote: ["p"],
33
+ center: ["p"],
34
+ details: ["p"],
35
+ dialog: ["p"],
36
+ dir: ["p"],
37
+ div: ["p"],
38
+ dl: ["p"],
39
+ fieldset: ["p"],
40
+ figcaption: ["p"],
41
+ figure: ["p"],
42
+ footer: ["p"],
43
+ form: ["p"],
44
+ h1: ["p"],
45
+ h2: ["p"],
46
+ h3: ["p"],
47
+ h4: ["p"],
48
+ h5: ["p"],
49
+ h6: ["p"],
50
+ header: ["p"],
51
+ hgroup: ["p"],
52
+ hr: ["p"],
53
+ listing: ["p"],
54
+ main: ["p"],
55
+ menu: ["p"],
56
+ nav: ["p"],
57
+ ol: ["p"],
58
+ p: ["p"],
59
+ pre: ["p"],
60
+ section: ["p"],
61
+ summary: ["p"],
62
+ table: ["p"],
63
+ ul: ["p"],
64
+ rt: ["rt", "rp"],
65
+ rp: ["rt", "rp"],
66
+ optgroup: ["optgroup"],
67
+ option: ["option"],
68
+ thead: ["tbody", "tfoot"],
69
+ tbody: ["thead", "tbody", "tfoot"],
70
+ tfoot: ["thead", "tbody"],
71
+ tr: ["tr"],
72
+ td: ["td", "th"],
73
+ th: ["td", "th"],
74
+ };
75
+
76
+ export const FORMATTING_ELEMENTS = new Set([
77
+ "a",
78
+ "b",
79
+ "big",
80
+ "code",
81
+ "em",
82
+ "font",
83
+ "i",
84
+ "nobr",
85
+ "s",
86
+ "small",
87
+ "strike",
88
+ "strong",
89
+ "tt",
90
+ "u",
91
+ ]);
92
+
93
+ export const TABLE_SCOPE_ELEMENTS = new Set([
94
+ "table",
95
+ "tbody",
96
+ "thead",
97
+ "tfoot",
98
+ "tr",
99
+ "template",
100
+ "html",
101
+ ]);
102
+
103
+ export const TABLE_CONTEXT_ELEMENTS = new Set([
104
+ "table",
105
+ "tbody",
106
+ "thead",
107
+ "tfoot",
108
+ "tr",
109
+ ]);
110
+
111
+ export const VALID_TABLE_CHILDREN = new Set([
112
+ "caption",
113
+ "colgroup",
114
+ "col",
115
+ "tbody",
116
+ "thead",
117
+ "tfoot",
118
+ "tr",
119
+ "script",
120
+ "template",
121
+ "style",
122
+ ]);
123
+
124
+ export const VALID_TABLE_SECTION_CHILDREN = new Set([
125
+ "tr",
126
+ "script",
127
+ "template",
128
+ "style",
129
+ ]);
130
+
131
+ export const VALID_TR_CHILDREN = new Set([
132
+ "td",
133
+ "th",
134
+ "script",
135
+ "template",
136
+ "style",
137
+ ]);
@@ -0,0 +1,79 @@
1
+ import type { ASTNode } from "./types.js";
2
+ import { ASTNodeType } from "./types.js";
3
+
4
+ export const domToAST = (dom: any): ASTNode => {
5
+ if (!dom) {
6
+ return {
7
+ type: ASTNodeType.Document,
8
+ children: [],
9
+ };
10
+ }
11
+
12
+ if (dom.nodeType === 9) {
13
+ return {
14
+ type: ASTNodeType.Document,
15
+ children: dom.childNodes ? dom.childNodes.map(domToAST) : [],
16
+ };
17
+ }
18
+
19
+ if (dom.nodeType === 1) {
20
+ const attributes: Record<string, string> = {};
21
+ if (dom.attributes) {
22
+ for (const [name, value] of Object.entries(dom.attributes)) {
23
+ attributes[name] = value;
24
+ }
25
+ }
26
+
27
+ return {
28
+ type: ASTNodeType.Element,
29
+ tagName: dom.tagName.toLowerCase(),
30
+ attributes,
31
+ children: dom.childNodes ? dom.childNodes.map(domToAST) : [],
32
+ namespaceURI: dom.namespaceURI,
33
+ isSelfClosing: dom.isSelfClosing || false,
34
+ };
35
+ }
36
+
37
+ if (dom.nodeType === 3) {
38
+ return {
39
+ type: ASTNodeType.Text,
40
+ content: dom.textContent || "",
41
+ };
42
+ }
43
+
44
+ if (dom.nodeType === 8) {
45
+ return {
46
+ type: ASTNodeType.Comment,
47
+ content: (dom.data as string) || "",
48
+ };
49
+ }
50
+
51
+ if (dom.nodeType === 10) {
52
+ return {
53
+ type: ASTNodeType.Doctype,
54
+ name: dom.name || "html",
55
+ publicId: dom.publicId || "",
56
+ systemId: dom.systemId || "",
57
+ content: dom.name || "html",
58
+ };
59
+ }
60
+
61
+ if (dom.nodeType === 4) {
62
+ return {
63
+ type: ASTNodeType.CDATA,
64
+ content: dom.textContent || "",
65
+ };
66
+ }
67
+
68
+ if (dom.nodeType === 7) {
69
+ return {
70
+ type: "processing-instruction" as any,
71
+ content: dom.data || dom.textContent || "",
72
+ };
73
+ }
74
+
75
+ return {
76
+ type: ASTNodeType.Text,
77
+ content: "",
78
+ };
79
+ };
@@ -0,0 +1,9 @@
1
+ export { parse } from "./parse";
2
+ export { domToAST } from "./dom-to-ast";
3
+ export type { ParserState, ParseError, InsertionMode, ASTNode } from "./types";
4
+ export { ASTNodeType } from "./types";
5
+ export {
6
+ VOID_ELEMENTS,
7
+ RAW_TEXT_ELEMENTS,
8
+ AUTO_CLOSE_RULES,
9
+ } from "./constants";