@tkeron/html-parser 1.1.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. package/.github/workflows/npm_deploy.yml +14 -4
  2. package/README.md +6 -6
  3. package/bun.lock +6 -8
  4. package/check-versions.ts +147 -0
  5. package/index.ts +4 -8
  6. package/package.json +5 -6
  7. package/src/dom-simulator/append-child.ts +130 -0
  8. package/src/dom-simulator/append.ts +18 -0
  9. package/src/dom-simulator/attributes.ts +23 -0
  10. package/src/dom-simulator/clone-node.ts +51 -0
  11. package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
  12. package/src/dom-simulator/create-cdata.ts +18 -0
  13. package/src/dom-simulator/create-comment.ts +23 -0
  14. package/src/dom-simulator/create-doctype.ts +24 -0
  15. package/src/dom-simulator/create-document.ts +81 -0
  16. package/src/dom-simulator/create-element.ts +195 -0
  17. package/src/dom-simulator/create-processing-instruction.ts +19 -0
  18. package/src/dom-simulator/create-temp-parent.ts +9 -0
  19. package/src/dom-simulator/create-text-node.ts +23 -0
  20. package/src/dom-simulator/escape-text-content.ts +6 -0
  21. package/src/dom-simulator/find-special-elements.ts +14 -0
  22. package/src/dom-simulator/get-text-content.ts +18 -0
  23. package/src/dom-simulator/index.ts +36 -0
  24. package/src/dom-simulator/inner-outer-html.ts +182 -0
  25. package/src/dom-simulator/insert-after.ts +20 -0
  26. package/src/dom-simulator/insert-before.ts +108 -0
  27. package/src/dom-simulator/matches.ts +26 -0
  28. package/src/dom-simulator/node-types.ts +26 -0
  29. package/src/dom-simulator/prepend.ts +24 -0
  30. package/src/dom-simulator/remove-child.ts +68 -0
  31. package/src/dom-simulator/remove.ts +7 -0
  32. package/src/dom-simulator/replace-child.ts +152 -0
  33. package/src/dom-simulator/set-text-content.ts +33 -0
  34. package/src/dom-simulator/update-element-content.ts +56 -0
  35. package/src/dom-simulator.ts +12 -1126
  36. package/src/encoding/constants.ts +8 -0
  37. package/src/encoding/detect-encoding.ts +21 -0
  38. package/src/encoding/index.ts +1 -0
  39. package/src/encoding/normalize-encoding.ts +6 -0
  40. package/src/html-entities.ts +2127 -0
  41. package/src/index.ts +5 -5
  42. package/src/parser/adoption-agency-helpers.ts +145 -0
  43. package/src/parser/constants.ts +137 -0
  44. package/src/parser/dom-to-ast.ts +79 -0
  45. package/src/parser/index.ts +9 -0
  46. package/src/parser/parse.ts +772 -0
  47. package/src/parser/types.ts +56 -0
  48. package/src/selectors/find-elements-descendant.ts +47 -0
  49. package/src/selectors/index.ts +2 -0
  50. package/src/selectors/matches-selector.ts +12 -0
  51. package/src/selectors/matches-token.ts +27 -0
  52. package/src/selectors/parse-selector.ts +48 -0
  53. package/src/selectors/query-selector-all.ts +43 -0
  54. package/src/selectors/query-selector.ts +6 -0
  55. package/src/selectors/types.ts +10 -0
  56. package/src/serializer/attributes.ts +74 -0
  57. package/src/serializer/escape.ts +13 -0
  58. package/src/serializer/index.ts +1 -0
  59. package/src/serializer/serialize-tokens.ts +511 -0
  60. package/src/tokenizer/calculate-position.ts +10 -0
  61. package/src/tokenizer/constants.ts +11 -0
  62. package/src/tokenizer/decode-entities.ts +64 -0
  63. package/src/tokenizer/index.ts +2 -0
  64. package/src/tokenizer/parse-attributes.ts +74 -0
  65. package/src/tokenizer/tokenize.ts +165 -0
  66. package/src/tokenizer/types.ts +25 -0
  67. package/tests/adoption-agency-helpers.test.ts +304 -0
  68. package/tests/advanced.test.ts +242 -221
  69. package/tests/cloneNode.test.ts +19 -66
  70. package/tests/custom-elements-head.test.ts +54 -55
  71. package/tests/dom-extended.test.ts +77 -64
  72. package/tests/dom-manipulation.test.ts +51 -24
  73. package/tests/dom.test.ts +15 -13
  74. package/tests/encoding/detect-encoding.test.ts +33 -0
  75. package/tests/google-dom.test.ts +2 -2
  76. package/tests/helpers/tokenizer-adapter.test.ts +29 -43
  77. package/tests/helpers/tokenizer-adapter.ts +36 -33
  78. package/tests/helpers/tree-adapter.test.ts +20 -20
  79. package/tests/helpers/tree-adapter.ts +34 -24
  80. package/tests/html-entities-text.test.ts +6 -2
  81. package/tests/innerhtml-void-elements.test.ts +52 -36
  82. package/tests/outerHTML-replacement.test.ts +37 -65
  83. package/tests/parser/dom-to-ast.test.ts +109 -0
  84. package/tests/parser/parse.test.ts +139 -0
  85. package/tests/parser.test.ts +281 -217
  86. package/tests/selectors/query-selector-all.test.ts +39 -0
  87. package/tests/selectors/query-selector.test.ts +42 -0
  88. package/tests/serializer/attributes.test.ts +132 -0
  89. package/tests/serializer/escape.test.ts +51 -0
  90. package/tests/serializer/serialize-tokens.test.ts +80 -0
  91. package/tests/serializer-core.test.ts +6 -6
  92. package/tests/serializer-injectmeta.test.ts +6 -6
  93. package/tests/serializer-optionaltags.test.ts +9 -6
  94. package/tests/serializer-options.test.ts +6 -6
  95. package/tests/serializer-whitespace.test.ts +6 -6
  96. package/tests/tokenizer/calculate-position.test.ts +34 -0
  97. package/tests/tokenizer/decode-entities.test.ts +31 -0
  98. package/tests/tokenizer/parse-attributes.test.ts +44 -0
  99. package/tests/tokenizer/tokenize.test.ts +757 -0
  100. package/tests/tokenizer-namedEntities.test.ts +10 -7
  101. package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
  102. package/tests/tokenizer.test.ts +268 -256
  103. package/tests/tree-construction-adoption01.test.ts +25 -16
  104. package/tests/tree-construction-adoption02.test.ts +30 -19
  105. package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
  106. package/tests/tree-construction-entities02.test.ts +18 -16
  107. package/tests/tree-construction-html5test-com.test.ts +16 -10
  108. package/tests/tree-construction-math.test.ts +11 -9
  109. package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
  110. package/tests/tree-construction-noscript01.test.ts +11 -9
  111. package/tests/tree-construction-ruby.test.ts +6 -4
  112. package/tests/tree-construction-scriptdata01.test.ts +6 -4
  113. package/tests/tree-construction-svg.test.ts +6 -4
  114. package/tests/tree-construction-template.test.ts +6 -4
  115. package/tests/tree-construction-tests10.test.ts +6 -4
  116. package/tests/tree-construction-tests11.test.ts +6 -4
  117. package/tests/tree-construction-tests20.test.ts +7 -4
  118. package/tests/tree-construction-tests21.test.ts +7 -4
  119. package/tests/tree-construction-tests23.test.ts +7 -4
  120. package/tests/tree-construction-tests24.test.ts +7 -4
  121. package/tests/tree-construction-tests5.test.ts +6 -5
  122. package/tests/tree-construction-tests6.test.ts +6 -5
  123. package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
  124. package/tests/void-elements.test.ts +85 -40
  125. package/tsconfig.json +1 -1
  126. package/src/css-selector.ts +0 -185
  127. package/src/encoding.ts +0 -39
  128. package/src/parser.ts +0 -682
  129. package/src/serializer.ts +0 -450
  130. package/src/tokenizer.ts +0 -325
  131. package/tests/selectors.test.ts +0 -128
@@ -0,0 +1,56 @@
1
+ import type { Token } from "../tokenizer/index.js";
2
+
3
+ export interface ParserState {
4
+ tokens: Token[];
5
+ position: number;
6
+ length: number;
7
+ stack: any[];
8
+ root: any;
9
+ insertionMode: InsertionMode;
10
+ errors: ParseError[];
11
+ explicitHead?: boolean;
12
+ activeFormattingElements: any[];
13
+ formElementPointer?: any;
14
+ }
15
+
16
+ export interface ParseError {
17
+ message: string;
18
+ position: number;
19
+ line: number;
20
+ column: number;
21
+ severity: "error" | "warning";
22
+ }
23
+
24
+ export enum InsertionMode {
25
+ Initial = "initial",
26
+ BeforeHtml = "beforeHtml",
27
+ BeforeHead = "beforeHead",
28
+ InHead = "inHead",
29
+ AfterHead = "afterHead",
30
+ InBody = "inBody",
31
+ }
32
+
33
+ export enum ASTNodeType {
34
+ Document = "document",
35
+ Element = "element",
36
+ Text = "text",
37
+ Comment = "comment",
38
+ Doctype = "doctype",
39
+ CDATA = "cdata",
40
+ }
41
+
42
+ export interface ASTNode {
43
+ type: ASTNodeType;
44
+ tagName?: string;
45
+ value?: string;
46
+ attributes?: Record<string, string>;
47
+ children?: ASTNode[];
48
+ isSelfClosing?: boolean;
49
+ content?: string;
50
+ name?: string;
51
+ publicId?: string;
52
+ systemId?: string;
53
+ namespaceURI?: string;
54
+ target?: string;
55
+ data?: string;
56
+ }
@@ -0,0 +1,47 @@
1
+ import type { SelectorGroup } from "./types.js";
2
+ import { matchesSelector } from "./matches-selector.js";
3
+
4
+ export const findElementsDescendant = (
5
+ node: any,
6
+ selectorGroups: SelectorGroup[],
7
+ groupIndex: number,
8
+ results: any[],
9
+ ): void => {
10
+ if (groupIndex >= selectorGroups.length) {
11
+ return;
12
+ }
13
+
14
+ const currentGroup = selectorGroups[groupIndex];
15
+ if (!currentGroup) {
16
+ return;
17
+ }
18
+
19
+ const isLastGroup = groupIndex === selectorGroups.length - 1;
20
+
21
+ for (const child of node.childNodes || []) {
22
+ if (child.nodeType === 1) {
23
+ const element = child;
24
+
25
+ if (matchesSelector(element, currentGroup.tokens)) {
26
+ if (isLastGroup) {
27
+ results.push(element);
28
+ } else {
29
+ findElementsDescendant(
30
+ element,
31
+ selectorGroups,
32
+ groupIndex + 1,
33
+ results,
34
+ );
35
+ }
36
+ }
37
+ }
38
+
39
+ const shouldContinueSearching =
40
+ !isLastGroup ||
41
+ child.nodeType !== 1 ||
42
+ !matchesSelector(child, currentGroup.tokens);
43
+ if (shouldContinueSearching) {
44
+ findElementsDescendant(child, selectorGroups, groupIndex, results);
45
+ }
46
+ }
47
+ };
@@ -0,0 +1,2 @@
1
+ export { querySelectorAll } from "./query-selector-all.ts";
2
+ export { querySelector } from "./query-selector.ts";
@@ -0,0 +1,12 @@
1
+ import type { SelectorToken } from "./types.js";
2
+ import { matchesToken } from "./matches-token.js";
3
+
4
+ export const matchesSelector = (
5
+ element: any,
6
+ tokens: SelectorToken[],
7
+ ): boolean => {
8
+ if (tokens.length === 0) {
9
+ return true;
10
+ }
11
+ return tokens.every((token) => matchesToken(element, token));
12
+ };
@@ -0,0 +1,27 @@
1
+ import type { SelectorToken } from "./types.js";
2
+
3
+ export const matchesToken = (element: any, token: SelectorToken): boolean => {
4
+ if (!element || !element.tagName) {
5
+ return false;
6
+ }
7
+
8
+ switch (token.type) {
9
+ case "tag":
10
+ return element.tagName.toLowerCase() === token.value;
11
+ case "class":
12
+ const classAttr =
13
+ element.attributes?.class || element.attributes?.className || "";
14
+ const classes = classAttr.split(/\s+/).filter(Boolean);
15
+ return classes.includes(token.value);
16
+ case "id":
17
+ return element.attributes?.id === token.value;
18
+ case "attribute":
19
+ const attrValue = element.attributes?.[token.attributeName || ""];
20
+ if (token.attributeValue === undefined) {
21
+ return attrValue !== undefined;
22
+ }
23
+ return attrValue === token.attributeValue;
24
+ default:
25
+ return false;
26
+ }
27
+ };
@@ -0,0 +1,48 @@
1
+ import type { SelectorGroup, SelectorToken } from "./types.js";
2
+
3
+ export const parseSelector = (selector: string): SelectorGroup[] => {
4
+ const parts = selector.trim().split(/\s+/);
5
+
6
+ return parts.map((part) => {
7
+ const trimmed = part.trim();
8
+ let tokens: SelectorToken[] = [];
9
+
10
+ if (trimmed === "*") {
11
+ return { tokens: [] };
12
+ }
13
+
14
+ let remaining = trimmed;
15
+
16
+ const tagMatch = remaining.match(/^([a-zA-Z][a-zA-Z0-9-]*)/);
17
+ if (tagMatch) {
18
+ tokens.push({ type: "tag", value: tagMatch[1].toLowerCase() });
19
+ remaining = remaining.slice(tagMatch[1].length);
20
+ }
21
+
22
+ const idMatches = remaining.matchAll(/#([a-zA-Z0-9][a-zA-Z0-9_-]*)/g);
23
+ for (const match of idMatches) {
24
+ tokens.push({ type: "id", value: match[1] });
25
+ }
26
+ remaining = remaining.replace(/#[a-zA-Z0-9][a-zA-Z0-9_-]*/g, "");
27
+
28
+ const classMatches = remaining.matchAll(/\.([a-zA-Z][a-zA-Z0-9_-]*)/g);
29
+ for (const match of classMatches) {
30
+ tokens.push({ type: "class", value: match[1] });
31
+ }
32
+ remaining = remaining.replace(/\.[a-zA-Z][a-zA-Z0-9_-]*/g, "");
33
+
34
+ const attrMatches = remaining.matchAll(
35
+ /\[([^=\]]+)(?:=["']?([^"'\]]*?)["']?)?\]/g,
36
+ );
37
+ for (const match of attrMatches) {
38
+ tokens.push({
39
+ type: "attribute",
40
+ value: match[1].trim(),
41
+ attributeName: match[1].trim(),
42
+ attributeValue: match[2] ? match[2].trim() : undefined,
43
+ });
44
+ }
45
+
46
+ return { tokens };
47
+ });
48
+ };
@@ -0,0 +1,43 @@
1
+ import type { SelectorGroup, SelectorToken } from "./types.js";
2
+ import { parseSelector } from "./parse-selector.js";
3
+ import { matchesSelector } from "./matches-selector.js";
4
+ import { findElementsDescendant } from "./find-elements-descendant.js";
5
+
6
+ const findElementsSimple = (
7
+ node: any,
8
+ tokens: SelectorToken[],
9
+ results: any[],
10
+ ): void => {
11
+ if (node.nodeType === 1) {
12
+ const element = node;
13
+ if (matchesSelector(element, tokens)) {
14
+ results.push(element);
15
+ }
16
+ }
17
+ for (const child of node.childNodes || []) {
18
+ findElementsSimple(child, tokens, results);
19
+ }
20
+ };
21
+
22
+ const findElements = (
23
+ node: any,
24
+ selectorGroups: SelectorGroup[],
25
+ results: any[],
26
+ ): void => {
27
+ if (selectorGroups.length === 1) {
28
+ const firstGroup = selectorGroups[0];
29
+ if (firstGroup) {
30
+ const tokens = firstGroup.tokens;
31
+ findElementsSimple(node, tokens, results);
32
+ }
33
+ } else {
34
+ findElementsDescendant(node, selectorGroups, 0, results);
35
+ }
36
+ };
37
+
38
+ export const querySelectorAll = (root: any, selector: string): any[] => {
39
+ const selectorGroups = parseSelector(selector);
40
+ const results: any[] = [];
41
+ findElements(root, selectorGroups, results);
42
+ return results;
43
+ };
@@ -0,0 +1,6 @@
1
+ import { querySelectorAll } from "./query-selector-all.js";
2
+
3
+ export const querySelector = (root: any, selector: string): any | null => {
4
+ const results = querySelectorAll(root, selector);
5
+ return results[0] || null;
6
+ };
@@ -0,0 +1,10 @@
1
+ export interface SelectorToken {
2
+ type: "tag" | "class" | "id" | "attribute";
3
+ value: string;
4
+ attributeName?: string;
5
+ attributeValue?: string;
6
+ }
7
+
8
+ export interface SelectorGroup {
9
+ tokens: SelectorToken[];
10
+ }
@@ -0,0 +1,74 @@
1
+ export const needsQuotes = (value: string): boolean => {
2
+ return value === "" || /[\t\n\r\f "'=`>]/.test(value);
3
+ };
4
+
5
+ export const serializeAttribute = (
6
+ name: string,
7
+ value: string,
8
+ options?: {
9
+ quote_char?: string;
10
+ quote_attr_values?: boolean;
11
+ minimize_boolean_attributes?: boolean;
12
+ escape_lt_in_attrs?: boolean;
13
+ escape_rcdata?: boolean;
14
+ },
15
+ ): string => {
16
+ if (options?.minimize_boolean_attributes !== false && value === name) {
17
+ return name;
18
+ }
19
+ const needsQuote =
20
+ needsQuotes(value) || options?.quote_attr_values || !!options?.quote_char;
21
+ if (!needsQuote) {
22
+ let escaped = value.replace(/&/g, "&amp;");
23
+ if (options?.escape_lt_in_attrs) {
24
+ escaped = escaped.replace(/</g, "&lt;");
25
+ }
26
+ return `${name}=${escaped}`;
27
+ }
28
+ let escaped = value.replace(/&/g, "&amp;");
29
+ if (options?.escape_lt_in_attrs) {
30
+ escaped = escaped.replace(/</g, "&lt;");
31
+ }
32
+ const forcedQuote = options?.quote_char;
33
+ if (forcedQuote) {
34
+ if (forcedQuote === "'") {
35
+ escaped = escaped.replace(/'/g, "&#39;");
36
+ } else {
37
+ escaped = escaped.replace(/"/g, "&quot;");
38
+ }
39
+ return `${name}=${forcedQuote}${escaped}${forcedQuote}`;
40
+ } else {
41
+ if (value.includes('"') && value.includes("'")) {
42
+ escaped = escaped.replace(/"/g, "&quot;");
43
+ return `${name}="${escaped}"`;
44
+ } else if (value.includes('"')) {
45
+ return `${name}='${escaped}'`;
46
+ } else {
47
+ escaped = escaped.replace(/"/g, "&quot;");
48
+ return `${name}="${escaped}"`;
49
+ }
50
+ }
51
+ };
52
+
53
+ export const serializeAttributes = (
54
+ attrs: any,
55
+ options?: {
56
+ quote_char?: string;
57
+ quote_attr_values?: boolean;
58
+ minimize_boolean_attributes?: boolean;
59
+ escape_lt_in_attrs?: boolean;
60
+ use_trailing_solidus?: boolean;
61
+ escape_rcdata?: boolean;
62
+ },
63
+ ): string => {
64
+ let attrList: [string, string][];
65
+ if (Array.isArray(attrs)) {
66
+ attrList = attrs.map((attr: any) => [attr.name, attr.value]);
67
+ } else {
68
+ attrList = attrs ? Object.entries(attrs) : [];
69
+ }
70
+ attrList.sort(([a], [b]) => a.localeCompare(b));
71
+ return attrList
72
+ .map(([name, value]) => " " + serializeAttribute(name, value, options))
73
+ .join("");
74
+ };
@@ -0,0 +1,13 @@
1
+ export const escapeText = (text: string): string => {
2
+ return text
3
+ .replace(/&/g, "&amp;")
4
+ .replace(/</g, "&lt;")
5
+ .replace(/>/g, "&gt;");
6
+ };
7
+
8
+ export const escapeAttributeValue = (value: string): string => {
9
+ return value
10
+ .replace(/&/g, "&amp;")
11
+ .replace(/"/g, "&quot;")
12
+ .replace(/'/g, "&#39;");
13
+ };
@@ -0,0 +1 @@
1
+ export { serializeTokens } from "./serialize-tokens.js";