@tkeron/html-parser 1.1.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/workflows/npm_deploy.yml +14 -4
- package/README.md +6 -6
- package/bun.lock +6 -8
- package/check-versions.ts +147 -0
- package/index.ts +4 -8
- package/package.json +5 -6
- package/src/dom-simulator/append-child.ts +130 -0
- package/src/dom-simulator/append.ts +18 -0
- package/src/dom-simulator/attributes.ts +23 -0
- package/src/dom-simulator/clone-node.ts +51 -0
- package/src/dom-simulator/convert-ast-node-to-dom.ts +37 -0
- package/src/dom-simulator/create-cdata.ts +18 -0
- package/src/dom-simulator/create-comment.ts +23 -0
- package/src/dom-simulator/create-doctype.ts +24 -0
- package/src/dom-simulator/create-document.ts +81 -0
- package/src/dom-simulator/create-element.ts +195 -0
- package/src/dom-simulator/create-processing-instruction.ts +19 -0
- package/src/dom-simulator/create-temp-parent.ts +9 -0
- package/src/dom-simulator/create-text-node.ts +23 -0
- package/src/dom-simulator/escape-text-content.ts +6 -0
- package/src/dom-simulator/find-special-elements.ts +14 -0
- package/src/dom-simulator/get-text-content.ts +18 -0
- package/src/dom-simulator/index.ts +36 -0
- package/src/dom-simulator/inner-outer-html.ts +182 -0
- package/src/dom-simulator/insert-after.ts +20 -0
- package/src/dom-simulator/insert-before.ts +108 -0
- package/src/dom-simulator/matches.ts +26 -0
- package/src/dom-simulator/node-types.ts +26 -0
- package/src/dom-simulator/prepend.ts +24 -0
- package/src/dom-simulator/remove-child.ts +68 -0
- package/src/dom-simulator/remove.ts +7 -0
- package/src/dom-simulator/replace-child.ts +152 -0
- package/src/dom-simulator/set-text-content.ts +33 -0
- package/src/dom-simulator/update-element-content.ts +56 -0
- package/src/dom-simulator.ts +12 -1126
- package/src/encoding/constants.ts +8 -0
- package/src/encoding/detect-encoding.ts +21 -0
- package/src/encoding/index.ts +1 -0
- package/src/encoding/normalize-encoding.ts +6 -0
- package/src/html-entities.ts +2127 -0
- package/src/index.ts +5 -5
- package/src/parser/adoption-agency-helpers.ts +145 -0
- package/src/parser/constants.ts +137 -0
- package/src/parser/dom-to-ast.ts +79 -0
- package/src/parser/index.ts +9 -0
- package/src/parser/parse.ts +772 -0
- package/src/parser/types.ts +56 -0
- package/src/selectors/find-elements-descendant.ts +47 -0
- package/src/selectors/index.ts +2 -0
- package/src/selectors/matches-selector.ts +12 -0
- package/src/selectors/matches-token.ts +27 -0
- package/src/selectors/parse-selector.ts +48 -0
- package/src/selectors/query-selector-all.ts +43 -0
- package/src/selectors/query-selector.ts +6 -0
- package/src/selectors/types.ts +10 -0
- package/src/serializer/attributes.ts +74 -0
- package/src/serializer/escape.ts +13 -0
- package/src/serializer/index.ts +1 -0
- package/src/serializer/serialize-tokens.ts +511 -0
- package/src/tokenizer/calculate-position.ts +10 -0
- package/src/tokenizer/constants.ts +11 -0
- package/src/tokenizer/decode-entities.ts +64 -0
- package/src/tokenizer/index.ts +2 -0
- package/src/tokenizer/parse-attributes.ts +74 -0
- package/src/tokenizer/tokenize.ts +165 -0
- package/src/tokenizer/types.ts +25 -0
- package/tests/adoption-agency-helpers.test.ts +304 -0
- package/tests/advanced.test.ts +242 -221
- package/tests/cloneNode.test.ts +19 -66
- package/tests/custom-elements-head.test.ts +54 -55
- package/tests/dom-extended.test.ts +77 -64
- package/tests/dom-manipulation.test.ts +51 -24
- package/tests/dom.test.ts +15 -13
- package/tests/encoding/detect-encoding.test.ts +33 -0
- package/tests/google-dom.test.ts +2 -2
- package/tests/helpers/tokenizer-adapter.test.ts +29 -43
- package/tests/helpers/tokenizer-adapter.ts +36 -33
- package/tests/helpers/tree-adapter.test.ts +20 -20
- package/tests/helpers/tree-adapter.ts +34 -24
- package/tests/html-entities-text.test.ts +6 -2
- package/tests/innerhtml-void-elements.test.ts +52 -36
- package/tests/outerHTML-replacement.test.ts +37 -65
- package/tests/parser/dom-to-ast.test.ts +109 -0
- package/tests/parser/parse.test.ts +139 -0
- package/tests/parser.test.ts +281 -217
- package/tests/selectors/query-selector-all.test.ts +39 -0
- package/tests/selectors/query-selector.test.ts +42 -0
- package/tests/serializer/attributes.test.ts +132 -0
- package/tests/serializer/escape.test.ts +51 -0
- package/tests/serializer/serialize-tokens.test.ts +80 -0
- package/tests/serializer-core.test.ts +6 -6
- package/tests/serializer-injectmeta.test.ts +6 -6
- package/tests/serializer-optionaltags.test.ts +9 -6
- package/tests/serializer-options.test.ts +6 -6
- package/tests/serializer-whitespace.test.ts +6 -6
- package/tests/tokenizer/calculate-position.test.ts +34 -0
- package/tests/tokenizer/decode-entities.test.ts +31 -0
- package/tests/tokenizer/parse-attributes.test.ts +44 -0
- package/tests/tokenizer/tokenize.test.ts +757 -0
- package/tests/tokenizer-namedEntities.test.ts +10 -7
- package/tests/tokenizer-pendingSpecChanges.test.ts +10 -7
- package/tests/tokenizer.test.ts +268 -256
- package/tests/tree-construction-adoption01.test.ts +25 -16
- package/tests/tree-construction-adoption02.test.ts +30 -19
- package/tests/tree-construction-domjs-unsafe.test.ts +6 -4
- package/tests/tree-construction-entities02.test.ts +18 -16
- package/tests/tree-construction-html5test-com.test.ts +16 -10
- package/tests/tree-construction-math.test.ts +11 -9
- package/tests/tree-construction-namespace-sensitivity.test.ts +11 -9
- package/tests/tree-construction-noscript01.test.ts +11 -9
- package/tests/tree-construction-ruby.test.ts +6 -4
- package/tests/tree-construction-scriptdata01.test.ts +6 -4
- package/tests/tree-construction-svg.test.ts +6 -4
- package/tests/tree-construction-template.test.ts +6 -4
- package/tests/tree-construction-tests10.test.ts +6 -4
- package/tests/tree-construction-tests11.test.ts +6 -4
- package/tests/tree-construction-tests20.test.ts +7 -4
- package/tests/tree-construction-tests21.test.ts +7 -4
- package/tests/tree-construction-tests23.test.ts +7 -4
- package/tests/tree-construction-tests24.test.ts +7 -4
- package/tests/tree-construction-tests5.test.ts +6 -5
- package/tests/tree-construction-tests6.test.ts +6 -5
- package/tests/tree-construction-tests_innerHTML_1.test.ts +6 -5
- package/tests/void-elements.test.ts +85 -40
- package/tsconfig.json +1 -1
- package/src/css-selector.ts +0 -185
- package/src/encoding.ts +0 -39
- package/src/parser.ts +0 -682
- package/src/serializer.ts +0 -450
- package/src/tokenizer.ts +0 -325
- package/tests/selectors.test.ts +0 -128
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import type { Token } from "../tokenizer/index.js";
|
|
2
|
+
|
|
3
|
+
export interface ParserState {
|
|
4
|
+
tokens: Token[];
|
|
5
|
+
position: number;
|
|
6
|
+
length: number;
|
|
7
|
+
stack: any[];
|
|
8
|
+
root: any;
|
|
9
|
+
insertionMode: InsertionMode;
|
|
10
|
+
errors: ParseError[];
|
|
11
|
+
explicitHead?: boolean;
|
|
12
|
+
activeFormattingElements: any[];
|
|
13
|
+
formElementPointer?: any;
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export interface ParseError {
|
|
17
|
+
message: string;
|
|
18
|
+
position: number;
|
|
19
|
+
line: number;
|
|
20
|
+
column: number;
|
|
21
|
+
severity: "error" | "warning";
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export enum InsertionMode {
|
|
25
|
+
Initial = "initial",
|
|
26
|
+
BeforeHtml = "beforeHtml",
|
|
27
|
+
BeforeHead = "beforeHead",
|
|
28
|
+
InHead = "inHead",
|
|
29
|
+
AfterHead = "afterHead",
|
|
30
|
+
InBody = "inBody",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export enum ASTNodeType {
|
|
34
|
+
Document = "document",
|
|
35
|
+
Element = "element",
|
|
36
|
+
Text = "text",
|
|
37
|
+
Comment = "comment",
|
|
38
|
+
Doctype = "doctype",
|
|
39
|
+
CDATA = "cdata",
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface ASTNode {
|
|
43
|
+
type: ASTNodeType;
|
|
44
|
+
tagName?: string;
|
|
45
|
+
value?: string;
|
|
46
|
+
attributes?: Record<string, string>;
|
|
47
|
+
children?: ASTNode[];
|
|
48
|
+
isSelfClosing?: boolean;
|
|
49
|
+
content?: string;
|
|
50
|
+
name?: string;
|
|
51
|
+
publicId?: string;
|
|
52
|
+
systemId?: string;
|
|
53
|
+
namespaceURI?: string;
|
|
54
|
+
target?: string;
|
|
55
|
+
data?: string;
|
|
56
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import type { SelectorGroup } from "./types.js";
|
|
2
|
+
import { matchesSelector } from "./matches-selector.js";
|
|
3
|
+
|
|
4
|
+
export const findElementsDescendant = (
|
|
5
|
+
node: any,
|
|
6
|
+
selectorGroups: SelectorGroup[],
|
|
7
|
+
groupIndex: number,
|
|
8
|
+
results: any[],
|
|
9
|
+
): void => {
|
|
10
|
+
if (groupIndex >= selectorGroups.length) {
|
|
11
|
+
return;
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
const currentGroup = selectorGroups[groupIndex];
|
|
15
|
+
if (!currentGroup) {
|
|
16
|
+
return;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const isLastGroup = groupIndex === selectorGroups.length - 1;
|
|
20
|
+
|
|
21
|
+
for (const child of node.childNodes || []) {
|
|
22
|
+
if (child.nodeType === 1) {
|
|
23
|
+
const element = child;
|
|
24
|
+
|
|
25
|
+
if (matchesSelector(element, currentGroup.tokens)) {
|
|
26
|
+
if (isLastGroup) {
|
|
27
|
+
results.push(element);
|
|
28
|
+
} else {
|
|
29
|
+
findElementsDescendant(
|
|
30
|
+
element,
|
|
31
|
+
selectorGroups,
|
|
32
|
+
groupIndex + 1,
|
|
33
|
+
results,
|
|
34
|
+
);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
const shouldContinueSearching =
|
|
40
|
+
!isLastGroup ||
|
|
41
|
+
child.nodeType !== 1 ||
|
|
42
|
+
!matchesSelector(child, currentGroup.tokens);
|
|
43
|
+
if (shouldContinueSearching) {
|
|
44
|
+
findElementsDescendant(child, selectorGroups, groupIndex, results);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
};
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { SelectorToken } from "./types.js";
|
|
2
|
+
import { matchesToken } from "./matches-token.js";
|
|
3
|
+
|
|
4
|
+
export const matchesSelector = (
|
|
5
|
+
element: any,
|
|
6
|
+
tokens: SelectorToken[],
|
|
7
|
+
): boolean => {
|
|
8
|
+
if (tokens.length === 0) {
|
|
9
|
+
return true;
|
|
10
|
+
}
|
|
11
|
+
return tokens.every((token) => matchesToken(element, token));
|
|
12
|
+
};
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import type { SelectorToken } from "./types.js";
|
|
2
|
+
|
|
3
|
+
export const matchesToken = (element: any, token: SelectorToken): boolean => {
|
|
4
|
+
if (!element || !element.tagName) {
|
|
5
|
+
return false;
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
switch (token.type) {
|
|
9
|
+
case "tag":
|
|
10
|
+
return element.tagName.toLowerCase() === token.value;
|
|
11
|
+
case "class":
|
|
12
|
+
const classAttr =
|
|
13
|
+
element.attributes?.class || element.attributes?.className || "";
|
|
14
|
+
const classes = classAttr.split(/\s+/).filter(Boolean);
|
|
15
|
+
return classes.includes(token.value);
|
|
16
|
+
case "id":
|
|
17
|
+
return element.attributes?.id === token.value;
|
|
18
|
+
case "attribute":
|
|
19
|
+
const attrValue = element.attributes?.[token.attributeName || ""];
|
|
20
|
+
if (token.attributeValue === undefined) {
|
|
21
|
+
return attrValue !== undefined;
|
|
22
|
+
}
|
|
23
|
+
return attrValue === token.attributeValue;
|
|
24
|
+
default:
|
|
25
|
+
return false;
|
|
26
|
+
}
|
|
27
|
+
};
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import type { SelectorGroup, SelectorToken } from "./types.js";
|
|
2
|
+
|
|
3
|
+
export const parseSelector = (selector: string): SelectorGroup[] => {
|
|
4
|
+
const parts = selector.trim().split(/\s+/);
|
|
5
|
+
|
|
6
|
+
return parts.map((part) => {
|
|
7
|
+
const trimmed = part.trim();
|
|
8
|
+
let tokens: SelectorToken[] = [];
|
|
9
|
+
|
|
10
|
+
if (trimmed === "*") {
|
|
11
|
+
return { tokens: [] };
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
let remaining = trimmed;
|
|
15
|
+
|
|
16
|
+
const tagMatch = remaining.match(/^([a-zA-Z][a-zA-Z0-9-]*)/);
|
|
17
|
+
if (tagMatch) {
|
|
18
|
+
tokens.push({ type: "tag", value: tagMatch[1].toLowerCase() });
|
|
19
|
+
remaining = remaining.slice(tagMatch[1].length);
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const idMatches = remaining.matchAll(/#([a-zA-Z0-9][a-zA-Z0-9_-]*)/g);
|
|
23
|
+
for (const match of idMatches) {
|
|
24
|
+
tokens.push({ type: "id", value: match[1] });
|
|
25
|
+
}
|
|
26
|
+
remaining = remaining.replace(/#[a-zA-Z0-9][a-zA-Z0-9_-]*/g, "");
|
|
27
|
+
|
|
28
|
+
const classMatches = remaining.matchAll(/\.([a-zA-Z][a-zA-Z0-9_-]*)/g);
|
|
29
|
+
for (const match of classMatches) {
|
|
30
|
+
tokens.push({ type: "class", value: match[1] });
|
|
31
|
+
}
|
|
32
|
+
remaining = remaining.replace(/\.[a-zA-Z][a-zA-Z0-9_-]*/g, "");
|
|
33
|
+
|
|
34
|
+
const attrMatches = remaining.matchAll(
|
|
35
|
+
/\[([^=\]]+)(?:=["']?([^"'\]]*?)["']?)?\]/g,
|
|
36
|
+
);
|
|
37
|
+
for (const match of attrMatches) {
|
|
38
|
+
tokens.push({
|
|
39
|
+
type: "attribute",
|
|
40
|
+
value: match[1].trim(),
|
|
41
|
+
attributeName: match[1].trim(),
|
|
42
|
+
attributeValue: match[2] ? match[2].trim() : undefined,
|
|
43
|
+
});
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
return { tokens };
|
|
47
|
+
});
|
|
48
|
+
};
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import type { SelectorGroup, SelectorToken } from "./types.js";
|
|
2
|
+
import { parseSelector } from "./parse-selector.js";
|
|
3
|
+
import { matchesSelector } from "./matches-selector.js";
|
|
4
|
+
import { findElementsDescendant } from "./find-elements-descendant.js";
|
|
5
|
+
|
|
6
|
+
const findElementsSimple = (
|
|
7
|
+
node: any,
|
|
8
|
+
tokens: SelectorToken[],
|
|
9
|
+
results: any[],
|
|
10
|
+
): void => {
|
|
11
|
+
if (node.nodeType === 1) {
|
|
12
|
+
const element = node;
|
|
13
|
+
if (matchesSelector(element, tokens)) {
|
|
14
|
+
results.push(element);
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
for (const child of node.childNodes || []) {
|
|
18
|
+
findElementsSimple(child, tokens, results);
|
|
19
|
+
}
|
|
20
|
+
};
|
|
21
|
+
|
|
22
|
+
const findElements = (
|
|
23
|
+
node: any,
|
|
24
|
+
selectorGroups: SelectorGroup[],
|
|
25
|
+
results: any[],
|
|
26
|
+
): void => {
|
|
27
|
+
if (selectorGroups.length === 1) {
|
|
28
|
+
const firstGroup = selectorGroups[0];
|
|
29
|
+
if (firstGroup) {
|
|
30
|
+
const tokens = firstGroup.tokens;
|
|
31
|
+
findElementsSimple(node, tokens, results);
|
|
32
|
+
}
|
|
33
|
+
} else {
|
|
34
|
+
findElementsDescendant(node, selectorGroups, 0, results);
|
|
35
|
+
}
|
|
36
|
+
};
|
|
37
|
+
|
|
38
|
+
export const querySelectorAll = (root: any, selector: string): any[] => {
|
|
39
|
+
const selectorGroups = parseSelector(selector);
|
|
40
|
+
const results: any[] = [];
|
|
41
|
+
findElements(root, selectorGroups, results);
|
|
42
|
+
return results;
|
|
43
|
+
};
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
export const needsQuotes = (value: string): boolean => {
|
|
2
|
+
return value === "" || /[\t\n\r\f "'=`>]/.test(value);
|
|
3
|
+
};
|
|
4
|
+
|
|
5
|
+
export const serializeAttribute = (
|
|
6
|
+
name: string,
|
|
7
|
+
value: string,
|
|
8
|
+
options?: {
|
|
9
|
+
quote_char?: string;
|
|
10
|
+
quote_attr_values?: boolean;
|
|
11
|
+
minimize_boolean_attributes?: boolean;
|
|
12
|
+
escape_lt_in_attrs?: boolean;
|
|
13
|
+
escape_rcdata?: boolean;
|
|
14
|
+
},
|
|
15
|
+
): string => {
|
|
16
|
+
if (options?.minimize_boolean_attributes !== false && value === name) {
|
|
17
|
+
return name;
|
|
18
|
+
}
|
|
19
|
+
const needsQuote =
|
|
20
|
+
needsQuotes(value) || options?.quote_attr_values || !!options?.quote_char;
|
|
21
|
+
if (!needsQuote) {
|
|
22
|
+
let escaped = value.replace(/&/g, "&");
|
|
23
|
+
if (options?.escape_lt_in_attrs) {
|
|
24
|
+
escaped = escaped.replace(/</g, "<");
|
|
25
|
+
}
|
|
26
|
+
return `${name}=${escaped}`;
|
|
27
|
+
}
|
|
28
|
+
let escaped = value.replace(/&/g, "&");
|
|
29
|
+
if (options?.escape_lt_in_attrs) {
|
|
30
|
+
escaped = escaped.replace(/</g, "<");
|
|
31
|
+
}
|
|
32
|
+
const forcedQuote = options?.quote_char;
|
|
33
|
+
if (forcedQuote) {
|
|
34
|
+
if (forcedQuote === "'") {
|
|
35
|
+
escaped = escaped.replace(/'/g, "'");
|
|
36
|
+
} else {
|
|
37
|
+
escaped = escaped.replace(/"/g, """);
|
|
38
|
+
}
|
|
39
|
+
return `${name}=${forcedQuote}${escaped}${forcedQuote}`;
|
|
40
|
+
} else {
|
|
41
|
+
if (value.includes('"') && value.includes("'")) {
|
|
42
|
+
escaped = escaped.replace(/"/g, """);
|
|
43
|
+
return `${name}="${escaped}"`;
|
|
44
|
+
} else if (value.includes('"')) {
|
|
45
|
+
return `${name}='${escaped}'`;
|
|
46
|
+
} else {
|
|
47
|
+
escaped = escaped.replace(/"/g, """);
|
|
48
|
+
return `${name}="${escaped}"`;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
};
|
|
52
|
+
|
|
53
|
+
export const serializeAttributes = (
|
|
54
|
+
attrs: any,
|
|
55
|
+
options?: {
|
|
56
|
+
quote_char?: string;
|
|
57
|
+
quote_attr_values?: boolean;
|
|
58
|
+
minimize_boolean_attributes?: boolean;
|
|
59
|
+
escape_lt_in_attrs?: boolean;
|
|
60
|
+
use_trailing_solidus?: boolean;
|
|
61
|
+
escape_rcdata?: boolean;
|
|
62
|
+
},
|
|
63
|
+
): string => {
|
|
64
|
+
let attrList: [string, string][];
|
|
65
|
+
if (Array.isArray(attrs)) {
|
|
66
|
+
attrList = attrs.map((attr: any) => [attr.name, attr.value]);
|
|
67
|
+
} else {
|
|
68
|
+
attrList = attrs ? Object.entries(attrs) : [];
|
|
69
|
+
}
|
|
70
|
+
attrList.sort(([a], [b]) => a.localeCompare(b));
|
|
71
|
+
return attrList
|
|
72
|
+
.map(([name, value]) => " " + serializeAttribute(name, value, options))
|
|
73
|
+
.join("");
|
|
74
|
+
};
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
export const escapeText = (text: string): string => {
|
|
2
|
+
return text
|
|
3
|
+
.replace(/&/g, "&")
|
|
4
|
+
.replace(/</g, "<")
|
|
5
|
+
.replace(/>/g, ">");
|
|
6
|
+
};
|
|
7
|
+
|
|
8
|
+
export const escapeAttributeValue = (value: string): string => {
|
|
9
|
+
return value
|
|
10
|
+
.replace(/&/g, "&")
|
|
11
|
+
.replace(/"/g, """)
|
|
12
|
+
.replace(/'/g, "'");
|
|
13
|
+
};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { serializeTokens } from "./serialize-tokens.js";
|