feedsweep 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +75 -19
- package/dist/common.d.ts +8 -8
- package/dist/common.js +65 -42
- package/dist/defaults.d.ts +2 -2
- package/dist/defaults.js +48 -15
- package/dist/embeds/youtube.js +2 -2
- package/dist/index.d.ts +17 -10
- package/dist/index.js +25 -14
- package/dist/parsers/linkedom.d.ts +4 -0
- package/dist/parsers/linkedom.js +38 -0
- package/dist/transforms/dom/convertBreaksToParagraphs.d.ts +6 -0
- package/dist/transforms/dom/convertBreaksToParagraphs.js +80 -0
- package/dist/transforms/dom/decodeDoubleEncodedTags.d.ts +6 -0
- package/dist/transforms/dom/decodeDoubleEncodedTags.js +30 -0
- package/dist/transforms/dom/enrichEmbedPlaceholders.d.ts +6 -0
- package/dist/transforms/dom/enrichEmbedPlaceholders.js +32 -0
- package/dist/transforms/dom/fixLazyImages.js +33 -13
- package/dist/transforms/dom/highlightCode.js +3 -2
- package/dist/transforms/dom/injectEnclosures.d.ts +6 -0
- package/dist/transforms/dom/injectEnclosures.js +66 -0
- package/dist/transforms/dom/mergeConsecutiveOneLinerPres.js +1 -1
- package/dist/transforms/dom/mergeFragmentedLists.d.ts +6 -0
- package/dist/transforms/dom/mergeFragmentedLists.js +84 -0
- package/dist/transforms/dom/proxyAssetUrls.d.ts +6 -0
- package/dist/transforms/dom/proxyAssetUrls.js +64 -0
- package/dist/transforms/dom/removeTrackingPixels.js +22 -25
- package/dist/transforms/dom/replaceEmbedsWithPlaceholders.js +24 -25
- package/dist/transforms/dom/replacePreLineBreaks.js +3 -4
- package/dist/transforms/dom/resolveRelativeUrls.js +44 -30
- package/dist/transforms/dom/stripComments.js +5 -15
- package/dist/transforms/dom/stripDeadAnchors.d.ts +6 -0
- package/dist/transforms/dom/stripDeadAnchors.js +20 -0
- package/dist/transforms/dom/stripDuplicateTitleHeading.d.ts +6 -0
- package/dist/transforms/dom/stripDuplicateTitleHeading.js +31 -0
- package/dist/transforms/dom/stripEmptyTags.d.ts +6 -0
- package/dist/transforms/dom/stripEmptyTags.js +53 -0
- package/dist/transforms/dom/stripInterBlockBreaks.js +28 -8
- package/dist/transforms/dom/stripParagraphBoundaryBreaks.js +26 -6
- package/dist/transforms/dom/stripTrackingParams.js +7 -6
- package/dist/transforms/dom/trimPreWhitespace.js +4 -3
- package/dist/transforms/dom/unwrapDoublyNestedLists.d.ts +6 -0
- package/dist/transforms/dom/unwrapDoublyNestedLists.js +41 -0
- package/dist/transforms/dom/unwrapRedirectUrls.js +4 -2
- package/dist/transforms/dom/unwrapWrappers.d.ts +6 -0
- package/dist/transforms/dom/unwrapWrappers.js +30 -0
- package/dist/transforms/string/paragraphizePlainText.js +1 -1
- package/dist/transforms/string/stripOversizedBase64Sources.d.ts +6 -0
- package/dist/transforms/string/stripOversizedBase64Sources.js +13 -0
- package/dist/transforms/string/unwrapCdataComments.d.ts +6 -0
- package/dist/transforms/string/unwrapCdataComments.js +10 -0
- package/dist/types.d.ts +37 -7
- package/dist/unwraps/google.js +1 -1
- package/dist/unwraps/googleNewsModern.js +7 -3
- package/package.json +15 -5
- package/dist/transforms/dom/injectEnclosureEmbedPlaceholders.d.ts +0 -6
- package/dist/transforms/dom/injectEnclosureEmbedPlaceholders.js +0 -33
- package/dist/transforms/dom/simplifyFigures.d.ts +0 -6
- package/dist/transforms/dom/simplifyFigures.js +0 -27
- package/dist/transforms/string/decodeDoubleEncodedTags.d.ts +0 -6
- package/dist/transforms/string/decodeDoubleEncodedTags.js +0 -23
- package/dist/transforms/string/stripEmptyTags.d.ts +0 -6
- package/dist/transforms/string/stripEmptyTags.js +0 -25
- package/dist/transforms/string/stripOrphanedClosingTags.d.ts +0 -6
- package/dist/transforms/string/stripOrphanedClosingTags.js +0 -28
- package/dist/transforms/string/unwrapWrappers.d.ts +0 -6
- package/dist/transforms/string/unwrapWrappers.js +0 -10
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { parseHTML } from "linkedom";
|
|
2
|
+
//#region src/parsers/linkedom.ts
|
|
3
|
+
const normalizeAttributeCase = (document) => {
|
|
4
|
+
for (const element of document.querySelectorAll("*")) {
|
|
5
|
+
const original = Array.from(element.attributes).map((attribute) => ({
|
|
6
|
+
name: attribute.name,
|
|
7
|
+
value: attribute.value
|
|
8
|
+
}));
|
|
9
|
+
const final = /* @__PURE__ */ new Map();
|
|
10
|
+
let needsRewrite = false;
|
|
11
|
+
for (const { name, value } of original) {
|
|
12
|
+
const lower = name.toLowerCase();
|
|
13
|
+
if (lower !== name) needsRewrite = true;
|
|
14
|
+
if (final.has(lower)) {
|
|
15
|
+
needsRewrite = true;
|
|
16
|
+
continue;
|
|
17
|
+
}
|
|
18
|
+
final.set(lower, value);
|
|
19
|
+
}
|
|
20
|
+
if (!needsRewrite) continue;
|
|
21
|
+
for (const { name } of original) element.removeAttribute(name);
|
|
22
|
+
for (const [name, value] of final) element.setAttribute(name, value);
|
|
23
|
+
}
|
|
24
|
+
};
|
|
25
|
+
const svgRegionRegex = /<svg\b[^>]*>[\s\S]*?<\/svg>/gi;
|
|
26
|
+
const svgSelfCloseRegex = /<([a-z][a-z0-9-]*)((?:\s[^>]*)?)\s*\/>/gi;
|
|
27
|
+
const expandSvgSelfClose = (html) => {
|
|
28
|
+
return html.replace(svgRegionRegex, (svgBlock) => {
|
|
29
|
+
return svgBlock.replace(svgSelfCloseRegex, "<$1$2></$1>");
|
|
30
|
+
});
|
|
31
|
+
};
|
|
32
|
+
const parseHtml = (html) => {
|
|
33
|
+
const { document } = parseHTML(`<!doctype html><html><head></head><body>${expandSvgSelfClose(html)}</body></html>`);
|
|
34
|
+
normalizeAttributeCase(document);
|
|
35
|
+
return document;
|
|
36
|
+
};
|
|
37
|
+
//#endregion
|
|
38
|
+
export { parseHtml };
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { Node, hasAncestorWithTagName, isBlockElement, isBr, isWhitespaceText } from "../../common.js";
|
|
2
|
+
//#region src/transforms/dom/convertBreaksToParagraphs.ts
|
|
3
|
+
const processContainersSelector = "body, div, blockquote, td, li, article, section, main, header, footer, aside";
|
|
4
|
+
const preOrCodeTags = new Set(["pre", "code"]);
|
|
5
|
+
const convertBreaksToParagraphs = () => {
|
|
6
|
+
return (document) => {
|
|
7
|
+
for (const container of document.querySelectorAll(processContainersSelector)) {
|
|
8
|
+
let hasBr = false;
|
|
9
|
+
for (let node = container.firstChild; node; node = node.nextSibling) if (isBr(node)) {
|
|
10
|
+
hasBr = true;
|
|
11
|
+
break;
|
|
12
|
+
}
|
|
13
|
+
if (!hasBr) continue;
|
|
14
|
+
if (hasAncestorWithTagName(container, preOrCodeTags)) continue;
|
|
15
|
+
const children = [];
|
|
16
|
+
for (let node = container.firstChild; node; node = node.nextSibling) children.push(node);
|
|
17
|
+
const childCount = children.length;
|
|
18
|
+
const chunks = [];
|
|
19
|
+
let current = {
|
|
20
|
+
start: 0,
|
|
21
|
+
end: 0,
|
|
22
|
+
hasContent: false,
|
|
23
|
+
hasBlock: false
|
|
24
|
+
};
|
|
25
|
+
let i = 0;
|
|
26
|
+
while (i < childCount) {
|
|
27
|
+
const child = children[i];
|
|
28
|
+
if (isBr(child)) {
|
|
29
|
+
let brCount = 1;
|
|
30
|
+
let j = i + 1;
|
|
31
|
+
while (j < childCount) {
|
|
32
|
+
const next = children[j];
|
|
33
|
+
if (isBr(next)) brCount++;
|
|
34
|
+
else if (!isWhitespaceText(next)) break;
|
|
35
|
+
j++;
|
|
36
|
+
}
|
|
37
|
+
if (brCount >= 2) {
|
|
38
|
+
current.end = i;
|
|
39
|
+
chunks.push(current);
|
|
40
|
+
current = {
|
|
41
|
+
start: j,
|
|
42
|
+
end: j,
|
|
43
|
+
hasContent: false,
|
|
44
|
+
hasBlock: false
|
|
45
|
+
};
|
|
46
|
+
i = j;
|
|
47
|
+
} else {
|
|
48
|
+
current.hasContent = true;
|
|
49
|
+
i++;
|
|
50
|
+
}
|
|
51
|
+
} else {
|
|
52
|
+
const nodeType = child.nodeType;
|
|
53
|
+
if (nodeType === Node.ELEMENT_NODE) {
|
|
54
|
+
current.hasContent = true;
|
|
55
|
+
if (isBlockElement(child)) current.hasBlock = true;
|
|
56
|
+
} else if (nodeType === Node.TEXT_NODE) {
|
|
57
|
+
if (!current.hasContent && child.textContent?.trim()) current.hasContent = true;
|
|
58
|
+
}
|
|
59
|
+
i++;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
current.end = childCount;
|
|
63
|
+
chunks.push(current);
|
|
64
|
+
if (chunks.length < 2) continue;
|
|
65
|
+
const newChildren = [];
|
|
66
|
+
for (const chunk of chunks) {
|
|
67
|
+
if (!chunk.hasContent) continue;
|
|
68
|
+
if (chunk.hasBlock) for (let k = chunk.start; k < chunk.end; k++) newChildren.push(children[k]);
|
|
69
|
+
else {
|
|
70
|
+
const paragraph = document.createElement("p");
|
|
71
|
+
for (let k = chunk.start; k < chunk.end; k++) paragraph.appendChild(children[k]);
|
|
72
|
+
newChildren.push(paragraph);
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
container.replaceChildren(...newChildren);
|
|
76
|
+
}
|
|
77
|
+
};
|
|
78
|
+
};
|
|
79
|
+
//#endregion
|
|
80
|
+
export { convertBreaksToParagraphs };
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { NodeFilter, hasAncestorWithTagName } from "../../common.js";
|
|
2
|
+
//#region src/transforms/dom/decodeDoubleEncodedTags.ts
|
|
3
|
+
const opaqueTags = new Set([
|
|
4
|
+
"code",
|
|
5
|
+
"pre",
|
|
6
|
+
"script",
|
|
7
|
+
"style",
|
|
8
|
+
"textarea",
|
|
9
|
+
"noscript"
|
|
10
|
+
]);
|
|
11
|
+
const tagInTextRegex = /<\/?[a-zA-Z][\w-]*[^<>]*>/;
|
|
12
|
+
const decodeDoubleEncodedTags = () => {
|
|
13
|
+
return (document) => {
|
|
14
|
+
if (document.body.children.length === 0) return;
|
|
15
|
+
document.body.normalize();
|
|
16
|
+
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
|
|
17
|
+
let tempDiv = null;
|
|
18
|
+
for (let node = walker.nextNode(); node !== null; node = walker.nextNode()) {
|
|
19
|
+
const text = node;
|
|
20
|
+
const data = text.data;
|
|
21
|
+
if (!data.includes("<") || !tagInTextRegex.test(data)) continue;
|
|
22
|
+
if (hasAncestorWithTagName(text, opaqueTags)) continue;
|
|
23
|
+
if (tempDiv === null) tempDiv = document.createElement("div");
|
|
24
|
+
tempDiv.innerHTML = data;
|
|
25
|
+
text.replaceWith(...tempDiv.childNodes);
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
};
|
|
29
|
+
//#endregion
|
|
30
|
+
export { decodeDoubleEncodedTags };
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import { applyEmbedMetadata } from "../../common.js";
|
|
2
|
+
//#region src/transforms/dom/enrichEmbedPlaceholders.ts
|
|
3
|
+
const enrichEmbedPlaceholders = (context) => {
|
|
4
|
+
const enrichEmbedFn = context.enrichEmbedFn;
|
|
5
|
+
if (!enrichEmbedFn) return () => {};
|
|
6
|
+
return async (document) => {
|
|
7
|
+
const placeholders = document.querySelectorAll("[data-embed-provider][data-embed-id]");
|
|
8
|
+
const count = placeholders.length;
|
|
9
|
+
if (!count) return;
|
|
10
|
+
const embeds = new Array(count);
|
|
11
|
+
for (let i = 0; i < count; i++) {
|
|
12
|
+
const element = placeholders[i];
|
|
13
|
+
embeds[i] = {
|
|
14
|
+
provider: element.getAttribute("data-embed-provider") ?? "",
|
|
15
|
+
id: element.getAttribute("data-embed-id") ?? ""
|
|
16
|
+
};
|
|
17
|
+
}
|
|
18
|
+
let enriched;
|
|
19
|
+
try {
|
|
20
|
+
enriched = await enrichEmbedFn(embeds);
|
|
21
|
+
} catch {
|
|
22
|
+
return;
|
|
23
|
+
}
|
|
24
|
+
for (let i = 0; i < count; i++) {
|
|
25
|
+
const embed = embeds[i];
|
|
26
|
+
const data = enriched.get(`${embed.provider}:${embed.id}`);
|
|
27
|
+
if (data) applyEmbedMetadata(placeholders[i], data, { setIfMissing: true });
|
|
28
|
+
}
|
|
29
|
+
};
|
|
30
|
+
};
|
|
31
|
+
//#endregion
|
|
32
|
+
export { enrichEmbedPlaceholders };
|
|
@@ -5,29 +5,49 @@ const isUrlShaped = (value) => {
|
|
|
5
5
|
return urlShapeRegex.test(value) && !value.startsWith("{") && !value.startsWith("[");
|
|
6
6
|
};
|
|
7
7
|
const fixLazyImages = (context) => {
|
|
8
|
+
const lazySrcSet = new Set(context.lazySrcAttributes);
|
|
9
|
+
const lazySrcsetSet = new Set(context.lazySrcsetAttributes);
|
|
10
|
+
const { lazySrcAttributes, lazySrcsetAttributes } = context;
|
|
8
11
|
return (document) => {
|
|
9
12
|
const images = document.querySelectorAll("img");
|
|
10
13
|
for (const image of images) {
|
|
11
|
-
let
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
if (!
|
|
15
|
-
|
|
16
|
-
|
|
14
|
+
let hasSrcCandidate = false;
|
|
15
|
+
let hasSrcsetCandidate = false;
|
|
16
|
+
for (const name of image.getAttributeNames()) {
|
|
17
|
+
if (!hasSrcCandidate && lazySrcSet.has(name)) hasSrcCandidate = true;
|
|
18
|
+
if (!hasSrcsetCandidate && lazySrcsetSet.has(name)) hasSrcsetCandidate = true;
|
|
19
|
+
if (hasSrcCandidate && hasSrcsetCandidate) break;
|
|
20
|
+
}
|
|
21
|
+
if (hasSrcCandidate) {
|
|
22
|
+
let srcResolved = false;
|
|
23
|
+
for (const attribute of lazySrcAttributes) {
|
|
24
|
+
const value = image.getAttribute(attribute);
|
|
25
|
+
if (value === null) continue;
|
|
26
|
+
if (!srcResolved && value && isUrlShaped(value)) {
|
|
27
|
+
image.setAttribute("src", value);
|
|
28
|
+
srcResolved = true;
|
|
29
|
+
}
|
|
30
|
+
image.removeAttribute(attribute);
|
|
17
31
|
}
|
|
18
|
-
image.removeAttribute(attribute);
|
|
19
32
|
}
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
33
|
+
if (hasSrcsetCandidate) {
|
|
34
|
+
let srcsetResolved = false;
|
|
35
|
+
for (const attribute of lazySrcsetAttributes) {
|
|
36
|
+
const value = image.getAttribute(attribute);
|
|
37
|
+
if (value === null) continue;
|
|
38
|
+
if (!srcsetResolved && value && isUrlShaped(value)) {
|
|
39
|
+
image.setAttribute("srcset", value);
|
|
40
|
+
srcsetResolved = true;
|
|
41
|
+
}
|
|
42
|
+
image.removeAttribute(attribute);
|
|
43
|
+
}
|
|
24
44
|
}
|
|
25
45
|
}
|
|
26
46
|
const noscripts = document.querySelectorAll("noscript");
|
|
27
47
|
for (const noscript of noscripts) {
|
|
28
48
|
const sibling = noscript.previousElementSibling;
|
|
29
|
-
if (sibling?.
|
|
30
|
-
const inner = noscript.
|
|
49
|
+
if (sibling?.localName !== "img") continue;
|
|
50
|
+
const inner = noscript.innerHTML;
|
|
31
51
|
if (!imgPattern.test(inner)) continue;
|
|
32
52
|
sibling.remove();
|
|
33
53
|
noscript.outerHTML = inner;
|
|
@@ -18,9 +18,10 @@ const highlightCode = () => {
|
|
|
18
18
|
for (const pre of pres) {
|
|
19
19
|
const code = pre.querySelector("code");
|
|
20
20
|
if (!code) continue;
|
|
21
|
-
const text = code.textContent ?? "";
|
|
22
|
-
if (!text.trim()) continue;
|
|
23
21
|
const language = detectLanguage(pre, code);
|
|
22
|
+
if (!language && code.children.length > 0) continue;
|
|
23
|
+
const text = code.textContent;
|
|
24
|
+
if (!text?.trim()) continue;
|
|
24
25
|
code.innerHTML = (language && hljs.getLanguage(language) ? hljs.highlight(text, { language }) : hljs.highlightAuto(text)).value;
|
|
25
26
|
code.classList.add("hljs");
|
|
26
27
|
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import { createEmbedPlaceholder } from "../../common.js";
|
|
2
|
+
//#region src/transforms/dom/injectEnclosures.ts
|
|
3
|
+
const existingMediaSelector = "audio[src], video[src], iframe[src], source[src], [data-embed-src]";
|
|
4
|
+
const isAudioEnclosure = (enclosure) => {
|
|
5
|
+
return enclosure.medium === "audio" || !!enclosure.type?.startsWith("audio/");
|
|
6
|
+
};
|
|
7
|
+
const isVideoEnclosure = (enclosure) => {
|
|
8
|
+
return enclosure.medium === "video" || !!enclosure.type?.startsWith("video/");
|
|
9
|
+
};
|
|
10
|
+
const resolveEnclosure = async (url, resolvers, document) => {
|
|
11
|
+
const probe = document.createElement("iframe");
|
|
12
|
+
probe.setAttribute("src", url);
|
|
13
|
+
for (const resolver of resolvers) if (probe.matches(resolver.selector)) {
|
|
14
|
+
const metadata = await resolver.extract(probe);
|
|
15
|
+
if (metadata) return metadata;
|
|
16
|
+
}
|
|
17
|
+
};
|
|
18
|
+
const collectExistingMediaUrls = (document) => {
|
|
19
|
+
const urls = /* @__PURE__ */ new Set();
|
|
20
|
+
for (const element of document.querySelectorAll(existingMediaSelector)) {
|
|
21
|
+
const src = element.getAttribute("src") ?? element.getAttribute("data-embed-src");
|
|
22
|
+
if (src) urls.add(src);
|
|
23
|
+
}
|
|
24
|
+
return urls;
|
|
25
|
+
};
|
|
26
|
+
const createNativeMediaElement = (document, tagName, enclosure, context) => {
|
|
27
|
+
const element = document.createElement(tagName);
|
|
28
|
+
element.setAttribute("src", enclosure.url);
|
|
29
|
+
element.setAttribute("controls", "");
|
|
30
|
+
element.setAttribute("preload", "none");
|
|
31
|
+
if (tagName === "video") {
|
|
32
|
+
if (enclosure.width) element.setAttribute("width", String(enclosure.width));
|
|
33
|
+
if (enclosure.height) element.setAttribute("height", String(enclosure.height));
|
|
34
|
+
const poster = enclosure.thumbnails?.[0]?.url;
|
|
35
|
+
if (poster && context.resolveUrlFn(poster, context.baseUrl)) element.setAttribute("poster", poster);
|
|
36
|
+
}
|
|
37
|
+
return element;
|
|
38
|
+
};
|
|
39
|
+
const injectEnclosures = (context) => {
|
|
40
|
+
if (!context.enclosures?.length) return () => {};
|
|
41
|
+
const enclosures = context.enclosures;
|
|
42
|
+
return async (document) => {
|
|
43
|
+
const existingUrls = collectExistingMediaUrls(document);
|
|
44
|
+
for (const enclosure of enclosures) {
|
|
45
|
+
if (existingUrls.has(enclosure.url)) continue;
|
|
46
|
+
if (!context.resolveUrlFn(enclosure.url, context.baseUrl)) continue;
|
|
47
|
+
const resolved = await resolveEnclosure(enclosure.url, context.embedResolvers, document);
|
|
48
|
+
if (resolved) {
|
|
49
|
+
document.body.prepend(createEmbedPlaceholder(document, enclosure.url, resolved));
|
|
50
|
+
existingUrls.add(enclosure.url);
|
|
51
|
+
continue;
|
|
52
|
+
}
|
|
53
|
+
if (isAudioEnclosure(enclosure)) {
|
|
54
|
+
document.body.prepend(createNativeMediaElement(document, "audio", enclosure, context));
|
|
55
|
+
existingUrls.add(enclosure.url);
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
if (isVideoEnclosure(enclosure)) {
|
|
59
|
+
document.body.prepend(createNativeMediaElement(document, "video", enclosure, context));
|
|
60
|
+
existingUrls.add(enclosure.url);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
};
|
|
64
|
+
};
|
|
65
|
+
//#endregion
|
|
66
|
+
export { injectEnclosures };
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import { Node, isWhitespaceText } from "../../common.js";
|
|
2
|
+
//#region src/transforms/dom/mergeFragmentedLists.ts
|
|
3
|
+
const mergeFragmentedLists = () => {
|
|
4
|
+
return (document) => {
|
|
5
|
+
const lists = document.querySelectorAll("ul, ol");
|
|
6
|
+
for (const list of lists) {
|
|
7
|
+
if (!list.parentNode) continue;
|
|
8
|
+
const localName = list.localName;
|
|
9
|
+
const firstCandidate = nextMergeableSibling(list, localName);
|
|
10
|
+
if (!firstCandidate) continue;
|
|
11
|
+
if (!hasOnlyListItemChildren(list)) continue;
|
|
12
|
+
const run = [list];
|
|
13
|
+
let candidate = firstCandidate;
|
|
14
|
+
while (candidate) {
|
|
15
|
+
if (!attributesEqual(list, candidate)) break;
|
|
16
|
+
if (!hasOnlyListItemChildren(candidate)) break;
|
|
17
|
+
run.push(candidate);
|
|
18
|
+
candidate = nextMergeableSibling(candidate, localName);
|
|
19
|
+
}
|
|
20
|
+
if (run.length < 2) continue;
|
|
21
|
+
const target = run[0];
|
|
22
|
+
for (let index = 1; index < run.length; index++) {
|
|
23
|
+
const extra = run[index];
|
|
24
|
+
let between = target.nextSibling;
|
|
25
|
+
while (between && between !== extra) {
|
|
26
|
+
const next = between.nextSibling;
|
|
27
|
+
const type = between.nodeType;
|
|
28
|
+
if (type === Node.COMMENT_NODE) between.parentNode?.removeChild(between);
|
|
29
|
+
else if (type === Node.TEXT_NODE) target.appendChild(between);
|
|
30
|
+
between = next;
|
|
31
|
+
}
|
|
32
|
+
while (extra.firstChild) target.appendChild(extra.firstChild);
|
|
33
|
+
extra.remove();
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
};
|
|
37
|
+
};
|
|
38
|
+
const nextMergeableSibling = (from, localName) => {
|
|
39
|
+
let sibling = from.nextSibling;
|
|
40
|
+
while (sibling) {
|
|
41
|
+
const type = sibling.nodeType;
|
|
42
|
+
if (type === Node.ELEMENT_NODE) return sibling.localName === localName ? sibling : void 0;
|
|
43
|
+
if (type === Node.TEXT_NODE) {
|
|
44
|
+
if (!isWhitespaceText(sibling)) return;
|
|
45
|
+
sibling = sibling.nextSibling;
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
if (type === Node.COMMENT_NODE) {
|
|
49
|
+
sibling = sibling.nextSibling;
|
|
50
|
+
continue;
|
|
51
|
+
}
|
|
52
|
+
return;
|
|
53
|
+
}
|
|
54
|
+
};
|
|
55
|
+
const hasOnlyListItemChildren = (list) => {
|
|
56
|
+
for (let child = list.firstChild; child; child = child.nextSibling) {
|
|
57
|
+
const type = child.nodeType;
|
|
58
|
+
if (type === Node.ELEMENT_NODE) {
|
|
59
|
+
if (child.localName !== "li") return false;
|
|
60
|
+
continue;
|
|
61
|
+
}
|
|
62
|
+
if (type === Node.TEXT_NODE) {
|
|
63
|
+
if (!isWhitespaceText(child)) return false;
|
|
64
|
+
continue;
|
|
65
|
+
}
|
|
66
|
+
if (type !== Node.COMMENT_NODE) return false;
|
|
67
|
+
}
|
|
68
|
+
return true;
|
|
69
|
+
};
|
|
70
|
+
const attributesEqual = (a, b) => {
|
|
71
|
+
const aHas = a.hasAttributes();
|
|
72
|
+
if (aHas !== b.hasAttributes()) return false;
|
|
73
|
+
if (!aHas) return true;
|
|
74
|
+
const attributes = a.attributes;
|
|
75
|
+
const length = attributes.length;
|
|
76
|
+
if (length !== b.getAttributeNames().length) return false;
|
|
77
|
+
for (let index = 0; index < length; index++) {
|
|
78
|
+
const attribute = attributes[index];
|
|
79
|
+
if (b.getAttribute(attribute.name) !== attribute.value) return false;
|
|
80
|
+
}
|
|
81
|
+
return true;
|
|
82
|
+
};
|
|
83
|
+
//#endregion
|
|
84
|
+
export { mergeFragmentedLists };
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { parseSrcset, stringifySrcset } from "srcset";
|
|
2
|
+
//#region src/transforms/dom/proxyAssetUrls.ts
|
|
3
|
+
const sourceTypeFromParent = (element) => {
|
|
4
|
+
const parent = element.parentElement?.localName;
|
|
5
|
+
if (parent === "video") return "video";
|
|
6
|
+
if (parent === "audio") return "audio";
|
|
7
|
+
return "image";
|
|
8
|
+
};
|
|
9
|
+
const isProxyableUrl = (url) => {
|
|
10
|
+
return !url.startsWith("data:");
|
|
11
|
+
};
|
|
12
|
+
const proxyAttribute = (element, attribute, type, assetProxyFn) => {
|
|
13
|
+
const value = element.getAttribute(attribute);
|
|
14
|
+
if (!value || !isProxyableUrl(value)) return;
|
|
15
|
+
const proxied = assetProxyFn(value, type);
|
|
16
|
+
if (proxied) element.setAttribute(attribute, proxied);
|
|
17
|
+
};
|
|
18
|
+
const proxySrcset = (element, type, assetProxyFn) => {
|
|
19
|
+
const srcset = element.getAttribute("srcset");
|
|
20
|
+
if (!srcset) return;
|
|
21
|
+
const rewritten = parseSrcset(srcset).map((entry) => {
|
|
22
|
+
if (!isProxyableUrl(entry.url)) return entry;
|
|
23
|
+
return {
|
|
24
|
+
...entry,
|
|
25
|
+
url: assetProxyFn(entry.url, type) ?? entry.url
|
|
26
|
+
};
|
|
27
|
+
});
|
|
28
|
+
element.setAttribute("srcset", stringifySrcset(rewritten));
|
|
29
|
+
};
|
|
30
|
+
const proxyAssetUrls = ({ assetProxyFn }) => {
|
|
31
|
+
if (!assetProxyFn) return () => {};
|
|
32
|
+
return (document) => {
|
|
33
|
+
const elements = document.querySelectorAll("img, video, audio, source, track, image, [data-embed-thumbnail], [data-embed-avatar]");
|
|
34
|
+
for (const element of elements) {
|
|
35
|
+
switch (element.localName) {
|
|
36
|
+
case "img":
|
|
37
|
+
proxyAttribute(element, "src", "image", assetProxyFn);
|
|
38
|
+
proxySrcset(element, "image", assetProxyFn);
|
|
39
|
+
break;
|
|
40
|
+
case "video":
|
|
41
|
+
proxyAttribute(element, "src", "video", assetProxyFn);
|
|
42
|
+
proxyAttribute(element, "poster", "image", assetProxyFn);
|
|
43
|
+
break;
|
|
44
|
+
case "audio":
|
|
45
|
+
proxyAttribute(element, "src", "audio", assetProxyFn);
|
|
46
|
+
break;
|
|
47
|
+
case "source":
|
|
48
|
+
proxyAttribute(element, "src", sourceTypeFromParent(element), assetProxyFn);
|
|
49
|
+
proxySrcset(element, "image", assetProxyFn);
|
|
50
|
+
break;
|
|
51
|
+
case "track":
|
|
52
|
+
proxyAttribute(element, "src", sourceTypeFromParent(element), assetProxyFn);
|
|
53
|
+
break;
|
|
54
|
+
case "image":
|
|
55
|
+
proxyAttribute(element, element.hasAttribute("href") ? "href" : "xlink:href", "image", assetProxyFn);
|
|
56
|
+
break;
|
|
57
|
+
}
|
|
58
|
+
if (element.hasAttribute("data-embed-thumbnail")) proxyAttribute(element, "data-embed-thumbnail", "image", assetProxyFn);
|
|
59
|
+
if (element.hasAttribute("data-embed-avatar")) proxyAttribute(element, "data-embed-avatar", "image", assetProxyFn);
|
|
60
|
+
}
|
|
61
|
+
};
|
|
62
|
+
};
|
|
63
|
+
//#endregion
|
|
64
|
+
export { proxyAssetUrls };
|
|
@@ -1,6 +1,5 @@
|
|
|
1
|
+
import { getDimensions } from "../../common.js";
|
|
1
2
|
//#region src/transforms/dom/removeTrackingPixels.ts
|
|
2
|
-
const styleWidthRegex = /(?:^|;)\s*width\s*:\s*([0-9]*\.?[0-9]+)\s*(?:px)?\s*(?:;|$)/i;
|
|
3
|
-
const styleHeightRegex = /(?:^|;)\s*height\s*:\s*([0-9]*\.?[0-9]+)\s*(?:px)?\s*(?:;|$)/i;
|
|
4
3
|
const styleDisplayNoneRegex = /(?:^|;)\s*display\s*:\s*none/i;
|
|
5
4
|
const styleVisibilityHiddenRegex = /(?:^|;)\s*visibility\s*:\s*hidden/i;
|
|
6
5
|
const styleOpacityZeroRegex = /(?:^|;)\s*opacity\s*:\s*0(?:\.0+)?\s*(?:;|$)/i;
|
|
@@ -13,45 +12,43 @@ const buildPathRegex = (segments) => {
|
|
|
13
12
|
const isTrackingUrl = (src, hosts, pathRegex) => {
|
|
14
13
|
try {
|
|
15
14
|
const url = new URL(src, "http://placeholder/");
|
|
16
|
-
|
|
15
|
+
const hostname = url.hostname;
|
|
16
|
+
if (hosts.size > 0) {
|
|
17
|
+
if (hosts.has(hostname)) return true;
|
|
18
|
+
for (const host of hosts) if (hostname.endsWith(`.${host}`)) return true;
|
|
19
|
+
}
|
|
17
20
|
return pathRegex?.test(url.pathname) ?? false;
|
|
18
21
|
} catch {
|
|
19
22
|
return false;
|
|
20
23
|
}
|
|
21
24
|
};
|
|
22
|
-
const
|
|
23
|
-
|
|
24
|
-
if (attribute !== null) {
|
|
25
|
-
const value = Number(attribute);
|
|
26
|
-
if (Number.isFinite(value)) return value;
|
|
27
|
-
}
|
|
28
|
-
const style = image.getAttribute("style");
|
|
29
|
-
if (style) {
|
|
30
|
-
const regex = prop === "width" ? styleWidthRegex : styleHeightRegex;
|
|
31
|
-
const match = style.match(regex);
|
|
32
|
-
if (match) return Number(match[1]);
|
|
33
|
-
}
|
|
25
|
+
const isPixelDimension = (value) => {
|
|
26
|
+
return value !== void 0 && value <= pixelDimensionLimit;
|
|
34
27
|
};
|
|
35
|
-
const
|
|
28
|
+
const isPixelSized = (image) => {
|
|
29
|
+
const { width, height } = getDimensions(image);
|
|
30
|
+
return isPixelDimension(width) || isPixelDimension(height);
|
|
31
|
+
};
|
|
32
|
+
const isHiddenImage = (image, style) => {
|
|
36
33
|
if (image.hasAttribute("hidden")) return true;
|
|
37
|
-
const style = image.getAttribute("style");
|
|
38
34
|
if (!style) return false;
|
|
39
35
|
return styleDisplayNoneRegex.test(style) || styleVisibilityHiddenRegex.test(style) || styleOpacityZeroRegex.test(style);
|
|
40
36
|
};
|
|
41
|
-
const isPixelSized = (image) => {
|
|
42
|
-
const width = getDimension(image, "width");
|
|
43
|
-
const height = getDimension(image, "height");
|
|
44
|
-
return width !== void 0 && width <= pixelDimensionLimit || height !== void 0 && height <= pixelDimensionLimit;
|
|
45
|
-
};
|
|
46
37
|
const removeTrackingPixels = (context) => {
|
|
47
38
|
const hosts = new Set(context.trackingHosts);
|
|
48
39
|
const pathRegex = buildPathRegex(context.trackingPathSegments);
|
|
40
|
+
const hasUrlChecks = hosts.size > 0 || pathRegex !== null;
|
|
49
41
|
return (document) => {
|
|
50
42
|
const images = document.querySelectorAll("img");
|
|
51
43
|
for (const image of images) {
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
44
|
+
if (isPixelSized(image) || isHiddenImage(image, image.getAttribute("style"))) {
|
|
45
|
+
image.remove();
|
|
46
|
+
continue;
|
|
47
|
+
}
|
|
48
|
+
if (hasUrlChecks) {
|
|
49
|
+
const src = image.getAttribute("src");
|
|
50
|
+
if (src && isTrackingUrl(src, hosts, pathRegex)) image.remove();
|
|
51
|
+
}
|
|
55
52
|
}
|
|
56
53
|
};
|
|
57
54
|
};
|
|
@@ -1,34 +1,33 @@
|
|
|
1
|
-
import { createEmbedPlaceholder } from "../../common.js";
|
|
2
|
-
import { coerceNumber } from "../../utils.js";
|
|
1
|
+
import { createEmbedPlaceholder, getDimensions } from "../../common.js";
|
|
3
2
|
//#region src/transforms/dom/replaceEmbedsWithPlaceholders.ts
|
|
4
3
|
const replaceEmbedsWithPlaceholders = (context) => {
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
4
|
+
const { embedResolvers, resolveUrlFn, baseUrl } = context;
|
|
5
|
+
return async (document) => {
|
|
6
|
+
const iframeSnapshot = document.getElementsByTagName("iframe");
|
|
7
|
+
const hasIframes = iframeSnapshot.length > 0;
|
|
8
|
+
for (const resolver of embedResolvers) {
|
|
9
|
+
if (!hasIframes && resolver.selector.startsWith("iframe")) continue;
|
|
10
|
+
for (const element of document.querySelectorAll(resolver.selector)) {
|
|
11
|
+
const metadata = await resolver.extract(element);
|
|
10
12
|
if (!metadata) continue;
|
|
11
|
-
if (!
|
|
12
|
-
if (metadata.url && !
|
|
13
|
-
const width =
|
|
14
|
-
const
|
|
15
|
-
const placeholder = createEmbedPlaceholder(document, metadata.src, metadata.type ?? "iframe", {
|
|
13
|
+
if (!resolveUrlFn(metadata.src, baseUrl)) continue;
|
|
14
|
+
if (metadata.url && !resolveUrlFn(metadata.url, baseUrl)) continue;
|
|
15
|
+
const { width, height } = getDimensions(element);
|
|
16
|
+
const placeholderMetadata = width === void 0 && height === void 0 ? metadata : {
|
|
16
17
|
...metadata,
|
|
17
|
-
width,
|
|
18
|
-
height
|
|
19
|
-
}
|
|
20
|
-
element.replaceWith(
|
|
18
|
+
width: width ?? metadata.width,
|
|
19
|
+
height: height ?? metadata.height
|
|
20
|
+
};
|
|
21
|
+
element.replaceWith(createEmbedPlaceholder(document, metadata.src, placeholderMetadata));
|
|
21
22
|
}
|
|
22
23
|
}
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
if (!
|
|
26
|
-
const
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
height
|
|
31
|
-
}));
|
|
24
|
+
if (!hasIframes) return;
|
|
25
|
+
for (const iframe of iframeSnapshot) {
|
|
26
|
+
if (!iframe.parentNode) continue;
|
|
27
|
+
const src = iframe.getAttribute("src");
|
|
28
|
+
if (!src) continue;
|
|
29
|
+
if (!resolveUrlFn(src, baseUrl)) continue;
|
|
30
|
+
iframe.replaceWith(createEmbedPlaceholder(document, src, getDimensions(iframe)));
|
|
32
31
|
}
|
|
33
32
|
};
|
|
34
33
|
};
|