feedsweep 1.0.0 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -17
- package/dist/common.d.ts +10 -6
- package/dist/common.js +99 -28
- package/dist/defaults.d.ts +2 -1
- package/dist/defaults.js +46 -17
- package/dist/embeds/youtube.js +2 -2
- package/dist/index.d.ts +16 -10
- package/dist/index.js +23 -13
- package/dist/transforms/dom/convertBreaksToParagraphs.d.ts +6 -0
- package/dist/transforms/dom/convertBreaksToParagraphs.js +80 -0
- package/dist/transforms/dom/decodeDoubleEncodedTags.d.ts +6 -0
- package/dist/transforms/dom/decodeDoubleEncodedTags.js +30 -0
- package/dist/transforms/dom/enrichEmbedPlaceholders.d.ts +6 -0
- package/dist/transforms/dom/enrichEmbedPlaceholders.js +32 -0
- package/dist/transforms/dom/fixLazyImages.js +37 -13
- package/dist/transforms/dom/highlightCode.js +3 -2
- package/dist/transforms/dom/injectEnclosures.d.ts +6 -0
- package/dist/transforms/dom/injectEnclosures.js +66 -0
- package/dist/transforms/dom/mergeConsecutiveOneLinerPres.js +1 -1
- package/dist/transforms/dom/mergeFragmentedLists.d.ts +6 -0
- package/dist/transforms/dom/mergeFragmentedLists.js +84 -0
- package/dist/transforms/dom/proxyAssetUrls.d.ts +6 -0
- package/dist/transforms/dom/proxyAssetUrls.js +64 -0
- package/dist/transforms/dom/removeTrackingPixels.js +22 -25
- package/dist/transforms/dom/replaceEmbedsWithPlaceholders.js +24 -25
- package/dist/transforms/dom/replacePreLineBreaks.js +3 -4
- package/dist/transforms/dom/resolveRelativeUrls.js +44 -30
- package/dist/transforms/dom/stripComments.js +5 -15
- package/dist/transforms/dom/stripDeadAnchors.d.ts +6 -0
- package/dist/transforms/dom/stripDeadAnchors.js +20 -0
- package/dist/transforms/dom/stripDuplicateTitleHeading.d.ts +6 -0
- package/dist/transforms/dom/stripDuplicateTitleHeading.js +31 -0
- package/dist/transforms/dom/stripEmptyTags.d.ts +6 -0
- package/dist/transforms/dom/stripEmptyTags.js +53 -0
- package/dist/transforms/dom/stripInterBlockBreaks.js +28 -8
- package/dist/transforms/dom/stripParagraphBoundaryBreaks.js +26 -6
- package/dist/transforms/dom/stripTrackingParams.js +7 -6
- package/dist/transforms/dom/trimPreWhitespace.js +4 -3
- package/dist/transforms/dom/unwrapDoublyNestedLists.d.ts +6 -0
- package/dist/transforms/dom/unwrapDoublyNestedLists.js +41 -0
- package/dist/transforms/dom/unwrapRedirectUrls.js +4 -2
- package/dist/transforms/dom/unwrapWrappers.d.ts +6 -0
- package/dist/transforms/dom/unwrapWrappers.js +30 -0
- package/dist/transforms/string/paragraphizePlainText.js +1 -1
- package/dist/transforms/string/unwrapCdataComments.d.ts +6 -0
- package/dist/transforms/string/unwrapCdataComments.js +10 -0
- package/dist/types.d.ts +35 -6
- package/dist/unwraps/google.js +1 -1
- package/dist/unwraps/googleNewsModern.js +7 -3
- package/package.json +2 -2
- package/dist/transforms/dom/injectEnclosureEmbedPlaceholders.d.ts +0 -6
- package/dist/transforms/dom/injectEnclosureEmbedPlaceholders.js +0 -33
- package/dist/transforms/dom/simplifyFigures.d.ts +0 -6
- package/dist/transforms/dom/simplifyFigures.js +0 -27
- package/dist/transforms/string/decodeDoubleEncodedTags.d.ts +0 -6
- package/dist/transforms/string/decodeDoubleEncodedTags.js +0 -23
- package/dist/transforms/string/stripEmptyTags.d.ts +0 -6
- package/dist/transforms/string/stripEmptyTags.js +0 -25
- package/dist/transforms/string/stripOrphanedClosingTags.d.ts +0 -6
- package/dist/transforms/string/stripOrphanedClosingTags.js +0 -28
- package/dist/transforms/string/unwrapWrappers.d.ts +0 -6
- package/dist/transforms/string/unwrapWrappers.js +0 -10
|
@@ -1,40 +1,54 @@
|
|
|
1
1
|
import { resolveUrl } from "feedcanon";
|
|
2
2
|
import { parseSrcset, stringifySrcset } from "srcset";
|
|
3
3
|
//#region src/transforms/dom/resolveRelativeUrls.ts
|
|
4
|
+
const absoluteOrOpaqueUrl = /^(?:https?:|data:|mailto:|tel:|javascript:)/i;
|
|
5
|
+
const srcsetSeparator = /,\s+/;
|
|
4
6
|
const resolveRelativeUrls = ({ baseUrl }) => {
|
|
5
7
|
return (document) => {
|
|
6
8
|
if (!baseUrl) return;
|
|
7
|
-
const
|
|
8
|
-
for (const anchor of anchors) {
|
|
9
|
-
const href = anchor.getAttribute("href");
|
|
10
|
-
if (!href) continue;
|
|
11
|
-
const resolved = resolveUrl(href, baseUrl);
|
|
12
|
-
if (resolved) anchor.setAttribute("href", resolved);
|
|
13
|
-
}
|
|
14
|
-
const elementsWithSrc = document.querySelectorAll("[src]");
|
|
15
|
-
for (const element of elementsWithSrc) {
|
|
16
|
-
const src = element.getAttribute("src");
|
|
17
|
-
if (!src) continue;
|
|
18
|
-
const resolved = resolveUrl(src, baseUrl);
|
|
19
|
-
if (resolved) element.setAttribute("src", resolved);
|
|
20
|
-
}
|
|
21
|
-
const videos = document.querySelectorAll("video[poster]");
|
|
22
|
-
for (const video of videos) {
|
|
23
|
-
const poster = video.getAttribute("poster");
|
|
24
|
-
if (!poster) continue;
|
|
25
|
-
const resolved = resolveUrl(poster, baseUrl);
|
|
26
|
-
if (resolved) video.setAttribute("poster", resolved);
|
|
27
|
-
}
|
|
28
|
-
const elements = document.querySelectorAll("img, source");
|
|
9
|
+
const elements = document.querySelectorAll("a[href], [src], video[poster], img[srcset], source[srcset]");
|
|
29
10
|
for (const element of elements) {
|
|
30
|
-
const
|
|
31
|
-
if (
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
11
|
+
const localName = element.localName;
|
|
12
|
+
if (localName === "a") {
|
|
13
|
+
const href = element.getAttribute("href");
|
|
14
|
+
if (href && !href.startsWith("#") && !absoluteOrOpaqueUrl.test(href)) {
|
|
15
|
+
const resolved = resolveUrl(href, baseUrl);
|
|
16
|
+
if (resolved) element.setAttribute("href", resolved);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
const src = element.getAttribute("src");
|
|
20
|
+
if (src && !absoluteOrOpaqueUrl.test(src)) {
|
|
21
|
+
const resolved = resolveUrl(src, baseUrl);
|
|
22
|
+
if (resolved) element.setAttribute("src", resolved);
|
|
23
|
+
}
|
|
24
|
+
if (localName === "video") {
|
|
25
|
+
const poster = element.getAttribute("poster");
|
|
26
|
+
if (poster && !absoluteOrOpaqueUrl.test(poster)) {
|
|
27
|
+
const resolved = resolveUrl(poster, baseUrl);
|
|
28
|
+
if (resolved) element.setAttribute("poster", resolved);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
if (localName === "img" || localName === "source") {
|
|
32
|
+
const srcset = element.getAttribute("srcset");
|
|
33
|
+
if (srcset) {
|
|
34
|
+
let needsResolution = false;
|
|
35
|
+
const candidates = srcset.split(srcsetSeparator);
|
|
36
|
+
for (const candidate of candidates) {
|
|
37
|
+
const trimmed = candidate.trimStart();
|
|
38
|
+
if (trimmed && !absoluteOrOpaqueUrl.test(trimmed)) {
|
|
39
|
+
needsResolution = true;
|
|
40
|
+
break;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
if (needsResolution) {
|
|
44
|
+
const resolved = parseSrcset(srcset).map((entry) => ({
|
|
45
|
+
...entry,
|
|
46
|
+
url: resolveUrl(entry.url, baseUrl) ?? entry.url
|
|
47
|
+
}));
|
|
48
|
+
element.setAttribute("srcset", stringifySrcset(resolved));
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
38
52
|
}
|
|
39
53
|
};
|
|
40
54
|
};
|
|
@@ -1,22 +1,12 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { NodeFilter, hasAncestorWithTagName } from "../../common.js";
|
|
2
2
|
//#region src/transforms/dom/stripComments.ts
|
|
3
3
|
const codeBlockTags = new Set(["pre", "code"]);
|
|
4
4
|
const stripComments = () => {
|
|
5
5
|
return (document) => {
|
|
6
|
-
const
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
if (!inCodeBlock) child.remove();
|
|
11
|
-
continue;
|
|
12
|
-
}
|
|
13
|
-
if (child.nodeType === Node.ELEMENT_NODE) {
|
|
14
|
-
const element = child;
|
|
15
|
-
visit(element, inCodeBlock || codeBlockTags.has(element.tagName.toLowerCase()));
|
|
16
|
-
}
|
|
17
|
-
}
|
|
18
|
-
};
|
|
19
|
-
visit(document.body, false);
|
|
6
|
+
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_COMMENT);
|
|
7
|
+
const comments = [];
|
|
8
|
+
for (let node = walker.nextNode(); node !== null; node = walker.nextNode()) comments.push(node);
|
|
9
|
+
for (const comment of comments) if (!hasAncestorWithTagName(comment, codeBlockTags, document.body)) comment.remove();
|
|
20
10
|
};
|
|
21
11
|
};
|
|
22
12
|
//#endregion
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
//#region src/transforms/dom/stripDeadAnchors.ts
|
|
2
|
+
const javascriptSchemeRegex = /^javascript:/i;
|
|
3
|
+
const stripDeadAnchors = () => {
|
|
4
|
+
return (document) => {
|
|
5
|
+
const anchors = document.querySelectorAll("a");
|
|
6
|
+
for (const anchor of anchors) {
|
|
7
|
+
const href = anchor.getAttribute("href");
|
|
8
|
+
if (href === null) continue;
|
|
9
|
+
const trimmed = href.trim();
|
|
10
|
+
if (!(trimmed === "" || trimmed === "#" || javascriptSchemeRegex.test(trimmed))) continue;
|
|
11
|
+
if (anchor.hasAttribute("id") || anchor.hasAttribute("name")) continue;
|
|
12
|
+
const parent = anchor.parentNode;
|
|
13
|
+
if (!parent) continue;
|
|
14
|
+
while (anchor.firstChild) parent.insertBefore(anchor.firstChild, anchor);
|
|
15
|
+
anchor.remove();
|
|
16
|
+
}
|
|
17
|
+
};
|
|
18
|
+
};
|
|
19
|
+
//#endregion
|
|
20
|
+
export { stripDeadAnchors };
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
//#region src/transforms/dom/stripDuplicateTitleHeading.ts
|
|
2
|
+
const headingSelector = "h1, h2, h3, h4, h5, h6";
|
|
3
|
+
const mediaSelector = "img, picture, video, audio, iframe, svg";
|
|
4
|
+
const normalize = (value) => value.trim().toLowerCase().replace(/\s+/g, " ");
|
|
5
|
+
const stripDuplicateTitleHeading = (context) => {
|
|
6
|
+
const articleTitle = context.articleTitle;
|
|
7
|
+
const title = articleTitle && articleTitle.trim().length > 0 ? normalize(articleTitle) : "";
|
|
8
|
+
if (!title) return () => {};
|
|
9
|
+
return (document) => {
|
|
10
|
+
let heading = document.querySelector(headingSelector);
|
|
11
|
+
let text = heading?.textContent?.trim() ?? "";
|
|
12
|
+
if (heading && text.length === 0) {
|
|
13
|
+
heading = null;
|
|
14
|
+
for (const candidate of document.querySelectorAll(headingSelector)) {
|
|
15
|
+
const candidateText = candidate.textContent?.trim() ?? "";
|
|
16
|
+
if (candidateText.length > 0) {
|
|
17
|
+
heading = candidate;
|
|
18
|
+
text = candidateText;
|
|
19
|
+
break;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
if (!heading) return;
|
|
24
|
+
if (text.toLowerCase().replace(/\s+/g, " ") !== title) return;
|
|
25
|
+
if (heading.querySelector(headingSelector)) return;
|
|
26
|
+
if (heading.querySelector(mediaSelector)) return;
|
|
27
|
+
heading.remove();
|
|
28
|
+
};
|
|
29
|
+
};
|
|
30
|
+
//#endregion
|
|
31
|
+
export { stripDuplicateTitleHeading };
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
import { Node } from "../../common.js";
|
|
2
|
+
//#region src/transforms/dom/stripEmptyTags.ts
|
|
3
|
+
const preserveWhenEmpty = new Set([
|
|
4
|
+
"iframe",
|
|
5
|
+
"video",
|
|
6
|
+
"audio",
|
|
7
|
+
"img",
|
|
8
|
+
"source",
|
|
9
|
+
"area",
|
|
10
|
+
"base",
|
|
11
|
+
"br",
|
|
12
|
+
"col",
|
|
13
|
+
"embed",
|
|
14
|
+
"hr",
|
|
15
|
+
"input",
|
|
16
|
+
"link",
|
|
17
|
+
"meta",
|
|
18
|
+
"param",
|
|
19
|
+
"track",
|
|
20
|
+
"wbr"
|
|
21
|
+
]);
|
|
22
|
+
const stripEmptyTags = () => {
|
|
23
|
+
return (document) => {
|
|
24
|
+
const all = document.body.querySelectorAll("*");
|
|
25
|
+
for (let i = all.length - 1; i >= 0; i--) {
|
|
26
|
+
const element = all[i];
|
|
27
|
+
if (!element.parentNode) continue;
|
|
28
|
+
const tagName = element.localName;
|
|
29
|
+
if (preserveWhenEmpty.has(tagName)) continue;
|
|
30
|
+
if (tagName.includes("-")) continue;
|
|
31
|
+
const childNodes = element.childNodes;
|
|
32
|
+
const childCount = childNodes.length;
|
|
33
|
+
let hasContent = false;
|
|
34
|
+
for (let j = 0; j < childCount; j++) {
|
|
35
|
+
const child = childNodes[j];
|
|
36
|
+
const nodeType = child.nodeType;
|
|
37
|
+
if (nodeType === Node.ELEMENT_NODE) {
|
|
38
|
+
hasContent = true;
|
|
39
|
+
break;
|
|
40
|
+
}
|
|
41
|
+
if (nodeType === Node.TEXT_NODE && child.data.trim().length > 0) {
|
|
42
|
+
hasContent = true;
|
|
43
|
+
break;
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
if (hasContent) continue;
|
|
47
|
+
if (childCount > 0) element.replaceWith(" ");
|
|
48
|
+
else element.remove();
|
|
49
|
+
}
|
|
50
|
+
};
|
|
51
|
+
};
|
|
52
|
+
//#endregion
|
|
53
|
+
export { stripEmptyTags };
|
|
@@ -1,16 +1,36 @@
|
|
|
1
|
-
import { isBlockElement, isSkippable } from "../../common.js";
|
|
1
|
+
import { isBlockElement, isBr, isSkippable } from "../../common.js";
|
|
2
2
|
//#region src/transforms/dom/stripInterBlockBreaks.ts
|
|
3
3
|
const stripInterBlockBreaks = () => {
|
|
4
4
|
return (document) => {
|
|
5
5
|
const brs = document.querySelectorAll("br");
|
|
6
|
+
const parents = /* @__PURE__ */ new Set();
|
|
6
7
|
for (const br of brs) {
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
8
|
+
const parent = br.parentNode;
|
|
9
|
+
if (parent) parents.add(parent);
|
|
10
|
+
}
|
|
11
|
+
for (const parent of parents) {
|
|
12
|
+
let runBrs = null;
|
|
13
|
+
let previousBoundary = null;
|
|
14
|
+
let child = parent.firstChild;
|
|
15
|
+
while (child !== null) {
|
|
16
|
+
const nextChild = child.nextSibling;
|
|
17
|
+
if (isSkippable(child)) {
|
|
18
|
+
if (isBr(child)) if (runBrs === null) runBrs = [child];
|
|
19
|
+
else runBrs.push(child);
|
|
20
|
+
} else {
|
|
21
|
+
if (runBrs !== null) {
|
|
22
|
+
const previousIsBlock = !previousBoundary || isBlockElement(previousBoundary);
|
|
23
|
+
const nextIsBlock = isBlockElement(child);
|
|
24
|
+
if (previousIsBlock && nextIsBlock) for (const br of runBrs) br.remove();
|
|
25
|
+
runBrs = null;
|
|
26
|
+
}
|
|
27
|
+
previousBoundary = child;
|
|
28
|
+
}
|
|
29
|
+
child = nextChild;
|
|
30
|
+
}
|
|
31
|
+
if (runBrs !== null) {
|
|
32
|
+
if (!previousBoundary || isBlockElement(previousBoundary)) for (const br of runBrs) br.remove();
|
|
33
|
+
}
|
|
14
34
|
}
|
|
15
35
|
};
|
|
16
36
|
};
|
|
@@ -4,20 +4,40 @@ const stripParagraphBoundaryBreaks = () => {
|
|
|
4
4
|
return (document) => {
|
|
5
5
|
const paragraphs = document.querySelectorAll("p");
|
|
6
6
|
for (const paragraph of paragraphs) {
|
|
7
|
-
const leading = [];
|
|
8
7
|
let cursor = paragraph.firstChild;
|
|
8
|
+
let leadingHasBr = false;
|
|
9
|
+
let leadingEnd = null;
|
|
9
10
|
while (cursor && isSkippable(cursor)) {
|
|
10
|
-
|
|
11
|
+
if (!leadingHasBr && isBr(cursor)) leadingHasBr = true;
|
|
12
|
+
leadingEnd = cursor;
|
|
11
13
|
cursor = cursor.nextSibling;
|
|
12
14
|
}
|
|
13
|
-
if (
|
|
14
|
-
|
|
15
|
+
if (leadingHasBr) {
|
|
16
|
+
let node = paragraph.firstChild;
|
|
17
|
+
while (node) {
|
|
18
|
+
const next = node.nextSibling;
|
|
19
|
+
node.remove();
|
|
20
|
+
if (node === leadingEnd) break;
|
|
21
|
+
node = next;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
15
24
|
cursor = paragraph.lastChild;
|
|
25
|
+
let trailingHasBr = false;
|
|
26
|
+
let trailingEnd = null;
|
|
16
27
|
while (cursor && isSkippable(cursor)) {
|
|
17
|
-
|
|
28
|
+
if (!trailingHasBr && isBr(cursor)) trailingHasBr = true;
|
|
29
|
+
trailingEnd = cursor;
|
|
18
30
|
cursor = cursor.previousSibling;
|
|
19
31
|
}
|
|
20
|
-
if (
|
|
32
|
+
if (trailingHasBr) {
|
|
33
|
+
let node = paragraph.lastChild;
|
|
34
|
+
while (node) {
|
|
35
|
+
const prev = node.previousSibling;
|
|
36
|
+
node.remove();
|
|
37
|
+
if (node === trailingEnd) break;
|
|
38
|
+
node = prev;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
21
41
|
}
|
|
22
42
|
};
|
|
23
43
|
};
|
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
import { defaultStrippedParams } from "feedcanon";
|
|
2
2
|
//#region src/transforms/dom/stripTrackingParams.ts
|
|
3
|
+
const strippedParamSet = new Set(defaultStrippedParams);
|
|
3
4
|
const stripTrackingParams = () => {
|
|
4
5
|
return (document) => {
|
|
5
6
|
const anchors = document.querySelectorAll("a[href]");
|
|
6
7
|
for (const anchor of anchors) {
|
|
7
8
|
const href = anchor.getAttribute("href");
|
|
8
|
-
if (!href) continue;
|
|
9
|
+
if (!href || href.indexOf("?") === -1) continue;
|
|
9
10
|
try {
|
|
10
11
|
const url = new URL(href);
|
|
11
|
-
|
|
12
|
-
for (const
|
|
13
|
-
|
|
14
|
-
|
|
12
|
+
const toDelete = [];
|
|
13
|
+
for (const key of url.searchParams.keys()) if (strippedParamSet.has(key)) toDelete.push(key);
|
|
14
|
+
if (toDelete.length > 0) {
|
|
15
|
+
for (const key of toDelete) url.searchParams.delete(key);
|
|
16
|
+
anchor.setAttribute("href", url.toString());
|
|
15
17
|
}
|
|
16
|
-
if (changed) anchor.setAttribute("href", url.toString());
|
|
17
18
|
} catch {}
|
|
18
19
|
}
|
|
19
20
|
};
|
|
@@ -7,12 +7,13 @@ const trimPreWhitespace = () => {
|
|
|
7
7
|
const pres = document.querySelectorAll("pre");
|
|
8
8
|
for (const pre of pres) {
|
|
9
9
|
const target = pre.querySelector("code") ?? pre;
|
|
10
|
-
const
|
|
10
|
+
const original = target.innerHTML;
|
|
11
|
+
const trimmed = original.replace(trailingWhitespaceRegex, "").replace(leadingBlankLinesRegex, "");
|
|
11
12
|
const lines = trimmed.split("\n");
|
|
12
13
|
const indents = lines.filter((line) => line.trim().length > 0).map((line) => line.match(leadingIndentRegex)?.[1].length ?? 0);
|
|
13
14
|
const common = Math.min(...indents);
|
|
14
|
-
|
|
15
|
-
|
|
15
|
+
const result = common > 0 ? lines.map((line) => line.slice(common)).join("\n") : trimmed;
|
|
16
|
+
if (result !== original) target.innerHTML = result;
|
|
16
17
|
}
|
|
17
18
|
};
|
|
18
19
|
};
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { Node } from "../../common.js";
|
|
2
|
+
//#region src/transforms/dom/unwrapDoublyNestedLists.ts
|
|
3
|
+
const unwrapDoublyNestedLists = () => {
|
|
4
|
+
return (document) => {
|
|
5
|
+
const lists = document.querySelectorAll("ul, ol");
|
|
6
|
+
for (const outer of lists) {
|
|
7
|
+
const wrapper = outer.firstElementChild;
|
|
8
|
+
if (wrapper === null || wrapper.nextElementSibling !== null) continue;
|
|
9
|
+
if (wrapper.localName !== "li") continue;
|
|
10
|
+
const outerTag = outer.localName;
|
|
11
|
+
let inner = null;
|
|
12
|
+
let elementDisqualified = false;
|
|
13
|
+
for (let element = wrapper.firstElementChild; element !== null; element = element.nextElementSibling) {
|
|
14
|
+
const localName = element.localName;
|
|
15
|
+
if (localName === "br") continue;
|
|
16
|
+
if (inner !== null || localName !== outerTag) {
|
|
17
|
+
elementDisqualified = true;
|
|
18
|
+
break;
|
|
19
|
+
}
|
|
20
|
+
inner = element;
|
|
21
|
+
}
|
|
22
|
+
if (elementDisqualified || inner === null) continue;
|
|
23
|
+
let textDisqualified = false;
|
|
24
|
+
for (let node = wrapper.firstChild; node !== null; node = node.nextSibling) if (node.nodeType === Node.TEXT_NODE && node.textContent?.trim()) {
|
|
25
|
+
textDisqualified = true;
|
|
26
|
+
break;
|
|
27
|
+
}
|
|
28
|
+
if (textDisqualified) continue;
|
|
29
|
+
const parent = outer.parentNode;
|
|
30
|
+
if (parent === null) continue;
|
|
31
|
+
for (let node = wrapper.firstChild; node !== null;) {
|
|
32
|
+
const next = node.nextSibling;
|
|
33
|
+
if (node.nodeType === Node.TEXT_NODE || node === inner) parent.insertBefore(node, outer);
|
|
34
|
+
node = next;
|
|
35
|
+
}
|
|
36
|
+
outer.remove();
|
|
37
|
+
}
|
|
38
|
+
};
|
|
39
|
+
};
|
|
40
|
+
//#endregion
|
|
41
|
+
export { unwrapDoublyNestedLists };
|
|
@@ -8,13 +8,15 @@ const extractRedirectTarget = (url, extractors) => {
|
|
|
8
8
|
const unwrapRedirectUrls = (context) => {
|
|
9
9
|
return (document) => {
|
|
10
10
|
const anchors = document.querySelectorAll("a[href]");
|
|
11
|
+
const unwrappers = context.urlUnwrappers;
|
|
12
|
+
if (unwrappers.length === 0) return;
|
|
11
13
|
for (const anchor of anchors) {
|
|
12
14
|
const href = anchor.getAttribute("href");
|
|
13
15
|
if (!href) continue;
|
|
14
16
|
try {
|
|
15
17
|
const url = new URL(href);
|
|
16
|
-
for (const
|
|
17
|
-
const target =
|
|
18
|
+
for (const unwrap of unwrappers) {
|
|
19
|
+
const target = unwrap(url);
|
|
18
20
|
if (target) {
|
|
19
21
|
anchor.setAttribute("href", target);
|
|
20
22
|
break;
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
//#region src/transforms/dom/unwrapWrappers.ts
|
|
2
|
+
const wrapperTags = new Set([
|
|
3
|
+
"div",
|
|
4
|
+
"article",
|
|
5
|
+
"section",
|
|
6
|
+
"main",
|
|
7
|
+
"header",
|
|
8
|
+
"footer"
|
|
9
|
+
]);
|
|
10
|
+
const hasEmbedAttribute = (element) => {
|
|
11
|
+
const attributes = element.attributes;
|
|
12
|
+
for (let i = 0, n = attributes.length; i < n; i++) if (attributes[i].name.startsWith("data-embed")) return true;
|
|
13
|
+
return false;
|
|
14
|
+
};
|
|
15
|
+
const unwrapWrappers = () => {
|
|
16
|
+
return (document) => {
|
|
17
|
+
const candidates = document.body.querySelectorAll("*");
|
|
18
|
+
for (let i = 0, n = candidates.length; i < n; i++) {
|
|
19
|
+
const element = candidates[i];
|
|
20
|
+
if (!wrapperTags.has(element.localName)) continue;
|
|
21
|
+
const parent = element.parentNode;
|
|
22
|
+
if (!parent) continue;
|
|
23
|
+
if (hasEmbedAttribute(element)) continue;
|
|
24
|
+
while (element.firstChild) parent.insertBefore(element.firstChild, element);
|
|
25
|
+
element.remove();
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
};
|
|
29
|
+
//#endregion
|
|
30
|
+
export { unwrapWrappers };
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { autop } from "@wordpress/autop";
|
|
2
2
|
//#region src/transforms/string/paragraphizePlainText.ts
|
|
3
|
-
const hasHtmlRegex = /<[a-z][a-z0-9]*[\s
|
|
3
|
+
const hasHtmlRegex = /<[a-z][a-z0-9]*[\s/>]/i;
|
|
4
4
|
const paragraphizePlainText = () => {
|
|
5
5
|
return (html) => {
|
|
6
6
|
return hasHtmlRegex.test(html) ? html : autop(html);
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
//#region src/transforms/string/unwrapCdataComments.ts
|
|
2
|
+
const cdataWrapperRegex = /<!--\s*\[CDATA\[([\s\S]*?)\]\]\s*-->/g;
|
|
3
|
+
const unwrapCdataComments = () => {
|
|
4
|
+
return (html) => {
|
|
5
|
+
if (!html.includes("[CDATA[")) return html;
|
|
6
|
+
return html.replace(cdataWrapperRegex, (_match, inner) => inner);
|
|
7
|
+
};
|
|
8
|
+
};
|
|
9
|
+
//#endregion
|
|
10
|
+
export { unwrapCdataComments };
|
package/dist/types.d.ts
CHANGED
|
@@ -1,52 +1,81 @@
|
|
|
1
1
|
import { DiscoverResolveUrlFn } from "feedscout";
|
|
2
2
|
|
|
3
3
|
//#region src/types.d.ts
|
|
4
|
+
type MaybePromise<T> = T | Promise<T>;
|
|
5
|
+
type EnclosureThumbnail = {
|
|
6
|
+
url: string;
|
|
7
|
+
width?: number;
|
|
8
|
+
height?: number;
|
|
9
|
+
};
|
|
4
10
|
type Enclosure = {
|
|
5
11
|
url: string;
|
|
6
12
|
type?: string;
|
|
7
13
|
medium?: string;
|
|
14
|
+
width?: number;
|
|
15
|
+
height?: number;
|
|
16
|
+
duration?: number;
|
|
17
|
+
title?: string;
|
|
18
|
+
description?: string;
|
|
19
|
+
thumbnails?: Array<EnclosureThumbnail>;
|
|
8
20
|
};
|
|
9
21
|
type ResolveUrlFn = DiscoverResolveUrlFn;
|
|
10
22
|
type EmbedResolverResult = {
|
|
11
23
|
provider: string;
|
|
24
|
+
id?: string;
|
|
12
25
|
src: string;
|
|
13
26
|
url?: string;
|
|
14
27
|
thumbnail?: string;
|
|
15
|
-
type?: 'video' | 'audio' | 'iframe';
|
|
16
28
|
width?: number;
|
|
17
29
|
height?: number;
|
|
30
|
+
title?: string;
|
|
31
|
+
description?: string;
|
|
18
32
|
author?: string;
|
|
19
|
-
|
|
33
|
+
avatar?: string;
|
|
34
|
+
duration?: number;
|
|
20
35
|
};
|
|
36
|
+
type EnrichEmbedFn = (embeds: Array<{
|
|
37
|
+
provider: string;
|
|
38
|
+
id: string;
|
|
39
|
+
}>) => MaybePromise<Map<string, Partial<EmbedResolverResult>>>;
|
|
21
40
|
type EmbedResolver = {
|
|
22
41
|
selector: string;
|
|
23
|
-
extract: (element: Element) => EmbedResolverResult | undefined
|
|
42
|
+
extract: (element: Element) => MaybePromise<EmbedResolverResult | undefined>;
|
|
24
43
|
};
|
|
25
44
|
type UrlUnwrapper = (url: URL) => string | undefined;
|
|
45
|
+
type AssetType = 'image' | 'video' | 'audio';
|
|
46
|
+
type AssetProxyFn = (url: string, type: AssetType) => string | undefined;
|
|
26
47
|
type TransformContext = {
|
|
27
48
|
baseUrl?: string;
|
|
28
49
|
enclosures?: Array<Enclosure>;
|
|
29
50
|
embedResolvers: Array<EmbedResolver>;
|
|
30
51
|
lazySrcAttributes: Array<string>;
|
|
52
|
+
lazySrcsetAttributes: Array<string>;
|
|
31
53
|
trackingHosts: Array<string>;
|
|
32
54
|
trackingPathSegments: Array<string>;
|
|
33
55
|
urlUnwrappers: Array<UrlUnwrapper>;
|
|
34
56
|
resolveUrlFn: ResolveUrlFn;
|
|
57
|
+
assetProxyFn?: AssetProxyFn;
|
|
58
|
+
enrichEmbedFn?: EnrichEmbedFn;
|
|
59
|
+
articleTitle?: string;
|
|
35
60
|
};
|
|
36
|
-
type DomTransform = (context: TransformContext) => (document: Document) => void
|
|
37
|
-
type StringTransform = (context: TransformContext) => (html: string) => string
|
|
61
|
+
type DomTransform = (context: TransformContext) => (document: Document) => MaybePromise<void>;
|
|
62
|
+
type StringTransform = (context: TransformContext) => (html: string) => MaybePromise<string>;
|
|
38
63
|
type TransformContentOptions = {
|
|
39
64
|
baseUrl?: string;
|
|
40
65
|
enclosures?: Array<Enclosure>;
|
|
41
66
|
embedResolvers?: Array<EmbedResolver>;
|
|
42
67
|
lazySrcAttributes?: Array<string>;
|
|
68
|
+
lazySrcsetAttributes?: Array<string>;
|
|
43
69
|
trackingHosts?: Array<string>;
|
|
44
70
|
trackingPathSegments?: Array<string>;
|
|
45
71
|
urlUnwrappers?: Array<UrlUnwrapper>;
|
|
46
72
|
resolveUrlFn?: ResolveUrlFn;
|
|
73
|
+
assetProxyFn?: AssetProxyFn;
|
|
74
|
+
enrichEmbedFn?: EnrichEmbedFn;
|
|
75
|
+
articleTitle?: string;
|
|
47
76
|
stringTransforms?: Array<StringTransform>;
|
|
48
77
|
domTransforms?: Array<DomTransform>;
|
|
49
78
|
finalStringTransforms?: Array<StringTransform>;
|
|
50
79
|
};
|
|
51
80
|
//#endregion
|
|
52
|
-
export { DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext, UrlUnwrapper };
|
|
81
|
+
export { AssetProxyFn, AssetType, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext, UrlUnwrapper };
|
package/dist/unwraps/google.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { createParamExtractor } from "../utils.js";
|
|
2
2
|
const unwrapGoogle = createParamExtractor({
|
|
3
|
-
hosts: /^(?:
|
|
3
|
+
hosts: /^(?:[a-z0-9-]+\.)*google\.(?:com|[a-z]{2,3}(?:\.[a-z]{2,3})?)$/,
|
|
4
4
|
path: "/url",
|
|
5
5
|
params: ["url", "q"]
|
|
6
6
|
});
|
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
import { isHostOf } from "feedscout/utils";
|
|
2
2
|
//#region src/unwraps/googleNewsModern.ts
|
|
3
|
+
const articleIdRegex = /^\/(?:rss\/)?articles\/([\w-]+)/;
|
|
4
|
+
const base64UrlMinusRegex = /-/g;
|
|
5
|
+
const base64UrlUnderscoreRegex = /_/g;
|
|
6
|
+
const protobufFramingRegex = /\x08\x13".+?(https?:\/\/[^\xd2]+)\xd2\x01/;
|
|
3
7
|
const unwrapGoogleNewsModern = (url) => {
|
|
4
8
|
if (!isHostOf(url.href, "news.google.com")) return;
|
|
5
|
-
const match = url.pathname.match(
|
|
9
|
+
const match = url.pathname.match(articleIdRegex);
|
|
6
10
|
if (!match) return;
|
|
7
|
-
const padded = match[1].replace(
|
|
8
|
-
return Buffer.from(padded, "base64").toString("latin1").match(
|
|
11
|
+
const padded = match[1].replace(base64UrlMinusRegex, "+").replace(base64UrlUnderscoreRegex, "/");
|
|
12
|
+
return Buffer.from(padded, "base64").toString("latin1").match(protobufFramingRegex)?.[1];
|
|
9
13
|
};
|
|
10
14
|
//#endregion
|
|
11
15
|
export { unwrapGoogleNewsModern };
|
package/package.json
CHANGED
|
@@ -39,7 +39,7 @@
|
|
|
39
39
|
"build": "tsdown src/index.ts src/defaults.ts --format esm --dts --clean --unbundle --no-fixed-extension"
|
|
40
40
|
},
|
|
41
41
|
"dependencies": {
|
|
42
|
-
"@wordpress/autop": "^4.
|
|
42
|
+
"@wordpress/autop": "^4.46.0",
|
|
43
43
|
"highlight.js": "^11.11.1",
|
|
44
44
|
"linkedom": "^0.18.12",
|
|
45
45
|
"linkifyjs": "^4.3.2",
|
|
@@ -54,5 +54,5 @@
|
|
|
54
54
|
"kvalita": "^1.13.0",
|
|
55
55
|
"tsdown": "^0.22.0"
|
|
56
56
|
},
|
|
57
|
-
"version": "1.
|
|
57
|
+
"version": "1.1.0"
|
|
58
58
|
}
|