feedsweep 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +75 -19
- package/dist/common.d.ts +8 -8
- package/dist/common.js +65 -42
- package/dist/defaults.d.ts +2 -2
- package/dist/defaults.js +48 -15
- package/dist/embeds/youtube.js +2 -2
- package/dist/index.d.ts +17 -10
- package/dist/index.js +25 -14
- package/dist/parsers/linkedom.d.ts +4 -0
- package/dist/parsers/linkedom.js +38 -0
- package/dist/transforms/dom/convertBreaksToParagraphs.d.ts +6 -0
- package/dist/transforms/dom/convertBreaksToParagraphs.js +80 -0
- package/dist/transforms/dom/decodeDoubleEncodedTags.d.ts +6 -0
- package/dist/transforms/dom/decodeDoubleEncodedTags.js +30 -0
- package/dist/transforms/dom/enrichEmbedPlaceholders.d.ts +6 -0
- package/dist/transforms/dom/enrichEmbedPlaceholders.js +32 -0
- package/dist/transforms/dom/fixLazyImages.js +33 -13
- package/dist/transforms/dom/highlightCode.js +3 -2
- package/dist/transforms/dom/injectEnclosures.d.ts +6 -0
- package/dist/transforms/dom/injectEnclosures.js +66 -0
- package/dist/transforms/dom/mergeConsecutiveOneLinerPres.js +1 -1
- package/dist/transforms/dom/mergeFragmentedLists.d.ts +6 -0
- package/dist/transforms/dom/mergeFragmentedLists.js +84 -0
- package/dist/transforms/dom/proxyAssetUrls.d.ts +6 -0
- package/dist/transforms/dom/proxyAssetUrls.js +64 -0
- package/dist/transforms/dom/removeTrackingPixels.js +22 -25
- package/dist/transforms/dom/replaceEmbedsWithPlaceholders.js +24 -25
- package/dist/transforms/dom/replacePreLineBreaks.js +3 -4
- package/dist/transforms/dom/resolveRelativeUrls.js +44 -30
- package/dist/transforms/dom/stripComments.js +5 -15
- package/dist/transforms/dom/stripDeadAnchors.d.ts +6 -0
- package/dist/transforms/dom/stripDeadAnchors.js +20 -0
- package/dist/transforms/dom/stripDuplicateTitleHeading.d.ts +6 -0
- package/dist/transforms/dom/stripDuplicateTitleHeading.js +31 -0
- package/dist/transforms/dom/stripEmptyTags.d.ts +6 -0
- package/dist/transforms/dom/stripEmptyTags.js +53 -0
- package/dist/transforms/dom/stripInterBlockBreaks.js +28 -8
- package/dist/transforms/dom/stripParagraphBoundaryBreaks.js +26 -6
- package/dist/transforms/dom/stripTrackingParams.js +7 -6
- package/dist/transforms/dom/trimPreWhitespace.js +4 -3
- package/dist/transforms/dom/unwrapDoublyNestedLists.d.ts +6 -0
- package/dist/transforms/dom/unwrapDoublyNestedLists.js +41 -0
- package/dist/transforms/dom/unwrapRedirectUrls.js +4 -2
- package/dist/transforms/dom/unwrapWrappers.d.ts +6 -0
- package/dist/transforms/dom/unwrapWrappers.js +30 -0
- package/dist/transforms/string/paragraphizePlainText.js +1 -1
- package/dist/transforms/string/stripOversizedBase64Sources.d.ts +6 -0
- package/dist/transforms/string/stripOversizedBase64Sources.js +13 -0
- package/dist/transforms/string/unwrapCdataComments.d.ts +6 -0
- package/dist/transforms/string/unwrapCdataComments.js +10 -0
- package/dist/types.d.ts +37 -7
- package/dist/unwraps/google.js +1 -1
- package/dist/unwraps/googleNewsModern.js +7 -3
- package/package.json +15 -5
- package/dist/transforms/dom/injectEnclosureEmbedPlaceholders.d.ts +0 -6
- package/dist/transforms/dom/injectEnclosureEmbedPlaceholders.js +0 -33
- package/dist/transforms/dom/simplifyFigures.d.ts +0 -6
- package/dist/transforms/dom/simplifyFigures.js +0 -27
- package/dist/transforms/string/decodeDoubleEncodedTags.d.ts +0 -6
- package/dist/transforms/string/decodeDoubleEncodedTags.js +0 -23
- package/dist/transforms/string/stripEmptyTags.d.ts +0 -6
- package/dist/transforms/string/stripEmptyTags.js +0 -25
- package/dist/transforms/string/stripOrphanedClosingTags.d.ts +0 -6
- package/dist/transforms/string/stripOrphanedClosingTags.js +0 -28
- package/dist/transforms/string/unwrapWrappers.d.ts +0 -6
- package/dist/transforms/string/unwrapWrappers.js +0 -10
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
import { isHostOf } from "feedscout/utils";
|
|
2
2
|
//#region src/unwraps/googleNewsModern.ts
|
|
3
|
+
const articleIdRegex = /^\/(?:rss\/)?articles\/([\w-]+)/;
|
|
4
|
+
const base64UrlMinusRegex = /-/g;
|
|
5
|
+
const base64UrlUnderscoreRegex = /_/g;
|
|
6
|
+
const protobufFramingRegex = /\x08\x13".+?(https?:\/\/[^\xd2]+)\xd2\x01/;
|
|
3
7
|
const unwrapGoogleNewsModern = (url) => {
|
|
4
8
|
if (!isHostOf(url.href, "news.google.com")) return;
|
|
5
|
-
const match = url.pathname.match(
|
|
9
|
+
const match = url.pathname.match(articleIdRegex);
|
|
6
10
|
if (!match) return;
|
|
7
|
-
const padded = match[1].replace(
|
|
8
|
-
return Buffer.from(padded, "base64").toString("latin1").match(
|
|
11
|
+
const padded = match[1].replace(base64UrlMinusRegex, "+").replace(base64UrlUnderscoreRegex, "/");
|
|
12
|
+
return Buffer.from(padded, "base64").toString("latin1").match(protobufFramingRegex)?.[1];
|
|
9
13
|
};
|
|
10
14
|
//#endregion
|
|
11
15
|
export { unwrapGoogleNewsModern };
|
package/package.json
CHANGED
|
@@ -29,6 +29,10 @@
|
|
|
29
29
|
"./defaults": {
|
|
30
30
|
"types": "./dist/defaults.d.ts",
|
|
31
31
|
"default": "./dist/defaults.js"
|
|
32
|
+
},
|
|
33
|
+
"./linkedom": {
|
|
34
|
+
"types": "./dist/parsers/linkedom.d.ts",
|
|
35
|
+
"default": "./dist/parsers/linkedom.js"
|
|
32
36
|
}
|
|
33
37
|
},
|
|
34
38
|
"files": [
|
|
@@ -36,23 +40,29 @@
|
|
|
36
40
|
],
|
|
37
41
|
"scripts": {
|
|
38
42
|
"prepare": "lefthook install",
|
|
39
|
-
"build": "tsdown src/index.ts src/defaults.ts --format esm --dts --clean --unbundle --no-fixed-extension"
|
|
43
|
+
"build": "tsdown src/index.ts src/defaults.ts src/parsers/linkedom.ts --format esm --dts --clean --unbundle --no-fixed-extension"
|
|
40
44
|
},
|
|
41
45
|
"dependencies": {
|
|
42
|
-
"@wordpress/autop": "^4.
|
|
46
|
+
"@wordpress/autop": "^4.46.0",
|
|
43
47
|
"highlight.js": "^11.11.1",
|
|
44
|
-
"linkedom": "^0.18.12",
|
|
45
48
|
"linkifyjs": "^4.3.2",
|
|
46
49
|
"srcset": "^5.0.3"
|
|
47
50
|
},
|
|
48
51
|
"peerDependencies": {
|
|
49
52
|
"feedcanon": "^2.0.0-next.3",
|
|
50
|
-
"feedscout": "^2.0.0-next.2"
|
|
53
|
+
"feedscout": "^2.0.0-next.2",
|
|
54
|
+
"linkedom": "^0.18.12"
|
|
55
|
+
},
|
|
56
|
+
"peerDependenciesMeta": {
|
|
57
|
+
"linkedom": {
|
|
58
|
+
"optional": true
|
|
59
|
+
}
|
|
51
60
|
},
|
|
52
61
|
"devDependencies": {
|
|
53
62
|
"@types/bun": "^1.3.13",
|
|
54
63
|
"kvalita": "^1.13.0",
|
|
64
|
+
"linkedom": "^0.18.12",
|
|
55
65
|
"tsdown": "^0.22.0"
|
|
56
66
|
},
|
|
57
|
-
"version": "1.
|
|
67
|
+
"version": "1.2.0"
|
|
58
68
|
}
|
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
import { createEmbedPlaceholder } from "../../common.js";
|
|
2
|
-
//#region src/transforms/dom/injectEnclosureEmbedPlaceholders.ts
|
|
3
|
-
const isAudioEnclosure = (enclosure) => {
|
|
4
|
-
return enclosure.medium === "audio" || !!enclosure.type?.startsWith("audio/");
|
|
5
|
-
};
|
|
6
|
-
const isVideoEnclosure = (enclosure) => {
|
|
7
|
-
return enclosure.medium === "video" || !!enclosure.type?.startsWith("video/");
|
|
8
|
-
};
|
|
9
|
-
const resolveEnclosure = (url, resolvers, document) => {
|
|
10
|
-
const probe = document.createElement("iframe");
|
|
11
|
-
probe.setAttribute("src", url);
|
|
12
|
-
for (const resolver of resolvers) if (probe.matches(resolver.selector)) {
|
|
13
|
-
const metadata = resolver.extract(probe);
|
|
14
|
-
if (metadata) return metadata;
|
|
15
|
-
}
|
|
16
|
-
};
|
|
17
|
-
const injectEnclosureEmbedPlaceholders = (context) => {
|
|
18
|
-
return (document) => {
|
|
19
|
-
if (!context.enclosures?.length) return;
|
|
20
|
-
const html = document.toString();
|
|
21
|
-
for (const enclosure of context.enclosures) {
|
|
22
|
-
if (html.includes(enclosure.url)) continue;
|
|
23
|
-
if (!context.resolveUrlFn(enclosure.url, context.baseUrl)) continue;
|
|
24
|
-
const resolved = resolveEnclosure(enclosure.url, context.embedResolvers, document);
|
|
25
|
-
if (!resolved && !isAudioEnclosure(enclosure) && !isVideoEnclosure(enclosure)) continue;
|
|
26
|
-
const type = resolved?.type ?? (isAudioEnclosure(enclosure) ? "audio" : "video");
|
|
27
|
-
const placeholder = createEmbedPlaceholder(document, enclosure.url, type, resolved);
|
|
28
|
-
document.body.prepend(placeholder);
|
|
29
|
-
}
|
|
30
|
-
};
|
|
31
|
-
};
|
|
32
|
-
//#endregion
|
|
33
|
-
export { injectEnclosureEmbedPlaceholders };
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
import { Node, unwrapOuterTag } from "../../common.js";
|
|
2
|
-
//#region src/transforms/dom/simplifyFigures.ts
|
|
3
|
-
const figureWrapperRegex = /^<(p|div|span)(\s[^>]*)?>[\s\n]*([\s\S]*)[\s\n]*<\/\1>$/i;
|
|
4
|
-
const mediaContentRegex = /<(img|picture|video|audio)[\s>]/i;
|
|
5
|
-
const isMediaOnly = (html) => {
|
|
6
|
-
return html.replace(/<\/?(img|picture|video|audio|source)(\s[^>]*)?>/gi, "").trim() === "" && mediaContentRegex.test(html);
|
|
7
|
-
};
|
|
8
|
-
const simplifyFigures = () => {
|
|
9
|
-
return (document) => {
|
|
10
|
-
const figures = document.querySelectorAll("figure");
|
|
11
|
-
for (const figure of figures) {
|
|
12
|
-
for (const child of [...figure.children]) {
|
|
13
|
-
if (child.tagName.toLowerCase() === "figcaption") continue;
|
|
14
|
-
const unwrapped = unwrapOuterTag(child.outerHTML, figureWrapperRegex);
|
|
15
|
-
if (unwrapped !== child.outerHTML && isMediaOnly(unwrapped)) child.outerHTML = unwrapped;
|
|
16
|
-
}
|
|
17
|
-
const captions = figure.querySelectorAll("figcaption");
|
|
18
|
-
for (const caption of captions) {
|
|
19
|
-
const elements = [...caption.children];
|
|
20
|
-
if (elements.length !== 1 || elements[0].tagName.toLowerCase() !== "div") continue;
|
|
21
|
-
if (![...caption.childNodes].some((node) => node.nodeType === Node.TEXT_NODE && (node.textContent ?? "").trim() !== "")) caption.innerHTML = elements[0].innerHTML;
|
|
22
|
-
}
|
|
23
|
-
}
|
|
24
|
-
};
|
|
25
|
-
};
|
|
26
|
-
//#endregion
|
|
27
|
-
export { simplifyFigures };
|
|
@@ -1,23 +0,0 @@
|
|
|
1
|
-
//#region src/transforms/string/decodeDoubleEncodedTags.ts
|
|
2
|
-
const hasHtmlRegex = /<[a-z][a-z0-9]*[\s>]/i;
|
|
3
|
-
const encodedTagRegex = /<(\/?)([a-zA-Z][\w-]*)((?:[^&]|&(?!gt;))*)>/g;
|
|
4
|
-
const hasEncodedTagRegex = /<[a-zA-Z/]/;
|
|
5
|
-
const codeBlockRegex = /<(code|pre)(\s[^>]*)?>[\s\S]*?<\/\1>/gi;
|
|
6
|
-
const decodeDoubleEncodedTags = () => {
|
|
7
|
-
return (html) => {
|
|
8
|
-
if (!hasHtmlRegex.test(html) || !hasEncodedTagRegex.test(html)) return html;
|
|
9
|
-
let result = "";
|
|
10
|
-
let lastIndex = 0;
|
|
11
|
-
for (const match of html.matchAll(codeBlockRegex)) {
|
|
12
|
-
const matchStart = match.index;
|
|
13
|
-
const matchEnd = matchStart + match[0].length;
|
|
14
|
-
result += html.slice(lastIndex, matchStart).replace(encodedTagRegex, "<$1$2$3>");
|
|
15
|
-
result += match[0];
|
|
16
|
-
lastIndex = matchEnd;
|
|
17
|
-
}
|
|
18
|
-
result += html.slice(lastIndex).replace(encodedTagRegex, "<$1$2$3>");
|
|
19
|
-
return result;
|
|
20
|
-
};
|
|
21
|
-
};
|
|
22
|
-
//#endregion
|
|
23
|
-
export { decodeDoubleEncodedTags };
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
//#region src/transforms/string/stripEmptyTags.ts
|
|
2
|
-
const emptyTagRegex = /<([a-z][a-z0-9]*)(\s[^>]*)?>(\s*)<\/\1>/gi;
|
|
3
|
-
const preserveWhenEmpty = new Set([
|
|
4
|
-
"iframe",
|
|
5
|
-
"video",
|
|
6
|
-
"audio",
|
|
7
|
-
"img",
|
|
8
|
-
"source"
|
|
9
|
-
]);
|
|
10
|
-
const stripEmptyTags = () => {
|
|
11
|
-
return (html) => {
|
|
12
|
-
let previous = "";
|
|
13
|
-
let result = html;
|
|
14
|
-
while (result !== previous) {
|
|
15
|
-
previous = result;
|
|
16
|
-
result = result.replace(emptyTagRegex, (match, tagName, _attrs, content) => {
|
|
17
|
-
if (preserveWhenEmpty.has(tagName.toLowerCase())) return match;
|
|
18
|
-
return content.length > 0 ? " " : "";
|
|
19
|
-
});
|
|
20
|
-
}
|
|
21
|
-
return result;
|
|
22
|
-
};
|
|
23
|
-
};
|
|
24
|
-
//#endregion
|
|
25
|
-
export { stripEmptyTags };
|
|
@@ -1,28 +0,0 @@
|
|
|
1
|
-
//#region src/transforms/string/stripOrphanedClosingTags.ts
|
|
2
|
-
const orphanTags = new Set([
|
|
3
|
-
"p",
|
|
4
|
-
"h1",
|
|
5
|
-
"h2",
|
|
6
|
-
"h3",
|
|
7
|
-
"h4",
|
|
8
|
-
"h5",
|
|
9
|
-
"h6"
|
|
10
|
-
]);
|
|
11
|
-
const orphanTagRegex = /<(\/?([a-z][a-z0-9]*))(\s[^>]*)?\/?>/gi;
|
|
12
|
-
const stripOrphanedClosingTags = () => {
|
|
13
|
-
return (html) => {
|
|
14
|
-
const counts = {};
|
|
15
|
-
return html.replace(orphanTagRegex, (match, _full, tagName) => {
|
|
16
|
-
const name = tagName.toLowerCase();
|
|
17
|
-
if (!orphanTags.has(name)) return match;
|
|
18
|
-
if (match[1] === "/") {
|
|
19
|
-
const count = counts[name] ?? 0;
|
|
20
|
-
if (count <= 0) return "";
|
|
21
|
-
counts[name] = count - 1;
|
|
22
|
-
} else counts[name] = (counts[name] ?? 0) + 1;
|
|
23
|
-
return match;
|
|
24
|
-
});
|
|
25
|
-
};
|
|
26
|
-
};
|
|
27
|
-
//#endregion
|
|
28
|
-
export { stripOrphanedClosingTags };
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
import { unwrapOuterTag } from "../../common.js";
|
|
2
|
-
//#region src/transforms/string/unwrapWrappers.ts
|
|
3
|
-
const wrapperRegex = /^<(div|article|section|main|header|footer)(\s[^>]*)?>[\s\n]*([\s\S]*)[\s\n]*<\/\1>$/i;
|
|
4
|
-
const unwrapWrappers = () => {
|
|
5
|
-
return (html) => {
|
|
6
|
-
return unwrapOuterTag(html, wrapperRegex);
|
|
7
|
-
};
|
|
8
|
-
};
|
|
9
|
-
//#endregion
|
|
10
|
-
export { unwrapWrappers };
|