feedsweep 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/README.md +75 -19
  2. package/dist/common.d.ts +8 -8
  3. package/dist/common.js +65 -42
  4. package/dist/defaults.d.ts +2 -2
  5. package/dist/defaults.js +48 -15
  6. package/dist/embeds/youtube.js +2 -2
  7. package/dist/index.d.ts +17 -10
  8. package/dist/index.js +25 -14
  9. package/dist/parsers/linkedom.d.ts +4 -0
  10. package/dist/parsers/linkedom.js +38 -0
  11. package/dist/transforms/dom/convertBreaksToParagraphs.d.ts +6 -0
  12. package/dist/transforms/dom/convertBreaksToParagraphs.js +80 -0
  13. package/dist/transforms/dom/decodeDoubleEncodedTags.d.ts +6 -0
  14. package/dist/transforms/dom/decodeDoubleEncodedTags.js +30 -0
  15. package/dist/transforms/dom/enrichEmbedPlaceholders.d.ts +6 -0
  16. package/dist/transforms/dom/enrichEmbedPlaceholders.js +32 -0
  17. package/dist/transforms/dom/fixLazyImages.js +33 -13
  18. package/dist/transforms/dom/highlightCode.js +3 -2
  19. package/dist/transforms/dom/injectEnclosures.d.ts +6 -0
  20. package/dist/transforms/dom/injectEnclosures.js +66 -0
  21. package/dist/transforms/dom/mergeConsecutiveOneLinerPres.js +1 -1
  22. package/dist/transforms/dom/mergeFragmentedLists.d.ts +6 -0
  23. package/dist/transforms/dom/mergeFragmentedLists.js +84 -0
  24. package/dist/transforms/dom/proxyAssetUrls.d.ts +6 -0
  25. package/dist/transforms/dom/proxyAssetUrls.js +64 -0
  26. package/dist/transforms/dom/removeTrackingPixels.js +22 -25
  27. package/dist/transforms/dom/replaceEmbedsWithPlaceholders.js +24 -25
  28. package/dist/transforms/dom/replacePreLineBreaks.js +3 -4
  29. package/dist/transforms/dom/resolveRelativeUrls.js +44 -30
  30. package/dist/transforms/dom/stripComments.js +5 -15
  31. package/dist/transforms/dom/stripDeadAnchors.d.ts +6 -0
  32. package/dist/transforms/dom/stripDeadAnchors.js +20 -0
  33. package/dist/transforms/dom/stripDuplicateTitleHeading.d.ts +6 -0
  34. package/dist/transforms/dom/stripDuplicateTitleHeading.js +31 -0
  35. package/dist/transforms/dom/stripEmptyTags.d.ts +6 -0
  36. package/dist/transforms/dom/stripEmptyTags.js +53 -0
  37. package/dist/transforms/dom/stripInterBlockBreaks.js +28 -8
  38. package/dist/transforms/dom/stripParagraphBoundaryBreaks.js +26 -6
  39. package/dist/transforms/dom/stripTrackingParams.js +7 -6
  40. package/dist/transforms/dom/trimPreWhitespace.js +4 -3
  41. package/dist/transforms/dom/unwrapDoublyNestedLists.d.ts +6 -0
  42. package/dist/transforms/dom/unwrapDoublyNestedLists.js +41 -0
  43. package/dist/transforms/dom/unwrapRedirectUrls.js +4 -2
  44. package/dist/transforms/dom/unwrapWrappers.d.ts +6 -0
  45. package/dist/transforms/dom/unwrapWrappers.js +30 -0
  46. package/dist/transforms/string/paragraphizePlainText.js +1 -1
  47. package/dist/transforms/string/stripOversizedBase64Sources.d.ts +6 -0
  48. package/dist/transforms/string/stripOversizedBase64Sources.js +13 -0
  49. package/dist/transforms/string/unwrapCdataComments.d.ts +6 -0
  50. package/dist/transforms/string/unwrapCdataComments.js +10 -0
  51. package/dist/types.d.ts +37 -7
  52. package/dist/unwraps/google.js +1 -1
  53. package/dist/unwraps/googleNewsModern.js +7 -3
  54. package/package.json +15 -5
  55. package/dist/transforms/dom/injectEnclosureEmbedPlaceholders.d.ts +0 -6
  56. package/dist/transforms/dom/injectEnclosureEmbedPlaceholders.js +0 -33
  57. package/dist/transforms/dom/simplifyFigures.d.ts +0 -6
  58. package/dist/transforms/dom/simplifyFigures.js +0 -27
  59. package/dist/transforms/string/decodeDoubleEncodedTags.d.ts +0 -6
  60. package/dist/transforms/string/decodeDoubleEncodedTags.js +0 -23
  61. package/dist/transforms/string/stripEmptyTags.d.ts +0 -6
  62. package/dist/transforms/string/stripEmptyTags.js +0 -25
  63. package/dist/transforms/string/stripOrphanedClosingTags.d.ts +0 -6
  64. package/dist/transforms/string/stripOrphanedClosingTags.js +0 -28
  65. package/dist/transforms/string/unwrapWrappers.d.ts +0 -6
  66. package/dist/transforms/string/unwrapWrappers.js +0 -10
@@ -1,11 +1,15 @@
1
1
  import { isHostOf } from "feedscout/utils";
2
2
  //#region src/unwraps/googleNewsModern.ts
3
+ const articleIdRegex = /^\/(?:rss\/)?articles\/([\w-]+)/;
4
+ const base64UrlMinusRegex = /-/g;
5
+ const base64UrlUnderscoreRegex = /_/g;
6
+ const protobufFramingRegex = /\x08\x13".+?(https?:\/\/[^\xd2]+)\xd2\x01/;
3
7
  const unwrapGoogleNewsModern = (url) => {
4
8
  if (!isHostOf(url.href, "news.google.com")) return;
5
- const match = url.pathname.match(/^\/(?:rss\/)?articles\/([\w-]+)/);
9
+ const match = url.pathname.match(articleIdRegex);
6
10
  if (!match) return;
7
- const padded = match[1].replace(/-/g, "+").replace(/_/g, "/");
8
- return Buffer.from(padded, "base64").toString("latin1").match(/\x08\x13".+?(https?:\/\/[^\xd2]+)\xd2\x01/)?.[1];
11
+ const padded = match[1].replace(base64UrlMinusRegex, "+").replace(base64UrlUnderscoreRegex, "/");
12
+ return Buffer.from(padded, "base64").toString("latin1").match(protobufFramingRegex)?.[1];
9
13
  };
10
14
  //#endregion
11
15
  export { unwrapGoogleNewsModern };
package/package.json CHANGED
@@ -29,6 +29,10 @@
29
29
  "./defaults": {
30
30
  "types": "./dist/defaults.d.ts",
31
31
  "default": "./dist/defaults.js"
32
+ },
33
+ "./linkedom": {
34
+ "types": "./dist/parsers/linkedom.d.ts",
35
+ "default": "./dist/parsers/linkedom.js"
32
36
  }
33
37
  },
34
38
  "files": [
@@ -36,23 +40,29 @@
36
40
  ],
37
41
  "scripts": {
38
42
  "prepare": "lefthook install",
39
- "build": "tsdown src/index.ts src/defaults.ts --format esm --dts --clean --unbundle --no-fixed-extension"
43
+ "build": "tsdown src/index.ts src/defaults.ts src/parsers/linkedom.ts --format esm --dts --clean --unbundle --no-fixed-extension"
40
44
  },
41
45
  "dependencies": {
42
- "@wordpress/autop": "^4.45.0",
46
+ "@wordpress/autop": "^4.46.0",
43
47
  "highlight.js": "^11.11.1",
44
- "linkedom": "^0.18.12",
45
48
  "linkifyjs": "^4.3.2",
46
49
  "srcset": "^5.0.3"
47
50
  },
48
51
  "peerDependencies": {
49
52
  "feedcanon": "^2.0.0-next.3",
50
- "feedscout": "^2.0.0-next.2"
53
+ "feedscout": "^2.0.0-next.2",
54
+ "linkedom": "^0.18.12"
55
+ },
56
+ "peerDependenciesMeta": {
57
+ "linkedom": {
58
+ "optional": true
59
+ }
51
60
  },
52
61
  "devDependencies": {
53
62
  "@types/bun": "^1.3.13",
54
63
  "kvalita": "^1.13.0",
64
+ "linkedom": "^0.18.12",
55
65
  "tsdown": "^0.22.0"
56
66
  },
57
- "version": "1.0.0"
67
+ "version": "1.2.0"
58
68
  }
@@ -1,6 +0,0 @@
1
- import { DomTransform } from "../../types.js";
2
-
3
- //#region src/transforms/dom/injectEnclosureEmbedPlaceholders.d.ts
4
- declare const injectEnclosureEmbedPlaceholders: DomTransform;
5
- //#endregion
6
- export { injectEnclosureEmbedPlaceholders };
@@ -1,33 +0,0 @@
1
- import { createEmbedPlaceholder } from "../../common.js";
2
- //#region src/transforms/dom/injectEnclosureEmbedPlaceholders.ts
3
- const isAudioEnclosure = (enclosure) => {
4
- return enclosure.medium === "audio" || !!enclosure.type?.startsWith("audio/");
5
- };
6
- const isVideoEnclosure = (enclosure) => {
7
- return enclosure.medium === "video" || !!enclosure.type?.startsWith("video/");
8
- };
9
- const resolveEnclosure = (url, resolvers, document) => {
10
- const probe = document.createElement("iframe");
11
- probe.setAttribute("src", url);
12
- for (const resolver of resolvers) if (probe.matches(resolver.selector)) {
13
- const metadata = resolver.extract(probe);
14
- if (metadata) return metadata;
15
- }
16
- };
17
- const injectEnclosureEmbedPlaceholders = (context) => {
18
- return (document) => {
19
- if (!context.enclosures?.length) return;
20
- const html = document.toString();
21
- for (const enclosure of context.enclosures) {
22
- if (html.includes(enclosure.url)) continue;
23
- if (!context.resolveUrlFn(enclosure.url, context.baseUrl)) continue;
24
- const resolved = resolveEnclosure(enclosure.url, context.embedResolvers, document);
25
- if (!resolved && !isAudioEnclosure(enclosure) && !isVideoEnclosure(enclosure)) continue;
26
- const type = resolved?.type ?? (isAudioEnclosure(enclosure) ? "audio" : "video");
27
- const placeholder = createEmbedPlaceholder(document, enclosure.url, type, resolved);
28
- document.body.prepend(placeholder);
29
- }
30
- };
31
- };
32
- //#endregion
33
- export { injectEnclosureEmbedPlaceholders };
@@ -1,6 +0,0 @@
1
- import { DomTransform } from "../../types.js";
2
-
3
- //#region src/transforms/dom/simplifyFigures.d.ts
4
- declare const simplifyFigures: DomTransform;
5
- //#endregion
6
- export { simplifyFigures };
@@ -1,27 +0,0 @@
1
- import { Node, unwrapOuterTag } from "../../common.js";
2
- //#region src/transforms/dom/simplifyFigures.ts
3
- const figureWrapperRegex = /^<(p|div|span)(\s[^>]*)?>[\s\n]*([\s\S]*)[\s\n]*<\/\1>$/i;
4
- const mediaContentRegex = /<(img|picture|video|audio)[\s>]/i;
5
- const isMediaOnly = (html) => {
6
- return html.replace(/<\/?(img|picture|video|audio|source)(\s[^>]*)?>/gi, "").trim() === "" && mediaContentRegex.test(html);
7
- };
8
- const simplifyFigures = () => {
9
- return (document) => {
10
- const figures = document.querySelectorAll("figure");
11
- for (const figure of figures) {
12
- for (const child of [...figure.children]) {
13
- if (child.tagName.toLowerCase() === "figcaption") continue;
14
- const unwrapped = unwrapOuterTag(child.outerHTML, figureWrapperRegex);
15
- if (unwrapped !== child.outerHTML && isMediaOnly(unwrapped)) child.outerHTML = unwrapped;
16
- }
17
- const captions = figure.querySelectorAll("figcaption");
18
- for (const caption of captions) {
19
- const elements = [...caption.children];
20
- if (elements.length !== 1 || elements[0].tagName.toLowerCase() !== "div") continue;
21
- if (![...caption.childNodes].some((node) => node.nodeType === Node.TEXT_NODE && (node.textContent ?? "").trim() !== "")) caption.innerHTML = elements[0].innerHTML;
22
- }
23
- }
24
- };
25
- };
26
- //#endregion
27
- export { simplifyFigures };
@@ -1,6 +0,0 @@
1
- import { StringTransform } from "../../types.js";
2
-
3
- //#region src/transforms/string/decodeDoubleEncodedTags.d.ts
4
- declare const decodeDoubleEncodedTags: StringTransform;
5
- //#endregion
6
- export { decodeDoubleEncodedTags };
@@ -1,23 +0,0 @@
1
- //#region src/transforms/string/decodeDoubleEncodedTags.ts
2
- const hasHtmlRegex = /<[a-z][a-z0-9]*[\s>]/i;
3
- const encodedTagRegex = /&lt;(\/?)([a-zA-Z][\w-]*)((?:[^&]|&(?!gt;))*)&gt;/g;
4
- const hasEncodedTagRegex = /&lt;[a-zA-Z/]/;
5
- const codeBlockRegex = /<(code|pre)(\s[^>]*)?>[\s\S]*?<\/\1>/gi;
6
- const decodeDoubleEncodedTags = () => {
7
- return (html) => {
8
- if (!hasHtmlRegex.test(html) || !hasEncodedTagRegex.test(html)) return html;
9
- let result = "";
10
- let lastIndex = 0;
11
- for (const match of html.matchAll(codeBlockRegex)) {
12
- const matchStart = match.index;
13
- const matchEnd = matchStart + match[0].length;
14
- result += html.slice(lastIndex, matchStart).replace(encodedTagRegex, "<$1$2$3>");
15
- result += match[0];
16
- lastIndex = matchEnd;
17
- }
18
- result += html.slice(lastIndex).replace(encodedTagRegex, "<$1$2$3>");
19
- return result;
20
- };
21
- };
22
- //#endregion
23
- export { decodeDoubleEncodedTags };
@@ -1,6 +0,0 @@
1
- import { StringTransform } from "../../types.js";
2
-
3
- //#region src/transforms/string/stripEmptyTags.d.ts
4
- declare const stripEmptyTags: StringTransform;
5
- //#endregion
6
- export { stripEmptyTags };
@@ -1,25 +0,0 @@
1
- //#region src/transforms/string/stripEmptyTags.ts
2
- const emptyTagRegex = /<([a-z][a-z0-9]*)(\s[^>]*)?>(\s*)<\/\1>/gi;
3
- const preserveWhenEmpty = new Set([
4
- "iframe",
5
- "video",
6
- "audio",
7
- "img",
8
- "source"
9
- ]);
10
- const stripEmptyTags = () => {
11
- return (html) => {
12
- let previous = "";
13
- let result = html;
14
- while (result !== previous) {
15
- previous = result;
16
- result = result.replace(emptyTagRegex, (match, tagName, _attrs, content) => {
17
- if (preserveWhenEmpty.has(tagName.toLowerCase())) return match;
18
- return content.length > 0 ? " " : "";
19
- });
20
- }
21
- return result;
22
- };
23
- };
24
- //#endregion
25
- export { stripEmptyTags };
@@ -1,6 +0,0 @@
1
- import { StringTransform } from "../../types.js";
2
-
3
- //#region src/transforms/string/stripOrphanedClosingTags.d.ts
4
- declare const stripOrphanedClosingTags: StringTransform;
5
- //#endregion
6
- export { stripOrphanedClosingTags };
@@ -1,28 +0,0 @@
1
- //#region src/transforms/string/stripOrphanedClosingTags.ts
2
- const orphanTags = new Set([
3
- "p",
4
- "h1",
5
- "h2",
6
- "h3",
7
- "h4",
8
- "h5",
9
- "h6"
10
- ]);
11
- const orphanTagRegex = /<(\/?([a-z][a-z0-9]*))(\s[^>]*)?\/?>/gi;
12
- const stripOrphanedClosingTags = () => {
13
- return (html) => {
14
- const counts = {};
15
- return html.replace(orphanTagRegex, (match, _full, tagName) => {
16
- const name = tagName.toLowerCase();
17
- if (!orphanTags.has(name)) return match;
18
- if (match[1] === "/") {
19
- const count = counts[name] ?? 0;
20
- if (count <= 0) return "";
21
- counts[name] = count - 1;
22
- } else counts[name] = (counts[name] ?? 0) + 1;
23
- return match;
24
- });
25
- };
26
- };
27
- //#endregion
28
- export { stripOrphanedClosingTags };
@@ -1,6 +0,0 @@
1
- import { StringTransform } from "../../types.js";
2
-
3
- //#region src/transforms/string/unwrapWrappers.d.ts
4
- declare const unwrapWrappers: StringTransform;
5
- //#endregion
6
- export { unwrapWrappers };
@@ -1,10 +0,0 @@
1
- import { unwrapOuterTag } from "../../common.js";
2
- //#region src/transforms/string/unwrapWrappers.ts
3
- const wrapperRegex = /^<(div|article|section|main|header|footer)(\s[^>]*)?>[\s\n]*([\s\S]*)[\s\n]*<\/\1>$/i;
4
- const unwrapWrappers = () => {
5
- return (html) => {
6
- return unwrapOuterTag(html, wrapperRegex);
7
- };
8
- };
9
- //#endregion
10
- export { unwrapWrappers };