feedsweep 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/README.md +29 -17
  2. package/dist/common.d.ts +10 -6
  3. package/dist/common.js +99 -28
  4. package/dist/defaults.d.ts +2 -1
  5. package/dist/defaults.js +46 -17
  6. package/dist/embeds/youtube.js +2 -2
  7. package/dist/index.d.ts +16 -10
  8. package/dist/index.js +23 -13
  9. package/dist/transforms/dom/convertBreaksToParagraphs.d.ts +6 -0
  10. package/dist/transforms/dom/convertBreaksToParagraphs.js +80 -0
  11. package/dist/transforms/dom/decodeDoubleEncodedTags.d.ts +6 -0
  12. package/dist/transforms/dom/decodeDoubleEncodedTags.js +30 -0
  13. package/dist/transforms/dom/enrichEmbedPlaceholders.d.ts +6 -0
  14. package/dist/transforms/dom/enrichEmbedPlaceholders.js +32 -0
  15. package/dist/transforms/dom/fixLazyImages.js +37 -13
  16. package/dist/transforms/dom/highlightCode.js +3 -2
  17. package/dist/transforms/dom/injectEnclosures.d.ts +6 -0
  18. package/dist/transforms/dom/injectEnclosures.js +66 -0
  19. package/dist/transforms/dom/mergeConsecutiveOneLinerPres.js +1 -1
  20. package/dist/transforms/dom/mergeFragmentedLists.d.ts +6 -0
  21. package/dist/transforms/dom/mergeFragmentedLists.js +84 -0
  22. package/dist/transforms/dom/proxyAssetUrls.d.ts +6 -0
  23. package/dist/transforms/dom/proxyAssetUrls.js +64 -0
  24. package/dist/transforms/dom/removeTrackingPixels.js +22 -25
  25. package/dist/transforms/dom/replaceEmbedsWithPlaceholders.js +24 -25
  26. package/dist/transforms/dom/replacePreLineBreaks.js +3 -4
  27. package/dist/transforms/dom/resolveRelativeUrls.js +44 -30
  28. package/dist/transforms/dom/stripComments.js +5 -15
  29. package/dist/transforms/dom/stripDeadAnchors.d.ts +6 -0
  30. package/dist/transforms/dom/stripDeadAnchors.js +20 -0
  31. package/dist/transforms/dom/stripDuplicateTitleHeading.d.ts +6 -0
  32. package/dist/transforms/dom/stripDuplicateTitleHeading.js +31 -0
  33. package/dist/transforms/dom/stripEmptyTags.d.ts +6 -0
  34. package/dist/transforms/dom/stripEmptyTags.js +53 -0
  35. package/dist/transforms/dom/stripInterBlockBreaks.js +28 -8
  36. package/dist/transforms/dom/stripParagraphBoundaryBreaks.js +26 -6
  37. package/dist/transforms/dom/stripTrackingParams.js +7 -6
  38. package/dist/transforms/dom/trimPreWhitespace.js +4 -3
  39. package/dist/transforms/dom/unwrapDoublyNestedLists.d.ts +6 -0
  40. package/dist/transforms/dom/unwrapDoublyNestedLists.js +41 -0
  41. package/dist/transforms/dom/unwrapRedirectUrls.js +4 -2
  42. package/dist/transforms/dom/unwrapWrappers.d.ts +6 -0
  43. package/dist/transforms/dom/unwrapWrappers.js +30 -0
  44. package/dist/transforms/string/paragraphizePlainText.js +1 -1
  45. package/dist/transforms/string/unwrapCdataComments.d.ts +6 -0
  46. package/dist/transforms/string/unwrapCdataComments.js +10 -0
  47. package/dist/types.d.ts +35 -6
  48. package/dist/unwraps/google.js +1 -1
  49. package/dist/unwraps/googleNewsModern.js +7 -3
  50. package/package.json +2 -2
  51. package/dist/transforms/dom/injectEnclosureEmbedPlaceholders.d.ts +0 -6
  52. package/dist/transforms/dom/injectEnclosureEmbedPlaceholders.js +0 -33
  53. package/dist/transforms/dom/simplifyFigures.d.ts +0 -6
  54. package/dist/transforms/dom/simplifyFigures.js +0 -27
  55. package/dist/transforms/string/decodeDoubleEncodedTags.d.ts +0 -6
  56. package/dist/transforms/string/decodeDoubleEncodedTags.js +0 -23
  57. package/dist/transforms/string/stripEmptyTags.d.ts +0 -6
  58. package/dist/transforms/string/stripEmptyTags.js +0 -25
  59. package/dist/transforms/string/stripOrphanedClosingTags.d.ts +0 -6
  60. package/dist/transforms/string/stripOrphanedClosingTags.js +0 -28
  61. package/dist/transforms/string/unwrapWrappers.d.ts +0 -6
  62. package/dist/transforms/string/unwrapWrappers.js +0 -10
@@ -1,40 +1,54 @@
1
1
  import { resolveUrl } from "feedcanon";
2
2
  import { parseSrcset, stringifySrcset } from "srcset";
3
3
  //#region src/transforms/dom/resolveRelativeUrls.ts
4
+ const absoluteOrOpaqueUrl = /^(?:https?:|data:|mailto:|tel:|javascript:)/i;
5
+ const srcsetSeparator = /,\s+/;
4
6
  const resolveRelativeUrls = ({ baseUrl }) => {
5
7
  return (document) => {
6
8
  if (!baseUrl) return;
7
- const anchors = document.querySelectorAll("a[href]");
8
- for (const anchor of anchors) {
9
- const href = anchor.getAttribute("href");
10
- if (!href) continue;
11
- const resolved = resolveUrl(href, baseUrl);
12
- if (resolved) anchor.setAttribute("href", resolved);
13
- }
14
- const elementsWithSrc = document.querySelectorAll("[src]");
15
- for (const element of elementsWithSrc) {
16
- const src = element.getAttribute("src");
17
- if (!src) continue;
18
- const resolved = resolveUrl(src, baseUrl);
19
- if (resolved) element.setAttribute("src", resolved);
20
- }
21
- const videos = document.querySelectorAll("video[poster]");
22
- for (const video of videos) {
23
- const poster = video.getAttribute("poster");
24
- if (!poster) continue;
25
- const resolved = resolveUrl(poster, baseUrl);
26
- if (resolved) video.setAttribute("poster", resolved);
27
- }
28
- const elements = document.querySelectorAll("img, source");
9
+ const elements = document.querySelectorAll("a[href], [src], video[poster], img[srcset], source[srcset]");
29
10
  for (const element of elements) {
30
- const srcset = element.getAttribute("srcset") ?? element.getAttribute("srcSet");
31
- if (!srcset) continue;
32
- const resolved = parseSrcset(srcset).map((entry) => ({
33
- ...entry,
34
- url: resolveUrl(entry.url, baseUrl) ?? entry.url
35
- }));
36
- element.removeAttribute("srcSet");
37
- element.setAttribute("srcset", stringifySrcset(resolved));
11
+ const localName = element.localName;
12
+ if (localName === "a") {
13
+ const href = element.getAttribute("href");
14
+ if (href && !href.startsWith("#") && !absoluteOrOpaqueUrl.test(href)) {
15
+ const resolved = resolveUrl(href, baseUrl);
16
+ if (resolved) element.setAttribute("href", resolved);
17
+ }
18
+ }
19
+ const src = element.getAttribute("src");
20
+ if (src && !absoluteOrOpaqueUrl.test(src)) {
21
+ const resolved = resolveUrl(src, baseUrl);
22
+ if (resolved) element.setAttribute("src", resolved);
23
+ }
24
+ if (localName === "video") {
25
+ const poster = element.getAttribute("poster");
26
+ if (poster && !absoluteOrOpaqueUrl.test(poster)) {
27
+ const resolved = resolveUrl(poster, baseUrl);
28
+ if (resolved) element.setAttribute("poster", resolved);
29
+ }
30
+ }
31
+ if (localName === "img" || localName === "source") {
32
+ const srcset = element.getAttribute("srcset");
33
+ if (srcset) {
34
+ let needsResolution = false;
35
+ const candidates = srcset.split(srcsetSeparator);
36
+ for (const candidate of candidates) {
37
+ const trimmed = candidate.trimStart();
38
+ if (trimmed && !absoluteOrOpaqueUrl.test(trimmed)) {
39
+ needsResolution = true;
40
+ break;
41
+ }
42
+ }
43
+ if (needsResolution) {
44
+ const resolved = parseSrcset(srcset).map((entry) => ({
45
+ ...entry,
46
+ url: resolveUrl(entry.url, baseUrl) ?? entry.url
47
+ }));
48
+ element.setAttribute("srcset", stringifySrcset(resolved));
49
+ }
50
+ }
51
+ }
38
52
  }
39
53
  };
40
54
  };
@@ -1,22 +1,12 @@
1
- import { Node } from "../../common.js";
1
+ import { NodeFilter, hasAncestorWithTagName } from "../../common.js";
2
2
  //#region src/transforms/dom/stripComments.ts
3
3
  const codeBlockTags = new Set(["pre", "code"]);
4
4
  const stripComments = () => {
5
5
  return (document) => {
6
- const visit = (node, inCodeBlock) => {
7
- const children = Array.from(node.childNodes);
8
- for (const child of children) {
9
- if (child.nodeType === Node.COMMENT_NODE) {
10
- if (!inCodeBlock) child.remove();
11
- continue;
12
- }
13
- if (child.nodeType === Node.ELEMENT_NODE) {
14
- const element = child;
15
- visit(element, inCodeBlock || codeBlockTags.has(element.tagName.toLowerCase()));
16
- }
17
- }
18
- };
19
- visit(document.body, false);
6
+ const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_COMMENT);
7
+ const comments = [];
8
+ for (let node = walker.nextNode(); node !== null; node = walker.nextNode()) comments.push(node);
9
+ for (const comment of comments) if (!hasAncestorWithTagName(comment, codeBlockTags, document.body)) comment.remove();
20
10
  };
21
11
  };
22
12
  //#endregion
@@ -0,0 +1,6 @@
1
+ import { DomTransform } from "../../types.js";
2
+
3
+ //#region src/transforms/dom/stripDeadAnchors.d.ts
4
+ declare const stripDeadAnchors: DomTransform;
5
+ //#endregion
6
+ export { stripDeadAnchors };
@@ -0,0 +1,20 @@
1
+ //#region src/transforms/dom/stripDeadAnchors.ts
2
+ const javascriptSchemeRegex = /^javascript:/i;
3
+ const stripDeadAnchors = () => {
4
+ return (document) => {
5
+ const anchors = document.querySelectorAll("a");
6
+ for (const anchor of anchors) {
7
+ const href = anchor.getAttribute("href");
8
+ if (href === null) continue;
9
+ const trimmed = href.trim();
10
+ if (!(trimmed === "" || trimmed === "#" || javascriptSchemeRegex.test(trimmed))) continue;
11
+ if (anchor.hasAttribute("id") || anchor.hasAttribute("name")) continue;
12
+ const parent = anchor.parentNode;
13
+ if (!parent) continue;
14
+ while (anchor.firstChild) parent.insertBefore(anchor.firstChild, anchor);
15
+ anchor.remove();
16
+ }
17
+ };
18
+ };
19
+ //#endregion
20
+ export { stripDeadAnchors };
@@ -0,0 +1,6 @@
1
+ import { DomTransform } from "../../types.js";
2
+
3
+ //#region src/transforms/dom/stripDuplicateTitleHeading.d.ts
4
+ declare const stripDuplicateTitleHeading: DomTransform;
5
+ //#endregion
6
+ export { stripDuplicateTitleHeading };
@@ -0,0 +1,31 @@
1
+ //#region src/transforms/dom/stripDuplicateTitleHeading.ts
2
+ const headingSelector = "h1, h2, h3, h4, h5, h6";
3
+ const mediaSelector = "img, picture, video, audio, iframe, svg";
4
+ const normalize = (value) => value.trim().toLowerCase().replace(/\s+/g, " ");
5
+ const stripDuplicateTitleHeading = (context) => {
6
+ const articleTitle = context.articleTitle;
7
+ const title = articleTitle && articleTitle.trim().length > 0 ? normalize(articleTitle) : "";
8
+ if (!title) return () => {};
9
+ return (document) => {
10
+ let heading = document.querySelector(headingSelector);
11
+ let text = heading?.textContent?.trim() ?? "";
12
+ if (heading && text.length === 0) {
13
+ heading = null;
14
+ for (const candidate of document.querySelectorAll(headingSelector)) {
15
+ const candidateText = candidate.textContent?.trim() ?? "";
16
+ if (candidateText.length > 0) {
17
+ heading = candidate;
18
+ text = candidateText;
19
+ break;
20
+ }
21
+ }
22
+ }
23
+ if (!heading) return;
24
+ if (text.toLowerCase().replace(/\s+/g, " ") !== title) return;
25
+ if (heading.querySelector(headingSelector)) return;
26
+ if (heading.querySelector(mediaSelector)) return;
27
+ heading.remove();
28
+ };
29
+ };
30
+ //#endregion
31
+ export { stripDuplicateTitleHeading };
@@ -0,0 +1,6 @@
1
+ import { DomTransform } from "../../types.js";
2
+
3
+ //#region src/transforms/dom/stripEmptyTags.d.ts
4
+ declare const stripEmptyTags: DomTransform;
5
+ //#endregion
6
+ export { stripEmptyTags };
@@ -0,0 +1,53 @@
1
+ import { Node } from "../../common.js";
2
+ //#region src/transforms/dom/stripEmptyTags.ts
3
+ const preserveWhenEmpty = new Set([
4
+ "iframe",
5
+ "video",
6
+ "audio",
7
+ "img",
8
+ "source",
9
+ "area",
10
+ "base",
11
+ "br",
12
+ "col",
13
+ "embed",
14
+ "hr",
15
+ "input",
16
+ "link",
17
+ "meta",
18
+ "param",
19
+ "track",
20
+ "wbr"
21
+ ]);
22
+ const stripEmptyTags = () => {
23
+ return (document) => {
24
+ const all = document.body.querySelectorAll("*");
25
+ for (let i = all.length - 1; i >= 0; i--) {
26
+ const element = all[i];
27
+ if (!element.parentNode) continue;
28
+ const tagName = element.localName;
29
+ if (preserveWhenEmpty.has(tagName)) continue;
30
+ if (tagName.includes("-")) continue;
31
+ const childNodes = element.childNodes;
32
+ const childCount = childNodes.length;
33
+ let hasContent = false;
34
+ for (let j = 0; j < childCount; j++) {
35
+ const child = childNodes[j];
36
+ const nodeType = child.nodeType;
37
+ if (nodeType === Node.ELEMENT_NODE) {
38
+ hasContent = true;
39
+ break;
40
+ }
41
+ if (nodeType === Node.TEXT_NODE && child.data.trim().length > 0) {
42
+ hasContent = true;
43
+ break;
44
+ }
45
+ }
46
+ if (hasContent) continue;
47
+ if (childCount > 0) element.replaceWith(" ");
48
+ else element.remove();
49
+ }
50
+ };
51
+ };
52
+ //#endregion
53
+ export { stripEmptyTags };
@@ -1,16 +1,36 @@
1
- import { isBlockElement, isSkippable } from "../../common.js";
1
+ import { isBlockElement, isBr, isSkippable } from "../../common.js";
2
2
  //#region src/transforms/dom/stripInterBlockBreaks.ts
3
3
  const stripInterBlockBreaks = () => {
4
4
  return (document) => {
5
5
  const brs = document.querySelectorAll("br");
6
+ const parents = /* @__PURE__ */ new Set();
6
7
  for (const br of brs) {
7
- let previous = br.previousSibling;
8
- while (previous && isSkippable(previous)) previous = previous.previousSibling;
9
- let next = br.nextSibling;
10
- while (next && isSkippable(next)) next = next.nextSibling;
11
- const previousIsBlock = !previous || isBlockElement(previous);
12
- const nextIsBlock = !next || isBlockElement(next);
13
- if (previousIsBlock && nextIsBlock) br.remove();
8
+ const parent = br.parentNode;
9
+ if (parent) parents.add(parent);
10
+ }
11
+ for (const parent of parents) {
12
+ let runBrs = null;
13
+ let previousBoundary = null;
14
+ let child = parent.firstChild;
15
+ while (child !== null) {
16
+ const nextChild = child.nextSibling;
17
+ if (isSkippable(child)) {
18
+ if (isBr(child)) if (runBrs === null) runBrs = [child];
19
+ else runBrs.push(child);
20
+ } else {
21
+ if (runBrs !== null) {
22
+ const previousIsBlock = !previousBoundary || isBlockElement(previousBoundary);
23
+ const nextIsBlock = isBlockElement(child);
24
+ if (previousIsBlock && nextIsBlock) for (const br of runBrs) br.remove();
25
+ runBrs = null;
26
+ }
27
+ previousBoundary = child;
28
+ }
29
+ child = nextChild;
30
+ }
31
+ if (runBrs !== null) {
32
+ if (!previousBoundary || isBlockElement(previousBoundary)) for (const br of runBrs) br.remove();
33
+ }
14
34
  }
15
35
  };
16
36
  };
@@ -4,20 +4,40 @@ const stripParagraphBoundaryBreaks = () => {
4
4
  return (document) => {
5
5
  const paragraphs = document.querySelectorAll("p");
6
6
  for (const paragraph of paragraphs) {
7
- const leading = [];
8
7
  let cursor = paragraph.firstChild;
8
+ let leadingHasBr = false;
9
+ let leadingEnd = null;
9
10
  while (cursor && isSkippable(cursor)) {
10
- leading.push(cursor);
11
+ if (!leadingHasBr && isBr(cursor)) leadingHasBr = true;
12
+ leadingEnd = cursor;
11
13
  cursor = cursor.nextSibling;
12
14
  }
13
- if (leading.some(isBr)) for (const node of leading) node.remove();
14
- const trailing = [];
15
+ if (leadingHasBr) {
16
+ let node = paragraph.firstChild;
17
+ while (node) {
18
+ const next = node.nextSibling;
19
+ node.remove();
20
+ if (node === leadingEnd) break;
21
+ node = next;
22
+ }
23
+ }
15
24
  cursor = paragraph.lastChild;
25
+ let trailingHasBr = false;
26
+ let trailingEnd = null;
16
27
  while (cursor && isSkippable(cursor)) {
17
- trailing.push(cursor);
28
+ if (!trailingHasBr && isBr(cursor)) trailingHasBr = true;
29
+ trailingEnd = cursor;
18
30
  cursor = cursor.previousSibling;
19
31
  }
20
- if (trailing.some(isBr)) for (const node of trailing) node.remove();
32
+ if (trailingHasBr) {
33
+ let node = paragraph.lastChild;
34
+ while (node) {
35
+ const prev = node.previousSibling;
36
+ node.remove();
37
+ if (node === trailingEnd) break;
38
+ node = prev;
39
+ }
40
+ }
21
41
  }
22
42
  };
23
43
  };
@@ -1,19 +1,20 @@
1
1
  import { defaultStrippedParams } from "feedcanon";
2
2
  //#region src/transforms/dom/stripTrackingParams.ts
3
+ const strippedParamSet = new Set(defaultStrippedParams);
3
4
  const stripTrackingParams = () => {
4
5
  return (document) => {
5
6
  const anchors = document.querySelectorAll("a[href]");
6
7
  for (const anchor of anchors) {
7
8
  const href = anchor.getAttribute("href");
8
- if (!href) continue;
9
+ if (!href || href.indexOf("?") === -1) continue;
9
10
  try {
10
11
  const url = new URL(href);
11
- let changed = false;
12
- for (const param of defaultStrippedParams) if (url.searchParams.has(param)) {
13
- url.searchParams.delete(param);
14
- changed = true;
12
+ const toDelete = [];
13
+ for (const key of url.searchParams.keys()) if (strippedParamSet.has(key)) toDelete.push(key);
14
+ if (toDelete.length > 0) {
15
+ for (const key of toDelete) url.searchParams.delete(key);
16
+ anchor.setAttribute("href", url.toString());
15
17
  }
16
- if (changed) anchor.setAttribute("href", url.toString());
17
18
  } catch {}
18
19
  }
19
20
  };
@@ -7,12 +7,13 @@ const trimPreWhitespace = () => {
7
7
  const pres = document.querySelectorAll("pre");
8
8
  for (const pre of pres) {
9
9
  const target = pre.querySelector("code") ?? pre;
10
- const trimmed = target.innerHTML.replace(trailingWhitespaceRegex, "").replace(leadingBlankLinesRegex, "");
10
+ const original = target.innerHTML;
11
+ const trimmed = original.replace(trailingWhitespaceRegex, "").replace(leadingBlankLinesRegex, "");
11
12
  const lines = trimmed.split("\n");
12
13
  const indents = lines.filter((line) => line.trim().length > 0).map((line) => line.match(leadingIndentRegex)?.[1].length ?? 0);
13
14
  const common = Math.min(...indents);
14
- if (common > 0) target.innerHTML = lines.map((line) => line.slice(common)).join("\n");
15
- else target.innerHTML = trimmed;
15
+ const result = common > 0 ? lines.map((line) => line.slice(common)).join("\n") : trimmed;
16
+ if (result !== original) target.innerHTML = result;
16
17
  }
17
18
  };
18
19
  };
@@ -0,0 +1,6 @@
1
+ import { DomTransform } from "../../types.js";
2
+
3
+ //#region src/transforms/dom/unwrapDoublyNestedLists.d.ts
4
+ declare const unwrapDoublyNestedLists: DomTransform;
5
+ //#endregion
6
+ export { unwrapDoublyNestedLists };
@@ -0,0 +1,41 @@
1
+ import { Node } from "../../common.js";
2
+ //#region src/transforms/dom/unwrapDoublyNestedLists.ts
3
+ const unwrapDoublyNestedLists = () => {
4
+ return (document) => {
5
+ const lists = document.querySelectorAll("ul, ol");
6
+ for (const outer of lists) {
7
+ const wrapper = outer.firstElementChild;
8
+ if (wrapper === null || wrapper.nextElementSibling !== null) continue;
9
+ if (wrapper.localName !== "li") continue;
10
+ const outerTag = outer.localName;
11
+ let inner = null;
12
+ let elementDisqualified = false;
13
+ for (let element = wrapper.firstElementChild; element !== null; element = element.nextElementSibling) {
14
+ const localName = element.localName;
15
+ if (localName === "br") continue;
16
+ if (inner !== null || localName !== outerTag) {
17
+ elementDisqualified = true;
18
+ break;
19
+ }
20
+ inner = element;
21
+ }
22
+ if (elementDisqualified || inner === null) continue;
23
+ let textDisqualified = false;
24
+ for (let node = wrapper.firstChild; node !== null; node = node.nextSibling) if (node.nodeType === Node.TEXT_NODE && node.textContent?.trim()) {
25
+ textDisqualified = true;
26
+ break;
27
+ }
28
+ if (textDisqualified) continue;
29
+ const parent = outer.parentNode;
30
+ if (parent === null) continue;
31
+ for (let node = wrapper.firstChild; node !== null;) {
32
+ const next = node.nextSibling;
33
+ if (node.nodeType === Node.TEXT_NODE || node === inner) parent.insertBefore(node, outer);
34
+ node = next;
35
+ }
36
+ outer.remove();
37
+ }
38
+ };
39
+ };
40
+ //#endregion
41
+ export { unwrapDoublyNestedLists };
@@ -8,13 +8,15 @@ const extractRedirectTarget = (url, extractors) => {
8
8
  const unwrapRedirectUrls = (context) => {
9
9
  return (document) => {
10
10
  const anchors = document.querySelectorAll("a[href]");
11
+ const unwrappers = context.urlUnwrappers;
12
+ if (unwrappers.length === 0) return;
11
13
  for (const anchor of anchors) {
12
14
  const href = anchor.getAttribute("href");
13
15
  if (!href) continue;
14
16
  try {
15
17
  const url = new URL(href);
16
- for (const extractor of context.urlUnwrappers) {
17
- const target = extractor(url);
18
+ for (const unwrap of unwrappers) {
19
+ const target = unwrap(url);
18
20
  if (target) {
19
21
  anchor.setAttribute("href", target);
20
22
  break;
@@ -0,0 +1,6 @@
1
+ import { DomTransform } from "../../types.js";
2
+
3
+ //#region src/transforms/dom/unwrapWrappers.d.ts
4
+ declare const unwrapWrappers: DomTransform;
5
+ //#endregion
6
+ export { unwrapWrappers };
@@ -0,0 +1,30 @@
1
+ //#region src/transforms/dom/unwrapWrappers.ts
2
+ const wrapperTags = new Set([
3
+ "div",
4
+ "article",
5
+ "section",
6
+ "main",
7
+ "header",
8
+ "footer"
9
+ ]);
10
+ const hasEmbedAttribute = (element) => {
11
+ const attributes = element.attributes;
12
+ for (let i = 0, n = attributes.length; i < n; i++) if (attributes[i].name.startsWith("data-embed")) return true;
13
+ return false;
14
+ };
15
+ const unwrapWrappers = () => {
16
+ return (document) => {
17
+ const candidates = document.body.querySelectorAll("*");
18
+ for (let i = 0, n = candidates.length; i < n; i++) {
19
+ const element = candidates[i];
20
+ if (!wrapperTags.has(element.localName)) continue;
21
+ const parent = element.parentNode;
22
+ if (!parent) continue;
23
+ if (hasEmbedAttribute(element)) continue;
24
+ while (element.firstChild) parent.insertBefore(element.firstChild, element);
25
+ element.remove();
26
+ }
27
+ };
28
+ };
29
+ //#endregion
30
+ export { unwrapWrappers };
@@ -1,6 +1,6 @@
1
1
  import { autop } from "@wordpress/autop";
2
2
  //#region src/transforms/string/paragraphizePlainText.ts
3
- const hasHtmlRegex = /<[a-z][a-z0-9]*[\s>]/i;
3
+ const hasHtmlRegex = /<[a-z][a-z0-9]*[\s/>]/i;
4
4
  const paragraphizePlainText = () => {
5
5
  return (html) => {
6
6
  return hasHtmlRegex.test(html) ? html : autop(html);
@@ -0,0 +1,6 @@
1
+ import { StringTransform } from "../../types.js";
2
+
3
+ //#region src/transforms/string/unwrapCdataComments.d.ts
4
+ declare const unwrapCdataComments: StringTransform;
5
+ //#endregion
6
+ export { unwrapCdataComments };
@@ -0,0 +1,10 @@
1
+ //#region src/transforms/string/unwrapCdataComments.ts
2
+ const cdataWrapperRegex = /<!--\s*\[CDATA\[([\s\S]*?)\]\]\s*-->/g;
3
+ const unwrapCdataComments = () => {
4
+ return (html) => {
5
+ if (!html.includes("[CDATA[")) return html;
6
+ return html.replace(cdataWrapperRegex, (_match, inner) => inner);
7
+ };
8
+ };
9
+ //#endregion
10
+ export { unwrapCdataComments };
package/dist/types.d.ts CHANGED
@@ -1,52 +1,81 @@
1
1
  import { DiscoverResolveUrlFn } from "feedscout";
2
2
 
3
3
  //#region src/types.d.ts
4
+ type MaybePromise<T> = T | Promise<T>;
5
+ type EnclosureThumbnail = {
6
+ url: string;
7
+ width?: number;
8
+ height?: number;
9
+ };
4
10
  type Enclosure = {
5
11
  url: string;
6
12
  type?: string;
7
13
  medium?: string;
14
+ width?: number;
15
+ height?: number;
16
+ duration?: number;
17
+ title?: string;
18
+ description?: string;
19
+ thumbnails?: Array<EnclosureThumbnail>;
8
20
  };
9
21
  type ResolveUrlFn = DiscoverResolveUrlFn;
10
22
  type EmbedResolverResult = {
11
23
  provider: string;
24
+ id?: string;
12
25
  src: string;
13
26
  url?: string;
14
27
  thumbnail?: string;
15
- type?: 'video' | 'audio' | 'iframe';
16
28
  width?: number;
17
29
  height?: number;
30
+ title?: string;
31
+ description?: string;
18
32
  author?: string;
19
- text?: string;
33
+ avatar?: string;
34
+ duration?: number;
20
35
  };
36
+ type EnrichEmbedFn = (embeds: Array<{
37
+ provider: string;
38
+ id: string;
39
+ }>) => MaybePromise<Map<string, Partial<EmbedResolverResult>>>;
21
40
  type EmbedResolver = {
22
41
  selector: string;
23
- extract: (element: Element) => EmbedResolverResult | undefined;
42
+ extract: (element: Element) => MaybePromise<EmbedResolverResult | undefined>;
24
43
  };
25
44
  type UrlUnwrapper = (url: URL) => string | undefined;
45
+ type AssetType = 'image' | 'video' | 'audio';
46
+ type AssetProxyFn = (url: string, type: AssetType) => string | undefined;
26
47
  type TransformContext = {
27
48
  baseUrl?: string;
28
49
  enclosures?: Array<Enclosure>;
29
50
  embedResolvers: Array<EmbedResolver>;
30
51
  lazySrcAttributes: Array<string>;
52
+ lazySrcsetAttributes: Array<string>;
31
53
  trackingHosts: Array<string>;
32
54
  trackingPathSegments: Array<string>;
33
55
  urlUnwrappers: Array<UrlUnwrapper>;
34
56
  resolveUrlFn: ResolveUrlFn;
57
+ assetProxyFn?: AssetProxyFn;
58
+ enrichEmbedFn?: EnrichEmbedFn;
59
+ articleTitle?: string;
35
60
  };
36
- type DomTransform = (context: TransformContext) => (document: Document) => void;
37
- type StringTransform = (context: TransformContext) => (html: string) => string;
61
+ type DomTransform = (context: TransformContext) => (document: Document) => MaybePromise<void>;
62
+ type StringTransform = (context: TransformContext) => (html: string) => MaybePromise<string>;
38
63
  type TransformContentOptions = {
39
64
  baseUrl?: string;
40
65
  enclosures?: Array<Enclosure>;
41
66
  embedResolvers?: Array<EmbedResolver>;
42
67
  lazySrcAttributes?: Array<string>;
68
+ lazySrcsetAttributes?: Array<string>;
43
69
  trackingHosts?: Array<string>;
44
70
  trackingPathSegments?: Array<string>;
45
71
  urlUnwrappers?: Array<UrlUnwrapper>;
46
72
  resolveUrlFn?: ResolveUrlFn;
73
+ assetProxyFn?: AssetProxyFn;
74
+ enrichEmbedFn?: EnrichEmbedFn;
75
+ articleTitle?: string;
47
76
  stringTransforms?: Array<StringTransform>;
48
77
  domTransforms?: Array<DomTransform>;
49
78
  finalStringTransforms?: Array<StringTransform>;
50
79
  };
51
80
  //#endregion
52
- export { DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext, UrlUnwrapper };
81
+ export { AssetProxyFn, AssetType, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext, UrlUnwrapper };
@@ -1,6 +1,6 @@
1
1
  import { createParamExtractor } from "../utils.js";
2
2
  const unwrapGoogle = createParamExtractor({
3
- hosts: /^(?:www\.)?google\.(?:com|[a-z]{2,3}(?:\.[a-z]{2,3})?)$/,
3
+ hosts: /^(?:[a-z0-9-]+\.)*google\.(?:com|[a-z]{2,3}(?:\.[a-z]{2,3})?)$/,
4
4
  path: "/url",
5
5
  params: ["url", "q"]
6
6
  });
@@ -1,11 +1,15 @@
1
1
  import { isHostOf } from "feedscout/utils";
2
2
  //#region src/unwraps/googleNewsModern.ts
3
+ const articleIdRegex = /^\/(?:rss\/)?articles\/([\w-]+)/;
4
+ const base64UrlMinusRegex = /-/g;
5
+ const base64UrlUnderscoreRegex = /_/g;
6
+ const protobufFramingRegex = /\x08\x13".+?(https?:\/\/[^\xd2]+)\xd2\x01/;
3
7
  const unwrapGoogleNewsModern = (url) => {
4
8
  if (!isHostOf(url.href, "news.google.com")) return;
5
- const match = url.pathname.match(/^\/(?:rss\/)?articles\/([\w-]+)/);
9
+ const match = url.pathname.match(articleIdRegex);
6
10
  if (!match) return;
7
- const padded = match[1].replace(/-/g, "+").replace(/_/g, "/");
8
- return Buffer.from(padded, "base64").toString("latin1").match(/\x08\x13".+?(https?:\/\/[^\xd2]+)\xd2\x01/)?.[1];
11
+ const padded = match[1].replace(base64UrlMinusRegex, "+").replace(base64UrlUnderscoreRegex, "/");
12
+ return Buffer.from(padded, "base64").toString("latin1").match(protobufFramingRegex)?.[1];
9
13
  };
10
14
  //#endregion
11
15
  export { unwrapGoogleNewsModern };
package/package.json CHANGED
@@ -39,7 +39,7 @@
39
39
  "build": "tsdown src/index.ts src/defaults.ts --format esm --dts --clean --unbundle --no-fixed-extension"
40
40
  },
41
41
  "dependencies": {
42
- "@wordpress/autop": "^4.45.0",
42
+ "@wordpress/autop": "^4.46.0",
43
43
  "highlight.js": "^11.11.1",
44
44
  "linkedom": "^0.18.12",
45
45
  "linkifyjs": "^4.3.2",
@@ -54,5 +54,5 @@
54
54
  "kvalita": "^1.13.0",
55
55
  "tsdown": "^0.22.0"
56
56
  },
57
- "version": "1.0.0"
57
+ "version": "1.1.0"
58
58
  }