feedsweep 1.0.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/README.md +75 -19
  2. package/dist/common.d.ts +8 -8
  3. package/dist/common.js +65 -42
  4. package/dist/defaults.d.ts +2 -2
  5. package/dist/defaults.js +48 -15
  6. package/dist/embeds/youtube.js +2 -2
  7. package/dist/index.d.ts +17 -10
  8. package/dist/index.js +25 -14
  9. package/dist/parsers/linkedom.d.ts +4 -0
  10. package/dist/parsers/linkedom.js +38 -0
  11. package/dist/transforms/dom/convertBreaksToParagraphs.d.ts +6 -0
  12. package/dist/transforms/dom/convertBreaksToParagraphs.js +80 -0
  13. package/dist/transforms/dom/decodeDoubleEncodedTags.d.ts +6 -0
  14. package/dist/transforms/dom/decodeDoubleEncodedTags.js +30 -0
  15. package/dist/transforms/dom/enrichEmbedPlaceholders.d.ts +6 -0
  16. package/dist/transforms/dom/enrichEmbedPlaceholders.js +32 -0
  17. package/dist/transforms/dom/fixLazyImages.js +33 -13
  18. package/dist/transforms/dom/highlightCode.js +3 -2
  19. package/dist/transforms/dom/injectEnclosures.d.ts +6 -0
  20. package/dist/transforms/dom/injectEnclosures.js +66 -0
  21. package/dist/transforms/dom/mergeConsecutiveOneLinerPres.js +1 -1
  22. package/dist/transforms/dom/mergeFragmentedLists.d.ts +6 -0
  23. package/dist/transforms/dom/mergeFragmentedLists.js +84 -0
  24. package/dist/transforms/dom/proxyAssetUrls.d.ts +6 -0
  25. package/dist/transforms/dom/proxyAssetUrls.js +64 -0
  26. package/dist/transforms/dom/removeTrackingPixels.js +22 -25
  27. package/dist/transforms/dom/replaceEmbedsWithPlaceholders.js +24 -25
  28. package/dist/transforms/dom/replacePreLineBreaks.js +3 -4
  29. package/dist/transforms/dom/resolveRelativeUrls.js +44 -30
  30. package/dist/transforms/dom/stripComments.js +5 -15
  31. package/dist/transforms/dom/stripDeadAnchors.d.ts +6 -0
  32. package/dist/transforms/dom/stripDeadAnchors.js +20 -0
  33. package/dist/transforms/dom/stripDuplicateTitleHeading.d.ts +6 -0
  34. package/dist/transforms/dom/stripDuplicateTitleHeading.js +31 -0
  35. package/dist/transforms/dom/stripEmptyTags.d.ts +6 -0
  36. package/dist/transforms/dom/stripEmptyTags.js +53 -0
  37. package/dist/transforms/dom/stripInterBlockBreaks.js +28 -8
  38. package/dist/transforms/dom/stripParagraphBoundaryBreaks.js +26 -6
  39. package/dist/transforms/dom/stripTrackingParams.js +7 -6
  40. package/dist/transforms/dom/trimPreWhitespace.js +4 -3
  41. package/dist/transforms/dom/unwrapDoublyNestedLists.d.ts +6 -0
  42. package/dist/transforms/dom/unwrapDoublyNestedLists.js +41 -0
  43. package/dist/transforms/dom/unwrapRedirectUrls.js +4 -2
  44. package/dist/transforms/dom/unwrapWrappers.d.ts +6 -0
  45. package/dist/transforms/dom/unwrapWrappers.js +30 -0
  46. package/dist/transforms/string/paragraphizePlainText.js +1 -1
  47. package/dist/transforms/string/stripOversizedBase64Sources.d.ts +6 -0
  48. package/dist/transforms/string/stripOversizedBase64Sources.js +13 -0
  49. package/dist/transforms/string/unwrapCdataComments.d.ts +6 -0
  50. package/dist/transforms/string/unwrapCdataComments.js +10 -0
  51. package/dist/types.d.ts +37 -7
  52. package/dist/unwraps/google.js +1 -1
  53. package/dist/unwraps/googleNewsModern.js +7 -3
  54. package/package.json +15 -5
  55. package/dist/transforms/dom/injectEnclosureEmbedPlaceholders.d.ts +0 -6
  56. package/dist/transforms/dom/injectEnclosureEmbedPlaceholders.js +0 -33
  57. package/dist/transforms/dom/simplifyFigures.d.ts +0 -6
  58. package/dist/transforms/dom/simplifyFigures.js +0 -27
  59. package/dist/transforms/string/decodeDoubleEncodedTags.d.ts +0 -6
  60. package/dist/transforms/string/decodeDoubleEncodedTags.js +0 -23
  61. package/dist/transforms/string/stripEmptyTags.d.ts +0 -6
  62. package/dist/transforms/string/stripEmptyTags.js +0 -25
  63. package/dist/transforms/string/stripOrphanedClosingTags.d.ts +0 -6
  64. package/dist/transforms/string/stripOrphanedClosingTags.js +0 -28
  65. package/dist/transforms/string/unwrapWrappers.d.ts +0 -6
  66. package/dist/transforms/string/unwrapWrappers.js +0 -10
package/README.md CHANGED
@@ -11,15 +11,19 @@ Feedsweep takes raw feed item HTML and runs it through a pipeline that genuinely
11
11
  ## Installation
12
12
 
13
13
  ```bash
14
- npm install feedsweep
14
+ npm install feedsweep linkedom
15
15
  ```
16
16
 
17
+ `linkedom` is an optional peer dependency. You only need it if you use the bundled `parseHtml` helper — see [DOM library](#dom-library) for jsdom / happy-dom / browser-native alternatives.
18
+
17
19
  ## Quick Start
18
20
 
19
21
  ```typescript
20
22
  import { transformContent } from 'feedsweep'
23
+ import { parseHtml } from 'feedsweep/linkedom'
21
24
 
22
- const result = transformContent('<p>Check <img data-src="photo.jpg"> and visit /about</p>', {
25
+ const result = await transformContent('<p>Check <img data-src="photo.jpg"> and visit /about</p>', {
26
+ parseHtmlFn: parseHtml,
23
27
  baseUrl: 'https://example.com/post/1',
24
28
  })
25
29
  ```
@@ -30,41 +34,93 @@ Inventory of every transform exported from the package. Most are enabled by defa
30
34
 
31
35
  | Transform | Description |
32
36
  | --- | --- |
33
- | `stripOrphanedClosingTags` | Remove unmatched `</p>` / `</div>` close tags |
34
37
  | `decodeDoubleEncodedTags` | Decode `&lt;tag&gt;` back to `<tag>` in mixed content |
35
- | `unwrapWrappers` | Remove outer `<div>`, `<article>`, `<section>` wrappers |
36
- | `paragraphizePlainText` | Wrap plain text in `<p>` tags |
37
- | `stripEmptyTags` | Remove empty `<p>`, `<div>`, `<span>` and other tags |
38
- | `stripComments` | Remove HTML `<!-- comments -->` |
39
38
  | `fixLazyImages` | Move `data-src` / `data-original` to real `src` |
40
- | `resolveRelativeUrls` | Convert relative URLs to absolute using base URL |
41
- | `unwrapRedirectUrls` | Remove Google/Bing/Facebook/etc. redirect wrappers |
42
- | `stripTrackingParams` | Remove UTM and other tracking parameters |
43
- | `removeTrackingPixels` | Strip 1×1 tracking pixel images |
44
- | `stripInterBlockBreaks` | Remove `<br>` tags between block elements |
45
- | `stripParagraphBoundaryBreaks` | Remove `<br>` tags adjacent to paragraph boundaries |
46
- | `highlightCode` | Syntax-highlight `<code>` blocks with highlight.js |
47
39
  | `mergeConsecutiveOneLinerPres` | Merge consecutive single-line `<pre>` tags |
48
40
  | `replacePreLineBreaks` | Replace `<br>` with `\n` inside `<pre>` |
49
- | `trimPreWhitespace` | Remove common leading indentation from `<pre>` |
50
- | `linkifyUrls` | Wrap bare URLs in `<a>` tags |
41
+ | `stripInterBlockBreaks` | Remove `<br>` tags between block elements |
42
+ | `stripParagraphBoundaryBreaks` | Remove `<br>` tags adjacent to paragraph boundaries |
43
+ | `stripDuplicateTitleHeading` | Remove first `<h1>`–`<h6>` matching article title |
44
+ | `unwrapRedirectUrls` | Remove Google/Bing/Facebook/etc. redirect wrappers |
45
+ | `stripDeadAnchors` | Unwrap `<a>` with empty, `#`, or `javascript:` href |
46
+ | `removeTrackingPixels` | Strip 1×1 tracking pixel images |
47
+ | `stripTrackingParams` | Remove UTM and other tracking parameters |
48
+ | `convertBreaksToParagraphs` | Convert `<br><br>` runs into semantic `<p>` blocks |
49
+ | `injectEnclosures` | Inject feed enclosures into content as native `<audio>`/`<video>` or iframe placeholders |
51
50
  | `replaceEmbedsWithPlaceholders` | Convert `<iframe>` to embed placeholders |
52
- | `injectEnclosureEmbedPlaceholders` | Add audio/video enclosures to content |
53
- | `simplifyFigures` | Unwrap `<figure>` when the figcaption is empty or redundant |
51
+ | `enrichEmbedPlaceholders` | Populate placeholder metadata (`title`, `description`, `duration`, etc.) via a caller-supplied async fn. Opt-in; not in defaults |
52
+ | `proxyAssetUrls` | Rewrite image, video, and audio URLs through a caller-supplied proxy |
53
+ | `resolveRelativeUrls` | Convert relative URLs to absolute using base URL |
54
+ | `unwrapWrappers` | Remove outer `<div>`, `<article>`, `<section>` wrappers |
55
+ | `unwrapDoublyNestedLists` | Unwrap `<ul>`/`<ol>` that wrap a single `<li>` containing a same-type list |
56
+ | `mergeFragmentedLists` | Merge consecutive sibling `<ul>` / `<ol>` lists with matching attributes |
57
+ | `paragraphizePlainText` | Wrap plain text in `<p>` tags |
58
+ | `stripOversizedBase64Sources` | Drop base64 `src`/`srcset`/`poster` payloads larger than 50 KB before parsing |
59
+ | `linkifyUrls` | Wrap bare URLs in `<a>` tags |
60
+ | `trimPreWhitespace` | Remove common leading indentation from `<pre>` |
61
+ | `highlightCode` | Syntax-highlight `<code>` blocks with highlight.js |
62
+ | `stripEmptyTags` | Remove empty `<p>`, `<div>`, `<span>` and other tags |
63
+ | `stripComments` | Remove HTML `<!-- comments -->` |
64
+ | `unwrapCdataComments` | Strip malformed `<!--[CDATA[ … ]]-->` wrappers before parsing so the wrapped article reaches the DOM as real HTML |
54
65
 
55
66
  ## Options
56
67
 
57
68
  ```typescript
58
69
  import { fixLazyImages, resolveRelativeUrls, transformContent } from 'feedsweep'
70
+ import { parseHtml } from 'feedsweep/linkedom'
59
71
 
60
72
  const result = transformContent(html, {
73
+ // Required: function that turns an HTML string into a `Document`. See "DOM library".
74
+ parseHtmlFn: parseHtml,
61
75
  // Base URL for resolving relative URLs.
62
76
  baseUrl: 'https://example.com/post/1',
63
77
  // Feed item enclosures (audio/video).
64
78
  enclosures: [{ url: 'https://example.com/audio.mp3', type: 'audio/mpeg' }],
79
+ // Route image/video/audio URLs through a proxy. Return `undefined` to leave a URL untouched.
80
+ assetProxyFn: (url, type) => `https://proxy.example.com/?type=${type}&url=${encodeURIComponent(url)}`,
81
+ // Populate embed placeholder metadata from a remote source (e.g. YouTube oEmbed).
82
+ enrichEmbedFn: async (embeds) => {
83
+ return new Map(embeds.map(({ provider, id }) => [`${provider}:${id}`, { title: '…' }]))
84
+ },
65
85
  // Run a custom DOM transform pipeline (omit to use defaults).
66
86
  domTransforms: [fixLazyImages, resolveRelativeUrls],
67
87
  })
68
88
  ```
69
89
 
70
- The `stringTransforms`, `domTransforms`, and `finalStringTransforms` options each fully replace the corresponding default phase when provided. Every transform is also exported individually from `feedsweep`, so you can compose any pipeline — list them explicitly to build from scratch, or spread `defaultDomTransforms` (etc.) from `feedsweep/defaults` to extend or filter the defaults.
90
+ The `stringTransforms` and `domTransforms` options each fully replace the corresponding default phase when provided. Every transform is also exported individually from `feedsweep`, so you can compose any pipeline — list them explicitly to build from scratch, or spread `defaultDomTransforms` (etc.) from `feedsweep/defaults` to extend or filter the defaults.
91
+
92
+ ## DOM library
93
+
94
+ Feedsweep is parser-agnostic. You provide `parseHtmlFn` — a function that turns an HTML string into a `Document`. Use any DOM library that produces a standards-compliant `Document`.
95
+
96
+ ```typescript
97
+ // linkedom (recommended default)
98
+ import { transformContent } from 'feedsweep'
99
+ import { parseHtml } from 'feedsweep/linkedom'
100
+
101
+ await transformContent(html, { parseHtmlFn: parseHtml, baseUrl })
102
+
103
+ // jsdom
104
+ import { transformContent } from 'feedsweep'
105
+ import { JSDOM } from 'jsdom'
106
+
107
+ await transformContent(html, {
108
+ parseHtmlFn: (raw) => new JSDOM(`<!doctype html><body>${raw}</body>`).window.document,
109
+ baseUrl,
110
+ })
111
+
112
+ // happy-dom
113
+ import { transformContent } from 'feedsweep'
114
+ import { Window } from 'happy-dom'
115
+
116
+ await transformContent(html, {
117
+ parseHtmlFn: (raw) => {
118
+ const window = new Window()
119
+ window.document.body.innerHTML = raw
120
+ return window.document
121
+ },
122
+ baseUrl,
123
+ })
124
+ ```
125
+
126
+ The bundled `feedsweep/linkedom` parser bakes in two workarounds for linkedom-specific spec violations (attribute case-folding and SVG XML mode). jsdom and happy-dom do not need them.
package/dist/common.d.ts CHANGED
@@ -1,11 +1,11 @@
1
- import { EmbedResolverResult } from "./types.js";
1
+ import { EmbedResolverResult, MaybePromise } from "./types.js";
2
2
 
3
3
  //#region src/common.d.ts
4
- declare const stripOversizedBase64Sources: (html: string, maxSize: number) => string;
5
- declare const parseFragment: (html: string) => Document;
6
- declare const transformHtml: (html: string, transform: (document: Document) => void) => string;
7
- declare const applyDomTransforms: (html: string, transforms: Array<(document: Document) => void>) => string;
8
- declare const applyStringTransforms: (html: string, transforms: Array<(html: string) => string>) => string;
9
- declare const createEmbedPlaceholder: (document: Document, src: string, type: "video" | "audio" | "iframe", metadata?: Partial<EmbedResolverResult>) => HTMLElement;
4
+ declare const applyDomTransforms: (document: Document, transforms: Array<(document: Document) => MaybePromise<void>>) => Promise<string>;
5
+ declare const applyStringTransforms: (html: string, transforms: Array<(html: string) => MaybePromise<string>>) => Promise<string>;
6
+ declare const applyEmbedMetadata: (element: HTMLElement, metadata: Partial<EmbedResolverResult>, options?: {
7
+ setIfMissing?: boolean;
8
+ }) => void;
9
+ declare const createEmbedPlaceholder: (document: Document, src: string, metadata?: Partial<EmbedResolverResult>) => HTMLElement;
10
10
  //#endregion
11
- export { applyDomTransforms, applyStringTransforms, createEmbedPlaceholder, parseFragment, stripOversizedBase64Sources, transformHtml };
11
+ export { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder };
package/dist/common.js CHANGED
@@ -1,39 +1,27 @@
1
+ import { coerceNumber } from "./utils.js";
1
2
  import { resolveUrl } from "feedcanon";
2
- import { parseHTML } from "linkedom";
3
3
  //#region src/common.ts
4
4
  const Node = {
5
5
  ELEMENT_NODE: 1,
6
6
  TEXT_NODE: 3,
7
7
  COMMENT_NODE: 8
8
8
  };
9
- const base64SrcRegex = /((?:src|srcset|poster)=["'])data:[^"']*;base64,[^"']*(["'])/g;
9
+ const NodeFilter = {
10
+ SHOW_ELEMENT: 1,
11
+ SHOW_TEXT: 4,
12
+ SHOW_COMMENT: 128
13
+ };
10
14
  const safeThumbnailDataUrlRegex = /^data:image\/(png|jpe?g|gif|webp|avif);/i;
11
15
  const isSafeThumbnailUrl = (url) => {
12
16
  return resolveUrl(url) !== void 0 || safeThumbnailDataUrlRegex.test(url);
13
17
  };
14
- const stripOversizedBase64Sources = (html, maxSize) => {
15
- return html.replace(base64SrcRegex, (match, prefix, suffix) => {
16
- if (match.length < maxSize) return match;
17
- return `${prefix}${suffix}`;
18
- });
19
- };
20
- const parseFragment = (html) => {
21
- const { document } = parseHTML(`<!doctype html><html><head></head><body>${html}</body></html>`);
22
- return document;
23
- };
24
- const transformHtml = (html, transform) => {
25
- const document = parseFragment(html);
26
- transform(document);
18
+ const applyDomTransforms = async (document, transforms) => {
19
+ for (const transform of transforms) await transform(document);
27
20
  return document.body.innerHTML;
28
21
  };
29
- const applyDomTransforms = (html, transforms) => {
30
- const document = parseFragment(stripOversizedBase64Sources(html, 50 * 1024));
31
- for (const transform of transforms) transform(document);
32
- return document.body.innerHTML;
33
- };
34
- const applyStringTransforms = (html, transforms) => {
22
+ const applyStringTransforms = async (html, transforms) => {
35
23
  let output = html;
36
- for (const transform of transforms) output = transform(output);
24
+ for (const transform of transforms) output = await transform(output);
37
25
  return output;
38
26
  };
39
27
  const blockElements = new Set([
@@ -71,10 +59,10 @@ const blockElements = new Set([
71
59
  "ul"
72
60
  ]);
73
61
  const isWhitespaceText = (node) => {
74
- return node.nodeType === Node.TEXT_NODE && !(node.textContent ?? "").trim();
62
+ return node.nodeType === Node.TEXT_NODE && !node.textContent?.trim();
75
63
  };
76
64
  const isBr = (node) => {
77
- return node.nodeType === Node.ELEMENT_NODE && node.tagName.toLowerCase() === "br";
65
+ return node.nodeType === Node.ELEMENT_NODE && node.localName === "br";
78
66
  };
79
67
  const isComment = (node) => {
80
68
  return node.nodeType === Node.COMMENT_NODE;
@@ -83,28 +71,63 @@ const isSkippable = (node) => {
83
71
  return isWhitespaceText(node) || isBr(node) || isComment(node);
84
72
  };
85
73
  const isBlockElement = (node) => {
86
- return node.nodeType === Node.ELEMENT_NODE && blockElements.has(node.tagName.toLowerCase());
74
+ return node.nodeType === Node.ELEMENT_NODE && blockElements.has(node.localName);
87
75
  };
88
- const unwrapOuterTag = (html, pattern) => {
89
- let result = html.trim();
90
- let match = pattern.exec(result);
91
- while (match) {
92
- result = match[3].trim();
93
- match = pattern.exec(result);
76
+ const hasAncestorWithTagName = (node, tagSet, stopAt) => {
77
+ let ancestor = node.parentNode;
78
+ while (ancestor !== null && ancestor !== stopAt) {
79
+ if (ancestor.nodeType === Node.ELEMENT_NODE && tagSet.has(ancestor.localName)) return true;
80
+ ancestor = ancestor.parentNode;
94
81
  }
95
- return result;
82
+ return false;
83
+ };
84
+ const styleWidthRegex = /(?:^|;)\s*width\s*:\s*([0-9]*\.?[0-9]+)\s*(?:px)?\s*(?:;|$)/i;
85
+ const styleHeightRegex = /(?:^|;)\s*height\s*:\s*([0-9]*\.?[0-9]+)\s*(?:px)?\s*(?:;|$)/i;
86
+ const getDimensions = (element) => {
87
+ const width = coerceNumber(element.getAttribute("width"));
88
+ const height = coerceNumber(element.getAttribute("height"));
89
+ if (width !== void 0 && height !== void 0) return {
90
+ width,
91
+ height
92
+ };
93
+ const style = element.getAttribute("style");
94
+ if (!style) return {
95
+ width,
96
+ height
97
+ };
98
+ const fromStyle = (regex) => {
99
+ const match = regex.exec(style);
100
+ return match ? coerceNumber(match[1]) : void 0;
101
+ };
102
+ return {
103
+ width: width ?? fromStyle(styleWidthRegex),
104
+ height: height ?? fromStyle(styleHeightRegex)
105
+ };
106
+ };
107
+ const applyEmbedMetadata = (element, metadata, options) => {
108
+ const setIfMissing = options?.setIfMissing ?? false;
109
+ const set = (name, value) => {
110
+ if (setIfMissing && element.hasAttribute(name)) return;
111
+ element.setAttribute(name, value);
112
+ };
113
+ if (metadata.provider) set("data-embed-provider", metadata.provider);
114
+ if (metadata.id) set("data-embed-id", metadata.id);
115
+ if (metadata.src) set("data-embed-src", metadata.src);
116
+ if (metadata.url) set("data-embed-url", metadata.url);
117
+ if (metadata.thumbnail && isSafeThumbnailUrl(metadata.thumbnail)) set("data-embed-thumbnail", metadata.thumbnail);
118
+ if (metadata.width) set("data-embed-width", String(metadata.width));
119
+ if (metadata.height) set("data-embed-height", String(metadata.height));
120
+ if (metadata.title) set("data-embed-title", metadata.title);
121
+ if (metadata.description) set("data-embed-description", metadata.description);
122
+ if (metadata.author) set("data-embed-author", metadata.author);
123
+ if (metadata.avatar && isSafeThumbnailUrl(metadata.avatar)) set("data-embed-avatar", metadata.avatar);
124
+ if (metadata.duration) set("data-embed-duration", String(metadata.duration));
96
125
  };
97
- const createEmbedPlaceholder = (document, src, type, metadata) => {
126
+ const createEmbedPlaceholder = (document, src, metadata) => {
98
127
  const element = document.createElement("div");
99
- element.setAttribute("data-embed", metadata?.type ?? type);
128
+ element.setAttribute("data-embed", "iframe");
100
129
  element.setAttribute("data-embed-src", metadata?.src ?? src);
101
- if (metadata?.provider) element.setAttribute("data-embed-provider", metadata.provider);
102
- if (metadata?.url) element.setAttribute("data-embed-url", metadata.url);
103
- if (metadata?.thumbnail && isSafeThumbnailUrl(metadata.thumbnail)) element.setAttribute("data-embed-thumbnail", metadata.thumbnail);
104
- if (metadata?.width) element.setAttribute("data-embed-width", String(metadata.width));
105
- if (metadata?.height) element.setAttribute("data-embed-height", String(metadata.height));
106
- if (metadata?.author) element.setAttribute("data-embed-author", metadata.author);
107
- if (metadata?.text) element.setAttribute("data-embed-text", metadata.text);
130
+ if (metadata) applyEmbedMetadata(element, metadata);
108
131
  const fallbackUrl = metadata?.url ?? metadata?.src ?? src;
109
132
  const link = document.createElement("a");
110
133
  link.setAttribute("href", fallbackUrl);
@@ -113,4 +136,4 @@ const createEmbedPlaceholder = (document, src, type, metadata) => {
113
136
  return element;
114
137
  };
115
138
  //#endregion
116
- export { Node, applyDomTransforms, applyStringTransforms, createEmbedPlaceholder, isBlockElement, isBr, isSkippable, parseFragment, stripOversizedBase64Sources, transformHtml, unwrapOuterTag };
139
+ export { Node, NodeFilter, applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder, getDimensions, hasAncestorWithTagName, isBlockElement, isBr, isSkippable, isWhitespaceText };
@@ -3,12 +3,12 @@ import { DomTransform, EmbedResolver, ResolveUrlFn, StringTransform, UrlUnwrappe
3
3
  //#region src/defaults.d.ts
4
4
  declare const defaultStringTransforms: Array<StringTransform>;
5
5
  declare const defaultDomTransforms: Array<DomTransform>;
6
- declare const defaultFinalStringTransforms: Array<StringTransform>;
7
6
  declare const defaultEmbedResolvers: Array<EmbedResolver>;
8
7
  declare const defaultResolveUrlFn: ResolveUrlFn;
9
8
  declare const defaultLazySrcAttributes: string[];
9
+ declare const defaultLazySrcsetAttributes: string[];
10
10
  declare const defaultTrackingHosts: string[];
11
11
  declare const defaultTrackingPathSegments: string[];
12
12
  declare const defaultUrlUnwrappers: Array<UrlUnwrapper>;
13
13
  //#endregion
14
- export { defaultDomTransforms, defaultEmbedResolvers, defaultFinalStringTransforms, defaultLazySrcAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
14
+ export { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
package/dist/defaults.js CHANGED
@@ -1,24 +1,31 @@
1
1
  import { youtubeEmbedResolver } from "./embeds/youtube.js";
2
+ import { convertBreaksToParagraphs } from "./transforms/dom/convertBreaksToParagraphs.js";
3
+ import { decodeDoubleEncodedTags } from "./transforms/dom/decodeDoubleEncodedTags.js";
2
4
  import { fixLazyImages } from "./transforms/dom/fixLazyImages.js";
3
5
  import { highlightCode } from "./transforms/dom/highlightCode.js";
4
- import { injectEnclosureEmbedPlaceholders } from "./transforms/dom/injectEnclosureEmbedPlaceholders.js";
6
+ import { injectEnclosures } from "./transforms/dom/injectEnclosures.js";
5
7
  import { linkifyUrls } from "./transforms/dom/linkifyUrls.js";
6
8
  import { mergeConsecutiveOneLinerPres } from "./transforms/dom/mergeConsecutiveOneLinerPres.js";
9
+ import { mergeFragmentedLists } from "./transforms/dom/mergeFragmentedLists.js";
10
+ import { proxyAssetUrls } from "./transforms/dom/proxyAssetUrls.js";
7
11
  import { removeTrackingPixels } from "./transforms/dom/removeTrackingPixels.js";
8
12
  import { replaceEmbedsWithPlaceholders } from "./transforms/dom/replaceEmbedsWithPlaceholders.js";
9
13
  import { replacePreLineBreaks } from "./transforms/dom/replacePreLineBreaks.js";
10
14
  import { resolveRelativeUrls } from "./transforms/dom/resolveRelativeUrls.js";
11
15
  import { stripComments } from "./transforms/dom/stripComments.js";
16
+ import { stripDeadAnchors } from "./transforms/dom/stripDeadAnchors.js";
17
+ import { stripDuplicateTitleHeading } from "./transforms/dom/stripDuplicateTitleHeading.js";
18
+ import { stripEmptyTags } from "./transforms/dom/stripEmptyTags.js";
12
19
  import { stripInterBlockBreaks } from "./transforms/dom/stripInterBlockBreaks.js";
13
20
  import { stripParagraphBoundaryBreaks } from "./transforms/dom/stripParagraphBoundaryBreaks.js";
14
21
  import { stripTrackingParams } from "./transforms/dom/stripTrackingParams.js";
15
22
  import { trimPreWhitespace } from "./transforms/dom/trimPreWhitespace.js";
23
+ import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedLists.js";
16
24
  import { unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
17
- import { decodeDoubleEncodedTags } from "./transforms/string/decodeDoubleEncodedTags.js";
25
+ import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
18
26
  import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
19
- import { stripEmptyTags } from "./transforms/string/stripEmptyTags.js";
20
- import { stripOrphanedClosingTags } from "./transforms/string/stripOrphanedClosingTags.js";
21
- import { unwrapWrappers } from "./transforms/string/unwrapWrappers.js";
27
+ import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
28
+ import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
22
29
  import { unwrapBing } from "./unwraps/bing.js";
23
30
  import { unwrapFacebookShim } from "./unwraps/facebook.js";
24
31
  import { unwrapGoogle } from "./unwraps/google.js";
@@ -34,30 +41,36 @@ import { unwrapYouTube } from "./unwraps/youtube.js";
34
41
  import { resolveUrl } from "feedcanon";
35
42
  //#region src/defaults.ts
36
43
  const defaultStringTransforms = [
37
- stripOrphanedClosingTags,
38
- decodeDoubleEncodedTags,
39
- unwrapWrappers,
40
- paragraphizePlainText,
41
- stripEmptyTags
44
+ stripOversizedBase64Sources,
45
+ unwrapCdataComments,
46
+ paragraphizePlainText
42
47
  ];
43
48
  const defaultDomTransforms = [
49
+ decodeDoubleEncodedTags,
44
50
  stripComments,
51
+ unwrapDoublyNestedLists,
52
+ stripDuplicateTitleHeading,
45
53
  fixLazyImages,
46
54
  resolveRelativeUrls,
47
55
  unwrapRedirectUrls,
56
+ stripDeadAnchors,
48
57
  stripTrackingParams,
49
58
  removeTrackingPixels,
59
+ convertBreaksToParagraphs,
50
60
  stripInterBlockBreaks,
51
61
  stripParagraphBoundaryBreaks,
62
+ mergeFragmentedLists,
52
63
  highlightCode,
53
64
  mergeConsecutiveOneLinerPres,
54
65
  replacePreLineBreaks,
55
66
  trimPreWhitespace,
56
67
  linkifyUrls,
57
68
  replaceEmbedsWithPlaceholders,
58
- injectEnclosureEmbedPlaceholders
69
+ injectEnclosures,
70
+ proxyAssetUrls,
71
+ unwrapWrappers,
72
+ stripEmptyTags
59
73
  ];
60
- const defaultFinalStringTransforms = [stripEmptyTags];
61
74
  const defaultEmbedResolvers = [youtubeEmbedResolver];
62
75
  const defaultResolveUrlFn = (url, baseUrl) => resolveUrl(url, baseUrl);
63
76
  const defaultLazySrcAttributes = [
@@ -75,9 +88,27 @@ const defaultLazySrcAttributes = [
75
88
  "data-image-src",
76
89
  "data-canonical-src",
77
90
  "data-img-url",
91
+ "nitro-lazy-src",
78
92
  "data-orig",
79
93
  "data-runner-src"
80
94
  ];
95
+ const defaultLazySrcsetAttributes = [
96
+ "data-srcset",
97
+ "data-tf-srcset",
98
+ "data-lazy-srcset",
99
+ "data-image-srcset",
100
+ "data-modal-srcset",
101
+ "data-splide-lazy-srcset",
102
+ "data-alt-srcset",
103
+ "fifu-data-srcset",
104
+ "data-thumb-srcset",
105
+ "data-vp-popup-img-srcset",
106
+ "data-original-srcset",
107
+ "data-pswp-srcset",
108
+ "data-nectar-img-srcset",
109
+ "nitro-lazy-srcset",
110
+ "data-flickity-lazyload-srcset"
111
+ ];
81
112
  const defaultTrackingHosts = [
82
113
  "feedsportal.com",
83
114
  "stats.wordpress.com",
@@ -102,12 +133,14 @@ const defaultTrackingHosts = [
102
133
  "quantserve.com",
103
134
  "chartbeat.com",
104
135
  "moatads.com",
105
- "sentry.io"
136
+ "sentry.io",
137
+ "hubspot.com"
106
138
  ];
107
139
  const defaultTrackingPathSegments = [
108
140
  "pixel",
109
141
  "beacon",
110
- "count"
142
+ "count",
143
+ "impression"
111
144
  ];
112
145
  const defaultUrlUnwrappers = [
113
146
  unwrapBing,
@@ -124,4 +157,4 @@ const defaultUrlUnwrappers = [
124
157
  unwrapRedditOut
125
158
  ];
126
159
  //#endregion
127
- export { defaultDomTransforms, defaultEmbedResolvers, defaultFinalStringTransforms, defaultLazySrcAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
160
+ export { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
@@ -32,10 +32,10 @@ const youtubeResolveEmbed = (url) => {
32
32
  if (!videoId) return;
33
33
  return {
34
34
  provider: "youtube",
35
+ id: videoId,
35
36
  src: `https://www.youtube-nocookie.com/embed/${videoId}`,
36
37
  url: `https://www.youtube.com/watch?v=${videoId}`,
37
- thumbnail: composeThumbnailUrl(videoId),
38
- type: "iframe"
38
+ thumbnail: composeThumbnailUrl(videoId)
39
39
  };
40
40
  };
41
41
  const youtubeEmbedResolver = {
package/dist/index.d.ts CHANGED
@@ -1,28 +1,35 @@
1
- import { DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext } from "./types.js";
1
+ import { AssetProxyFn, AssetType, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ParseHtmlFn, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext } from "./types.js";
2
2
  import { defaultResolveUrlFn } from "./defaults.js";
3
- import { applyDomTransforms, applyStringTransforms, createEmbedPlaceholder, parseFragment, stripOversizedBase64Sources, transformHtml } from "./common.js";
3
+ import { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder } from "./common.js";
4
4
  import { composeThumbnailUrl, extractVideoId, youtubeEmbedResolver, youtubeResolveEmbed } from "./embeds/youtube.js";
5
+ import { convertBreaksToParagraphs } from "./transforms/dom/convertBreaksToParagraphs.js";
6
+ import { decodeDoubleEncodedTags } from "./transforms/dom/decodeDoubleEncodedTags.js";
7
+ import { enrichEmbedPlaceholders } from "./transforms/dom/enrichEmbedPlaceholders.js";
5
8
  import { fixLazyImages } from "./transforms/dom/fixLazyImages.js";
6
9
  import { detectLanguage, highlightCode } from "./transforms/dom/highlightCode.js";
7
- import { injectEnclosureEmbedPlaceholders } from "./transforms/dom/injectEnclosureEmbedPlaceholders.js";
10
+ import { injectEnclosures } from "./transforms/dom/injectEnclosures.js";
8
11
  import { linkifyUrls } from "./transforms/dom/linkifyUrls.js";
9
12
  import { mergeConsecutiveOneLinerPres } from "./transforms/dom/mergeConsecutiveOneLinerPres.js";
13
+ import { mergeFragmentedLists } from "./transforms/dom/mergeFragmentedLists.js";
14
+ import { proxyAssetUrls } from "./transforms/dom/proxyAssetUrls.js";
10
15
  import { removeTrackingPixels } from "./transforms/dom/removeTrackingPixels.js";
11
16
  import { replaceEmbedsWithPlaceholders } from "./transforms/dom/replaceEmbedsWithPlaceholders.js";
12
17
  import { replacePreLineBreaks } from "./transforms/dom/replacePreLineBreaks.js";
13
18
  import { resolveRelativeUrls } from "./transforms/dom/resolveRelativeUrls.js";
14
- import { simplifyFigures } from "./transforms/dom/simplifyFigures.js";
15
19
  import { stripComments } from "./transforms/dom/stripComments.js";
20
+ import { stripDeadAnchors } from "./transforms/dom/stripDeadAnchors.js";
21
+ import { stripDuplicateTitleHeading } from "./transforms/dom/stripDuplicateTitleHeading.js";
22
+ import { stripEmptyTags } from "./transforms/dom/stripEmptyTags.js";
16
23
  import { stripInterBlockBreaks } from "./transforms/dom/stripInterBlockBreaks.js";
17
24
  import { stripParagraphBoundaryBreaks } from "./transforms/dom/stripParagraphBoundaryBreaks.js";
18
25
  import { stripTrackingParams } from "./transforms/dom/stripTrackingParams.js";
19
26
  import { trimPreWhitespace } from "./transforms/dom/trimPreWhitespace.js";
27
+ import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedLists.js";
20
28
  import { extractRedirectTarget, unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
21
- import { decodeDoubleEncodedTags } from "./transforms/string/decodeDoubleEncodedTags.js";
29
+ import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
22
30
  import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
23
- import { stripEmptyTags } from "./transforms/string/stripEmptyTags.js";
24
- import { stripOrphanedClosingTags } from "./transforms/string/stripOrphanedClosingTags.js";
25
- import { unwrapWrappers } from "./transforms/string/unwrapWrappers.js";
31
+ import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
32
+ import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
26
33
  import { unwrapAceml } from "./unwraps/aceml.js";
27
34
  import { unwrapAdjust } from "./unwraps/adjust.js";
28
35
  import { unwrapAmazonAffiliate } from "./unwraps/amazonAffiliate.js";
@@ -100,6 +107,6 @@ import { unwrapZhihu } from "./unwraps/zhihu.js";
100
107
  import { ParamExtractorConfig, chooseBaseUrl, coerceNumber, createParamExtractor } from "./utils.js";
101
108
 
102
109
  //#region src/index.d.ts
103
- declare const transformContent: (html: string, options?: TransformContentOptions) => string;
110
+ declare const transformContent: (html: string, options: TransformContentOptions) => Promise<string>;
104
111
  //#endregion
105
- export { type DomTransform, type EmbedResolver, type EmbedResolverResult, type Enclosure, type ParamExtractorConfig, type ResolveUrlFn, type StringTransform, type TransformContentOptions, type TransformContext, applyDomTransforms, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode, injectEnclosureEmbedPlaceholders, linkifyUrls, mergeConsecutiveOneLinerPres, paragraphizePlainText, parseFragment, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, simplifyFigures, stripComments, stripEmptyTags, stripInterBlockBreaks, stripOrphanedClosingTags, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, transformContent, transformHtml, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, youtubeEmbedResolver, youtubeResolveEmbed };
112
+ export { type AssetProxyFn, type AssetType, type DomTransform, type EmbedResolver, type EmbedResolverResult, type Enclosure, type EnrichEmbedFn, type MaybePromise, type ParamExtractorConfig, type ParseHtmlFn, type ResolveUrlFn, type StringTransform, type TransformContentOptions, type TransformContext, applyDomTransforms, applyEmbedMetadata, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBreaksToParagraphs, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode, injectEnclosures, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, paragraphizePlainText, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInterBlockBreaks, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, transformContent, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, youtubeEmbedResolver, youtubeResolveEmbed };
package/dist/index.js CHANGED
@@ -1,26 +1,33 @@
1
- import { applyDomTransforms, applyStringTransforms, createEmbedPlaceholder, parseFragment, stripOversizedBase64Sources, transformHtml } from "./common.js";
1
+ import { chooseBaseUrl, coerceNumber, createParamExtractor } from "./utils.js";
2
+ import { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder } from "./common.js";
2
3
  import { composeThumbnailUrl, extractVideoId, youtubeEmbedResolver, youtubeResolveEmbed } from "./embeds/youtube.js";
4
+ import { convertBreaksToParagraphs } from "./transforms/dom/convertBreaksToParagraphs.js";
5
+ import { decodeDoubleEncodedTags } from "./transforms/dom/decodeDoubleEncodedTags.js";
3
6
  import { fixLazyImages } from "./transforms/dom/fixLazyImages.js";
4
7
  import { detectLanguage, highlightCode } from "./transforms/dom/highlightCode.js";
5
- import { injectEnclosureEmbedPlaceholders } from "./transforms/dom/injectEnclosureEmbedPlaceholders.js";
8
+ import { injectEnclosures } from "./transforms/dom/injectEnclosures.js";
6
9
  import { linkifyUrls } from "./transforms/dom/linkifyUrls.js";
7
10
  import { mergeConsecutiveOneLinerPres } from "./transforms/dom/mergeConsecutiveOneLinerPres.js";
11
+ import { mergeFragmentedLists } from "./transforms/dom/mergeFragmentedLists.js";
12
+ import { proxyAssetUrls } from "./transforms/dom/proxyAssetUrls.js";
8
13
  import { removeTrackingPixels } from "./transforms/dom/removeTrackingPixels.js";
9
- import { chooseBaseUrl, coerceNumber, createParamExtractor } from "./utils.js";
10
14
  import { replaceEmbedsWithPlaceholders } from "./transforms/dom/replaceEmbedsWithPlaceholders.js";
11
15
  import { replacePreLineBreaks } from "./transforms/dom/replacePreLineBreaks.js";
12
16
  import { resolveRelativeUrls } from "./transforms/dom/resolveRelativeUrls.js";
13
17
  import { stripComments } from "./transforms/dom/stripComments.js";
18
+ import { stripDeadAnchors } from "./transforms/dom/stripDeadAnchors.js";
19
+ import { stripDuplicateTitleHeading } from "./transforms/dom/stripDuplicateTitleHeading.js";
20
+ import { stripEmptyTags } from "./transforms/dom/stripEmptyTags.js";
14
21
  import { stripInterBlockBreaks } from "./transforms/dom/stripInterBlockBreaks.js";
15
22
  import { stripParagraphBoundaryBreaks } from "./transforms/dom/stripParagraphBoundaryBreaks.js";
16
23
  import { stripTrackingParams } from "./transforms/dom/stripTrackingParams.js";
17
24
  import { trimPreWhitespace } from "./transforms/dom/trimPreWhitespace.js";
25
+ import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedLists.js";
18
26
  import { extractRedirectTarget, unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
19
- import { decodeDoubleEncodedTags } from "./transforms/string/decodeDoubleEncodedTags.js";
27
+ import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
20
28
  import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
21
- import { stripEmptyTags } from "./transforms/string/stripEmptyTags.js";
22
- import { stripOrphanedClosingTags } from "./transforms/string/stripOrphanedClosingTags.js";
23
- import { unwrapWrappers } from "./transforms/string/unwrapWrappers.js";
29
+ import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
30
+ import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
24
31
  import { unwrapBing } from "./unwraps/bing.js";
25
32
  import { unwrapFacebookShim } from "./unwraps/facebook.js";
26
33
  import { unwrapGoogle } from "./unwraps/google.js";
@@ -33,8 +40,8 @@ import { unwrapRedditOut } from "./unwraps/redditOut.js";
33
40
  import { unwrapVkAway } from "./unwraps/vkAway.js";
34
41
  import { unwrapYahooSearch } from "./unwraps/yahooSearch.js";
35
42
  import { unwrapYouTube } from "./unwraps/youtube.js";
36
- import { defaultDomTransforms, defaultEmbedResolvers, defaultFinalStringTransforms, defaultLazySrcAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers } from "./defaults.js";
37
- import { simplifyFigures } from "./transforms/dom/simplifyFigures.js";
43
+ import { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers } from "./defaults.js";
44
+ import { enrichEmbedPlaceholders } from "./transforms/dom/enrichEmbedPlaceholders.js";
38
45
  import { unwrapAceml } from "./unwraps/aceml.js";
39
46
  import { unwrapAdjust } from "./unwraps/adjust.js";
40
47
  import { unwrapAmazonAffiliate } from "./unwraps/amazonAffiliate.js";
@@ -98,21 +105,25 @@ import { unwrapWebArchive } from "./unwraps/webArchive.js";
98
105
  import { unwrapYandexTurbo } from "./unwraps/yandexTurbo.js";
99
106
  import { unwrapZhihu } from "./unwraps/zhihu.js";
100
107
  //#region src/index.ts
101
- const transformContent = (html, options = {}) => {
108
+ const transformContent = async (html, options) => {
102
109
  const context = {
103
110
  baseUrl: options.baseUrl,
104
111
  enclosures: options.enclosures,
105
112
  embedResolvers: options.embedResolvers ?? defaultEmbedResolvers,
106
113
  lazySrcAttributes: options.lazySrcAttributes ?? defaultLazySrcAttributes,
114
+ lazySrcsetAttributes: options.lazySrcsetAttributes ?? defaultLazySrcsetAttributes,
107
115
  trackingHosts: options.trackingHosts ?? defaultTrackingHosts,
108
116
  trackingPathSegments: options.trackingPathSegments ?? defaultTrackingPathSegments,
109
117
  urlUnwrappers: options.urlUnwrappers ?? defaultUrlUnwrappers,
110
- resolveUrlFn: options.resolveUrlFn ?? defaultResolveUrlFn
118
+ resolveUrlFn: options.resolveUrlFn ?? defaultResolveUrlFn,
119
+ assetProxyFn: options.assetProxyFn,
120
+ enrichEmbedFn: options.enrichEmbedFn,
121
+ articleTitle: options.articleTitle
111
122
  };
112
123
  const stringFns = options.stringTransforms ?? defaultStringTransforms;
113
124
  const domFns = options.domTransforms ?? defaultDomTransforms;
114
- const finalFns = options.finalStringTransforms ?? defaultFinalStringTransforms;
115
- return applyStringTransforms(applyDomTransforms(applyStringTransforms(html, stringFns.map((transform) => transform(context))), domFns.map((transform) => transform(context))), finalFns.map((transform) => transform(context)));
125
+ const afterString = await applyStringTransforms(html, stringFns.map((transform) => transform(context)));
126
+ return await applyDomTransforms(await options.parseHtmlFn(afterString), domFns.map((transform) => transform(context)));
116
127
  };
117
128
  //#endregion
118
- export { applyDomTransforms, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode, injectEnclosureEmbedPlaceholders, linkifyUrls, mergeConsecutiveOneLinerPres, paragraphizePlainText, parseFragment, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, simplifyFigures, stripComments, stripEmptyTags, stripInterBlockBreaks, stripOrphanedClosingTags, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, transformContent, transformHtml, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, youtubeEmbedResolver, youtubeResolveEmbed };
129
+ export { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBreaksToParagraphs, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode, injectEnclosures, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, paragraphizePlainText, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInterBlockBreaks, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, transformContent, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, youtubeEmbedResolver, youtubeResolveEmbed };
@@ -0,0 +1,4 @@
1
+ //#region src/parsers/linkedom.d.ts
2
+ declare const parseHtml: (html: string) => Document;
3
+ //#endregion
4
+ export { parseHtml };