feedsweep 1.0.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +75 -19
- package/dist/common.d.ts +8 -8
- package/dist/common.js +65 -42
- package/dist/defaults.d.ts +2 -2
- package/dist/defaults.js +48 -15
- package/dist/embeds/youtube.js +2 -2
- package/dist/index.d.ts +17 -10
- package/dist/index.js +25 -14
- package/dist/parsers/linkedom.d.ts +4 -0
- package/dist/parsers/linkedom.js +38 -0
- package/dist/transforms/dom/convertBreaksToParagraphs.d.ts +6 -0
- package/dist/transforms/dom/convertBreaksToParagraphs.js +80 -0
- package/dist/transforms/dom/decodeDoubleEncodedTags.d.ts +6 -0
- package/dist/transforms/dom/decodeDoubleEncodedTags.js +30 -0
- package/dist/transforms/dom/enrichEmbedPlaceholders.d.ts +6 -0
- package/dist/transforms/dom/enrichEmbedPlaceholders.js +32 -0
- package/dist/transforms/dom/fixLazyImages.js +33 -13
- package/dist/transforms/dom/highlightCode.js +3 -2
- package/dist/transforms/dom/injectEnclosures.d.ts +6 -0
- package/dist/transforms/dom/injectEnclosures.js +66 -0
- package/dist/transforms/dom/mergeConsecutiveOneLinerPres.js +1 -1
- package/dist/transforms/dom/mergeFragmentedLists.d.ts +6 -0
- package/dist/transforms/dom/mergeFragmentedLists.js +84 -0
- package/dist/transforms/dom/proxyAssetUrls.d.ts +6 -0
- package/dist/transforms/dom/proxyAssetUrls.js +64 -0
- package/dist/transforms/dom/removeTrackingPixels.js +22 -25
- package/dist/transforms/dom/replaceEmbedsWithPlaceholders.js +24 -25
- package/dist/transforms/dom/replacePreLineBreaks.js +3 -4
- package/dist/transforms/dom/resolveRelativeUrls.js +44 -30
- package/dist/transforms/dom/stripComments.js +5 -15
- package/dist/transforms/dom/stripDeadAnchors.d.ts +6 -0
- package/dist/transforms/dom/stripDeadAnchors.js +20 -0
- package/dist/transforms/dom/stripDuplicateTitleHeading.d.ts +6 -0
- package/dist/transforms/dom/stripDuplicateTitleHeading.js +31 -0
- package/dist/transforms/dom/stripEmptyTags.d.ts +6 -0
- package/dist/transforms/dom/stripEmptyTags.js +53 -0
- package/dist/transforms/dom/stripInterBlockBreaks.js +28 -8
- package/dist/transforms/dom/stripParagraphBoundaryBreaks.js +26 -6
- package/dist/transforms/dom/stripTrackingParams.js +7 -6
- package/dist/transforms/dom/trimPreWhitespace.js +4 -3
- package/dist/transforms/dom/unwrapDoublyNestedLists.d.ts +6 -0
- package/dist/transforms/dom/unwrapDoublyNestedLists.js +41 -0
- package/dist/transforms/dom/unwrapRedirectUrls.js +4 -2
- package/dist/transforms/dom/unwrapWrappers.d.ts +6 -0
- package/dist/transforms/dom/unwrapWrappers.js +30 -0
- package/dist/transforms/string/paragraphizePlainText.js +1 -1
- package/dist/transforms/string/stripOversizedBase64Sources.d.ts +6 -0
- package/dist/transforms/string/stripOversizedBase64Sources.js +13 -0
- package/dist/transforms/string/unwrapCdataComments.d.ts +6 -0
- package/dist/transforms/string/unwrapCdataComments.js +10 -0
- package/dist/types.d.ts +37 -7
- package/dist/unwraps/google.js +1 -1
- package/dist/unwraps/googleNewsModern.js +7 -3
- package/package.json +15 -5
- package/dist/transforms/dom/injectEnclosureEmbedPlaceholders.d.ts +0 -6
- package/dist/transforms/dom/injectEnclosureEmbedPlaceholders.js +0 -33
- package/dist/transforms/dom/simplifyFigures.d.ts +0 -6
- package/dist/transforms/dom/simplifyFigures.js +0 -27
- package/dist/transforms/string/decodeDoubleEncodedTags.d.ts +0 -6
- package/dist/transforms/string/decodeDoubleEncodedTags.js +0 -23
- package/dist/transforms/string/stripEmptyTags.d.ts +0 -6
- package/dist/transforms/string/stripEmptyTags.js +0 -25
- package/dist/transforms/string/stripOrphanedClosingTags.d.ts +0 -6
- package/dist/transforms/string/stripOrphanedClosingTags.js +0 -28
- package/dist/transforms/string/unwrapWrappers.d.ts +0 -6
- package/dist/transforms/string/unwrapWrappers.js +0 -10
package/README.md
CHANGED
|
@@ -11,15 +11,19 @@ Feedsweep takes raw feed item HTML and runs it through a pipeline that genuinely
|
|
|
11
11
|
## Installation
|
|
12
12
|
|
|
13
13
|
```bash
|
|
14
|
-
npm install feedsweep
|
|
14
|
+
npm install feedsweep linkedom
|
|
15
15
|
```
|
|
16
16
|
|
|
17
|
+
`linkedom` is an optional peer dependency. You only need it if you use the bundled `parseHtml` helper — see [DOM library](#dom-library) for jsdom / happy-dom / browser-native alternatives.
|
|
18
|
+
|
|
17
19
|
## Quick Start
|
|
18
20
|
|
|
19
21
|
```typescript
|
|
20
22
|
import { transformContent } from 'feedsweep'
|
|
23
|
+
import { parseHtml } from 'feedsweep/linkedom'
|
|
21
24
|
|
|
22
|
-
const result = transformContent('<p>Check <img data-src="photo.jpg"> and visit /about</p>', {
|
|
25
|
+
const result = await transformContent('<p>Check <img data-src="photo.jpg"> and visit /about</p>', {
|
|
26
|
+
parseHtmlFn: parseHtml,
|
|
23
27
|
baseUrl: 'https://example.com/post/1',
|
|
24
28
|
})
|
|
25
29
|
```
|
|
@@ -30,41 +34,93 @@ Inventory of every transform exported from the package. Most are enabled by defa
|
|
|
30
34
|
|
|
31
35
|
| Transform | Description |
|
|
32
36
|
| --- | --- |
|
|
33
|
-
| `stripOrphanedClosingTags` | Remove unmatched `</p>` / `</div>` close tags |
|
|
34
37
|
| `decodeDoubleEncodedTags` | Decode `<tag>` back to `<tag>` in mixed content |
|
|
35
|
-
| `unwrapWrappers` | Remove outer `<div>`, `<article>`, `<section>` wrappers |
|
|
36
|
-
| `paragraphizePlainText` | Wrap plain text in `<p>` tags |
|
|
37
|
-
| `stripEmptyTags` | Remove empty `<p>`, `<div>`, `<span>` and other tags |
|
|
38
|
-
| `stripComments` | Remove HTML `<!-- comments -->` |
|
|
39
38
|
| `fixLazyImages` | Move `data-src` / `data-original` to real `src` |
|
|
40
|
-
| `resolveRelativeUrls` | Convert relative URLs to absolute using base URL |
|
|
41
|
-
| `unwrapRedirectUrls` | Remove Google/Bing/Facebook/etc. redirect wrappers |
|
|
42
|
-
| `stripTrackingParams` | Remove UTM and other tracking parameters |
|
|
43
|
-
| `removeTrackingPixels` | Strip 1×1 tracking pixel images |
|
|
44
|
-
| `stripInterBlockBreaks` | Remove `<br>` tags between block elements |
|
|
45
|
-
| `stripParagraphBoundaryBreaks` | Remove `<br>` tags adjacent to paragraph boundaries |
|
|
46
|
-
| `highlightCode` | Syntax-highlight `<code>` blocks with highlight.js |
|
|
47
39
|
| `mergeConsecutiveOneLinerPres` | Merge consecutive single-line `<pre>` tags |
|
|
48
40
|
| `replacePreLineBreaks` | Replace `<br>` with `\n` inside `<pre>` |
|
|
49
|
-
| `
|
|
50
|
-
| `
|
|
41
|
+
| `stripInterBlockBreaks` | Remove `<br>` tags between block elements |
|
|
42
|
+
| `stripParagraphBoundaryBreaks` | Remove `<br>` tags adjacent to paragraph boundaries |
|
|
43
|
+
| `stripDuplicateTitleHeading` | Remove first `<h1>`–`<h6>` matching article title |
|
|
44
|
+
| `unwrapRedirectUrls` | Remove Google/Bing/Facebook/etc. redirect wrappers |
|
|
45
|
+
| `stripDeadAnchors` | Unwrap `<a>` with empty, `#`, or `javascript:` href |
|
|
46
|
+
| `removeTrackingPixels` | Strip 1×1 tracking pixel images |
|
|
47
|
+
| `stripTrackingParams` | Remove UTM and other tracking parameters |
|
|
48
|
+
| `convertBreaksToParagraphs` | Convert `<br><br>` runs into semantic `<p>` blocks |
|
|
49
|
+
| `injectEnclosures` | Inject feed enclosures into content as native `<audio>`/`<video>` or iframe placeholders |
|
|
51
50
|
| `replaceEmbedsWithPlaceholders` | Convert `<iframe>` to embed placeholders |
|
|
52
|
-
| `
|
|
53
|
-
| `
|
|
51
|
+
| `enrichEmbedPlaceholders` | Populate placeholder metadata (`title`, `description`, `duration`, etc.) via a caller-supplied async fn. Opt-in; not in defaults |
|
|
52
|
+
| `proxyAssetUrls` | Rewrite image, video, and audio URLs through a caller-supplied proxy |
|
|
53
|
+
| `resolveRelativeUrls` | Convert relative URLs to absolute using base URL |
|
|
54
|
+
| `unwrapWrappers` | Remove outer `<div>`, `<article>`, `<section>` wrappers |
|
|
55
|
+
| `unwrapDoublyNestedLists` | Unwrap `<ul>`/`<ol>` that wrap a single `<li>` containing a same-type list |
|
|
56
|
+
| `mergeFragmentedLists` | Merge consecutive sibling `<ul>` / `<ol>` lists with matching attributes |
|
|
57
|
+
| `paragraphizePlainText` | Wrap plain text in `<p>` tags |
|
|
58
|
+
| `stripOversizedBase64Sources` | Drop base64 `src`/`srcset`/`poster` payloads larger than 50 KB before parsing |
|
|
59
|
+
| `linkifyUrls` | Wrap bare URLs in `<a>` tags |
|
|
60
|
+
| `trimPreWhitespace` | Remove common leading indentation from `<pre>` |
|
|
61
|
+
| `highlightCode` | Syntax-highlight `<code>` blocks with highlight.js |
|
|
62
|
+
| `stripEmptyTags` | Remove empty `<p>`, `<div>`, `<span>` and other tags |
|
|
63
|
+
| `stripComments` | Remove HTML `<!-- comments -->` |
|
|
64
|
+
| `unwrapCdataComments` | Strip malformed `<!--[CDATA[ … ]]-->` wrappers before parsing so the wrapped article reaches the DOM as real HTML |
|
|
54
65
|
|
|
55
66
|
## Options
|
|
56
67
|
|
|
57
68
|
```typescript
|
|
58
69
|
import { fixLazyImages, resolveRelativeUrls, transformContent } from 'feedsweep'
|
|
70
|
+
import { parseHtml } from 'feedsweep/linkedom'
|
|
59
71
|
|
|
60
72
|
const result = transformContent(html, {
|
|
73
|
+
// Required: function that turns an HTML string into a `Document`. See "DOM library".
|
|
74
|
+
parseHtmlFn: parseHtml,
|
|
61
75
|
// Base URL for resolving relative URLs.
|
|
62
76
|
baseUrl: 'https://example.com/post/1',
|
|
63
77
|
// Feed item enclosures (audio/video).
|
|
64
78
|
enclosures: [{ url: 'https://example.com/audio.mp3', type: 'audio/mpeg' }],
|
|
79
|
+
// Route image/video/audio URLs through a proxy. Return `undefined` to leave a URL untouched.
|
|
80
|
+
assetProxyFn: (url, type) => `https://proxy.example.com/?type=${type}&url=${encodeURIComponent(url)}`,
|
|
81
|
+
// Populate embed placeholder metadata from a remote source (e.g. YouTube oEmbed).
|
|
82
|
+
enrichEmbedFn: async (embeds) => {
|
|
83
|
+
return new Map(embeds.map(({ provider, id }) => [`${provider}:${id}`, { title: '…' }]))
|
|
84
|
+
},
|
|
65
85
|
// Run a custom DOM transform pipeline (omit to use defaults).
|
|
66
86
|
domTransforms: [fixLazyImages, resolveRelativeUrls],
|
|
67
87
|
})
|
|
68
88
|
```
|
|
69
89
|
|
|
70
|
-
The `stringTransforms
|
|
90
|
+
The `stringTransforms` and `domTransforms` options each fully replace the corresponding default phase when provided. Every transform is also exported individually from `feedsweep`, so you can compose any pipeline — list them explicitly to build from scratch, or spread `defaultDomTransforms` (etc.) from `feedsweep/defaults` to extend or filter the defaults.
|
|
91
|
+
|
|
92
|
+
## DOM library
|
|
93
|
+
|
|
94
|
+
Feedsweep is parser-agnostic. You provide `parseHtmlFn` — a function that turns an HTML string into a `Document`. Use any DOM library that produces a standards-compliant `Document`.
|
|
95
|
+
|
|
96
|
+
```typescript
|
|
97
|
+
// linkedom (recommended default)
|
|
98
|
+
import { transformContent } from 'feedsweep'
|
|
99
|
+
import { parseHtml } from 'feedsweep/linkedom'
|
|
100
|
+
|
|
101
|
+
await transformContent(html, { parseHtmlFn: parseHtml, baseUrl })
|
|
102
|
+
|
|
103
|
+
// jsdom
|
|
104
|
+
import { transformContent } from 'feedsweep'
|
|
105
|
+
import { JSDOM } from 'jsdom'
|
|
106
|
+
|
|
107
|
+
await transformContent(html, {
|
|
108
|
+
parseHtmlFn: (raw) => new JSDOM(`<!doctype html><body>${raw}</body>`).window.document,
|
|
109
|
+
baseUrl,
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
// happy-dom
|
|
113
|
+
import { transformContent } from 'feedsweep'
|
|
114
|
+
import { Window } from 'happy-dom'
|
|
115
|
+
|
|
116
|
+
await transformContent(html, {
|
|
117
|
+
parseHtmlFn: (raw) => {
|
|
118
|
+
const window = new Window()
|
|
119
|
+
window.document.body.innerHTML = raw
|
|
120
|
+
return window.document
|
|
121
|
+
},
|
|
122
|
+
baseUrl,
|
|
123
|
+
})
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
The bundled `feedsweep/linkedom` parser bakes in two workarounds for linkedom-specific spec violations (attribute case-folding and SVG XML mode). jsdom and happy-dom do not need them.
|
package/dist/common.d.ts
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
import { EmbedResolverResult } from "./types.js";
|
|
1
|
+
import { EmbedResolverResult, MaybePromise } from "./types.js";
|
|
2
2
|
|
|
3
3
|
//#region src/common.d.ts
|
|
4
|
-
declare const
|
|
5
|
-
declare const
|
|
6
|
-
declare const
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
declare const createEmbedPlaceholder: (document: Document, src: string,
|
|
4
|
+
declare const applyDomTransforms: (document: Document, transforms: Array<(document: Document) => MaybePromise<void>>) => Promise<string>;
|
|
5
|
+
declare const applyStringTransforms: (html: string, transforms: Array<(html: string) => MaybePromise<string>>) => Promise<string>;
|
|
6
|
+
declare const applyEmbedMetadata: (element: HTMLElement, metadata: Partial<EmbedResolverResult>, options?: {
|
|
7
|
+
setIfMissing?: boolean;
|
|
8
|
+
}) => void;
|
|
9
|
+
declare const createEmbedPlaceholder: (document: Document, src: string, metadata?: Partial<EmbedResolverResult>) => HTMLElement;
|
|
10
10
|
//#endregion
|
|
11
|
-
export { applyDomTransforms, applyStringTransforms, createEmbedPlaceholder
|
|
11
|
+
export { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder };
|
package/dist/common.js
CHANGED
|
@@ -1,39 +1,27 @@
|
|
|
1
|
+
import { coerceNumber } from "./utils.js";
|
|
1
2
|
import { resolveUrl } from "feedcanon";
|
|
2
|
-
import { parseHTML } from "linkedom";
|
|
3
3
|
//#region src/common.ts
|
|
4
4
|
const Node = {
|
|
5
5
|
ELEMENT_NODE: 1,
|
|
6
6
|
TEXT_NODE: 3,
|
|
7
7
|
COMMENT_NODE: 8
|
|
8
8
|
};
|
|
9
|
-
const
|
|
9
|
+
const NodeFilter = {
|
|
10
|
+
SHOW_ELEMENT: 1,
|
|
11
|
+
SHOW_TEXT: 4,
|
|
12
|
+
SHOW_COMMENT: 128
|
|
13
|
+
};
|
|
10
14
|
const safeThumbnailDataUrlRegex = /^data:image\/(png|jpe?g|gif|webp|avif);/i;
|
|
11
15
|
const isSafeThumbnailUrl = (url) => {
|
|
12
16
|
return resolveUrl(url) !== void 0 || safeThumbnailDataUrlRegex.test(url);
|
|
13
17
|
};
|
|
14
|
-
const
|
|
15
|
-
|
|
16
|
-
if (match.length < maxSize) return match;
|
|
17
|
-
return `${prefix}${suffix}`;
|
|
18
|
-
});
|
|
19
|
-
};
|
|
20
|
-
const parseFragment = (html) => {
|
|
21
|
-
const { document } = parseHTML(`<!doctype html><html><head></head><body>${html}</body></html>`);
|
|
22
|
-
return document;
|
|
23
|
-
};
|
|
24
|
-
const transformHtml = (html, transform) => {
|
|
25
|
-
const document = parseFragment(html);
|
|
26
|
-
transform(document);
|
|
18
|
+
const applyDomTransforms = async (document, transforms) => {
|
|
19
|
+
for (const transform of transforms) await transform(document);
|
|
27
20
|
return document.body.innerHTML;
|
|
28
21
|
};
|
|
29
|
-
const
|
|
30
|
-
const document = parseFragment(stripOversizedBase64Sources(html, 50 * 1024));
|
|
31
|
-
for (const transform of transforms) transform(document);
|
|
32
|
-
return document.body.innerHTML;
|
|
33
|
-
};
|
|
34
|
-
const applyStringTransforms = (html, transforms) => {
|
|
22
|
+
const applyStringTransforms = async (html, transforms) => {
|
|
35
23
|
let output = html;
|
|
36
|
-
for (const transform of transforms) output = transform(output);
|
|
24
|
+
for (const transform of transforms) output = await transform(output);
|
|
37
25
|
return output;
|
|
38
26
|
};
|
|
39
27
|
const blockElements = new Set([
|
|
@@ -71,10 +59,10 @@ const blockElements = new Set([
|
|
|
71
59
|
"ul"
|
|
72
60
|
]);
|
|
73
61
|
const isWhitespaceText = (node) => {
|
|
74
|
-
return node.nodeType === Node.TEXT_NODE && !
|
|
62
|
+
return node.nodeType === Node.TEXT_NODE && !node.textContent?.trim();
|
|
75
63
|
};
|
|
76
64
|
const isBr = (node) => {
|
|
77
|
-
return node.nodeType === Node.ELEMENT_NODE && node.
|
|
65
|
+
return node.nodeType === Node.ELEMENT_NODE && node.localName === "br";
|
|
78
66
|
};
|
|
79
67
|
const isComment = (node) => {
|
|
80
68
|
return node.nodeType === Node.COMMENT_NODE;
|
|
@@ -83,28 +71,63 @@ const isSkippable = (node) => {
|
|
|
83
71
|
return isWhitespaceText(node) || isBr(node) || isComment(node);
|
|
84
72
|
};
|
|
85
73
|
const isBlockElement = (node) => {
|
|
86
|
-
return node.nodeType === Node.ELEMENT_NODE && blockElements.has(node.
|
|
74
|
+
return node.nodeType === Node.ELEMENT_NODE && blockElements.has(node.localName);
|
|
87
75
|
};
|
|
88
|
-
const
|
|
89
|
-
let
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
match = pattern.exec(result);
|
|
76
|
+
const hasAncestorWithTagName = (node, tagSet, stopAt) => {
|
|
77
|
+
let ancestor = node.parentNode;
|
|
78
|
+
while (ancestor !== null && ancestor !== stopAt) {
|
|
79
|
+
if (ancestor.nodeType === Node.ELEMENT_NODE && tagSet.has(ancestor.localName)) return true;
|
|
80
|
+
ancestor = ancestor.parentNode;
|
|
94
81
|
}
|
|
95
|
-
return
|
|
82
|
+
return false;
|
|
83
|
+
};
|
|
84
|
+
const styleWidthRegex = /(?:^|;)\s*width\s*:\s*([0-9]*\.?[0-9]+)\s*(?:px)?\s*(?:;|$)/i;
|
|
85
|
+
const styleHeightRegex = /(?:^|;)\s*height\s*:\s*([0-9]*\.?[0-9]+)\s*(?:px)?\s*(?:;|$)/i;
|
|
86
|
+
const getDimensions = (element) => {
|
|
87
|
+
const width = coerceNumber(element.getAttribute("width"));
|
|
88
|
+
const height = coerceNumber(element.getAttribute("height"));
|
|
89
|
+
if (width !== void 0 && height !== void 0) return {
|
|
90
|
+
width,
|
|
91
|
+
height
|
|
92
|
+
};
|
|
93
|
+
const style = element.getAttribute("style");
|
|
94
|
+
if (!style) return {
|
|
95
|
+
width,
|
|
96
|
+
height
|
|
97
|
+
};
|
|
98
|
+
const fromStyle = (regex) => {
|
|
99
|
+
const match = regex.exec(style);
|
|
100
|
+
return match ? coerceNumber(match[1]) : void 0;
|
|
101
|
+
};
|
|
102
|
+
return {
|
|
103
|
+
width: width ?? fromStyle(styleWidthRegex),
|
|
104
|
+
height: height ?? fromStyle(styleHeightRegex)
|
|
105
|
+
};
|
|
106
|
+
};
|
|
107
|
+
const applyEmbedMetadata = (element, metadata, options) => {
|
|
108
|
+
const setIfMissing = options?.setIfMissing ?? false;
|
|
109
|
+
const set = (name, value) => {
|
|
110
|
+
if (setIfMissing && element.hasAttribute(name)) return;
|
|
111
|
+
element.setAttribute(name, value);
|
|
112
|
+
};
|
|
113
|
+
if (metadata.provider) set("data-embed-provider", metadata.provider);
|
|
114
|
+
if (metadata.id) set("data-embed-id", metadata.id);
|
|
115
|
+
if (metadata.src) set("data-embed-src", metadata.src);
|
|
116
|
+
if (metadata.url) set("data-embed-url", metadata.url);
|
|
117
|
+
if (metadata.thumbnail && isSafeThumbnailUrl(metadata.thumbnail)) set("data-embed-thumbnail", metadata.thumbnail);
|
|
118
|
+
if (metadata.width) set("data-embed-width", String(metadata.width));
|
|
119
|
+
if (metadata.height) set("data-embed-height", String(metadata.height));
|
|
120
|
+
if (metadata.title) set("data-embed-title", metadata.title);
|
|
121
|
+
if (metadata.description) set("data-embed-description", metadata.description);
|
|
122
|
+
if (metadata.author) set("data-embed-author", metadata.author);
|
|
123
|
+
if (metadata.avatar && isSafeThumbnailUrl(metadata.avatar)) set("data-embed-avatar", metadata.avatar);
|
|
124
|
+
if (metadata.duration) set("data-embed-duration", String(metadata.duration));
|
|
96
125
|
};
|
|
97
|
-
const createEmbedPlaceholder = (document, src,
|
|
126
|
+
const createEmbedPlaceholder = (document, src, metadata) => {
|
|
98
127
|
const element = document.createElement("div");
|
|
99
|
-
element.setAttribute("data-embed",
|
|
128
|
+
element.setAttribute("data-embed", "iframe");
|
|
100
129
|
element.setAttribute("data-embed-src", metadata?.src ?? src);
|
|
101
|
-
if (metadata
|
|
102
|
-
if (metadata?.url) element.setAttribute("data-embed-url", metadata.url);
|
|
103
|
-
if (metadata?.thumbnail && isSafeThumbnailUrl(metadata.thumbnail)) element.setAttribute("data-embed-thumbnail", metadata.thumbnail);
|
|
104
|
-
if (metadata?.width) element.setAttribute("data-embed-width", String(metadata.width));
|
|
105
|
-
if (metadata?.height) element.setAttribute("data-embed-height", String(metadata.height));
|
|
106
|
-
if (metadata?.author) element.setAttribute("data-embed-author", metadata.author);
|
|
107
|
-
if (metadata?.text) element.setAttribute("data-embed-text", metadata.text);
|
|
130
|
+
if (metadata) applyEmbedMetadata(element, metadata);
|
|
108
131
|
const fallbackUrl = metadata?.url ?? metadata?.src ?? src;
|
|
109
132
|
const link = document.createElement("a");
|
|
110
133
|
link.setAttribute("href", fallbackUrl);
|
|
@@ -113,4 +136,4 @@ const createEmbedPlaceholder = (document, src, type, metadata) => {
|
|
|
113
136
|
return element;
|
|
114
137
|
};
|
|
115
138
|
//#endregion
|
|
116
|
-
export { Node, applyDomTransforms, applyStringTransforms, createEmbedPlaceholder,
|
|
139
|
+
export { Node, NodeFilter, applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder, getDimensions, hasAncestorWithTagName, isBlockElement, isBr, isSkippable, isWhitespaceText };
|
package/dist/defaults.d.ts
CHANGED
|
@@ -3,12 +3,12 @@ import { DomTransform, EmbedResolver, ResolveUrlFn, StringTransform, UrlUnwrappe
|
|
|
3
3
|
//#region src/defaults.d.ts
|
|
4
4
|
declare const defaultStringTransforms: Array<StringTransform>;
|
|
5
5
|
declare const defaultDomTransforms: Array<DomTransform>;
|
|
6
|
-
declare const defaultFinalStringTransforms: Array<StringTransform>;
|
|
7
6
|
declare const defaultEmbedResolvers: Array<EmbedResolver>;
|
|
8
7
|
declare const defaultResolveUrlFn: ResolveUrlFn;
|
|
9
8
|
declare const defaultLazySrcAttributes: string[];
|
|
9
|
+
declare const defaultLazySrcsetAttributes: string[];
|
|
10
10
|
declare const defaultTrackingHosts: string[];
|
|
11
11
|
declare const defaultTrackingPathSegments: string[];
|
|
12
12
|
declare const defaultUrlUnwrappers: Array<UrlUnwrapper>;
|
|
13
13
|
//#endregion
|
|
14
|
-
export { defaultDomTransforms, defaultEmbedResolvers,
|
|
14
|
+
export { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
|
package/dist/defaults.js
CHANGED
|
@@ -1,24 +1,31 @@
|
|
|
1
1
|
import { youtubeEmbedResolver } from "./embeds/youtube.js";
|
|
2
|
+
import { convertBreaksToParagraphs } from "./transforms/dom/convertBreaksToParagraphs.js";
|
|
3
|
+
import { decodeDoubleEncodedTags } from "./transforms/dom/decodeDoubleEncodedTags.js";
|
|
2
4
|
import { fixLazyImages } from "./transforms/dom/fixLazyImages.js";
|
|
3
5
|
import { highlightCode } from "./transforms/dom/highlightCode.js";
|
|
4
|
-
import {
|
|
6
|
+
import { injectEnclosures } from "./transforms/dom/injectEnclosures.js";
|
|
5
7
|
import { linkifyUrls } from "./transforms/dom/linkifyUrls.js";
|
|
6
8
|
import { mergeConsecutiveOneLinerPres } from "./transforms/dom/mergeConsecutiveOneLinerPres.js";
|
|
9
|
+
import { mergeFragmentedLists } from "./transforms/dom/mergeFragmentedLists.js";
|
|
10
|
+
import { proxyAssetUrls } from "./transforms/dom/proxyAssetUrls.js";
|
|
7
11
|
import { removeTrackingPixels } from "./transforms/dom/removeTrackingPixels.js";
|
|
8
12
|
import { replaceEmbedsWithPlaceholders } from "./transforms/dom/replaceEmbedsWithPlaceholders.js";
|
|
9
13
|
import { replacePreLineBreaks } from "./transforms/dom/replacePreLineBreaks.js";
|
|
10
14
|
import { resolveRelativeUrls } from "./transforms/dom/resolveRelativeUrls.js";
|
|
11
15
|
import { stripComments } from "./transforms/dom/stripComments.js";
|
|
16
|
+
import { stripDeadAnchors } from "./transforms/dom/stripDeadAnchors.js";
|
|
17
|
+
import { stripDuplicateTitleHeading } from "./transforms/dom/stripDuplicateTitleHeading.js";
|
|
18
|
+
import { stripEmptyTags } from "./transforms/dom/stripEmptyTags.js";
|
|
12
19
|
import { stripInterBlockBreaks } from "./transforms/dom/stripInterBlockBreaks.js";
|
|
13
20
|
import { stripParagraphBoundaryBreaks } from "./transforms/dom/stripParagraphBoundaryBreaks.js";
|
|
14
21
|
import { stripTrackingParams } from "./transforms/dom/stripTrackingParams.js";
|
|
15
22
|
import { trimPreWhitespace } from "./transforms/dom/trimPreWhitespace.js";
|
|
23
|
+
import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedLists.js";
|
|
16
24
|
import { unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
|
|
17
|
-
import {
|
|
25
|
+
import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
|
|
18
26
|
import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
|
|
19
|
-
import {
|
|
20
|
-
import {
|
|
21
|
-
import { unwrapWrappers } from "./transforms/string/unwrapWrappers.js";
|
|
27
|
+
import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
|
|
28
|
+
import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
|
|
22
29
|
import { unwrapBing } from "./unwraps/bing.js";
|
|
23
30
|
import { unwrapFacebookShim } from "./unwraps/facebook.js";
|
|
24
31
|
import { unwrapGoogle } from "./unwraps/google.js";
|
|
@@ -34,30 +41,36 @@ import { unwrapYouTube } from "./unwraps/youtube.js";
|
|
|
34
41
|
import { resolveUrl } from "feedcanon";
|
|
35
42
|
//#region src/defaults.ts
|
|
36
43
|
const defaultStringTransforms = [
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
paragraphizePlainText,
|
|
41
|
-
stripEmptyTags
|
|
44
|
+
stripOversizedBase64Sources,
|
|
45
|
+
unwrapCdataComments,
|
|
46
|
+
paragraphizePlainText
|
|
42
47
|
];
|
|
43
48
|
const defaultDomTransforms = [
|
|
49
|
+
decodeDoubleEncodedTags,
|
|
44
50
|
stripComments,
|
|
51
|
+
unwrapDoublyNestedLists,
|
|
52
|
+
stripDuplicateTitleHeading,
|
|
45
53
|
fixLazyImages,
|
|
46
54
|
resolveRelativeUrls,
|
|
47
55
|
unwrapRedirectUrls,
|
|
56
|
+
stripDeadAnchors,
|
|
48
57
|
stripTrackingParams,
|
|
49
58
|
removeTrackingPixels,
|
|
59
|
+
convertBreaksToParagraphs,
|
|
50
60
|
stripInterBlockBreaks,
|
|
51
61
|
stripParagraphBoundaryBreaks,
|
|
62
|
+
mergeFragmentedLists,
|
|
52
63
|
highlightCode,
|
|
53
64
|
mergeConsecutiveOneLinerPres,
|
|
54
65
|
replacePreLineBreaks,
|
|
55
66
|
trimPreWhitespace,
|
|
56
67
|
linkifyUrls,
|
|
57
68
|
replaceEmbedsWithPlaceholders,
|
|
58
|
-
|
|
69
|
+
injectEnclosures,
|
|
70
|
+
proxyAssetUrls,
|
|
71
|
+
unwrapWrappers,
|
|
72
|
+
stripEmptyTags
|
|
59
73
|
];
|
|
60
|
-
const defaultFinalStringTransforms = [stripEmptyTags];
|
|
61
74
|
const defaultEmbedResolvers = [youtubeEmbedResolver];
|
|
62
75
|
const defaultResolveUrlFn = (url, baseUrl) => resolveUrl(url, baseUrl);
|
|
63
76
|
const defaultLazySrcAttributes = [
|
|
@@ -75,9 +88,27 @@ const defaultLazySrcAttributes = [
|
|
|
75
88
|
"data-image-src",
|
|
76
89
|
"data-canonical-src",
|
|
77
90
|
"data-img-url",
|
|
91
|
+
"nitro-lazy-src",
|
|
78
92
|
"data-orig",
|
|
79
93
|
"data-runner-src"
|
|
80
94
|
];
|
|
95
|
+
const defaultLazySrcsetAttributes = [
|
|
96
|
+
"data-srcset",
|
|
97
|
+
"data-tf-srcset",
|
|
98
|
+
"data-lazy-srcset",
|
|
99
|
+
"data-image-srcset",
|
|
100
|
+
"data-modal-srcset",
|
|
101
|
+
"data-splide-lazy-srcset",
|
|
102
|
+
"data-alt-srcset",
|
|
103
|
+
"fifu-data-srcset",
|
|
104
|
+
"data-thumb-srcset",
|
|
105
|
+
"data-vp-popup-img-srcset",
|
|
106
|
+
"data-original-srcset",
|
|
107
|
+
"data-pswp-srcset",
|
|
108
|
+
"data-nectar-img-srcset",
|
|
109
|
+
"nitro-lazy-srcset",
|
|
110
|
+
"data-flickity-lazyload-srcset"
|
|
111
|
+
];
|
|
81
112
|
const defaultTrackingHosts = [
|
|
82
113
|
"feedsportal.com",
|
|
83
114
|
"stats.wordpress.com",
|
|
@@ -102,12 +133,14 @@ const defaultTrackingHosts = [
|
|
|
102
133
|
"quantserve.com",
|
|
103
134
|
"chartbeat.com",
|
|
104
135
|
"moatads.com",
|
|
105
|
-
"sentry.io"
|
|
136
|
+
"sentry.io",
|
|
137
|
+
"hubspot.com"
|
|
106
138
|
];
|
|
107
139
|
const defaultTrackingPathSegments = [
|
|
108
140
|
"pixel",
|
|
109
141
|
"beacon",
|
|
110
|
-
"count"
|
|
142
|
+
"count",
|
|
143
|
+
"impression"
|
|
111
144
|
];
|
|
112
145
|
const defaultUrlUnwrappers = [
|
|
113
146
|
unwrapBing,
|
|
@@ -124,4 +157,4 @@ const defaultUrlUnwrappers = [
|
|
|
124
157
|
unwrapRedditOut
|
|
125
158
|
];
|
|
126
159
|
//#endregion
|
|
127
|
-
export { defaultDomTransforms, defaultEmbedResolvers,
|
|
160
|
+
export { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
|
package/dist/embeds/youtube.js
CHANGED
|
@@ -32,10 +32,10 @@ const youtubeResolveEmbed = (url) => {
|
|
|
32
32
|
if (!videoId) return;
|
|
33
33
|
return {
|
|
34
34
|
provider: "youtube",
|
|
35
|
+
id: videoId,
|
|
35
36
|
src: `https://www.youtube-nocookie.com/embed/${videoId}`,
|
|
36
37
|
url: `https://www.youtube.com/watch?v=${videoId}`,
|
|
37
|
-
thumbnail: composeThumbnailUrl(videoId)
|
|
38
|
-
type: "iframe"
|
|
38
|
+
thumbnail: composeThumbnailUrl(videoId)
|
|
39
39
|
};
|
|
40
40
|
};
|
|
41
41
|
const youtubeEmbedResolver = {
|
package/dist/index.d.ts
CHANGED
|
@@ -1,28 +1,35 @@
|
|
|
1
|
-
import { DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext } from "./types.js";
|
|
1
|
+
import { AssetProxyFn, AssetType, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ParseHtmlFn, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext } from "./types.js";
|
|
2
2
|
import { defaultResolveUrlFn } from "./defaults.js";
|
|
3
|
-
import { applyDomTransforms, applyStringTransforms, createEmbedPlaceholder
|
|
3
|
+
import { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder } from "./common.js";
|
|
4
4
|
import { composeThumbnailUrl, extractVideoId, youtubeEmbedResolver, youtubeResolveEmbed } from "./embeds/youtube.js";
|
|
5
|
+
import { convertBreaksToParagraphs } from "./transforms/dom/convertBreaksToParagraphs.js";
|
|
6
|
+
import { decodeDoubleEncodedTags } from "./transforms/dom/decodeDoubleEncodedTags.js";
|
|
7
|
+
import { enrichEmbedPlaceholders } from "./transforms/dom/enrichEmbedPlaceholders.js";
|
|
5
8
|
import { fixLazyImages } from "./transforms/dom/fixLazyImages.js";
|
|
6
9
|
import { detectLanguage, highlightCode } from "./transforms/dom/highlightCode.js";
|
|
7
|
-
import {
|
|
10
|
+
import { injectEnclosures } from "./transforms/dom/injectEnclosures.js";
|
|
8
11
|
import { linkifyUrls } from "./transforms/dom/linkifyUrls.js";
|
|
9
12
|
import { mergeConsecutiveOneLinerPres } from "./transforms/dom/mergeConsecutiveOneLinerPres.js";
|
|
13
|
+
import { mergeFragmentedLists } from "./transforms/dom/mergeFragmentedLists.js";
|
|
14
|
+
import { proxyAssetUrls } from "./transforms/dom/proxyAssetUrls.js";
|
|
10
15
|
import { removeTrackingPixels } from "./transforms/dom/removeTrackingPixels.js";
|
|
11
16
|
import { replaceEmbedsWithPlaceholders } from "./transforms/dom/replaceEmbedsWithPlaceholders.js";
|
|
12
17
|
import { replacePreLineBreaks } from "./transforms/dom/replacePreLineBreaks.js";
|
|
13
18
|
import { resolveRelativeUrls } from "./transforms/dom/resolveRelativeUrls.js";
|
|
14
|
-
import { simplifyFigures } from "./transforms/dom/simplifyFigures.js";
|
|
15
19
|
import { stripComments } from "./transforms/dom/stripComments.js";
|
|
20
|
+
import { stripDeadAnchors } from "./transforms/dom/stripDeadAnchors.js";
|
|
21
|
+
import { stripDuplicateTitleHeading } from "./transforms/dom/stripDuplicateTitleHeading.js";
|
|
22
|
+
import { stripEmptyTags } from "./transforms/dom/stripEmptyTags.js";
|
|
16
23
|
import { stripInterBlockBreaks } from "./transforms/dom/stripInterBlockBreaks.js";
|
|
17
24
|
import { stripParagraphBoundaryBreaks } from "./transforms/dom/stripParagraphBoundaryBreaks.js";
|
|
18
25
|
import { stripTrackingParams } from "./transforms/dom/stripTrackingParams.js";
|
|
19
26
|
import { trimPreWhitespace } from "./transforms/dom/trimPreWhitespace.js";
|
|
27
|
+
import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedLists.js";
|
|
20
28
|
import { extractRedirectTarget, unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
|
|
21
|
-
import {
|
|
29
|
+
import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
|
|
22
30
|
import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
|
|
23
|
-
import {
|
|
24
|
-
import {
|
|
25
|
-
import { unwrapWrappers } from "./transforms/string/unwrapWrappers.js";
|
|
31
|
+
import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
|
|
32
|
+
import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
|
|
26
33
|
import { unwrapAceml } from "./unwraps/aceml.js";
|
|
27
34
|
import { unwrapAdjust } from "./unwraps/adjust.js";
|
|
28
35
|
import { unwrapAmazonAffiliate } from "./unwraps/amazonAffiliate.js";
|
|
@@ -100,6 +107,6 @@ import { unwrapZhihu } from "./unwraps/zhihu.js";
|
|
|
100
107
|
import { ParamExtractorConfig, chooseBaseUrl, coerceNumber, createParamExtractor } from "./utils.js";
|
|
101
108
|
|
|
102
109
|
//#region src/index.d.ts
|
|
103
|
-
declare const transformContent: (html: string, options
|
|
110
|
+
declare const transformContent: (html: string, options: TransformContentOptions) => Promise<string>;
|
|
104
111
|
//#endregion
|
|
105
|
-
export { type DomTransform, type EmbedResolver, type EmbedResolverResult, type Enclosure, type ParamExtractorConfig, type ResolveUrlFn, type StringTransform, type TransformContentOptions, type TransformContext, applyDomTransforms, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode,
|
|
112
|
+
export { type AssetProxyFn, type AssetType, type DomTransform, type EmbedResolver, type EmbedResolverResult, type Enclosure, type EnrichEmbedFn, type MaybePromise, type ParamExtractorConfig, type ParseHtmlFn, type ResolveUrlFn, type StringTransform, type TransformContentOptions, type TransformContext, applyDomTransforms, applyEmbedMetadata, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBreaksToParagraphs, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode, injectEnclosures, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, paragraphizePlainText, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInterBlockBreaks, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, transformContent, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, youtubeEmbedResolver, youtubeResolveEmbed };
|
package/dist/index.js
CHANGED
|
@@ -1,26 +1,33 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { chooseBaseUrl, coerceNumber, createParamExtractor } from "./utils.js";
|
|
2
|
+
import { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder } from "./common.js";
|
|
2
3
|
import { composeThumbnailUrl, extractVideoId, youtubeEmbedResolver, youtubeResolveEmbed } from "./embeds/youtube.js";
|
|
4
|
+
import { convertBreaksToParagraphs } from "./transforms/dom/convertBreaksToParagraphs.js";
|
|
5
|
+
import { decodeDoubleEncodedTags } from "./transforms/dom/decodeDoubleEncodedTags.js";
|
|
3
6
|
import { fixLazyImages } from "./transforms/dom/fixLazyImages.js";
|
|
4
7
|
import { detectLanguage, highlightCode } from "./transforms/dom/highlightCode.js";
|
|
5
|
-
import {
|
|
8
|
+
import { injectEnclosures } from "./transforms/dom/injectEnclosures.js";
|
|
6
9
|
import { linkifyUrls } from "./transforms/dom/linkifyUrls.js";
|
|
7
10
|
import { mergeConsecutiveOneLinerPres } from "./transforms/dom/mergeConsecutiveOneLinerPres.js";
|
|
11
|
+
import { mergeFragmentedLists } from "./transforms/dom/mergeFragmentedLists.js";
|
|
12
|
+
import { proxyAssetUrls } from "./transforms/dom/proxyAssetUrls.js";
|
|
8
13
|
import { removeTrackingPixels } from "./transforms/dom/removeTrackingPixels.js";
|
|
9
|
-
import { chooseBaseUrl, coerceNumber, createParamExtractor } from "./utils.js";
|
|
10
14
|
import { replaceEmbedsWithPlaceholders } from "./transforms/dom/replaceEmbedsWithPlaceholders.js";
|
|
11
15
|
import { replacePreLineBreaks } from "./transforms/dom/replacePreLineBreaks.js";
|
|
12
16
|
import { resolveRelativeUrls } from "./transforms/dom/resolveRelativeUrls.js";
|
|
13
17
|
import { stripComments } from "./transforms/dom/stripComments.js";
|
|
18
|
+
import { stripDeadAnchors } from "./transforms/dom/stripDeadAnchors.js";
|
|
19
|
+
import { stripDuplicateTitleHeading } from "./transforms/dom/stripDuplicateTitleHeading.js";
|
|
20
|
+
import { stripEmptyTags } from "./transforms/dom/stripEmptyTags.js";
|
|
14
21
|
import { stripInterBlockBreaks } from "./transforms/dom/stripInterBlockBreaks.js";
|
|
15
22
|
import { stripParagraphBoundaryBreaks } from "./transforms/dom/stripParagraphBoundaryBreaks.js";
|
|
16
23
|
import { stripTrackingParams } from "./transforms/dom/stripTrackingParams.js";
|
|
17
24
|
import { trimPreWhitespace } from "./transforms/dom/trimPreWhitespace.js";
|
|
25
|
+
import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedLists.js";
|
|
18
26
|
import { extractRedirectTarget, unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
|
|
19
|
-
import {
|
|
27
|
+
import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
|
|
20
28
|
import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
|
|
21
|
-
import {
|
|
22
|
-
import {
|
|
23
|
-
import { unwrapWrappers } from "./transforms/string/unwrapWrappers.js";
|
|
29
|
+
import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
|
|
30
|
+
import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
|
|
24
31
|
import { unwrapBing } from "./unwraps/bing.js";
|
|
25
32
|
import { unwrapFacebookShim } from "./unwraps/facebook.js";
|
|
26
33
|
import { unwrapGoogle } from "./unwraps/google.js";
|
|
@@ -33,8 +40,8 @@ import { unwrapRedditOut } from "./unwraps/redditOut.js";
|
|
|
33
40
|
import { unwrapVkAway } from "./unwraps/vkAway.js";
|
|
34
41
|
import { unwrapYahooSearch } from "./unwraps/yahooSearch.js";
|
|
35
42
|
import { unwrapYouTube } from "./unwraps/youtube.js";
|
|
36
|
-
import { defaultDomTransforms, defaultEmbedResolvers,
|
|
37
|
-
import {
|
|
43
|
+
import { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers } from "./defaults.js";
|
|
44
|
+
import { enrichEmbedPlaceholders } from "./transforms/dom/enrichEmbedPlaceholders.js";
|
|
38
45
|
import { unwrapAceml } from "./unwraps/aceml.js";
|
|
39
46
|
import { unwrapAdjust } from "./unwraps/adjust.js";
|
|
40
47
|
import { unwrapAmazonAffiliate } from "./unwraps/amazonAffiliate.js";
|
|
@@ -98,21 +105,25 @@ import { unwrapWebArchive } from "./unwraps/webArchive.js";
|
|
|
98
105
|
import { unwrapYandexTurbo } from "./unwraps/yandexTurbo.js";
|
|
99
106
|
import { unwrapZhihu } from "./unwraps/zhihu.js";
|
|
100
107
|
//#region src/index.ts
|
|
101
|
-
const transformContent = (html, options
|
|
108
|
+
const transformContent = async (html, options) => {
|
|
102
109
|
const context = {
|
|
103
110
|
baseUrl: options.baseUrl,
|
|
104
111
|
enclosures: options.enclosures,
|
|
105
112
|
embedResolvers: options.embedResolvers ?? defaultEmbedResolvers,
|
|
106
113
|
lazySrcAttributes: options.lazySrcAttributes ?? defaultLazySrcAttributes,
|
|
114
|
+
lazySrcsetAttributes: options.lazySrcsetAttributes ?? defaultLazySrcsetAttributes,
|
|
107
115
|
trackingHosts: options.trackingHosts ?? defaultTrackingHosts,
|
|
108
116
|
trackingPathSegments: options.trackingPathSegments ?? defaultTrackingPathSegments,
|
|
109
117
|
urlUnwrappers: options.urlUnwrappers ?? defaultUrlUnwrappers,
|
|
110
|
-
resolveUrlFn: options.resolveUrlFn ?? defaultResolveUrlFn
|
|
118
|
+
resolveUrlFn: options.resolveUrlFn ?? defaultResolveUrlFn,
|
|
119
|
+
assetProxyFn: options.assetProxyFn,
|
|
120
|
+
enrichEmbedFn: options.enrichEmbedFn,
|
|
121
|
+
articleTitle: options.articleTitle
|
|
111
122
|
};
|
|
112
123
|
const stringFns = options.stringTransforms ?? defaultStringTransforms;
|
|
113
124
|
const domFns = options.domTransforms ?? defaultDomTransforms;
|
|
114
|
-
const
|
|
115
|
-
return
|
|
125
|
+
const afterString = await applyStringTransforms(html, stringFns.map((transform) => transform(context)));
|
|
126
|
+
return await applyDomTransforms(await options.parseHtmlFn(afterString), domFns.map((transform) => transform(context)));
|
|
116
127
|
};
|
|
117
128
|
//#endregion
|
|
118
|
-
export { applyDomTransforms, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode,
|
|
129
|
+
export { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBreaksToParagraphs, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode, injectEnclosures, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, paragraphizePlainText, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInterBlockBreaks, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, transformContent, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, youtubeEmbedResolver, youtubeResolveEmbed };
|