feedsweep 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +46 -2
- package/dist/common.d.ts +2 -6
- package/dist/common.js +2 -50
- package/dist/defaults.d.ts +1 -2
- package/dist/defaults.js +7 -3
- package/dist/index.d.ts +5 -4
- package/dist/index.js +7 -6
- package/dist/parsers/linkedom.d.ts +4 -0
- package/dist/parsers/linkedom.js +38 -0
- package/dist/transforms/dom/fixLazyImages.js +0 -4
- package/dist/transforms/string/stripOversizedBase64Sources.d.ts +6 -0
- package/dist/transforms/string/stripOversizedBase64Sources.js +13 -0
- package/dist/types.d.ts +3 -2
- package/package.json +14 -4
package/README.md
CHANGED
|
@@ -11,15 +11,19 @@ Feedsweep takes raw feed item HTML and runs it through a pipeline that genuinely
|
|
|
11
11
|
## Installation
|
|
12
12
|
|
|
13
13
|
```bash
|
|
14
|
-
npm install feedsweep
|
|
14
|
+
npm install feedsweep linkedom
|
|
15
15
|
```
|
|
16
16
|
|
|
17
|
+
`linkedom` is an optional peer dependency. You only need it if you use the bundled `parseHtml` helper — see [DOM library](#dom-library) for jsdom / happy-dom / browser-native alternatives.
|
|
18
|
+
|
|
17
19
|
## Quick Start
|
|
18
20
|
|
|
19
21
|
```typescript
|
|
20
22
|
import { transformContent } from 'feedsweep'
|
|
23
|
+
import { parseHtml } from 'feedsweep/linkedom'
|
|
21
24
|
|
|
22
25
|
const result = await transformContent('<p>Check <img data-src="photo.jpg"> and visit /about</p>', {
|
|
26
|
+
parseHtmlFn: parseHtml,
|
|
23
27
|
baseUrl: 'https://example.com/post/1',
|
|
24
28
|
})
|
|
25
29
|
```
|
|
@@ -51,6 +55,7 @@ Inventory of every transform exported from the package. Most are enabled by defa
|
|
|
51
55
|
| `unwrapDoublyNestedLists` | Unwrap `<ul>`/`<ol>` that wrap a single `<li>` containing a same-type list |
|
|
52
56
|
| `mergeFragmentedLists` | Merge consecutive sibling `<ul>` / `<ol>` lists with matching attributes |
|
|
53
57
|
| `paragraphizePlainText` | Wrap plain text in `<p>` tags |
|
|
58
|
+
| `stripOversizedBase64Sources` | Drop base64 `src`/`srcset`/`poster` payloads larger than 50 KB before parsing |
|
|
54
59
|
| `linkifyUrls` | Wrap bare URLs in `<a>` tags |
|
|
55
60
|
| `trimPreWhitespace` | Remove common leading indentation from `<pre>` |
|
|
56
61
|
| `highlightCode` | Syntax-highlight `<code>` blocks with highlight.js |
|
|
@@ -62,8 +67,11 @@ Inventory of every transform exported from the package. Most are enabled by defa
|
|
|
62
67
|
|
|
63
68
|
```typescript
|
|
64
69
|
import { fixLazyImages, resolveRelativeUrls, transformContent } from 'feedsweep'
|
|
70
|
+
import { parseHtml } from 'feedsweep/linkedom'
|
|
65
71
|
|
|
66
72
|
const result = transformContent(html, {
|
|
73
|
+
// Required: function that turns an HTML string into a `Document`. See "DOM library".
|
|
74
|
+
parseHtmlFn: parseHtml,
|
|
67
75
|
// Base URL for resolving relative URLs.
|
|
68
76
|
baseUrl: 'https://example.com/post/1',
|
|
69
77
|
// Feed item enclosures (audio/video).
|
|
@@ -79,4 +87,40 @@ const result = transformContent(html, {
|
|
|
79
87
|
})
|
|
80
88
|
```
|
|
81
89
|
|
|
82
|
-
The `stringTransforms
|
|
90
|
+
The `stringTransforms` and `domTransforms` options each fully replace the corresponding default phase when provided. Every transform is also exported individually from `feedsweep`, so you can compose any pipeline — list them explicitly to build from scratch, or spread `defaultDomTransforms` (etc.) from `feedsweep/defaults` to extend or filter the defaults.
|
|
91
|
+
|
|
92
|
+
## DOM library
|
|
93
|
+
|
|
94
|
+
Feedsweep is parser-agnostic. You provide `parseHtmlFn` — a function that turns an HTML string into a `Document`. Use any DOM library that produces a standards-compliant `Document`.
|
|
95
|
+
|
|
96
|
+
```typescript
|
|
97
|
+
// linkedom (recommended default)
|
|
98
|
+
import { transformContent } from 'feedsweep'
|
|
99
|
+
import { parseHtml } from 'feedsweep/linkedom'
|
|
100
|
+
|
|
101
|
+
await transformContent(html, { parseHtmlFn: parseHtml, baseUrl })
|
|
102
|
+
|
|
103
|
+
// jsdom
|
|
104
|
+
import { transformContent } from 'feedsweep'
|
|
105
|
+
import { JSDOM } from 'jsdom'
|
|
106
|
+
|
|
107
|
+
await transformContent(html, {
|
|
108
|
+
parseHtmlFn: (raw) => new JSDOM(`<!doctype html><body>${raw}</body>`).window.document,
|
|
109
|
+
baseUrl,
|
|
110
|
+
})
|
|
111
|
+
|
|
112
|
+
// happy-dom
|
|
113
|
+
import { transformContent } from 'feedsweep'
|
|
114
|
+
import { Window } from 'happy-dom'
|
|
115
|
+
|
|
116
|
+
await transformContent(html, {
|
|
117
|
+
parseHtmlFn: (raw) => {
|
|
118
|
+
const window = new Window()
|
|
119
|
+
window.document.body.innerHTML = raw
|
|
120
|
+
return window.document
|
|
121
|
+
},
|
|
122
|
+
baseUrl,
|
|
123
|
+
})
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
The bundled `feedsweep/linkedom` parser bakes in two workarounds for linkedom-specific spec violations (attribute case-folding and SVG XML mode). jsdom and happy-dom do not need them.
|
package/dist/common.d.ts
CHANGED
|
@@ -1,15 +1,11 @@
|
|
|
1
1
|
import { EmbedResolverResult, MaybePromise } from "./types.js";
|
|
2
2
|
|
|
3
3
|
//#region src/common.d.ts
|
|
4
|
-
declare const
|
|
5
|
-
declare const expandSvgSelfClose: (html: string) => string;
|
|
6
|
-
declare const parseFragment: (html: string) => Document;
|
|
7
|
-
declare const transformHtml: (html: string, transform: (document: Document) => MaybePromise<void>) => Promise<string>;
|
|
8
|
-
declare const applyDomTransforms: (html: string, transforms: Array<(document: Document) => MaybePromise<void>>) => Promise<string>;
|
|
4
|
+
declare const applyDomTransforms: (document: Document, transforms: Array<(document: Document) => MaybePromise<void>>) => Promise<string>;
|
|
9
5
|
declare const applyStringTransforms: (html: string, transforms: Array<(html: string) => MaybePromise<string>>) => Promise<string>;
|
|
10
6
|
declare const applyEmbedMetadata: (element: HTMLElement, metadata: Partial<EmbedResolverResult>, options?: {
|
|
11
7
|
setIfMissing?: boolean;
|
|
12
8
|
}) => void;
|
|
13
9
|
declare const createEmbedPlaceholder: (document: Document, src: string, metadata?: Partial<EmbedResolverResult>) => HTMLElement;
|
|
14
10
|
//#endregion
|
|
15
|
-
export { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder
|
|
11
|
+
export { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder };
|
package/dist/common.js
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import { coerceNumber } from "./utils.js";
|
|
2
2
|
import { resolveUrl } from "feedcanon";
|
|
3
|
-
import { parseHTML } from "linkedom";
|
|
4
3
|
//#region src/common.ts
|
|
5
4
|
const Node = {
|
|
6
5
|
ELEMENT_NODE: 1,
|
|
@@ -12,58 +11,11 @@ const NodeFilter = {
|
|
|
12
11
|
SHOW_TEXT: 4,
|
|
13
12
|
SHOW_COMMENT: 128
|
|
14
13
|
};
|
|
15
|
-
const base64SrcRegex = /((?:src|srcset|poster)=["'])data:[^"']*;base64,[^"']*(["'])/g;
|
|
16
14
|
const safeThumbnailDataUrlRegex = /^data:image\/(png|jpe?g|gif|webp|avif);/i;
|
|
17
15
|
const isSafeThumbnailUrl = (url) => {
|
|
18
16
|
return resolveUrl(url) !== void 0 || safeThumbnailDataUrlRegex.test(url);
|
|
19
17
|
};
|
|
20
|
-
const
|
|
21
|
-
return html.replace(base64SrcRegex, (match, prefix, suffix) => {
|
|
22
|
-
if (match.length < maxSize) return match;
|
|
23
|
-
return `${prefix}${suffix}`;
|
|
24
|
-
});
|
|
25
|
-
};
|
|
26
|
-
const normalizeAttributeCase = (document) => {
|
|
27
|
-
for (const element of document.querySelectorAll("*")) {
|
|
28
|
-
const original = Array.from(element.attributes).map((attribute) => ({
|
|
29
|
-
name: attribute.name,
|
|
30
|
-
value: attribute.value
|
|
31
|
-
}));
|
|
32
|
-
const final = /* @__PURE__ */ new Map();
|
|
33
|
-
let needsRewrite = false;
|
|
34
|
-
for (const { name, value } of original) {
|
|
35
|
-
const lower = name.toLowerCase();
|
|
36
|
-
if (lower !== name) needsRewrite = true;
|
|
37
|
-
if (final.has(lower)) {
|
|
38
|
-
needsRewrite = true;
|
|
39
|
-
continue;
|
|
40
|
-
}
|
|
41
|
-
final.set(lower, value);
|
|
42
|
-
}
|
|
43
|
-
if (!needsRewrite) continue;
|
|
44
|
-
for (const { name } of original) element.removeAttribute(name);
|
|
45
|
-
for (const [name, value] of final) element.setAttribute(name, value);
|
|
46
|
-
}
|
|
47
|
-
};
|
|
48
|
-
const svgRegionRegex = /<svg\b[^>]*>[\s\S]*?<\/svg>/gi;
|
|
49
|
-
const svgSelfCloseRegex = /<([a-z][a-z0-9-]*)((?:\s[^>]*)?)\s*\/>/gi;
|
|
50
|
-
const expandSvgSelfClose = (html) => {
|
|
51
|
-
return html.replace(svgRegionRegex, (svgBlock) => {
|
|
52
|
-
return svgBlock.replace(svgSelfCloseRegex, "<$1$2></$1>");
|
|
53
|
-
});
|
|
54
|
-
};
|
|
55
|
-
const parseFragment = (html) => {
|
|
56
|
-
const { document } = parseHTML(`<!doctype html><html><head></head><body>${expandSvgSelfClose(html)}</body></html>`);
|
|
57
|
-
normalizeAttributeCase(document);
|
|
58
|
-
return document;
|
|
59
|
-
};
|
|
60
|
-
const transformHtml = async (html, transform) => {
|
|
61
|
-
const document = parseFragment(html);
|
|
62
|
-
await transform(document);
|
|
63
|
-
return document.body.innerHTML;
|
|
64
|
-
};
|
|
65
|
-
const applyDomTransforms = async (html, transforms) => {
|
|
66
|
-
const document = parseFragment(stripOversizedBase64Sources(html, 50 * 1024));
|
|
18
|
+
const applyDomTransforms = async (document, transforms) => {
|
|
67
19
|
for (const transform of transforms) await transform(document);
|
|
68
20
|
return document.body.innerHTML;
|
|
69
21
|
};
|
|
@@ -184,4 +136,4 @@ const createEmbedPlaceholder = (document, src, metadata) => {
|
|
|
184
136
|
return element;
|
|
185
137
|
};
|
|
186
138
|
//#endregion
|
|
187
|
-
export { Node, NodeFilter, applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder,
|
|
139
|
+
export { Node, NodeFilter, applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder, getDimensions, hasAncestorWithTagName, isBlockElement, isBr, isSkippable, isWhitespaceText };
|
package/dist/defaults.d.ts
CHANGED
|
@@ -3,7 +3,6 @@ import { DomTransform, EmbedResolver, ResolveUrlFn, StringTransform, UrlUnwrappe
|
|
|
3
3
|
//#region src/defaults.d.ts
|
|
4
4
|
declare const defaultStringTransforms: Array<StringTransform>;
|
|
5
5
|
declare const defaultDomTransforms: Array<DomTransform>;
|
|
6
|
-
declare const defaultFinalStringTransforms: Array<StringTransform>;
|
|
7
6
|
declare const defaultEmbedResolvers: Array<EmbedResolver>;
|
|
8
7
|
declare const defaultResolveUrlFn: ResolveUrlFn;
|
|
9
8
|
declare const defaultLazySrcAttributes: string[];
|
|
@@ -12,4 +11,4 @@ declare const defaultTrackingHosts: string[];
|
|
|
12
11
|
declare const defaultTrackingPathSegments: string[];
|
|
13
12
|
declare const defaultUrlUnwrappers: Array<UrlUnwrapper>;
|
|
14
13
|
//#endregion
|
|
15
|
-
export { defaultDomTransforms, defaultEmbedResolvers,
|
|
14
|
+
export { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
|
package/dist/defaults.js
CHANGED
|
@@ -24,6 +24,7 @@ import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedList
|
|
|
24
24
|
import { unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
|
|
25
25
|
import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
|
|
26
26
|
import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
|
|
27
|
+
import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
|
|
27
28
|
import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
|
|
28
29
|
import { unwrapBing } from "./unwraps/bing.js";
|
|
29
30
|
import { unwrapFacebookShim } from "./unwraps/facebook.js";
|
|
@@ -39,7 +40,11 @@ import { unwrapYahooSearch } from "./unwraps/yahooSearch.js";
|
|
|
39
40
|
import { unwrapYouTube } from "./unwraps/youtube.js";
|
|
40
41
|
import { resolveUrl } from "feedcanon";
|
|
41
42
|
//#region src/defaults.ts
|
|
42
|
-
const defaultStringTransforms = [
|
|
43
|
+
const defaultStringTransforms = [
|
|
44
|
+
stripOversizedBase64Sources,
|
|
45
|
+
unwrapCdataComments,
|
|
46
|
+
paragraphizePlainText
|
|
47
|
+
];
|
|
43
48
|
const defaultDomTransforms = [
|
|
44
49
|
decodeDoubleEncodedTags,
|
|
45
50
|
stripComments,
|
|
@@ -66,7 +71,6 @@ const defaultDomTransforms = [
|
|
|
66
71
|
unwrapWrappers,
|
|
67
72
|
stripEmptyTags
|
|
68
73
|
];
|
|
69
|
-
const defaultFinalStringTransforms = [];
|
|
70
74
|
const defaultEmbedResolvers = [youtubeEmbedResolver];
|
|
71
75
|
const defaultResolveUrlFn = (url, baseUrl) => resolveUrl(url, baseUrl);
|
|
72
76
|
const defaultLazySrcAttributes = [
|
|
@@ -153,4 +157,4 @@ const defaultUrlUnwrappers = [
|
|
|
153
157
|
unwrapRedditOut
|
|
154
158
|
];
|
|
155
159
|
//#endregion
|
|
156
|
-
export { defaultDomTransforms, defaultEmbedResolvers,
|
|
160
|
+
export { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { AssetProxyFn, AssetType, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext } from "./types.js";
|
|
1
|
+
import { AssetProxyFn, AssetType, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ParseHtmlFn, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext } from "./types.js";
|
|
2
2
|
import { defaultResolveUrlFn } from "./defaults.js";
|
|
3
|
-
import { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder
|
|
3
|
+
import { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder } from "./common.js";
|
|
4
4
|
import { composeThumbnailUrl, extractVideoId, youtubeEmbedResolver, youtubeResolveEmbed } from "./embeds/youtube.js";
|
|
5
5
|
import { convertBreaksToParagraphs } from "./transforms/dom/convertBreaksToParagraphs.js";
|
|
6
6
|
import { decodeDoubleEncodedTags } from "./transforms/dom/decodeDoubleEncodedTags.js";
|
|
@@ -28,6 +28,7 @@ import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedList
|
|
|
28
28
|
import { extractRedirectTarget, unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
|
|
29
29
|
import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
|
|
30
30
|
import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
|
|
31
|
+
import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
|
|
31
32
|
import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
|
|
32
33
|
import { unwrapAceml } from "./unwraps/aceml.js";
|
|
33
34
|
import { unwrapAdjust } from "./unwraps/adjust.js";
|
|
@@ -106,6 +107,6 @@ import { unwrapZhihu } from "./unwraps/zhihu.js";
|
|
|
106
107
|
import { ParamExtractorConfig, chooseBaseUrl, coerceNumber, createParamExtractor } from "./utils.js";
|
|
107
108
|
|
|
108
109
|
//#region src/index.d.ts
|
|
109
|
-
declare const transformContent: (html: string, options
|
|
110
|
+
declare const transformContent: (html: string, options: TransformContentOptions) => Promise<string>;
|
|
110
111
|
//#endregion
|
|
111
|
-
export { type AssetProxyFn, type AssetType, type DomTransform, type EmbedResolver, type EmbedResolverResult, type Enclosure, type EnrichEmbedFn, type MaybePromise, type ParamExtractorConfig, type ResolveUrlFn, type StringTransform, type TransformContentOptions, type TransformContext, applyDomTransforms, applyEmbedMetadata, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBreaksToParagraphs, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, enrichEmbedPlaceholders,
|
|
112
|
+
export { type AssetProxyFn, type AssetType, type DomTransform, type EmbedResolver, type EmbedResolverResult, type Enclosure, type EnrichEmbedFn, type MaybePromise, type ParamExtractorConfig, type ParseHtmlFn, type ResolveUrlFn, type StringTransform, type TransformContentOptions, type TransformContext, applyDomTransforms, applyEmbedMetadata, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBreaksToParagraphs, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode, injectEnclosures, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, paragraphizePlainText, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInterBlockBreaks, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, transformContent, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, youtubeEmbedResolver, youtubeResolveEmbed };
|
package/dist/index.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { chooseBaseUrl, coerceNumber, createParamExtractor } from "./utils.js";
|
|
2
|
-
import { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder
|
|
2
|
+
import { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder } from "./common.js";
|
|
3
3
|
import { composeThumbnailUrl, extractVideoId, youtubeEmbedResolver, youtubeResolveEmbed } from "./embeds/youtube.js";
|
|
4
4
|
import { convertBreaksToParagraphs } from "./transforms/dom/convertBreaksToParagraphs.js";
|
|
5
5
|
import { decodeDoubleEncodedTags } from "./transforms/dom/decodeDoubleEncodedTags.js";
|
|
@@ -26,6 +26,7 @@ import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedList
|
|
|
26
26
|
import { extractRedirectTarget, unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
|
|
27
27
|
import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
|
|
28
28
|
import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
|
|
29
|
+
import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
|
|
29
30
|
import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
|
|
30
31
|
import { unwrapBing } from "./unwraps/bing.js";
|
|
31
32
|
import { unwrapFacebookShim } from "./unwraps/facebook.js";
|
|
@@ -39,7 +40,7 @@ import { unwrapRedditOut } from "./unwraps/redditOut.js";
|
|
|
39
40
|
import { unwrapVkAway } from "./unwraps/vkAway.js";
|
|
40
41
|
import { unwrapYahooSearch } from "./unwraps/yahooSearch.js";
|
|
41
42
|
import { unwrapYouTube } from "./unwraps/youtube.js";
|
|
42
|
-
import { defaultDomTransforms, defaultEmbedResolvers,
|
|
43
|
+
import { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers } from "./defaults.js";
|
|
43
44
|
import { enrichEmbedPlaceholders } from "./transforms/dom/enrichEmbedPlaceholders.js";
|
|
44
45
|
import { unwrapAceml } from "./unwraps/aceml.js";
|
|
45
46
|
import { unwrapAdjust } from "./unwraps/adjust.js";
|
|
@@ -104,7 +105,7 @@ import { unwrapWebArchive } from "./unwraps/webArchive.js";
|
|
|
104
105
|
import { unwrapYandexTurbo } from "./unwraps/yandexTurbo.js";
|
|
105
106
|
import { unwrapZhihu } from "./unwraps/zhihu.js";
|
|
106
107
|
//#region src/index.ts
|
|
107
|
-
const transformContent = async (html, options
|
|
108
|
+
const transformContent = async (html, options) => {
|
|
108
109
|
const context = {
|
|
109
110
|
baseUrl: options.baseUrl,
|
|
110
111
|
enclosures: options.enclosures,
|
|
@@ -121,8 +122,8 @@ const transformContent = async (html, options = {}) => {
|
|
|
121
122
|
};
|
|
122
123
|
const stringFns = options.stringTransforms ?? defaultStringTransforms;
|
|
123
124
|
const domFns = options.domTransforms ?? defaultDomTransforms;
|
|
124
|
-
const
|
|
125
|
-
return await
|
|
125
|
+
const afterString = await applyStringTransforms(html, stringFns.map((transform) => transform(context)));
|
|
126
|
+
return await applyDomTransforms(await options.parseHtmlFn(afterString), domFns.map((transform) => transform(context)));
|
|
126
127
|
};
|
|
127
128
|
//#endregion
|
|
128
|
-
export { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBreaksToParagraphs, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, enrichEmbedPlaceholders,
|
|
129
|
+
export { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBreaksToParagraphs, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode, injectEnclosures, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, paragraphizePlainText, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInterBlockBreaks, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, transformContent, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, youtubeEmbedResolver, youtubeResolveEmbed };
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import { parseHTML } from "linkedom";
|
|
2
|
+
//#region src/parsers/linkedom.ts
|
|
3
|
+
const normalizeAttributeCase = (document) => {
|
|
4
|
+
for (const element of document.querySelectorAll("*")) {
|
|
5
|
+
const original = Array.from(element.attributes).map((attribute) => ({
|
|
6
|
+
name: attribute.name,
|
|
7
|
+
value: attribute.value
|
|
8
|
+
}));
|
|
9
|
+
const final = /* @__PURE__ */ new Map();
|
|
10
|
+
let needsRewrite = false;
|
|
11
|
+
for (const { name, value } of original) {
|
|
12
|
+
const lower = name.toLowerCase();
|
|
13
|
+
if (lower !== name) needsRewrite = true;
|
|
14
|
+
if (final.has(lower)) {
|
|
15
|
+
needsRewrite = true;
|
|
16
|
+
continue;
|
|
17
|
+
}
|
|
18
|
+
final.set(lower, value);
|
|
19
|
+
}
|
|
20
|
+
if (!needsRewrite) continue;
|
|
21
|
+
for (const { name } of original) element.removeAttribute(name);
|
|
22
|
+
for (const [name, value] of final) element.setAttribute(name, value);
|
|
23
|
+
}
|
|
24
|
+
};
|
|
25
|
+
const svgRegionRegex = /<svg\b[^>]*>[\s\S]*?<\/svg>/gi;
|
|
26
|
+
const svgSelfCloseRegex = /<([a-z][a-z0-9-]*)((?:\s[^>]*)?)\s*\/>/gi;
|
|
27
|
+
const expandSvgSelfClose = (html) => {
|
|
28
|
+
return html.replace(svgRegionRegex, (svgBlock) => {
|
|
29
|
+
return svgBlock.replace(svgSelfCloseRegex, "<$1$2></$1>");
|
|
30
|
+
});
|
|
31
|
+
};
|
|
32
|
+
const parseHtml = (html) => {
|
|
33
|
+
const { document } = parseHTML(`<!doctype html><html><head></head><body>${expandSvgSelfClose(html)}</body></html>`);
|
|
34
|
+
normalizeAttributeCase(document);
|
|
35
|
+
return document;
|
|
36
|
+
};
|
|
37
|
+
//#endregion
|
|
38
|
+
export { parseHtml };
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { normalizeAttributeCase } from "../../common.js";
|
|
2
1
|
//#region src/transforms/dom/fixLazyImages.ts
|
|
3
2
|
const imgPattern = /<img\s/i;
|
|
4
3
|
const urlShapeRegex = /[:/.]/;
|
|
@@ -45,7 +44,6 @@ const fixLazyImages = (context) => {
|
|
|
45
44
|
}
|
|
46
45
|
}
|
|
47
46
|
const noscripts = document.querySelectorAll("noscript");
|
|
48
|
-
let replacedNoscript = false;
|
|
49
47
|
for (const noscript of noscripts) {
|
|
50
48
|
const sibling = noscript.previousElementSibling;
|
|
51
49
|
if (sibling?.localName !== "img") continue;
|
|
@@ -53,9 +51,7 @@ const fixLazyImages = (context) => {
|
|
|
53
51
|
if (!imgPattern.test(inner)) continue;
|
|
54
52
|
sibling.remove();
|
|
55
53
|
noscript.outerHTML = inner;
|
|
56
|
-
replacedNoscript = true;
|
|
57
54
|
}
|
|
58
|
-
if (replacedNoscript) normalizeAttributeCase(document);
|
|
59
55
|
};
|
|
60
56
|
};
|
|
61
57
|
//#endregion
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
//#region src/transforms/string/stripOversizedBase64Sources.ts
|
|
2
|
+
const base64SrcRegex = /((?:src|srcset|poster)=["'])data:[^"']*;base64,[^"']*(["'])/g;
|
|
3
|
+
const maxBase64Size = 50 * 1024;
|
|
4
|
+
const stripOversizedBase64Sources = () => {
|
|
5
|
+
return (html) => {
|
|
6
|
+
return html.replace(base64SrcRegex, (match, prefix, suffix) => {
|
|
7
|
+
if (match.length < maxBase64Size) return match;
|
|
8
|
+
return `${prefix}${suffix}`;
|
|
9
|
+
});
|
|
10
|
+
};
|
|
11
|
+
};
|
|
12
|
+
//#endregion
|
|
13
|
+
export { stripOversizedBase64Sources };
|
package/dist/types.d.ts
CHANGED
|
@@ -60,7 +60,9 @@ type TransformContext = {
|
|
|
60
60
|
};
|
|
61
61
|
type DomTransform = (context: TransformContext) => (document: Document) => MaybePromise<void>;
|
|
62
62
|
type StringTransform = (context: TransformContext) => (html: string) => MaybePromise<string>;
|
|
63
|
+
type ParseHtmlFn = (html: string) => MaybePromise<Document>;
|
|
63
64
|
type TransformContentOptions = {
|
|
65
|
+
parseHtmlFn: ParseHtmlFn;
|
|
64
66
|
baseUrl?: string;
|
|
65
67
|
enclosures?: Array<Enclosure>;
|
|
66
68
|
embedResolvers?: Array<EmbedResolver>;
|
|
@@ -75,7 +77,6 @@ type TransformContentOptions = {
|
|
|
75
77
|
articleTitle?: string;
|
|
76
78
|
stringTransforms?: Array<StringTransform>;
|
|
77
79
|
domTransforms?: Array<DomTransform>;
|
|
78
|
-
finalStringTransforms?: Array<StringTransform>;
|
|
79
80
|
};
|
|
80
81
|
//#endregion
|
|
81
|
-
export { AssetProxyFn, AssetType, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext, UrlUnwrapper };
|
|
82
|
+
export { AssetProxyFn, AssetType, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ParseHtmlFn, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext, UrlUnwrapper };
|
package/package.json
CHANGED
|
@@ -29,6 +29,10 @@
|
|
|
29
29
|
"./defaults": {
|
|
30
30
|
"types": "./dist/defaults.d.ts",
|
|
31
31
|
"default": "./dist/defaults.js"
|
|
32
|
+
},
|
|
33
|
+
"./linkedom": {
|
|
34
|
+
"types": "./dist/parsers/linkedom.d.ts",
|
|
35
|
+
"default": "./dist/parsers/linkedom.js"
|
|
32
36
|
}
|
|
33
37
|
},
|
|
34
38
|
"files": [
|
|
@@ -36,23 +40,29 @@
|
|
|
36
40
|
],
|
|
37
41
|
"scripts": {
|
|
38
42
|
"prepare": "lefthook install",
|
|
39
|
-
"build": "tsdown src/index.ts src/defaults.ts --format esm --dts --clean --unbundle --no-fixed-extension"
|
|
43
|
+
"build": "tsdown src/index.ts src/defaults.ts src/parsers/linkedom.ts --format esm --dts --clean --unbundle --no-fixed-extension"
|
|
40
44
|
},
|
|
41
45
|
"dependencies": {
|
|
42
46
|
"@wordpress/autop": "^4.46.0",
|
|
43
47
|
"highlight.js": "^11.11.1",
|
|
44
|
-
"linkedom": "^0.18.12",
|
|
45
48
|
"linkifyjs": "^4.3.2",
|
|
46
49
|
"srcset": "^5.0.3"
|
|
47
50
|
},
|
|
48
51
|
"peerDependencies": {
|
|
49
52
|
"feedcanon": "^2.0.0-next.3",
|
|
50
|
-
"feedscout": "^2.0.0-next.2"
|
|
53
|
+
"feedscout": "^2.0.0-next.2",
|
|
54
|
+
"linkedom": "^0.18.12"
|
|
55
|
+
},
|
|
56
|
+
"peerDependenciesMeta": {
|
|
57
|
+
"linkedom": {
|
|
58
|
+
"optional": true
|
|
59
|
+
}
|
|
51
60
|
},
|
|
52
61
|
"devDependencies": {
|
|
53
62
|
"@types/bun": "^1.3.13",
|
|
54
63
|
"kvalita": "^1.13.0",
|
|
64
|
+
"linkedom": "^0.18.12",
|
|
55
65
|
"tsdown": "^0.22.0"
|
|
56
66
|
},
|
|
57
|
-
"version": "1.
|
|
67
|
+
"version": "1.2.0"
|
|
58
68
|
}
|