feedsweep 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -11,15 +11,19 @@ Feedsweep takes raw feed item HTML and runs it through a pipeline that genuinely
11
11
  ## Installation
12
12
 
13
13
  ```bash
14
- npm install feedsweep
14
+ npm install feedsweep linkedom
15
15
  ```
16
16
 
17
+ `linkedom` is an optional peer dependency. You only need it if you use the bundled `parseHtml` helper — see [DOM library](#dom-library) for jsdom / happy-dom / browser-native alternatives.
18
+
17
19
  ## Quick Start
18
20
 
19
21
  ```typescript
20
22
  import { transformContent } from 'feedsweep'
23
+ import { parseHtml } from 'feedsweep/linkedom'
21
24
 
22
25
  const result = await transformContent('<p>Check <img data-src="photo.jpg"> and visit /about</p>', {
26
+ parseHtmlFn: parseHtml,
23
27
  baseUrl: 'https://example.com/post/1',
24
28
  })
25
29
  ```
@@ -51,6 +55,7 @@ Inventory of every transform exported from the package. Most are enabled by defa
51
55
  | `unwrapDoublyNestedLists` | Unwrap `<ul>`/`<ol>` that wrap a single `<li>` containing a same-type list |
52
56
  | `mergeFragmentedLists` | Merge consecutive sibling `<ul>` / `<ol>` lists with matching attributes |
53
57
  | `paragraphizePlainText` | Wrap plain text in `<p>` tags |
58
+ | `stripOversizedBase64Sources` | Drop base64 `src`/`srcset`/`poster` payloads larger than 50 KB before parsing |
54
59
  | `linkifyUrls` | Wrap bare URLs in `<a>` tags |
55
60
  | `trimPreWhitespace` | Remove common leading indentation from `<pre>` |
56
61
  | `highlightCode` | Syntax-highlight `<code>` blocks with highlight.js |
@@ -62,8 +67,11 @@ Inventory of every transform exported from the package. Most are enabled by defa
62
67
 
63
68
  ```typescript
64
69
  import { fixLazyImages, resolveRelativeUrls, transformContent } from 'feedsweep'
70
+ import { parseHtml } from 'feedsweep/linkedom'
65
71
 
66
72
  const result = transformContent(html, {
73
+ // Required: function that turns an HTML string into a `Document`. See "DOM library".
74
+ parseHtmlFn: parseHtml,
67
75
  // Base URL for resolving relative URLs.
68
76
  baseUrl: 'https://example.com/post/1',
69
77
  // Feed item enclosures (audio/video).
@@ -79,4 +87,40 @@ const result = transformContent(html, {
79
87
  })
80
88
  ```
81
89
 
82
- The `stringTransforms`, `domTransforms`, and `finalStringTransforms` options each fully replace the corresponding default phase when provided. Every transform is also exported individually from `feedsweep`, so you can compose any pipeline — list them explicitly to build from scratch, or spread `defaultDomTransforms` (etc.) from `feedsweep/defaults` to extend or filter the defaults.
90
+ The `stringTransforms` and `domTransforms` options each fully replace the corresponding default phase when provided. Every transform is also exported individually from `feedsweep`, so you can compose any pipeline — list them explicitly to build from scratch, or spread `defaultDomTransforms` (etc.) from `feedsweep/defaults` to extend or filter the defaults.
91
+
92
+ ## DOM library
93
+
94
+ Feedsweep is parser-agnostic. You provide `parseHtmlFn` — a function that turns an HTML string into a `Document`. Use any DOM library that produces a standards-compliant `Document`.
95
+
96
+ ```typescript
97
+ // linkedom (recommended default)
98
+ import { transformContent } from 'feedsweep'
99
+ import { parseHtml } from 'feedsweep/linkedom'
100
+
101
+ await transformContent(html, { parseHtmlFn: parseHtml, baseUrl })
102
+
103
+ // jsdom
104
+ import { transformContent } from 'feedsweep'
105
+ import { JSDOM } from 'jsdom'
106
+
107
+ await transformContent(html, {
108
+ parseHtmlFn: (raw) => new JSDOM(`<!doctype html><body>${raw}</body>`).window.document,
109
+ baseUrl,
110
+ })
111
+
112
+ // happy-dom
113
+ import { transformContent } from 'feedsweep'
114
+ import { Window } from 'happy-dom'
115
+
116
+ await transformContent(html, {
117
+ parseHtmlFn: (raw) => {
118
+ const window = new Window()
119
+ window.document.body.innerHTML = raw
120
+ return window.document
121
+ },
122
+ baseUrl,
123
+ })
124
+ ```
125
+
126
+ The bundled `feedsweep/linkedom` parser bakes in two workarounds for linkedom-specific spec violations (attribute case-folding and SVG XML mode). jsdom and happy-dom do not need them.
package/dist/common.d.ts CHANGED
@@ -1,15 +1,11 @@
1
1
  import { EmbedResolverResult, MaybePromise } from "./types.js";
2
2
 
3
3
  //#region src/common.d.ts
4
- declare const stripOversizedBase64Sources: (html: string, maxSize: number) => string;
5
- declare const expandSvgSelfClose: (html: string) => string;
6
- declare const parseFragment: (html: string) => Document;
7
- declare const transformHtml: (html: string, transform: (document: Document) => MaybePromise<void>) => Promise<string>;
8
- declare const applyDomTransforms: (html: string, transforms: Array<(document: Document) => MaybePromise<void>>) => Promise<string>;
4
+ declare const applyDomTransforms: (document: Document, transforms: Array<(document: Document) => MaybePromise<void>>) => Promise<string>;
9
5
  declare const applyStringTransforms: (html: string, transforms: Array<(html: string) => MaybePromise<string>>) => Promise<string>;
10
6
  declare const applyEmbedMetadata: (element: HTMLElement, metadata: Partial<EmbedResolverResult>, options?: {
11
7
  setIfMissing?: boolean;
12
8
  }) => void;
13
9
  declare const createEmbedPlaceholder: (document: Document, src: string, metadata?: Partial<EmbedResolverResult>) => HTMLElement;
14
10
  //#endregion
15
- export { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder, expandSvgSelfClose, parseFragment, stripOversizedBase64Sources, transformHtml };
11
+ export { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder };
package/dist/common.js CHANGED
@@ -1,6 +1,5 @@
1
1
  import { coerceNumber } from "./utils.js";
2
2
  import { resolveUrl } from "feedcanon";
3
- import { parseHTML } from "linkedom";
4
3
  //#region src/common.ts
5
4
  const Node = {
6
5
  ELEMENT_NODE: 1,
@@ -12,58 +11,11 @@ const NodeFilter = {
12
11
  SHOW_TEXT: 4,
13
12
  SHOW_COMMENT: 128
14
13
  };
15
- const base64SrcRegex = /((?:src|srcset|poster)=["'])data:[^"']*;base64,[^"']*(["'])/g;
16
14
  const safeThumbnailDataUrlRegex = /^data:image\/(png|jpe?g|gif|webp|avif);/i;
17
15
  const isSafeThumbnailUrl = (url) => {
18
16
  return resolveUrl(url) !== void 0 || safeThumbnailDataUrlRegex.test(url);
19
17
  };
20
- const stripOversizedBase64Sources = (html, maxSize) => {
21
- return html.replace(base64SrcRegex, (match, prefix, suffix) => {
22
- if (match.length < maxSize) return match;
23
- return `${prefix}${suffix}`;
24
- });
25
- };
26
- const normalizeAttributeCase = (document) => {
27
- for (const element of document.querySelectorAll("*")) {
28
- const original = Array.from(element.attributes).map((attribute) => ({
29
- name: attribute.name,
30
- value: attribute.value
31
- }));
32
- const final = /* @__PURE__ */ new Map();
33
- let needsRewrite = false;
34
- for (const { name, value } of original) {
35
- const lower = name.toLowerCase();
36
- if (lower !== name) needsRewrite = true;
37
- if (final.has(lower)) {
38
- needsRewrite = true;
39
- continue;
40
- }
41
- final.set(lower, value);
42
- }
43
- if (!needsRewrite) continue;
44
- for (const { name } of original) element.removeAttribute(name);
45
- for (const [name, value] of final) element.setAttribute(name, value);
46
- }
47
- };
48
- const svgRegionRegex = /<svg\b[^>]*>[\s\S]*?<\/svg>/gi;
49
- const svgSelfCloseRegex = /<([a-z][a-z0-9-]*)((?:\s[^>]*)?)\s*\/>/gi;
50
- const expandSvgSelfClose = (html) => {
51
- return html.replace(svgRegionRegex, (svgBlock) => {
52
- return svgBlock.replace(svgSelfCloseRegex, "<$1$2></$1>");
53
- });
54
- };
55
- const parseFragment = (html) => {
56
- const { document } = parseHTML(`<!doctype html><html><head></head><body>${expandSvgSelfClose(html)}</body></html>`);
57
- normalizeAttributeCase(document);
58
- return document;
59
- };
60
- const transformHtml = async (html, transform) => {
61
- const document = parseFragment(html);
62
- await transform(document);
63
- return document.body.innerHTML;
64
- };
65
- const applyDomTransforms = async (html, transforms) => {
66
- const document = parseFragment(stripOversizedBase64Sources(html, 50 * 1024));
18
+ const applyDomTransforms = async (document, transforms) => {
67
19
  for (const transform of transforms) await transform(document);
68
20
  return document.body.innerHTML;
69
21
  };
@@ -184,4 +136,4 @@ const createEmbedPlaceholder = (document, src, metadata) => {
184
136
  return element;
185
137
  };
186
138
  //#endregion
187
- export { Node, NodeFilter, applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder, expandSvgSelfClose, getDimensions, hasAncestorWithTagName, isBlockElement, isBr, isSkippable, isWhitespaceText, normalizeAttributeCase, parseFragment, stripOversizedBase64Sources, transformHtml };
139
+ export { Node, NodeFilter, applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder, getDimensions, hasAncestorWithTagName, isBlockElement, isBr, isSkippable, isWhitespaceText };
@@ -3,7 +3,6 @@ import { DomTransform, EmbedResolver, ResolveUrlFn, StringTransform, UrlUnwrappe
3
3
  //#region src/defaults.d.ts
4
4
  declare const defaultStringTransforms: Array<StringTransform>;
5
5
  declare const defaultDomTransforms: Array<DomTransform>;
6
- declare const defaultFinalStringTransforms: Array<StringTransform>;
7
6
  declare const defaultEmbedResolvers: Array<EmbedResolver>;
8
7
  declare const defaultResolveUrlFn: ResolveUrlFn;
9
8
  declare const defaultLazySrcAttributes: string[];
@@ -12,4 +11,4 @@ declare const defaultTrackingHosts: string[];
12
11
  declare const defaultTrackingPathSegments: string[];
13
12
  declare const defaultUrlUnwrappers: Array<UrlUnwrapper>;
14
13
  //#endregion
15
- export { defaultDomTransforms, defaultEmbedResolvers, defaultFinalStringTransforms, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
14
+ export { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
package/dist/defaults.js CHANGED
@@ -24,6 +24,7 @@ import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedList
24
24
  import { unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
25
25
  import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
26
26
  import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
27
+ import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
27
28
  import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
28
29
  import { unwrapBing } from "./unwraps/bing.js";
29
30
  import { unwrapFacebookShim } from "./unwraps/facebook.js";
@@ -39,7 +40,11 @@ import { unwrapYahooSearch } from "./unwraps/yahooSearch.js";
39
40
  import { unwrapYouTube } from "./unwraps/youtube.js";
40
41
  import { resolveUrl } from "feedcanon";
41
42
  //#region src/defaults.ts
42
- const defaultStringTransforms = [unwrapCdataComments, paragraphizePlainText];
43
+ const defaultStringTransforms = [
44
+ stripOversizedBase64Sources,
45
+ unwrapCdataComments,
46
+ paragraphizePlainText
47
+ ];
43
48
  const defaultDomTransforms = [
44
49
  decodeDoubleEncodedTags,
45
50
  stripComments,
@@ -66,7 +71,6 @@ const defaultDomTransforms = [
66
71
  unwrapWrappers,
67
72
  stripEmptyTags
68
73
  ];
69
- const defaultFinalStringTransforms = [];
70
74
  const defaultEmbedResolvers = [youtubeEmbedResolver];
71
75
  const defaultResolveUrlFn = (url, baseUrl) => resolveUrl(url, baseUrl);
72
76
  const defaultLazySrcAttributes = [
@@ -153,4 +157,4 @@ const defaultUrlUnwrappers = [
153
157
  unwrapRedditOut
154
158
  ];
155
159
  //#endregion
156
- export { defaultDomTransforms, defaultEmbedResolvers, defaultFinalStringTransforms, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
160
+ export { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
package/dist/index.d.ts CHANGED
@@ -1,6 +1,6 @@
1
- import { AssetProxyFn, AssetType, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext } from "./types.js";
1
+ import { AssetProxyFn, AssetType, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ParseHtmlFn, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext } from "./types.js";
2
2
  import { defaultResolveUrlFn } from "./defaults.js";
3
- import { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder, expandSvgSelfClose, parseFragment, stripOversizedBase64Sources, transformHtml } from "./common.js";
3
+ import { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder } from "./common.js";
4
4
  import { composeThumbnailUrl, extractVideoId, youtubeEmbedResolver, youtubeResolveEmbed } from "./embeds/youtube.js";
5
5
  import { convertBreaksToParagraphs } from "./transforms/dom/convertBreaksToParagraphs.js";
6
6
  import { decodeDoubleEncodedTags } from "./transforms/dom/decodeDoubleEncodedTags.js";
@@ -28,6 +28,7 @@ import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedList
28
28
  import { extractRedirectTarget, unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
29
29
  import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
30
30
  import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
31
+ import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
31
32
  import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
32
33
  import { unwrapAceml } from "./unwraps/aceml.js";
33
34
  import { unwrapAdjust } from "./unwraps/adjust.js";
@@ -106,6 +107,6 @@ import { unwrapZhihu } from "./unwraps/zhihu.js";
106
107
  import { ParamExtractorConfig, chooseBaseUrl, coerceNumber, createParamExtractor } from "./utils.js";
107
108
 
108
109
  //#region src/index.d.ts
109
- declare const transformContent: (html: string, options?: TransformContentOptions) => Promise<string>;
110
+ declare const transformContent: (html: string, options: TransformContentOptions) => Promise<string>;
110
111
  //#endregion
111
- export { type AssetProxyFn, type AssetType, type DomTransform, type EmbedResolver, type EmbedResolverResult, type Enclosure, type EnrichEmbedFn, type MaybePromise, type ParamExtractorConfig, type ResolveUrlFn, type StringTransform, type TransformContentOptions, type TransformContext, applyDomTransforms, applyEmbedMetadata, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBreaksToParagraphs, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, enrichEmbedPlaceholders, expandSvgSelfClose, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode, injectEnclosures, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, paragraphizePlainText, parseFragment, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInterBlockBreaks, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, transformContent, transformHtml, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, youtubeEmbedResolver, youtubeResolveEmbed };
112
+ export { type AssetProxyFn, type AssetType, type DomTransform, type EmbedResolver, type EmbedResolverResult, type Enclosure, type EnrichEmbedFn, type MaybePromise, type ParamExtractorConfig, type ParseHtmlFn, type ResolveUrlFn, type StringTransform, type TransformContentOptions, type TransformContext, applyDomTransforms, applyEmbedMetadata, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBreaksToParagraphs, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode, injectEnclosures, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, paragraphizePlainText, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInterBlockBreaks, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, transformContent, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, youtubeEmbedResolver, youtubeResolveEmbed };
package/dist/index.js CHANGED
@@ -1,5 +1,5 @@
1
1
  import { chooseBaseUrl, coerceNumber, createParamExtractor } from "./utils.js";
2
- import { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder, expandSvgSelfClose, parseFragment, stripOversizedBase64Sources, transformHtml } from "./common.js";
2
+ import { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder } from "./common.js";
3
3
  import { composeThumbnailUrl, extractVideoId, youtubeEmbedResolver, youtubeResolveEmbed } from "./embeds/youtube.js";
4
4
  import { convertBreaksToParagraphs } from "./transforms/dom/convertBreaksToParagraphs.js";
5
5
  import { decodeDoubleEncodedTags } from "./transforms/dom/decodeDoubleEncodedTags.js";
@@ -26,6 +26,7 @@ import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedList
26
26
  import { extractRedirectTarget, unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
27
27
  import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
28
28
  import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
29
+ import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
29
30
  import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
30
31
  import { unwrapBing } from "./unwraps/bing.js";
31
32
  import { unwrapFacebookShim } from "./unwraps/facebook.js";
@@ -39,7 +40,7 @@ import { unwrapRedditOut } from "./unwraps/redditOut.js";
39
40
  import { unwrapVkAway } from "./unwraps/vkAway.js";
40
41
  import { unwrapYahooSearch } from "./unwraps/yahooSearch.js";
41
42
  import { unwrapYouTube } from "./unwraps/youtube.js";
42
- import { defaultDomTransforms, defaultEmbedResolvers, defaultFinalStringTransforms, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers } from "./defaults.js";
43
+ import { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers } from "./defaults.js";
43
44
  import { enrichEmbedPlaceholders } from "./transforms/dom/enrichEmbedPlaceholders.js";
44
45
  import { unwrapAceml } from "./unwraps/aceml.js";
45
46
  import { unwrapAdjust } from "./unwraps/adjust.js";
@@ -104,7 +105,7 @@ import { unwrapWebArchive } from "./unwraps/webArchive.js";
104
105
  import { unwrapYandexTurbo } from "./unwraps/yandexTurbo.js";
105
106
  import { unwrapZhihu } from "./unwraps/zhihu.js";
106
107
  //#region src/index.ts
107
- const transformContent = async (html, options = {}) => {
108
+ const transformContent = async (html, options) => {
108
109
  const context = {
109
110
  baseUrl: options.baseUrl,
110
111
  enclosures: options.enclosures,
@@ -121,8 +122,8 @@ const transformContent = async (html, options = {}) => {
121
122
  };
122
123
  const stringFns = options.stringTransforms ?? defaultStringTransforms;
123
124
  const domFns = options.domTransforms ?? defaultDomTransforms;
124
- const finalFns = options.finalStringTransforms ?? defaultFinalStringTransforms;
125
- return await applyStringTransforms(await applyDomTransforms(await applyStringTransforms(html, stringFns.map((transform) => transform(context))), domFns.map((transform) => transform(context))), finalFns.map((transform) => transform(context)));
125
+ const afterString = await applyStringTransforms(html, stringFns.map((transform) => transform(context)));
126
+ return await applyDomTransforms(await options.parseHtmlFn(afterString), domFns.map((transform) => transform(context)));
126
127
  };
127
128
  //#endregion
128
- export { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBreaksToParagraphs, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, enrichEmbedPlaceholders, expandSvgSelfClose, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode, injectEnclosures, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, paragraphizePlainText, parseFragment, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInterBlockBreaks, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, transformContent, transformHtml, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, youtubeEmbedResolver, youtubeResolveEmbed };
129
+ export { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBreaksToParagraphs, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode, injectEnclosures, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, paragraphizePlainText, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInterBlockBreaks, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, transformContent, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, youtubeEmbedResolver, youtubeResolveEmbed };
@@ -0,0 +1,4 @@
1
+ //#region src/parsers/linkedom.d.ts
2
+ declare const parseHtml: (html: string) => Document;
3
+ //#endregion
4
+ export { parseHtml };
@@ -0,0 +1,38 @@
1
+ import { parseHTML } from "linkedom";
2
+ //#region src/parsers/linkedom.ts
3
+ const normalizeAttributeCase = (document) => {
4
+ for (const element of document.querySelectorAll("*")) {
5
+ const original = Array.from(element.attributes).map((attribute) => ({
6
+ name: attribute.name,
7
+ value: attribute.value
8
+ }));
9
+ const final = /* @__PURE__ */ new Map();
10
+ let needsRewrite = false;
11
+ for (const { name, value } of original) {
12
+ const lower = name.toLowerCase();
13
+ if (lower !== name) needsRewrite = true;
14
+ if (final.has(lower)) {
15
+ needsRewrite = true;
16
+ continue;
17
+ }
18
+ final.set(lower, value);
19
+ }
20
+ if (!needsRewrite) continue;
21
+ for (const { name } of original) element.removeAttribute(name);
22
+ for (const [name, value] of final) element.setAttribute(name, value);
23
+ }
24
+ };
25
+ const svgRegionRegex = /<svg\b[^>]*>[\s\S]*?<\/svg>/gi;
26
+ const svgSelfCloseRegex = /<([a-z][a-z0-9-]*)((?:\s[^>]*)?)\s*\/>/gi;
27
+ const expandSvgSelfClose = (html) => {
28
+ return html.replace(svgRegionRegex, (svgBlock) => {
29
+ return svgBlock.replace(svgSelfCloseRegex, "<$1$2></$1>");
30
+ });
31
+ };
32
+ const parseHtml = (html) => {
33
+ const { document } = parseHTML(`<!doctype html><html><head></head><body>${expandSvgSelfClose(html)}</body></html>`);
34
+ normalizeAttributeCase(document);
35
+ return document;
36
+ };
37
+ //#endregion
38
+ export { parseHtml };
@@ -1,4 +1,3 @@
1
- import { normalizeAttributeCase } from "../../common.js";
2
1
  //#region src/transforms/dom/fixLazyImages.ts
3
2
  const imgPattern = /<img\s/i;
4
3
  const urlShapeRegex = /[:/.]/;
@@ -45,7 +44,6 @@ const fixLazyImages = (context) => {
45
44
  }
46
45
  }
47
46
  const noscripts = document.querySelectorAll("noscript");
48
- let replacedNoscript = false;
49
47
  for (const noscript of noscripts) {
50
48
  const sibling = noscript.previousElementSibling;
51
49
  if (sibling?.localName !== "img") continue;
@@ -53,9 +51,7 @@ const fixLazyImages = (context) => {
53
51
  if (!imgPattern.test(inner)) continue;
54
52
  sibling.remove();
55
53
  noscript.outerHTML = inner;
56
- replacedNoscript = true;
57
54
  }
58
- if (replacedNoscript) normalizeAttributeCase(document);
59
55
  };
60
56
  };
61
57
  //#endregion
@@ -0,0 +1,6 @@
1
+ import { StringTransform } from "../../types.js";
2
+
3
+ //#region src/transforms/string/stripOversizedBase64Sources.d.ts
4
+ declare const stripOversizedBase64Sources: StringTransform;
5
+ //#endregion
6
+ export { stripOversizedBase64Sources };
@@ -0,0 +1,13 @@
1
+ //#region src/transforms/string/stripOversizedBase64Sources.ts
2
+ const base64SrcRegex = /((?:src|srcset|poster)=["'])data:[^"']*;base64,[^"']*(["'])/g;
3
+ const maxBase64Size = 50 * 1024;
4
+ const stripOversizedBase64Sources = () => {
5
+ return (html) => {
6
+ return html.replace(base64SrcRegex, (match, prefix, suffix) => {
7
+ if (match.length < maxBase64Size) return match;
8
+ return `${prefix}${suffix}`;
9
+ });
10
+ };
11
+ };
12
+ //#endregion
13
+ export { stripOversizedBase64Sources };
package/dist/types.d.ts CHANGED
@@ -60,7 +60,9 @@ type TransformContext = {
60
60
  };
61
61
  type DomTransform = (context: TransformContext) => (document: Document) => MaybePromise<void>;
62
62
  type StringTransform = (context: TransformContext) => (html: string) => MaybePromise<string>;
63
+ type ParseHtmlFn = (html: string) => MaybePromise<Document>;
63
64
  type TransformContentOptions = {
65
+ parseHtmlFn: ParseHtmlFn;
64
66
  baseUrl?: string;
65
67
  enclosures?: Array<Enclosure>;
66
68
  embedResolvers?: Array<EmbedResolver>;
@@ -75,7 +77,6 @@ type TransformContentOptions = {
75
77
  articleTitle?: string;
76
78
  stringTransforms?: Array<StringTransform>;
77
79
  domTransforms?: Array<DomTransform>;
78
- finalStringTransforms?: Array<StringTransform>;
79
80
  };
80
81
  //#endregion
81
- export { AssetProxyFn, AssetType, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext, UrlUnwrapper };
82
+ export { AssetProxyFn, AssetType, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ParseHtmlFn, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext, UrlUnwrapper };
package/package.json CHANGED
@@ -29,6 +29,10 @@
29
29
  "./defaults": {
30
30
  "types": "./dist/defaults.d.ts",
31
31
  "default": "./dist/defaults.js"
32
+ },
33
+ "./linkedom": {
34
+ "types": "./dist/parsers/linkedom.d.ts",
35
+ "default": "./dist/parsers/linkedom.js"
32
36
  }
33
37
  },
34
38
  "files": [
@@ -36,23 +40,29 @@
36
40
  ],
37
41
  "scripts": {
38
42
  "prepare": "lefthook install",
39
- "build": "tsdown src/index.ts src/defaults.ts --format esm --dts --clean --unbundle --no-fixed-extension"
43
+ "build": "tsdown src/index.ts src/defaults.ts src/parsers/linkedom.ts --format esm --dts --clean --unbundle --no-fixed-extension"
40
44
  },
41
45
  "dependencies": {
42
46
  "@wordpress/autop": "^4.46.0",
43
47
  "highlight.js": "^11.11.1",
44
- "linkedom": "^0.18.12",
45
48
  "linkifyjs": "^4.3.2",
46
49
  "srcset": "^5.0.3"
47
50
  },
48
51
  "peerDependencies": {
49
52
  "feedcanon": "^2.0.0-next.3",
50
- "feedscout": "^2.0.0-next.2"
53
+ "feedscout": "^2.0.0-next.2",
54
+ "linkedom": "^0.18.12"
55
+ },
56
+ "peerDependenciesMeta": {
57
+ "linkedom": {
58
+ "optional": true
59
+ }
51
60
  },
52
61
  "devDependencies": {
53
62
  "@types/bun": "^1.3.13",
54
63
  "kvalita": "^1.13.0",
64
+ "linkedom": "^0.18.12",
55
65
  "tsdown": "^0.22.0"
56
66
  },
57
- "version": "1.1.0"
67
+ "version": "1.2.0"
58
68
  }