feedsweep 1.2.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -41,13 +41,17 @@ Inventory of every transform exported from the package. Most are enabled by defa
41
41
  | `stripInterBlockBreaks` | Remove `<br>` tags between block elements |
42
42
  | `stripParagraphBoundaryBreaks` | Remove `<br>` tags adjacent to paragraph boundaries |
43
43
  | `stripDuplicateTitleHeading` | Remove first `<h1>`–`<h6>` matching article title |
44
+ | `demoteHeadings` | Shift every heading down by one level (`<h1>`→`<h2>`, …, `<h5>`→`<h6>`) when the body contains an `<h1>`, so it sits below the reader's own page title |
44
45
  | `unwrapRedirectUrls` | Remove Google/Bing/Facebook/etc. redirect wrappers |
45
46
  | `stripDeadAnchors` | Unwrap `<a>` with empty, `#`, or `javascript:` href |
47
+ | `stripInertElements` | Remove platform chrome and dead placeholders — subscribe widgets, share buttons, related-posts widgets, ad slots (AdSense / AdThrive), author bio blocks, email preheaders, Substack image controls, and Drupal `<drupal-render-placeholder>` tags. Pass `inertSelectors` to extend or replace |
46
48
  | `removeTrackingPixels` | Strip 1×1 tracking pixel images |
49
+ | `unwrapEmojiImages` | Replace WordPress/Facebook/Twitter/GitHub emoji `<img>` tags with their alt-text glyph |
47
50
  | `stripTrackingParams` | Remove UTM and other tracking parameters |
48
51
  | `convertBreaksToParagraphs` | Convert `<br><br>` runs into semantic `<p>` blocks |
49
52
  | `injectEnclosures` | Inject feed enclosures into content as native `<audio>`/`<video>` or iframe placeholders |
50
53
  | `replaceEmbedsWithPlaceholders` | Convert `<iframe>` to embed placeholders |
54
+ | `convertBookmarkCards` | Convert link-preview cards into `data-bookmark-*` placeholders via a registry of per-provider `BookmarkResolver`s (`defaultBookmarkResolvers`: Ghost `kg-bookmark-card`, Substack `embedded-publication-wrap`). Extend via `bookmarkResolvers` |
51
55
  | `enrichEmbedPlaceholders` | Populate placeholder metadata (`title`, `description`, `duration`, etc.) via a caller-supplied async fn. Opt-in; not in defaults |
52
56
  | `proxyAssetUrls` | Rewrite image, video, and audio URLs through a caller-supplied proxy |
53
57
  | `resolveRelativeUrls` | Convert relative URLs to absolute using base URL |
@@ -62,6 +66,7 @@ Inventory of every transform exported from the package. Most are enabled by defa
62
66
  | `stripEmptyTags` | Remove empty `<p>`, `<div>`, `<span>` and other tags |
63
67
  | `stripComments` | Remove HTML `<!-- comments -->` |
64
68
  | `unwrapCdataComments` | Strip malformed `<!--[CDATA[ … ]]-->` wrappers before parsing so the wrapped article reaches the DOM as real HTML |
69
+ | `stripControlChars` | Strip rendering-hostile control characters (NUL, BEL, ESC, DEL, C1 range) before parsing. Preserves tab / LF / CR |
65
70
 
66
71
  ## Options
67
72
 
@@ -91,7 +96,7 @@ The `stringTransforms` and `domTransforms` options each fully replace the corres
91
96
 
92
97
  ## DOM library
93
98
 
94
- Feedsweep is parser-agnostic. You provide `parseHtmlFn` — a function that turns an HTML string into a `Document`. Use any DOM library that produces a standards-compliant `Document`.
99
+ Feedsweep is parser-agnostic. You provide `parseHtmlFn` — a function that turns an HTML string into a `Document`. Use any DOM library that produces a standards-compliant `Document`. The test suite runs the full pipeline against both linkedom and jsdom.
95
100
 
96
101
  ```typescript
97
102
  // linkedom (recommended default)
@@ -0,0 +1,6 @@
1
+ import { BookmarkResolver } from "../types.js";
2
+
3
+ //#region src/bookmarks/ghost.d.ts
4
+ declare const ghostBookmarkResolver: BookmarkResolver;
5
+ //#endregion
6
+ export { ghostBookmarkResolver };
@@ -0,0 +1,21 @@
1
+ //#region src/bookmarks/ghost.ts
2
+ const ghostBookmarkResolver = {
3
+ selector: ".kg-bookmark-card",
4
+ extract: (element) => {
5
+ const url = element.querySelector("a.kg-bookmark-container")?.getAttribute("href") ?? void 0;
6
+ const title = element.querySelector(".kg-bookmark-title")?.textContent?.trim();
7
+ if (!url || !title) return;
8
+ return {
9
+ provider: "ghost",
10
+ url,
11
+ title,
12
+ description: element.querySelector(".kg-bookmark-description")?.textContent?.trim(),
13
+ author: element.querySelector(".kg-bookmark-author")?.textContent?.trim(),
14
+ publisher: element.querySelector(".kg-bookmark-publisher")?.textContent?.trim(),
15
+ icon: element.querySelector("img.kg-bookmark-icon")?.getAttribute("src") ?? void 0,
16
+ thumbnail: element.querySelector(".kg-bookmark-thumbnail img")?.getAttribute("src") ?? void 0
17
+ };
18
+ }
19
+ };
20
+ //#endregion
21
+ export { ghostBookmarkResolver };
@@ -0,0 +1,6 @@
1
+ import { BookmarkResolver } from "../types.js";
2
+
3
+ //#region src/bookmarks/substack.d.ts
4
+ declare const substackBookmarkResolver: BookmarkResolver;
5
+ //#endregion
6
+ export { substackBookmarkResolver };
@@ -0,0 +1,26 @@
1
+ //#region src/bookmarks/substack.ts
2
+ const parsePublicationAttrs = (raw) => {
3
+ if (!raw) return;
4
+ try {
5
+ return JSON.parse(raw);
6
+ } catch {}
7
+ };
8
+ const substackBookmarkResolver = {
9
+ selector: ".embedded-publication-wrap",
10
+ extract: (element) => {
11
+ const attrs = parsePublicationAttrs(element.getAttribute("data-attrs"));
12
+ const url = attrs?.base_url;
13
+ const title = attrs?.name?.trim();
14
+ if (!url || !title) return;
15
+ return {
16
+ provider: "substack",
17
+ url,
18
+ title,
19
+ description: attrs.hero_text?.trim(),
20
+ author: attrs.author_name?.trim(),
21
+ icon: attrs.logo_url
22
+ };
23
+ }
24
+ };
25
+ //#endregion
26
+ export { substackBookmarkResolver };
package/dist/common.d.ts CHANGED
@@ -1,11 +1,13 @@
1
- import { EmbedResolverResult, MaybePromise } from "./types.js";
1
+ import { BookmarkResolverResult, EmbedResolverResult, MaybePromise } from "./types.js";
2
2
 
3
3
  //#region src/common.d.ts
4
+ declare const isSafeThumbnailUrl: (url: string) => boolean;
4
5
  declare const applyDomTransforms: (document: Document, transforms: Array<(document: Document) => MaybePromise<void>>) => Promise<string>;
5
6
  declare const applyStringTransforms: (html: string, transforms: Array<(html: string) => MaybePromise<string>>) => Promise<string>;
6
- declare const applyEmbedMetadata: (element: HTMLElement, metadata: Partial<EmbedResolverResult>, options?: {
7
- setIfMissing?: boolean;
8
- }) => void;
7
+ declare const createPlaceholder: <Type extends object>(document: Document, type: string, fields: Type) => HTMLElement;
8
+ declare const normalizeEmbedFields: (metadata: Partial<EmbedResolverResult>) => Record<string, string | undefined>;
9
+ declare const updateEmbedPlaceholder: (element: HTMLElement, metadata: Partial<EmbedResolverResult>) => void;
9
10
  declare const createEmbedPlaceholder: (document: Document, src: string, metadata?: Partial<EmbedResolverResult>) => HTMLElement;
11
+ declare const createBookmarkPlaceholder: (document: Document, result: BookmarkResolverResult) => HTMLElement;
10
12
  //#endregion
11
- export { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder };
13
+ export { applyDomTransforms, applyStringTransforms, createBookmarkPlaceholder, createEmbedPlaceholder, createPlaceholder, isSafeThumbnailUrl, normalizeEmbedFields, updateEmbedPlaceholder };
package/dist/common.js CHANGED
@@ -1,5 +1,5 @@
1
1
  import { coerceNumber } from "./utils.js";
2
- import { resolveUrl } from "feedcanon";
2
+ import { resolveUrl, upgradeProtocol } from "feedcanon";
3
3
  //#region src/common.ts
4
4
  const Node = {
5
5
  ELEMENT_NODE: 1,
@@ -104,36 +104,61 @@ const getDimensions = (element) => {
104
104
  height: height ?? fromStyle(styleHeightRegex)
105
105
  };
106
106
  };
107
- const applyEmbedMetadata = (element, metadata, options) => {
108
- const setIfMissing = options?.setIfMissing ?? false;
109
- const set = (name, value) => {
110
- if (setIfMissing && element.hasAttribute(name)) return;
111
- element.setAttribute(name, value);
107
+ const createPlaceholder = (document, type, fields) => {
108
+ const element = document.createElement("div");
109
+ for (const [key, value] of Object.entries(fields)) if (value) element.setAttribute(`data-${type}-${key}`, value);
110
+ return element;
111
+ };
112
+ const normalizeEmbedFields = (metadata) => {
113
+ return {
114
+ src: metadata.src ? upgradeProtocol(metadata.src) : void 0,
115
+ provider: metadata.provider,
116
+ id: metadata.id,
117
+ url: metadata.url ? upgradeProtocol(metadata.url) : void 0,
118
+ thumbnail: metadata.thumbnail && isSafeThumbnailUrl(metadata.thumbnail) ? metadata.thumbnail : void 0,
119
+ width: metadata.width ? String(metadata.width) : void 0,
120
+ height: metadata.height ? String(metadata.height) : void 0,
121
+ title: metadata.title,
122
+ description: metadata.description,
123
+ author: metadata.author,
124
+ avatar: metadata.avatar && isSafeThumbnailUrl(metadata.avatar) ? metadata.avatar : void 0,
125
+ duration: metadata.duration ? String(metadata.duration) : void 0
112
126
  };
113
- if (metadata.provider) set("data-embed-provider", metadata.provider);
114
- if (metadata.id) set("data-embed-id", metadata.id);
115
- if (metadata.src) set("data-embed-src", metadata.src);
116
- if (metadata.url) set("data-embed-url", metadata.url);
117
- if (metadata.thumbnail && isSafeThumbnailUrl(metadata.thumbnail)) set("data-embed-thumbnail", metadata.thumbnail);
118
- if (metadata.width) set("data-embed-width", String(metadata.width));
119
- if (metadata.height) set("data-embed-height", String(metadata.height));
120
- if (metadata.title) set("data-embed-title", metadata.title);
121
- if (metadata.description) set("data-embed-description", metadata.description);
122
- if (metadata.author) set("data-embed-author", metadata.author);
123
- if (metadata.avatar && isSafeThumbnailUrl(metadata.avatar)) set("data-embed-avatar", metadata.avatar);
124
- if (metadata.duration) set("data-embed-duration", String(metadata.duration));
127
+ };
128
+ const updateEmbedPlaceholder = (element, metadata) => {
129
+ for (const [key, value] of Object.entries(normalizeEmbedFields(metadata))) {
130
+ const name = `data-embed-${key}`;
131
+ if (value && !element.hasAttribute(name)) element.setAttribute(name, value);
132
+ }
125
133
  };
126
134
  const createEmbedPlaceholder = (document, src, metadata) => {
127
- const element = document.createElement("div");
128
- element.setAttribute("data-embed", "iframe");
129
- element.setAttribute("data-embed-src", metadata?.src ?? src);
130
- if (metadata) applyEmbedMetadata(element, metadata);
131
- const fallbackUrl = metadata?.url ?? metadata?.src ?? src;
135
+ const element = createPlaceholder(document, "embed", normalizeEmbedFields({
136
+ ...metadata,
137
+ src: metadata?.src ?? src
138
+ }));
139
+ const fallbackUrl = upgradeProtocol(metadata?.url ?? metadata?.src ?? src);
132
140
  const link = document.createElement("a");
133
141
  link.setAttribute("href", fallbackUrl);
134
142
  link.textContent = fallbackUrl;
135
143
  element.appendChild(link);
136
144
  return element;
137
145
  };
146
+ const createBookmarkPlaceholder = (document, result) => {
147
+ const { provider, title, url, icon, thumbnail, ...rest } = result;
148
+ const safeUrl = upgradeProtocol(url);
149
+ const element = createPlaceholder(document, "bookmark", {
150
+ provider,
151
+ ...rest,
152
+ url: safeUrl,
153
+ title,
154
+ icon: icon && isSafeThumbnailUrl(icon) ? upgradeProtocol(icon) : void 0,
155
+ thumbnail: thumbnail && isSafeThumbnailUrl(thumbnail) ? upgradeProtocol(thumbnail) : void 0
156
+ });
157
+ const link = document.createElement("a");
158
+ link.setAttribute("href", safeUrl);
159
+ link.textContent = title;
160
+ element.appendChild(link);
161
+ return element;
162
+ };
138
163
  //#endregion
139
- export { Node, NodeFilter, applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder, getDimensions, hasAncestorWithTagName, isBlockElement, isBr, isSkippable, isWhitespaceText };
164
+ export { Node, NodeFilter, applyDomTransforms, applyStringTransforms, createBookmarkPlaceholder, createEmbedPlaceholder, createPlaceholder, getDimensions, hasAncestorWithTagName, isBlockElement, isBr, isSafeThumbnailUrl, isSkippable, isWhitespaceText, normalizeEmbedFields, updateEmbedPlaceholder };
@@ -1,14 +1,18 @@
1
- import { DomTransform, EmbedResolver, ResolveUrlFn, StringTransform, UrlUnwrapper } from "./types.js";
1
+ import { BookmarkResolver, DomTransform, EmbedResolver, ResolveUrlFn, StringTransform, UrlUnwrapper } from "./types.js";
2
2
 
3
3
  //#region src/defaults.d.ts
4
4
  declare const defaultStringTransforms: Array<StringTransform>;
5
5
  declare const defaultDomTransforms: Array<DomTransform>;
6
6
  declare const defaultEmbedResolvers: Array<EmbedResolver>;
7
+ declare const defaultBookmarkResolvers: Array<BookmarkResolver>;
7
8
  declare const defaultResolveUrlFn: ResolveUrlFn;
8
9
  declare const defaultLazySrcAttributes: string[];
9
10
  declare const defaultLazySrcsetAttributes: string[];
10
11
  declare const defaultTrackingHosts: string[];
11
12
  declare const defaultTrackingPathSegments: string[];
13
+ declare const defaultEmojiImageHosts: string[];
14
+ declare const defaultPreservedPreClasses: string[];
15
+ declare const defaultInertSelectors: string[];
12
16
  declare const defaultUrlUnwrappers: Array<UrlUnwrapper>;
13
17
  //#endregion
14
- export { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
18
+ export { defaultBookmarkResolvers, defaultDomTransforms, defaultEmbedResolvers, defaultEmojiImageHosts, defaultInertSelectors, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultPreservedPreClasses, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
package/dist/defaults.js CHANGED
@@ -1,6 +1,10 @@
1
+ import { ghostBookmarkResolver } from "./bookmarks/ghost.js";
2
+ import { substackBookmarkResolver } from "./bookmarks/substack.js";
1
3
  import { youtubeEmbedResolver } from "./embeds/youtube.js";
4
+ import { convertBookmarkCards } from "./transforms/dom/convertBookmarkCards.js";
2
5
  import { convertBreaksToParagraphs } from "./transforms/dom/convertBreaksToParagraphs.js";
3
6
  import { decodeDoubleEncodedTags } from "./transforms/dom/decodeDoubleEncodedTags.js";
7
+ import { demoteHeadings } from "./transforms/dom/demoteHeadings.js";
4
8
  import { fixLazyImages } from "./transforms/dom/fixLazyImages.js";
5
9
  import { highlightCode } from "./transforms/dom/highlightCode.js";
6
10
  import { injectEnclosures } from "./transforms/dom/injectEnclosures.js";
@@ -16,14 +20,17 @@ import { stripComments } from "./transforms/dom/stripComments.js";
16
20
  import { stripDeadAnchors } from "./transforms/dom/stripDeadAnchors.js";
17
21
  import { stripDuplicateTitleHeading } from "./transforms/dom/stripDuplicateTitleHeading.js";
18
22
  import { stripEmptyTags } from "./transforms/dom/stripEmptyTags.js";
23
+ import { stripInertElements } from "./transforms/dom/stripInertElements.js";
19
24
  import { stripInterBlockBreaks } from "./transforms/dom/stripInterBlockBreaks.js";
20
25
  import { stripParagraphBoundaryBreaks } from "./transforms/dom/stripParagraphBoundaryBreaks.js";
21
26
  import { stripTrackingParams } from "./transforms/dom/stripTrackingParams.js";
22
27
  import { trimPreWhitespace } from "./transforms/dom/trimPreWhitespace.js";
23
28
  import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedLists.js";
29
+ import { unwrapEmojiImages } from "./transforms/dom/unwrapEmojiImages.js";
24
30
  import { unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
25
31
  import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
26
32
  import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
33
+ import { stripControlChars } from "./transforms/string/stripControlChars.js";
27
34
  import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
28
35
  import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
29
36
  import { unwrapBing } from "./unwraps/bing.js";
@@ -41,6 +48,7 @@ import { unwrapYouTube } from "./unwraps/youtube.js";
41
48
  import { resolveUrl } from "feedcanon";
42
49
  //#region src/defaults.ts
43
50
  const defaultStringTransforms = [
51
+ stripControlChars,
44
52
  stripOversizedBase64Sources,
45
53
  unwrapCdataComments,
46
54
  paragraphizePlainText
@@ -50,12 +58,16 @@ const defaultDomTransforms = [
50
58
  stripComments,
51
59
  unwrapDoublyNestedLists,
52
60
  stripDuplicateTitleHeading,
61
+ demoteHeadings,
53
62
  fixLazyImages,
63
+ stripInertElements,
54
64
  resolveRelativeUrls,
55
65
  unwrapRedirectUrls,
56
66
  stripDeadAnchors,
57
67
  stripTrackingParams,
68
+ convertBookmarkCards,
58
69
  removeTrackingPixels,
70
+ unwrapEmojiImages,
59
71
  convertBreaksToParagraphs,
60
72
  stripInterBlockBreaks,
61
73
  stripParagraphBoundaryBreaks,
@@ -72,6 +84,7 @@ const defaultDomTransforms = [
72
84
  stripEmptyTags
73
85
  ];
74
86
  const defaultEmbedResolvers = [youtubeEmbedResolver];
87
+ const defaultBookmarkResolvers = [ghostBookmarkResolver, substackBookmarkResolver];
75
88
  const defaultResolveUrlFn = (url, baseUrl) => resolveUrl(url, baseUrl);
76
89
  const defaultLazySrcAttributes = [
77
90
  "data-src",
@@ -142,6 +155,40 @@ const defaultTrackingPathSegments = [
142
155
  "count",
143
156
  "impression"
144
157
  ];
158
+ const defaultEmojiImageHosts = [
159
+ "s.w.org/images/core/emoji/",
160
+ "s0.wp.com/wp-content/mu-plugins/wpcom-smileys/",
161
+ "fbcdn.net/images/emoji.php/",
162
+ "abs.twimg.com/emoji/",
163
+ "githubassets.com/images/icons/emoji/"
164
+ ];
165
+ const defaultPreservedPreClasses = ["wp-block-verse", "wp-block-preformatted"];
166
+ const defaultInertSelectors = [
167
+ ".image-link-expand",
168
+ "[data-component-name=\"SubscribeWidget\"]",
169
+ ".subscription-widget-wrap-editor",
170
+ "drupal-render-placeholder",
171
+ ".adsbygoogle",
172
+ ".embedded-publication-wrap",
173
+ ".yarpp-related",
174
+ ".sharethis-inline-share-buttons",
175
+ ".sharedaddy",
176
+ ".wp-block-jetpack-subscriptions",
177
+ ".wp-block-post-author",
178
+ ".kg-signup-card",
179
+ ".mc4wp-form",
180
+ ".formkit-form",
181
+ ".mcnPreviewText",
182
+ ".saboxplugin-wrap",
183
+ ".addtoany_share_save_container",
184
+ "iframe[src*=\"embeds.beehiiv.com\"]",
185
+ ".jp-relatedposts",
186
+ ".adthrive-ad",
187
+ ".jetpack_subscription_widget",
188
+ ".crp_related",
189
+ "form[action*=\"buttondown.email\"]",
190
+ ".sqs-block-newsletter"
191
+ ];
145
192
  const defaultUrlUnwrappers = [
146
193
  unwrapBing,
147
194
  unwrapGoogle,
@@ -157,4 +204,4 @@ const defaultUrlUnwrappers = [
157
204
  unwrapRedditOut
158
205
  ];
159
206
  //#endregion
160
- export { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
207
+ export { defaultBookmarkResolvers, defaultDomTransforms, defaultEmbedResolvers, defaultEmojiImageHosts, defaultInertSelectors, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultPreservedPreClasses, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers };
@@ -33,7 +33,7 @@ const youtubeResolveEmbed = (url) => {
33
33
  return {
34
34
  provider: "youtube",
35
35
  id: videoId,
36
- src: `https://www.youtube-nocookie.com/embed/${videoId}`,
36
+ src: `https://www.youtube.com/embed/${videoId}`,
37
37
  url: `https://www.youtube.com/watch?v=${videoId}`,
38
38
  thumbnail: composeThumbnailUrl(videoId)
39
39
  };
package/dist/index.d.ts CHANGED
@@ -1,9 +1,13 @@
1
- import { AssetProxyFn, AssetType, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ParseHtmlFn, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext } from "./types.js";
1
+ import { AssetProxyFn, AssetType, BookmarkResolver, BookmarkResolverResult, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ParseHtmlFn, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext } from "./types.js";
2
2
  import { defaultResolveUrlFn } from "./defaults.js";
3
- import { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder } from "./common.js";
3
+ import { ghostBookmarkResolver } from "./bookmarks/ghost.js";
4
+ import { substackBookmarkResolver } from "./bookmarks/substack.js";
5
+ import { applyDomTransforms, applyStringTransforms, createBookmarkPlaceholder, createEmbedPlaceholder, createPlaceholder, isSafeThumbnailUrl, normalizeEmbedFields, updateEmbedPlaceholder } from "./common.js";
4
6
  import { composeThumbnailUrl, extractVideoId, youtubeEmbedResolver, youtubeResolveEmbed } from "./embeds/youtube.js";
7
+ import { convertBookmarkCards } from "./transforms/dom/convertBookmarkCards.js";
5
8
  import { convertBreaksToParagraphs } from "./transforms/dom/convertBreaksToParagraphs.js";
6
9
  import { decodeDoubleEncodedTags } from "./transforms/dom/decodeDoubleEncodedTags.js";
10
+ import { demoteHeadings } from "./transforms/dom/demoteHeadings.js";
7
11
  import { enrichEmbedPlaceholders } from "./transforms/dom/enrichEmbedPlaceholders.js";
8
12
  import { fixLazyImages } from "./transforms/dom/fixLazyImages.js";
9
13
  import { detectLanguage, highlightCode } from "./transforms/dom/highlightCode.js";
@@ -20,14 +24,17 @@ import { stripComments } from "./transforms/dom/stripComments.js";
20
24
  import { stripDeadAnchors } from "./transforms/dom/stripDeadAnchors.js";
21
25
  import { stripDuplicateTitleHeading } from "./transforms/dom/stripDuplicateTitleHeading.js";
22
26
  import { stripEmptyTags } from "./transforms/dom/stripEmptyTags.js";
27
+ import { stripInertElements } from "./transforms/dom/stripInertElements.js";
23
28
  import { stripInterBlockBreaks } from "./transforms/dom/stripInterBlockBreaks.js";
24
29
  import { stripParagraphBoundaryBreaks } from "./transforms/dom/stripParagraphBoundaryBreaks.js";
25
30
  import { stripTrackingParams } from "./transforms/dom/stripTrackingParams.js";
26
31
  import { trimPreWhitespace } from "./transforms/dom/trimPreWhitespace.js";
27
32
  import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedLists.js";
33
+ import { unwrapEmojiImages } from "./transforms/dom/unwrapEmojiImages.js";
28
34
  import { extractRedirectTarget, unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
29
35
  import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
30
36
  import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
37
+ import { stripControlChars } from "./transforms/string/stripControlChars.js";
31
38
  import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
32
39
  import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
33
40
  import { unwrapAceml } from "./unwraps/aceml.js";
@@ -109,4 +116,4 @@ import { ParamExtractorConfig, chooseBaseUrl, coerceNumber, createParamExtractor
109
116
  //#region src/index.d.ts
110
117
  declare const transformContent: (html: string, options: TransformContentOptions) => Promise<string>;
111
118
  //#endregion
112
- export { type AssetProxyFn, type AssetType, type DomTransform, type EmbedResolver, type EmbedResolverResult, type Enclosure, type EnrichEmbedFn, type MaybePromise, type ParamExtractorConfig, type ParseHtmlFn, type ResolveUrlFn, type StringTransform, type TransformContentOptions, type TransformContext, applyDomTransforms, applyEmbedMetadata, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBreaksToParagraphs, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode, injectEnclosures, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, paragraphizePlainText, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInterBlockBreaks, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, transformContent, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, youtubeEmbedResolver, youtubeResolveEmbed };
119
+ export { type AssetProxyFn, type AssetType, type BookmarkResolver, type BookmarkResolverResult, type DomTransform, type EmbedResolver, type EmbedResolverResult, type Enclosure, type EnrichEmbedFn, type MaybePromise, type ParamExtractorConfig, type ParseHtmlFn, type ResolveUrlFn, type StringTransform, type TransformContentOptions, type TransformContext, applyDomTransforms, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBookmarkCards, convertBreaksToParagraphs, createBookmarkPlaceholder, createEmbedPlaceholder, createParamExtractor, createPlaceholder, decodeDoubleEncodedTags, defaultResolveUrlFn, demoteHeadings, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, ghostBookmarkResolver, highlightCode, injectEnclosures, isSafeThumbnailUrl, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, normalizeEmbedFields, paragraphizePlainText, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripControlChars, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInertElements, stripInterBlockBreaks, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, substackBookmarkResolver, transformContent, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapEmojiImages, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, updateEmbedPlaceholder, youtubeEmbedResolver, youtubeResolveEmbed };
package/dist/index.js CHANGED
@@ -1,8 +1,12 @@
1
1
  import { chooseBaseUrl, coerceNumber, createParamExtractor } from "./utils.js";
2
- import { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, createEmbedPlaceholder } from "./common.js";
2
+ import { applyDomTransforms, applyStringTransforms, createBookmarkPlaceholder, createEmbedPlaceholder, createPlaceholder, isSafeThumbnailUrl, normalizeEmbedFields, updateEmbedPlaceholder } from "./common.js";
3
+ import { ghostBookmarkResolver } from "./bookmarks/ghost.js";
4
+ import { substackBookmarkResolver } from "./bookmarks/substack.js";
3
5
  import { composeThumbnailUrl, extractVideoId, youtubeEmbedResolver, youtubeResolveEmbed } from "./embeds/youtube.js";
6
+ import { convertBookmarkCards } from "./transforms/dom/convertBookmarkCards.js";
4
7
  import { convertBreaksToParagraphs } from "./transforms/dom/convertBreaksToParagraphs.js";
5
8
  import { decodeDoubleEncodedTags } from "./transforms/dom/decodeDoubleEncodedTags.js";
9
+ import { demoteHeadings } from "./transforms/dom/demoteHeadings.js";
6
10
  import { fixLazyImages } from "./transforms/dom/fixLazyImages.js";
7
11
  import { detectLanguage, highlightCode } from "./transforms/dom/highlightCode.js";
8
12
  import { injectEnclosures } from "./transforms/dom/injectEnclosures.js";
@@ -18,14 +22,17 @@ import { stripComments } from "./transforms/dom/stripComments.js";
18
22
  import { stripDeadAnchors } from "./transforms/dom/stripDeadAnchors.js";
19
23
  import { stripDuplicateTitleHeading } from "./transforms/dom/stripDuplicateTitleHeading.js";
20
24
  import { stripEmptyTags } from "./transforms/dom/stripEmptyTags.js";
25
+ import { stripInertElements } from "./transforms/dom/stripInertElements.js";
21
26
  import { stripInterBlockBreaks } from "./transforms/dom/stripInterBlockBreaks.js";
22
27
  import { stripParagraphBoundaryBreaks } from "./transforms/dom/stripParagraphBoundaryBreaks.js";
23
28
  import { stripTrackingParams } from "./transforms/dom/stripTrackingParams.js";
24
29
  import { trimPreWhitespace } from "./transforms/dom/trimPreWhitespace.js";
25
30
  import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedLists.js";
31
+ import { unwrapEmojiImages } from "./transforms/dom/unwrapEmojiImages.js";
26
32
  import { extractRedirectTarget, unwrapRedirectUrls } from "./transforms/dom/unwrapRedirectUrls.js";
27
33
  import { unwrapWrappers } from "./transforms/dom/unwrapWrappers.js";
28
34
  import { paragraphizePlainText } from "./transforms/string/paragraphizePlainText.js";
35
+ import { stripControlChars } from "./transforms/string/stripControlChars.js";
29
36
  import { stripOversizedBase64Sources } from "./transforms/string/stripOversizedBase64Sources.js";
30
37
  import { unwrapCdataComments } from "./transforms/string/unwrapCdataComments.js";
31
38
  import { unwrapBing } from "./unwraps/bing.js";
@@ -40,7 +47,7 @@ import { unwrapRedditOut } from "./unwraps/redditOut.js";
40
47
  import { unwrapVkAway } from "./unwraps/vkAway.js";
41
48
  import { unwrapYahooSearch } from "./unwraps/yahooSearch.js";
42
49
  import { unwrapYouTube } from "./unwraps/youtube.js";
43
- import { defaultDomTransforms, defaultEmbedResolvers, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers } from "./defaults.js";
50
+ import { defaultBookmarkResolvers, defaultDomTransforms, defaultEmbedResolvers, defaultEmojiImageHosts, defaultInertSelectors, defaultLazySrcAttributes, defaultLazySrcsetAttributes, defaultPreservedPreClasses, defaultResolveUrlFn, defaultStringTransforms, defaultTrackingHosts, defaultTrackingPathSegments, defaultUrlUnwrappers } from "./defaults.js";
44
51
  import { enrichEmbedPlaceholders } from "./transforms/dom/enrichEmbedPlaceholders.js";
45
52
  import { unwrapAceml } from "./unwraps/aceml.js";
46
53
  import { unwrapAdjust } from "./unwraps/adjust.js";
@@ -110,10 +117,14 @@ const transformContent = async (html, options) => {
110
117
  baseUrl: options.baseUrl,
111
118
  enclosures: options.enclosures,
112
119
  embedResolvers: options.embedResolvers ?? defaultEmbedResolvers,
120
+ bookmarkResolvers: options.bookmarkResolvers ?? defaultBookmarkResolvers,
113
121
  lazySrcAttributes: options.lazySrcAttributes ?? defaultLazySrcAttributes,
114
122
  lazySrcsetAttributes: options.lazySrcsetAttributes ?? defaultLazySrcsetAttributes,
115
123
  trackingHosts: options.trackingHosts ?? defaultTrackingHosts,
116
124
  trackingPathSegments: options.trackingPathSegments ?? defaultTrackingPathSegments,
125
+ emojiImageHosts: options.emojiImageHosts ?? defaultEmojiImageHosts,
126
+ inertSelectors: options.inertSelectors ?? defaultInertSelectors,
127
+ preservedPreClasses: options.preservedPreClasses ?? defaultPreservedPreClasses,
117
128
  urlUnwrappers: options.urlUnwrappers ?? defaultUrlUnwrappers,
118
129
  resolveUrlFn: options.resolveUrlFn ?? defaultResolveUrlFn,
119
130
  assetProxyFn: options.assetProxyFn,
@@ -126,4 +137,4 @@ const transformContent = async (html, options) => {
126
137
  return await applyDomTransforms(await options.parseHtmlFn(afterString), domFns.map((transform) => transform(context)));
127
138
  };
128
139
  //#endregion
129
- export { applyDomTransforms, applyEmbedMetadata, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBreaksToParagraphs, createEmbedPlaceholder, createParamExtractor, decodeDoubleEncodedTags, defaultResolveUrlFn, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, highlightCode, injectEnclosures, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, paragraphizePlainText, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInterBlockBreaks, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, transformContent, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, youtubeEmbedResolver, youtubeResolveEmbed };
140
+ export { applyDomTransforms, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBookmarkCards, convertBreaksToParagraphs, createBookmarkPlaceholder, createEmbedPlaceholder, createParamExtractor, createPlaceholder, decodeDoubleEncodedTags, defaultResolveUrlFn, demoteHeadings, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, ghostBookmarkResolver, highlightCode, injectEnclosures, isSafeThumbnailUrl, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, normalizeEmbedFields, paragraphizePlainText, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripControlChars, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInertElements, stripInterBlockBreaks, stripOversizedBase64Sources, stripParagraphBoundaryBreaks, stripTrackingParams, substackBookmarkResolver, transformContent, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapEmojiImages, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, updateEmbedPlaceholder, youtubeEmbedResolver, youtubeResolveEmbed };
@@ -0,0 +1,6 @@
1
+ import { DomTransform } from "../../types.js";
2
+
3
+ //#region src/transforms/dom/convertBookmarkCards.d.ts
4
+ declare const convertBookmarkCards: DomTransform;
5
+ //#endregion
6
+ export { convertBookmarkCards };
@@ -0,0 +1,14 @@
1
+ import { createBookmarkPlaceholder } from "../../common.js";
2
+ //#region src/transforms/dom/convertBookmarkCards.ts
3
+ const convertBookmarkCards = (context) => {
4
+ const { bookmarkResolvers } = context;
5
+ return async (document) => {
6
+ for (const resolver of bookmarkResolvers) for (const element of document.querySelectorAll(resolver.selector)) {
7
+ const result = await resolver.extract(element);
8
+ if (!result) continue;
9
+ element.replaceWith(createBookmarkPlaceholder(document, result));
10
+ }
11
+ };
12
+ };
13
+ //#endregion
14
+ export { convertBookmarkCards };
@@ -0,0 +1,6 @@
1
+ import { DomTransform } from "../../types.js";
2
+
3
+ //#region src/transforms/dom/demoteHeadings.d.ts
4
+ declare const demoteHeadings: DomTransform;
5
+ //#endregion
6
+ export { demoteHeadings };
@@ -0,0 +1,20 @@
1
+ //#region src/transforms/dom/demoteHeadings.ts
2
+ const headingSelector = "h1, h2, h3, h4, h5";
3
+ const demoteHeadings = () => {
4
+ return (document) => {
5
+ if (!document.querySelector("h1")) return;
6
+ const headings = document.querySelectorAll(headingSelector);
7
+ for (const heading of headings) {
8
+ const nextTagName = `h${Number(heading.tagName.slice(1)) + 1}`;
9
+ const replacement = document.createElement(nextTagName);
10
+ for (const name of heading.getAttributeNames().reverse()) {
11
+ const value = heading.getAttribute(name);
12
+ if (value !== null) replacement.setAttribute(name, value);
13
+ }
14
+ while (heading.firstChild) replacement.appendChild(heading.firstChild);
15
+ heading.replaceWith(replacement);
16
+ }
17
+ };
18
+ };
19
+ //#endregion
20
+ export { demoteHeadings };
@@ -1,4 +1,4 @@
1
- import { applyEmbedMetadata } from "../../common.js";
1
+ import { updateEmbedPlaceholder } from "../../common.js";
2
2
  //#region src/transforms/dom/enrichEmbedPlaceholders.ts
3
3
  const enrichEmbedPlaceholders = (context) => {
4
4
  const enrichEmbedFn = context.enrichEmbedFn;
@@ -24,7 +24,7 @@ const enrichEmbedPlaceholders = (context) => {
24
24
  for (let i = 0; i < count; i++) {
25
25
  const embed = embeds[i];
26
26
  const data = enriched.get(`${embed.provider}:${embed.id}`);
27
- if (data) applyEmbedMetadata(placeholders[i], data, { setIfMissing: true });
27
+ if (data) updateEmbedPlaceholder(placeholders[i], data);
28
28
  }
29
29
  };
30
30
  };
@@ -1,7 +1,15 @@
1
1
  //#region src/transforms/dom/mergeConsecutiveOneLinerPres.ts
2
2
  const trailingBrRegex = /<br\s*\/?>\s*$/i;
3
3
  const surroundingNewlinesRegex = /^\n+|\n+$/g;
4
- const mergeConsecutiveOneLinerPres = () => {
4
+ const classTokenSeparator = /\s+/;
5
+ const mergeConsecutiveOneLinerPres = ({ preservedPreClasses }) => {
6
+ const preservedSet = new Set(preservedPreClasses);
7
+ const isPreserved = (element) => {
8
+ const classAttribute = element.getAttribute("class");
9
+ if (!classAttribute) return false;
10
+ for (const token of classAttribute.split(classTokenSeparator)) if (preservedSet.has(token)) return true;
11
+ return false;
12
+ };
5
13
  return (document) => {
6
14
  const pres = document.querySelectorAll("pre");
7
15
  for (const pre of pres) {
@@ -20,6 +28,7 @@ const mergeConsecutiveOneLinerPres = () => {
20
28
  sibling = sibling.nextSibling;
21
29
  }
22
30
  if (run.length < 2) continue;
31
+ if (run.some(isPreserved)) continue;
23
32
  const isSingleLine = (element) => {
24
33
  return !element.innerHTML.replace(surroundingNewlinesRegex, "").includes("\n");
25
34
  };
@@ -1,5 +1,17 @@
1
1
  import { parseSrcset, stringifySrcset } from "srcset";
2
2
  //#region src/transforms/dom/proxyAssetUrls.ts
3
+ const proxyableSelectors = [
4
+ "img",
5
+ "video",
6
+ "audio",
7
+ "source",
8
+ "track",
9
+ "image",
10
+ "[data-embed-thumbnail]",
11
+ "[data-embed-avatar]",
12
+ "[data-bookmark-icon]",
13
+ "[data-bookmark-thumbnail]"
14
+ ];
3
15
  const sourceTypeFromParent = (element) => {
4
16
  const parent = element.parentElement?.localName;
5
17
  if (parent === "video") return "video";
@@ -30,7 +42,7 @@ const proxySrcset = (element, type, assetProxyFn) => {
30
42
  const proxyAssetUrls = ({ assetProxyFn }) => {
31
43
  if (!assetProxyFn) return () => {};
32
44
  return (document) => {
33
- const elements = document.querySelectorAll("img, video, audio, source, track, image, [data-embed-thumbnail], [data-embed-avatar]");
45
+ const elements = document.querySelectorAll(proxyableSelectors.join(", "));
34
46
  for (const element of elements) {
35
47
  switch (element.localName) {
36
48
  case "img":
@@ -57,6 +69,8 @@ const proxyAssetUrls = ({ assetProxyFn }) => {
57
69
  }
58
70
  if (element.hasAttribute("data-embed-thumbnail")) proxyAttribute(element, "data-embed-thumbnail", "image", assetProxyFn);
59
71
  if (element.hasAttribute("data-embed-avatar")) proxyAttribute(element, "data-embed-avatar", "image", assetProxyFn);
72
+ if (element.hasAttribute("data-bookmark-icon")) proxyAttribute(element, "data-bookmark-icon", "image", assetProxyFn);
73
+ if (element.hasAttribute("data-bookmark-thumbnail")) proxyAttribute(element, "data-bookmark-thumbnail", "image", assetProxyFn);
60
74
  }
61
75
  };
62
76
  };
@@ -0,0 +1,6 @@
1
+ import { DomTransform } from "../../types.js";
2
+
3
+ //#region src/transforms/dom/stripInertElements.d.ts
4
+ declare const stripInertElements: DomTransform;
5
+ //#endregion
6
+ export { stripInertElements };
@@ -0,0 +1,11 @@
1
+ //#region src/transforms/dom/stripInertElements.ts
2
+ const stripInertElements = ({ inertSelectors }) => {
3
+ const selector = inertSelectors.join(",");
4
+ return (document) => {
5
+ if (!selector) return;
6
+ const elements = document.querySelectorAll(selector);
7
+ for (const element of elements) element.remove();
8
+ };
9
+ };
10
+ //#endregion
11
+ export { stripInertElements };
@@ -0,0 +1,6 @@
1
+ import { DomTransform } from "../../types.js";
2
+
3
+ //#region src/transforms/dom/unwrapEmojiImages.d.ts
4
+ declare const unwrapEmojiImages: DomTransform;
5
+ //#endregion
6
+ export { unwrapEmojiImages };
@@ -0,0 +1,21 @@
1
+ //#region src/transforms/dom/unwrapEmojiImages.ts
2
+ const nonAsciiRegex = /[€-￿]/;
3
+ const asciiLetterRegex = /[a-zA-Z]/;
4
+ const isEmojiShapedAlt = (alt) => {
5
+ return nonAsciiRegex.test(alt) && !asciiLetterRegex.test(alt);
6
+ };
7
+ const unwrapEmojiImages = (context) => {
8
+ const selector = [
9
+ "img.wp-smiley[alt]",
10
+ "img.emoji[alt]",
11
+ ...context.emojiImageHosts.map((host) => `img[alt][src*="${host}"]`)
12
+ ].join(", ");
13
+ return (document) => {
14
+ for (const image of document.querySelectorAll(selector)) {
15
+ const alt = image.getAttribute("alt");
16
+ if (alt && isEmojiShapedAlt(alt)) image.replaceWith(document.createTextNode(alt));
17
+ }
18
+ };
19
+ };
20
+ //#endregion
21
+ export { unwrapEmojiImages };
@@ -7,9 +7,13 @@ const wrapperTags = new Set([
7
7
  "header",
8
8
  "footer"
9
9
  ]);
10
- const hasEmbedAttribute = (element) => {
10
+ const preservedPrefixes = ["data-embed", "data-bookmark"];
11
+ const hasPreservedAttribute = (element) => {
11
12
  const attributes = element.attributes;
12
- for (let i = 0, n = attributes.length; i < n; i++) if (attributes[i].name.startsWith("data-embed")) return true;
13
+ for (let i = 0, n = attributes.length; i < n; i++) {
14
+ const name = attributes[i].name;
15
+ for (const prefix of preservedPrefixes) if (name.startsWith(prefix)) return true;
16
+ }
13
17
  return false;
14
18
  };
15
19
  const unwrapWrappers = () => {
@@ -20,7 +24,7 @@ const unwrapWrappers = () => {
20
24
  if (!wrapperTags.has(element.localName)) continue;
21
25
  const parent = element.parentNode;
22
26
  if (!parent) continue;
23
- if (hasEmbedAttribute(element)) continue;
27
+ if (hasPreservedAttribute(element)) continue;
24
28
  while (element.firstChild) parent.insertBefore(element.firstChild, element);
25
29
  element.remove();
26
30
  }
@@ -0,0 +1,6 @@
1
+ import { StringTransform } from "../../types.js";
2
+
3
+ //#region src/transforms/string/stripControlChars.d.ts
4
+ declare const stripControlChars: StringTransform;
5
+ //#endregion
6
+ export { stripControlChars };
@@ -0,0 +1,21 @@
1
+ //#region src/transforms/string/stripControlChars.ts
2
+ const ranges = [
3
+ "\\x00-\\x08",
4
+ "\\x0B\\x0C",
5
+ "\\x0E-\\x1F",
6
+ "\\x7F-\\x9F",
7
+ "\\uFDD0-\\uFDEF",
8
+ "\\uFFFE\\uFFFF",
9
+ ...Array.from({ length: 16 }, (_, index) => {
10
+ const plane = (index + 1).toString(16).toUpperCase();
11
+ return `\\u{${plane}FFFE}\\u{${plane}FFFF}`;
12
+ })
13
+ ];
14
+ const controlCharRegex = new RegExp(`[${ranges.join("")}]`, "gu");
15
+ const stripControlChars = () => {
16
+ return (html) => {
17
+ return html.replace(controlCharRegex, "");
18
+ };
19
+ };
20
+ //#endregion
21
+ export { stripControlChars };
package/dist/types.d.ts CHANGED
@@ -41,6 +41,20 @@ type EmbedResolver = {
41
41
  selector: string;
42
42
  extract: (element: Element) => MaybePromise<EmbedResolverResult | undefined>;
43
43
  };
44
+ type BookmarkResolverResult = {
45
+ provider: string;
46
+ url: string;
47
+ title: string;
48
+ description?: string;
49
+ author?: string;
50
+ publisher?: string;
51
+ icon?: string;
52
+ thumbnail?: string;
53
+ };
54
+ type BookmarkResolver = {
55
+ selector: string;
56
+ extract: (element: Element) => MaybePromise<BookmarkResolverResult | undefined>;
57
+ };
44
58
  type UrlUnwrapper = (url: URL) => string | undefined;
45
59
  type AssetType = 'image' | 'video' | 'audio';
46
60
  type AssetProxyFn = (url: string, type: AssetType) => string | undefined;
@@ -48,10 +62,14 @@ type TransformContext = {
48
62
  baseUrl?: string;
49
63
  enclosures?: Array<Enclosure>;
50
64
  embedResolvers: Array<EmbedResolver>;
65
+ bookmarkResolvers: Array<BookmarkResolver>;
51
66
  lazySrcAttributes: Array<string>;
52
67
  lazySrcsetAttributes: Array<string>;
53
68
  trackingHosts: Array<string>;
54
69
  trackingPathSegments: Array<string>;
70
+ emojiImageHosts: Array<string>;
71
+ inertSelectors: Array<string>;
72
+ preservedPreClasses: Array<string>;
55
73
  urlUnwrappers: Array<UrlUnwrapper>;
56
74
  resolveUrlFn: ResolveUrlFn;
57
75
  assetProxyFn?: AssetProxyFn;
@@ -66,10 +84,14 @@ type TransformContentOptions = {
66
84
  baseUrl?: string;
67
85
  enclosures?: Array<Enclosure>;
68
86
  embedResolvers?: Array<EmbedResolver>;
87
+ bookmarkResolvers?: Array<BookmarkResolver>;
69
88
  lazySrcAttributes?: Array<string>;
70
89
  lazySrcsetAttributes?: Array<string>;
71
90
  trackingHosts?: Array<string>;
72
91
  trackingPathSegments?: Array<string>;
92
+ emojiImageHosts?: Array<string>;
93
+ inertSelectors?: Array<string>;
94
+ preservedPreClasses?: Array<string>;
73
95
  urlUnwrappers?: Array<UrlUnwrapper>;
74
96
  resolveUrlFn?: ResolveUrlFn;
75
97
  assetProxyFn?: AssetProxyFn;
@@ -79,4 +101,4 @@ type TransformContentOptions = {
79
101
  domTransforms?: Array<DomTransform>;
80
102
  };
81
103
  //#endregion
82
- export { AssetProxyFn, AssetType, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ParseHtmlFn, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext, UrlUnwrapper };
104
+ export { AssetProxyFn, AssetType, BookmarkResolver, BookmarkResolverResult, DomTransform, EmbedResolver, EmbedResolverResult, Enclosure, EnrichEmbedFn, MaybePromise, ParseHtmlFn, ResolveUrlFn, StringTransform, TransformContentOptions, TransformContext, UrlUnwrapper };
package/package.json CHANGED
@@ -40,16 +40,19 @@
40
40
  ],
41
41
  "scripts": {
42
42
  "prepare": "lefthook install",
43
- "build": "tsdown src/index.ts src/defaults.ts src/parsers/linkedom.ts --format esm --dts --clean --unbundle --no-fixed-extension"
43
+ "build": "tsdown src/index.ts src/defaults.ts src/parsers/linkedom.ts --format esm --dts --clean --unbundle --no-fixed-extension",
44
+ "test": "bun test",
45
+ "test:linkedom": "DOM_LIBRARY=linkedom bun test",
46
+ "test:jsdom": "DOM_LIBRARY=jsdom bun test"
44
47
  },
45
48
  "dependencies": {
46
- "@wordpress/autop": "^4.46.0",
49
+ "@wordpress/autop": "^4.47.0",
47
50
  "highlight.js": "^11.11.1",
48
51
  "linkifyjs": "^4.3.2",
49
52
  "srcset": "^5.0.3"
50
53
  },
51
54
  "peerDependencies": {
52
- "feedcanon": "^2.0.0-next.3",
55
+ "feedcanon": "^2.0.0-next.4",
53
56
  "feedscout": "^2.0.0-next.2",
54
57
  "linkedom": "^0.18.12"
55
58
  },
@@ -60,9 +63,11 @@
60
63
  },
61
64
  "devDependencies": {
62
65
  "@types/bun": "^1.3.13",
66
+ "@types/jsdom": "^28.0.3",
67
+ "jsdom": "^29.1.1",
63
68
  "kvalita": "^1.13.0",
64
69
  "linkedom": "^0.18.12",
65
- "tsdown": "^0.22.0"
70
+ "tsdown": "^0.22.1"
66
71
  },
67
- "version": "1.2.0"
72
+ "version": "2.0.1"
68
73
  }