feedsweep 2.0.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -2
- package/dist/common.js +14 -8
- package/dist/defaults.js +4 -2
- package/dist/embeds/youtube.js +1 -1
- package/dist/index.d.ts +3 -2
- package/dist/index.js +3 -2
- package/dist/transforms/dom/convertBreaksToParagraphs.js +3 -4
- package/dist/transforms/dom/decodeDoubleEncodedTags.js +5 -5
- package/dist/transforms/dom/linkifyUrls.js +4 -4
- package/dist/transforms/dom/markTimestamps.d.ts +7 -0
- package/dist/transforms/dom/markTimestamps.js +64 -0
- package/dist/transforms/dom/mergeConsecutiveOneLinerPres.js +3 -2
- package/dist/transforms/dom/mergeFragmentedLists.js +9 -12
- package/dist/transforms/dom/stripBoundaryBreaks.d.ts +6 -0
- package/dist/transforms/dom/{stripParagraphBoundaryBreaks.js → stripBoundaryBreaks.js} +25 -9
- package/dist/transforms/dom/stripEmptyTags.js +3 -4
- package/dist/transforms/dom/unwrapDoublyNestedLists.js +3 -3
- package/package.json +9 -4
- package/dist/transforms/dom/stripParagraphBoundaryBreaks.d.ts +0 -6
package/README.md
CHANGED
|
@@ -39,7 +39,7 @@ Inventory of every transform exported from the package. Most are enabled by defa
|
|
|
39
39
|
| `mergeConsecutiveOneLinerPres` | Merge consecutive single-line `<pre>` tags |
|
|
40
40
|
| `replacePreLineBreaks` | Replace `<br>` with `\n` inside `<pre>` |
|
|
41
41
|
| `stripInterBlockBreaks` | Remove `<br>` tags between block elements |
|
|
42
|
-
| `
|
|
42
|
+
| `stripBoundaryBreaks` | Remove `<br>` tags adjacent to block-element boundaries (paragraphs, headings, divs, list items, blockquotes, …) |
|
|
43
43
|
| `stripDuplicateTitleHeading` | Remove first `<h1>`–`<h6>` matching article title |
|
|
44
44
|
| `demoteHeadings` | Shift every heading down by one level (`<h1>`→`<h2>`, …, `<h5>`→`<h6>`) when the body contains an `<h1>`, so it sits below the reader's own page title |
|
|
45
45
|
| `unwrapRedirectUrls` | Remove Google/Bing/Facebook/etc. redirect wrappers |
|
|
@@ -61,6 +61,7 @@ Inventory of every transform exported from the package. Most are enabled by defa
|
|
|
61
61
|
| `paragraphizePlainText` | Wrap plain text in `<p>` tags |
|
|
62
62
|
| `stripOversizedBase64Sources` | Drop base64 `src`/`srcset`/`poster` payloads larger than 50 KB before parsing |
|
|
63
63
|
| `linkifyUrls` | Wrap bare URLs in `<a>` tags |
|
|
64
|
+
| `markTimestamps` | Wrap line-leading timestamps (`MM:SS` / `HH:MM:SS`) in `<span data-timestamp="seconds">` so a player can be seeked to that point |
|
|
64
65
|
| `trimPreWhitespace` | Remove common leading indentation from `<pre>` |
|
|
65
66
|
| `highlightCode` | Syntax-highlight `<code>` blocks with highlight.js |
|
|
66
67
|
| `stripEmptyTags` | Remove empty `<p>`, `<div>`, `<span>` and other tags |
|
|
@@ -96,7 +97,7 @@ The `stringTransforms` and `domTransforms` options each fully replace the corres
|
|
|
96
97
|
|
|
97
98
|
## DOM library
|
|
98
99
|
|
|
99
|
-
Feedsweep is parser-agnostic. You provide `parseHtmlFn` — a function that turns an HTML string into a `Document`. Use any DOM library that produces a standards-compliant `Document`.
|
|
100
|
+
Feedsweep is parser-agnostic. You provide `parseHtmlFn` — a function that turns an HTML string into a `Document`. Use any DOM library that produces a standards-compliant `Document`. The test suite runs the full pipeline against both linkedom and jsdom.
|
|
100
101
|
|
|
101
102
|
```typescript
|
|
102
103
|
// linkedom (recommended default)
|
package/dist/common.js
CHANGED
|
@@ -58,25 +58,31 @@ const blockElements = new Set([
|
|
|
58
58
|
"table",
|
|
59
59
|
"ul"
|
|
60
60
|
]);
|
|
61
|
-
const
|
|
62
|
-
return node
|
|
61
|
+
const isElement = (node) => {
|
|
62
|
+
return node?.nodeType === Node.ELEMENT_NODE;
|
|
63
63
|
};
|
|
64
|
-
const
|
|
65
|
-
return node
|
|
64
|
+
const isText = (node) => {
|
|
65
|
+
return node?.nodeType === Node.TEXT_NODE;
|
|
66
66
|
};
|
|
67
67
|
const isComment = (node) => {
|
|
68
|
-
return node
|
|
68
|
+
return node?.nodeType === Node.COMMENT_NODE;
|
|
69
|
+
};
|
|
70
|
+
const isWhitespaceText = (node) => {
|
|
71
|
+
return isText(node) && !node.textContent?.trim();
|
|
72
|
+
};
|
|
73
|
+
const isBr = (node) => {
|
|
74
|
+
return isElement(node) && node.localName === "br";
|
|
69
75
|
};
|
|
70
76
|
const isSkippable = (node) => {
|
|
71
77
|
return isWhitespaceText(node) || isBr(node) || isComment(node);
|
|
72
78
|
};
|
|
73
79
|
const isBlockElement = (node) => {
|
|
74
|
-
return node
|
|
80
|
+
return isElement(node) && blockElements.has(node.localName);
|
|
75
81
|
};
|
|
76
82
|
const hasAncestorWithTagName = (node, tagSet, stopAt) => {
|
|
77
83
|
let ancestor = node.parentNode;
|
|
78
84
|
while (ancestor !== null && ancestor !== stopAt) {
|
|
79
|
-
if (ancestor
|
|
85
|
+
if (isElement(ancestor) && tagSet.has(ancestor.localName)) return true;
|
|
80
86
|
ancestor = ancestor.parentNode;
|
|
81
87
|
}
|
|
82
88
|
return false;
|
|
@@ -161,4 +167,4 @@ const createBookmarkPlaceholder = (document, result) => {
|
|
|
161
167
|
return element;
|
|
162
168
|
};
|
|
163
169
|
//#endregion
|
|
164
|
-
export {
|
|
170
|
+
export { NodeFilter, applyDomTransforms, applyStringTransforms, createBookmarkPlaceholder, createEmbedPlaceholder, createPlaceholder, getDimensions, hasAncestorWithTagName, isBlockElement, isBr, isComment, isElement, isSafeThumbnailUrl, isSkippable, isText, isWhitespaceText, normalizeEmbedFields, updateEmbedPlaceholder };
|
package/dist/defaults.js
CHANGED
|
@@ -9,6 +9,7 @@ import { fixLazyImages } from "./transforms/dom/fixLazyImages.js";
|
|
|
9
9
|
import { highlightCode } from "./transforms/dom/highlightCode.js";
|
|
10
10
|
import { injectEnclosures } from "./transforms/dom/injectEnclosures.js";
|
|
11
11
|
import { linkifyUrls } from "./transforms/dom/linkifyUrls.js";
|
|
12
|
+
import { markTimestamps } from "./transforms/dom/markTimestamps.js";
|
|
12
13
|
import { mergeConsecutiveOneLinerPres } from "./transforms/dom/mergeConsecutiveOneLinerPres.js";
|
|
13
14
|
import { mergeFragmentedLists } from "./transforms/dom/mergeFragmentedLists.js";
|
|
14
15
|
import { proxyAssetUrls } from "./transforms/dom/proxyAssetUrls.js";
|
|
@@ -16,13 +17,13 @@ import { removeTrackingPixels } from "./transforms/dom/removeTrackingPixels.js";
|
|
|
16
17
|
import { replaceEmbedsWithPlaceholders } from "./transforms/dom/replaceEmbedsWithPlaceholders.js";
|
|
17
18
|
import { replacePreLineBreaks } from "./transforms/dom/replacePreLineBreaks.js";
|
|
18
19
|
import { resolveRelativeUrls } from "./transforms/dom/resolveRelativeUrls.js";
|
|
20
|
+
import { stripBoundaryBreaks } from "./transforms/dom/stripBoundaryBreaks.js";
|
|
19
21
|
import { stripComments } from "./transforms/dom/stripComments.js";
|
|
20
22
|
import { stripDeadAnchors } from "./transforms/dom/stripDeadAnchors.js";
|
|
21
23
|
import { stripDuplicateTitleHeading } from "./transforms/dom/stripDuplicateTitleHeading.js";
|
|
22
24
|
import { stripEmptyTags } from "./transforms/dom/stripEmptyTags.js";
|
|
23
25
|
import { stripInertElements } from "./transforms/dom/stripInertElements.js";
|
|
24
26
|
import { stripInterBlockBreaks } from "./transforms/dom/stripInterBlockBreaks.js";
|
|
25
|
-
import { stripParagraphBoundaryBreaks } from "./transforms/dom/stripParagraphBoundaryBreaks.js";
|
|
26
27
|
import { stripTrackingParams } from "./transforms/dom/stripTrackingParams.js";
|
|
27
28
|
import { trimPreWhitespace } from "./transforms/dom/trimPreWhitespace.js";
|
|
28
29
|
import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedLists.js";
|
|
@@ -70,13 +71,14 @@ const defaultDomTransforms = [
|
|
|
70
71
|
unwrapEmojiImages,
|
|
71
72
|
convertBreaksToParagraphs,
|
|
72
73
|
stripInterBlockBreaks,
|
|
73
|
-
|
|
74
|
+
stripBoundaryBreaks,
|
|
74
75
|
mergeFragmentedLists,
|
|
75
76
|
highlightCode,
|
|
76
77
|
mergeConsecutiveOneLinerPres,
|
|
77
78
|
replacePreLineBreaks,
|
|
78
79
|
trimPreWhitespace,
|
|
79
80
|
linkifyUrls,
|
|
81
|
+
markTimestamps,
|
|
80
82
|
replaceEmbedsWithPlaceholders,
|
|
81
83
|
injectEnclosures,
|
|
82
84
|
proxyAssetUrls,
|
package/dist/embeds/youtube.js
CHANGED
|
@@ -33,7 +33,7 @@ const youtubeResolveEmbed = (url) => {
|
|
|
33
33
|
return {
|
|
34
34
|
provider: "youtube",
|
|
35
35
|
id: videoId,
|
|
36
|
-
src: `https://www.youtube
|
|
36
|
+
src: `https://www.youtube.com/embed/${videoId}`,
|
|
37
37
|
url: `https://www.youtube.com/watch?v=${videoId}`,
|
|
38
38
|
thumbnail: composeThumbnailUrl(videoId)
|
|
39
39
|
};
|
package/dist/index.d.ts
CHANGED
|
@@ -13,6 +13,7 @@ import { fixLazyImages } from "./transforms/dom/fixLazyImages.js";
|
|
|
13
13
|
import { detectLanguage, highlightCode } from "./transforms/dom/highlightCode.js";
|
|
14
14
|
import { injectEnclosures } from "./transforms/dom/injectEnclosures.js";
|
|
15
15
|
import { linkifyUrls } from "./transforms/dom/linkifyUrls.js";
|
|
16
|
+
import { markTimestamps, parseTimestampSeconds } from "./transforms/dom/markTimestamps.js";
|
|
16
17
|
import { mergeConsecutiveOneLinerPres } from "./transforms/dom/mergeConsecutiveOneLinerPres.js";
|
|
17
18
|
import { mergeFragmentedLists } from "./transforms/dom/mergeFragmentedLists.js";
|
|
18
19
|
import { proxyAssetUrls } from "./transforms/dom/proxyAssetUrls.js";
|
|
@@ -20,13 +21,13 @@ import { removeTrackingPixels } from "./transforms/dom/removeTrackingPixels.js";
|
|
|
20
21
|
import { replaceEmbedsWithPlaceholders } from "./transforms/dom/replaceEmbedsWithPlaceholders.js";
|
|
21
22
|
import { replacePreLineBreaks } from "./transforms/dom/replacePreLineBreaks.js";
|
|
22
23
|
import { resolveRelativeUrls } from "./transforms/dom/resolveRelativeUrls.js";
|
|
24
|
+
import { stripBoundaryBreaks } from "./transforms/dom/stripBoundaryBreaks.js";
|
|
23
25
|
import { stripComments } from "./transforms/dom/stripComments.js";
|
|
24
26
|
import { stripDeadAnchors } from "./transforms/dom/stripDeadAnchors.js";
|
|
25
27
|
import { stripDuplicateTitleHeading } from "./transforms/dom/stripDuplicateTitleHeading.js";
|
|
26
28
|
import { stripEmptyTags } from "./transforms/dom/stripEmptyTags.js";
|
|
27
29
|
import { stripInertElements } from "./transforms/dom/stripInertElements.js";
|
|
28
30
|
import { stripInterBlockBreaks } from "./transforms/dom/stripInterBlockBreaks.js";
|
|
29
|
-
import { stripParagraphBoundaryBreaks } from "./transforms/dom/stripParagraphBoundaryBreaks.js";
|
|
30
31
|
import { stripTrackingParams } from "./transforms/dom/stripTrackingParams.js";
|
|
31
32
|
import { trimPreWhitespace } from "./transforms/dom/trimPreWhitespace.js";
|
|
32
33
|
import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedLists.js";
|
|
@@ -116,4 +117,4 @@ import { ParamExtractorConfig, chooseBaseUrl, coerceNumber, createParamExtractor
|
|
|
116
117
|
//#region src/index.d.ts
|
|
117
118
|
declare const transformContent: (html: string, options: TransformContentOptions) => Promise<string>;
|
|
118
119
|
//#endregion
|
|
119
|
-
export { type AssetProxyFn, type AssetType, type BookmarkResolver, type BookmarkResolverResult, type DomTransform, type EmbedResolver, type EmbedResolverResult, type Enclosure, type EnrichEmbedFn, type MaybePromise, type ParamExtractorConfig, type ParseHtmlFn, type ResolveUrlFn, type StringTransform, type TransformContentOptions, type TransformContext, applyDomTransforms, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBookmarkCards, convertBreaksToParagraphs, createBookmarkPlaceholder, createEmbedPlaceholder, createParamExtractor, createPlaceholder, decodeDoubleEncodedTags, defaultResolveUrlFn, demoteHeadings, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, ghostBookmarkResolver, highlightCode, injectEnclosures, isSafeThumbnailUrl, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, normalizeEmbedFields, paragraphizePlainText, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripControlChars, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInertElements, stripInterBlockBreaks, stripOversizedBase64Sources,
|
|
120
|
+
export { type AssetProxyFn, type AssetType, type BookmarkResolver, type BookmarkResolverResult, type DomTransform, type EmbedResolver, type EmbedResolverResult, type Enclosure, type EnrichEmbedFn, type MaybePromise, type ParamExtractorConfig, type ParseHtmlFn, type ResolveUrlFn, type StringTransform, type TransformContentOptions, type TransformContext, applyDomTransforms, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBookmarkCards, convertBreaksToParagraphs, createBookmarkPlaceholder, createEmbedPlaceholder, createParamExtractor, createPlaceholder, decodeDoubleEncodedTags, defaultResolveUrlFn, demoteHeadings, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, ghostBookmarkResolver, highlightCode, injectEnclosures, isSafeThumbnailUrl, linkifyUrls, markTimestamps, mergeConsecutiveOneLinerPres, mergeFragmentedLists, normalizeEmbedFields, paragraphizePlainText, parseTimestampSeconds, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripBoundaryBreaks, stripComments, stripControlChars, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInertElements, stripInterBlockBreaks, stripOversizedBase64Sources, stripTrackingParams, substackBookmarkResolver, transformContent, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapEmojiImages, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, updateEmbedPlaceholder, youtubeEmbedResolver, youtubeResolveEmbed };
|
package/dist/index.js
CHANGED
|
@@ -11,6 +11,7 @@ import { fixLazyImages } from "./transforms/dom/fixLazyImages.js";
|
|
|
11
11
|
import { detectLanguage, highlightCode } from "./transforms/dom/highlightCode.js";
|
|
12
12
|
import { injectEnclosures } from "./transforms/dom/injectEnclosures.js";
|
|
13
13
|
import { linkifyUrls } from "./transforms/dom/linkifyUrls.js";
|
|
14
|
+
import { markTimestamps, parseTimestampSeconds } from "./transforms/dom/markTimestamps.js";
|
|
14
15
|
import { mergeConsecutiveOneLinerPres } from "./transforms/dom/mergeConsecutiveOneLinerPres.js";
|
|
15
16
|
import { mergeFragmentedLists } from "./transforms/dom/mergeFragmentedLists.js";
|
|
16
17
|
import { proxyAssetUrls } from "./transforms/dom/proxyAssetUrls.js";
|
|
@@ -18,13 +19,13 @@ import { removeTrackingPixels } from "./transforms/dom/removeTrackingPixels.js";
|
|
|
18
19
|
import { replaceEmbedsWithPlaceholders } from "./transforms/dom/replaceEmbedsWithPlaceholders.js";
|
|
19
20
|
import { replacePreLineBreaks } from "./transforms/dom/replacePreLineBreaks.js";
|
|
20
21
|
import { resolveRelativeUrls } from "./transforms/dom/resolveRelativeUrls.js";
|
|
22
|
+
import { stripBoundaryBreaks } from "./transforms/dom/stripBoundaryBreaks.js";
|
|
21
23
|
import { stripComments } from "./transforms/dom/stripComments.js";
|
|
22
24
|
import { stripDeadAnchors } from "./transforms/dom/stripDeadAnchors.js";
|
|
23
25
|
import { stripDuplicateTitleHeading } from "./transforms/dom/stripDuplicateTitleHeading.js";
|
|
24
26
|
import { stripEmptyTags } from "./transforms/dom/stripEmptyTags.js";
|
|
25
27
|
import { stripInertElements } from "./transforms/dom/stripInertElements.js";
|
|
26
28
|
import { stripInterBlockBreaks } from "./transforms/dom/stripInterBlockBreaks.js";
|
|
27
|
-
import { stripParagraphBoundaryBreaks } from "./transforms/dom/stripParagraphBoundaryBreaks.js";
|
|
28
29
|
import { stripTrackingParams } from "./transforms/dom/stripTrackingParams.js";
|
|
29
30
|
import { trimPreWhitespace } from "./transforms/dom/trimPreWhitespace.js";
|
|
30
31
|
import { unwrapDoublyNestedLists } from "./transforms/dom/unwrapDoublyNestedLists.js";
|
|
@@ -137,4 +138,4 @@ const transformContent = async (html, options) => {
|
|
|
137
138
|
return await applyDomTransforms(await options.parseHtmlFn(afterString), domFns.map((transform) => transform(context)));
|
|
138
139
|
};
|
|
139
140
|
//#endregion
|
|
140
|
-
export { applyDomTransforms, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBookmarkCards, convertBreaksToParagraphs, createBookmarkPlaceholder, createEmbedPlaceholder, createParamExtractor, createPlaceholder, decodeDoubleEncodedTags, defaultResolveUrlFn, demoteHeadings, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, ghostBookmarkResolver, highlightCode, injectEnclosures, isSafeThumbnailUrl, linkifyUrls, mergeConsecutiveOneLinerPres, mergeFragmentedLists, normalizeEmbedFields, paragraphizePlainText, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripComments, stripControlChars, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInertElements, stripInterBlockBreaks, stripOversizedBase64Sources,
|
|
141
|
+
export { applyDomTransforms, applyStringTransforms, chooseBaseUrl, coerceNumber, composeThumbnailUrl, convertBookmarkCards, convertBreaksToParagraphs, createBookmarkPlaceholder, createEmbedPlaceholder, createParamExtractor, createPlaceholder, decodeDoubleEncodedTags, defaultResolveUrlFn, demoteHeadings, detectLanguage, enrichEmbedPlaceholders, extractRedirectTarget, extractVideoId, fixLazyImages, ghostBookmarkResolver, highlightCode, injectEnclosures, isSafeThumbnailUrl, linkifyUrls, markTimestamps, mergeConsecutiveOneLinerPres, mergeFragmentedLists, normalizeEmbedFields, paragraphizePlainText, parseTimestampSeconds, proxyAssetUrls, removeTrackingPixels, replaceEmbedsWithPlaceholders, replacePreLineBreaks, resolveRelativeUrls, stripBoundaryBreaks, stripComments, stripControlChars, stripDeadAnchors, stripDuplicateTitleHeading, stripEmptyTags, stripInertElements, stripInterBlockBreaks, stripOversizedBase64Sources, stripTrackingParams, substackBookmarkResolver, transformContent, trimPreWhitespace, unwrapAceml, unwrapAdjust, unwrapAmazonAffiliate, unwrapAmpCache, unwrapAwin, unwrapBing, unwrapCdataComments, unwrapCjNetwork, unwrapDigidip, unwrapDisqus, unwrapDouban, unwrapDoublyNestedLists, unwrapDuckduckgo, unwrapEbayRover, unwrapEffiliation, unwrapEmbedly, unwrapEmojiImages, unwrapFacebookShim, unwrapFeedsportal, unwrapFirebaseDynamicLinks, unwrapFlipboard, unwrapGateSc, unwrapGeoriot, unwrapGitee, unwrapGoogle, unwrapGoogleAmpViewer, unwrapGoogleNews, unwrapGoogleNewsModern, unwrapGoogleScholar, unwrapGoogleTranslate, unwrapHashnode, unwrapIcptrack, unwrapIdealoPartner, unwrapInstagramShim, unwrapJianshuGo, unwrapJuejin, unwrapLeverAnalytics, unwrapLinksynergy, unwrapMailchimp, unwrapMailpanion, unwrapMailpgn, unwrapMailtrack, unwrapMedium, unwrapMimecast, unwrapMozillaOutgoing, unwrapNarrativ, unwrapNicoMs, unwrapOutlookSafelinks, unwrapPartnerAds, unwrapPocket, unwrapPostmark, unwrapProofpointV1, unwrapProofpointV2, unwrapProofpointV3, unwrapPxf, unwrapRecruitics, unwrapRedditOut, unwrapRedirectUrls, unwrapRedirectingat, unwrapSegmentfault, unwrapShareasale, unwrapSjv, unwrapSkimlinks, unwrapSlack, unwrapSmartredirect, unwrapSspai, unwrapSteamLinkfilter, unwrapTelegramIv, unwrapTradedoubler, unwrapTumblr, unwrapValuecommerce, unwrapViglink, unwrapVkAway, unwrapWebArchive, unwrapWrappers, unwrapYahooSearch, unwrapYandexTurbo, unwrapYouTube, unwrapZhihu, updateEmbedPlaceholder, youtubeEmbedResolver, youtubeResolveEmbed };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { hasAncestorWithTagName, isBlockElement, isBr, isElement, isText, isWhitespaceText } from "../../common.js";
|
|
2
2
|
//#region src/transforms/dom/convertBreaksToParagraphs.ts
|
|
3
3
|
const processContainersSelector = "body, div, blockquote, td, li, article, section, main, header, footer, aside";
|
|
4
4
|
const preOrCodeTags = new Set(["pre", "code"]);
|
|
@@ -49,11 +49,10 @@ const convertBreaksToParagraphs = () => {
|
|
|
49
49
|
i++;
|
|
50
50
|
}
|
|
51
51
|
} else {
|
|
52
|
-
|
|
53
|
-
if (nodeType === Node.ELEMENT_NODE) {
|
|
52
|
+
if (isElement(child)) {
|
|
54
53
|
current.hasContent = true;
|
|
55
54
|
if (isBlockElement(child)) current.hasBlock = true;
|
|
56
|
-
} else if (
|
|
55
|
+
} else if (isText(child)) {
|
|
57
56
|
if (!current.hasContent && child.textContent?.trim()) current.hasContent = true;
|
|
58
57
|
}
|
|
59
58
|
i++;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { NodeFilter, hasAncestorWithTagName } from "../../common.js";
|
|
1
|
+
import { NodeFilter, hasAncestorWithTagName, isText } from "../../common.js";
|
|
2
2
|
//#region src/transforms/dom/decodeDoubleEncodedTags.ts
|
|
3
3
|
const opaqueTags = new Set([
|
|
4
4
|
"code",
|
|
@@ -16,13 +16,13 @@ const decodeDoubleEncodedTags = () => {
|
|
|
16
16
|
const walker = document.createTreeWalker(document.body, NodeFilter.SHOW_TEXT);
|
|
17
17
|
let tempDiv = null;
|
|
18
18
|
for (let node = walker.nextNode(); node !== null; node = walker.nextNode()) {
|
|
19
|
-
|
|
20
|
-
const data =
|
|
19
|
+
if (!isText(node)) continue;
|
|
20
|
+
const data = node.data;
|
|
21
21
|
if (!data.includes("<") || !tagInTextRegex.test(data)) continue;
|
|
22
|
-
if (hasAncestorWithTagName(
|
|
22
|
+
if (hasAncestorWithTagName(node, opaqueTags)) continue;
|
|
23
23
|
if (tempDiv === null) tempDiv = document.createElement("div");
|
|
24
24
|
tempDiv.innerHTML = data;
|
|
25
|
-
|
|
25
|
+
node.replaceWith(...tempDiv.childNodes);
|
|
26
26
|
}
|
|
27
27
|
};
|
|
28
28
|
};
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { isElement, isText } from "../../common.js";
|
|
2
2
|
import { find } from "linkifyjs";
|
|
3
3
|
//#region src/transforms/dom/linkifyUrls.ts
|
|
4
4
|
const urlProtocolRegex = /^https?:\/\//i;
|
|
@@ -13,9 +13,9 @@ const linkifyIgnoreTags = new Set([
|
|
|
13
13
|
"style"
|
|
14
14
|
]);
|
|
15
15
|
const collectTextNodes = (node, result = []) => {
|
|
16
|
-
if (node
|
|
17
|
-
for (const child of node.childNodes) if (child
|
|
18
|
-
else if (child
|
|
16
|
+
if (isElement(node) && linkifyIgnoreTags.has(node.tagName.toLowerCase())) return result;
|
|
17
|
+
for (const child of node.childNodes) if (isText(child)) result.push(child);
|
|
18
|
+
else if (isElement(child) && !linkifyIgnoreTags.has(child.tagName.toLowerCase())) collectTextNodes(child, result);
|
|
19
19
|
return result;
|
|
20
20
|
};
|
|
21
21
|
const linkifyUrls = () => {
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import { DomTransform } from "../../types.js";
|
|
2
|
+
|
|
3
|
+
//#region src/transforms/dom/markTimestamps.d.ts
|
|
4
|
+
declare const parseTimestampSeconds: (timestamp: string) => number | undefined;
|
|
5
|
+
declare const markTimestamps: DomTransform;
|
|
6
|
+
//#endregion
|
|
7
|
+
export { markTimestamps, parseTimestampSeconds };
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { isElement, isText } from "../../common.js";
|
|
2
|
+
//#region src/transforms/dom/markTimestamps.ts
|
|
3
|
+
const timestampIgnoreTags = new Set([
|
|
4
|
+
"a",
|
|
5
|
+
"pre",
|
|
6
|
+
"code",
|
|
7
|
+
"kbd",
|
|
8
|
+
"samp",
|
|
9
|
+
"var",
|
|
10
|
+
"script",
|
|
11
|
+
"style"
|
|
12
|
+
]);
|
|
13
|
+
const lineLeadingTimestampRegex = /(^|\n)([ \t]*)((?:\d{1,2}:)?\d{1,2}:\d{2})/gm;
|
|
14
|
+
const numericPartRegex = /^\d+$/;
|
|
15
|
+
const parseTimestampSeconds = (timestamp) => {
|
|
16
|
+
const parts = timestamp.split(":");
|
|
17
|
+
if (!parts.every((part) => numericPartRegex.test(part))) return;
|
|
18
|
+
if (parts.length === 2) {
|
|
19
|
+
const [minutes, seconds] = parts.map(Number);
|
|
20
|
+
if (seconds > 59) return;
|
|
21
|
+
return minutes * 60 + seconds;
|
|
22
|
+
}
|
|
23
|
+
if (parts.length === 3) {
|
|
24
|
+
const [hours, minutes, seconds] = parts.map(Number);
|
|
25
|
+
if (minutes > 59 || seconds > 59) return;
|
|
26
|
+
return hours * 3600 + minutes * 60 + seconds;
|
|
27
|
+
}
|
|
28
|
+
};
|
|
29
|
+
const shouldSkipElement = (element) => {
|
|
30
|
+
return timestampIgnoreTags.has(element.tagName.toLowerCase()) || element.hasAttribute("data-timestamp");
|
|
31
|
+
};
|
|
32
|
+
const collectTextNodes = (node, result = []) => {
|
|
33
|
+
for (const child of node.childNodes) if (isText(child)) result.push(child);
|
|
34
|
+
else if (isElement(child) && !shouldSkipElement(child)) collectTextNodes(child, result);
|
|
35
|
+
return result;
|
|
36
|
+
};
|
|
37
|
+
const markTimestamps = () => {
|
|
38
|
+
return (document) => {
|
|
39
|
+
const textNodes = collectTextNodes(document);
|
|
40
|
+
for (const node of textNodes) {
|
|
41
|
+
const text = node.textContent;
|
|
42
|
+
if (!text?.includes(":")) continue;
|
|
43
|
+
const parts = [];
|
|
44
|
+
let lastIndex = 0;
|
|
45
|
+
for (const match of text.matchAll(lineLeadingTimestampRegex)) {
|
|
46
|
+
const [, lineStart, leading, token] = match;
|
|
47
|
+
const seconds = parseTimestampSeconds(token);
|
|
48
|
+
if (seconds === void 0) continue;
|
|
49
|
+
const tokenStart = (match.index ?? 0) + lineStart.length + leading.length;
|
|
50
|
+
if (tokenStart > lastIndex) parts.push(document.createTextNode(text.slice(lastIndex, tokenStart)));
|
|
51
|
+
const span = document.createElement("span");
|
|
52
|
+
span.setAttribute("data-timestamp", String(seconds));
|
|
53
|
+
span.textContent = token;
|
|
54
|
+
parts.push(span);
|
|
55
|
+
lastIndex = tokenStart + token.length;
|
|
56
|
+
}
|
|
57
|
+
if (parts.length === 0) continue;
|
|
58
|
+
if (lastIndex < text.length) parts.push(document.createTextNode(text.slice(lastIndex)));
|
|
59
|
+
node.replaceWith(...parts);
|
|
60
|
+
}
|
|
61
|
+
};
|
|
62
|
+
};
|
|
63
|
+
//#endregion
|
|
64
|
+
export { markTimestamps, parseTimestampSeconds };
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { isElement, isText } from "../../common.js";
|
|
1
2
|
//#region src/transforms/dom/mergeConsecutiveOneLinerPres.ts
|
|
2
3
|
const trailingBrRegex = /<br\s*\/?>\s*$/i;
|
|
3
4
|
const surroundingNewlinesRegex = /^\n+|\n+$/g;
|
|
@@ -17,8 +18,8 @@ const mergeConsecutiveOneLinerPres = ({ preservedPreClasses }) => {
|
|
|
17
18
|
const run = [pre];
|
|
18
19
|
let sibling = pre.nextSibling;
|
|
19
20
|
while (sibling) {
|
|
20
|
-
if (sibling
|
|
21
|
-
if (sibling
|
|
21
|
+
if (!isElement(sibling) && !isText(sibling)) break;
|
|
22
|
+
if (isText(sibling)) {
|
|
22
23
|
if (sibling.textContent?.trim() !== "") break;
|
|
23
24
|
sibling = sibling.nextSibling;
|
|
24
25
|
continue;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { isComment, isElement, isText, isWhitespaceText } from "../../common.js";
|
|
2
2
|
//#region src/transforms/dom/mergeFragmentedLists.ts
|
|
3
3
|
const mergeFragmentedLists = () => {
|
|
4
4
|
return (document) => {
|
|
@@ -24,9 +24,8 @@ const mergeFragmentedLists = () => {
|
|
|
24
24
|
let between = target.nextSibling;
|
|
25
25
|
while (between && between !== extra) {
|
|
26
26
|
const next = between.nextSibling;
|
|
27
|
-
|
|
28
|
-
if (
|
|
29
|
-
else if (type === Node.TEXT_NODE) target.appendChild(between);
|
|
27
|
+
if (isComment(between)) between.parentNode?.removeChild(between);
|
|
28
|
+
else if (isText(between)) target.appendChild(between);
|
|
30
29
|
between = next;
|
|
31
30
|
}
|
|
32
31
|
while (extra.firstChild) target.appendChild(extra.firstChild);
|
|
@@ -38,14 +37,13 @@ const mergeFragmentedLists = () => {
|
|
|
38
37
|
const nextMergeableSibling = (from, localName) => {
|
|
39
38
|
let sibling = from.nextSibling;
|
|
40
39
|
while (sibling) {
|
|
41
|
-
|
|
42
|
-
if (
|
|
43
|
-
if (type === Node.TEXT_NODE) {
|
|
40
|
+
if (isElement(sibling)) return sibling.localName === localName ? sibling : void 0;
|
|
41
|
+
if (isText(sibling)) {
|
|
44
42
|
if (!isWhitespaceText(sibling)) return;
|
|
45
43
|
sibling = sibling.nextSibling;
|
|
46
44
|
continue;
|
|
47
45
|
}
|
|
48
|
-
if (
|
|
46
|
+
if (isComment(sibling)) {
|
|
49
47
|
sibling = sibling.nextSibling;
|
|
50
48
|
continue;
|
|
51
49
|
}
|
|
@@ -54,16 +52,15 @@ const nextMergeableSibling = (from, localName) => {
|
|
|
54
52
|
};
|
|
55
53
|
const hasOnlyListItemChildren = (list) => {
|
|
56
54
|
for (let child = list.firstChild; child; child = child.nextSibling) {
|
|
57
|
-
|
|
58
|
-
if (type === Node.ELEMENT_NODE) {
|
|
55
|
+
if (isElement(child)) {
|
|
59
56
|
if (child.localName !== "li") return false;
|
|
60
57
|
continue;
|
|
61
58
|
}
|
|
62
|
-
if (
|
|
59
|
+
if (isText(child)) {
|
|
63
60
|
if (!isWhitespaceText(child)) return false;
|
|
64
61
|
continue;
|
|
65
62
|
}
|
|
66
|
-
if (
|
|
63
|
+
if (!isComment(child)) return false;
|
|
67
64
|
}
|
|
68
65
|
return true;
|
|
69
66
|
};
|
|
@@ -1,10 +1,26 @@
|
|
|
1
1
|
import { isBr, isSkippable } from "../../common.js";
|
|
2
|
-
//#region src/transforms/dom/
|
|
3
|
-
const
|
|
2
|
+
//#region src/transforms/dom/stripBoundaryBreaks.ts
|
|
3
|
+
const boundaryBreakSelectors = [
|
|
4
|
+
"p",
|
|
5
|
+
"h1",
|
|
6
|
+
"h2",
|
|
7
|
+
"h3",
|
|
8
|
+
"h4",
|
|
9
|
+
"h5",
|
|
10
|
+
"h6",
|
|
11
|
+
"div",
|
|
12
|
+
"blockquote",
|
|
13
|
+
"li",
|
|
14
|
+
"ul",
|
|
15
|
+
"ol",
|
|
16
|
+
"figcaption",
|
|
17
|
+
"section"
|
|
18
|
+
];
|
|
19
|
+
const stripBoundaryBreaks = () => {
|
|
4
20
|
return (document) => {
|
|
5
|
-
const
|
|
6
|
-
for (const
|
|
7
|
-
let cursor =
|
|
21
|
+
const elements = document.querySelectorAll(boundaryBreakSelectors.join(", "));
|
|
22
|
+
for (const element of elements) {
|
|
23
|
+
let cursor = element.firstChild;
|
|
8
24
|
let leadingHasBr = false;
|
|
9
25
|
let leadingEnd = null;
|
|
10
26
|
while (cursor && isSkippable(cursor)) {
|
|
@@ -13,7 +29,7 @@ const stripParagraphBoundaryBreaks = () => {
|
|
|
13
29
|
cursor = cursor.nextSibling;
|
|
14
30
|
}
|
|
15
31
|
if (leadingHasBr) {
|
|
16
|
-
let node =
|
|
32
|
+
let node = element.firstChild;
|
|
17
33
|
while (node) {
|
|
18
34
|
const next = node.nextSibling;
|
|
19
35
|
node.remove();
|
|
@@ -21,7 +37,7 @@ const stripParagraphBoundaryBreaks = () => {
|
|
|
21
37
|
node = next;
|
|
22
38
|
}
|
|
23
39
|
}
|
|
24
|
-
cursor =
|
|
40
|
+
cursor = element.lastChild;
|
|
25
41
|
let trailingHasBr = false;
|
|
26
42
|
let trailingEnd = null;
|
|
27
43
|
while (cursor && isSkippable(cursor)) {
|
|
@@ -30,7 +46,7 @@ const stripParagraphBoundaryBreaks = () => {
|
|
|
30
46
|
cursor = cursor.previousSibling;
|
|
31
47
|
}
|
|
32
48
|
if (trailingHasBr) {
|
|
33
|
-
let node =
|
|
49
|
+
let node = element.lastChild;
|
|
34
50
|
while (node) {
|
|
35
51
|
const prev = node.previousSibling;
|
|
36
52
|
node.remove();
|
|
@@ -42,4 +58,4 @@ const stripParagraphBoundaryBreaks = () => {
|
|
|
42
58
|
};
|
|
43
59
|
};
|
|
44
60
|
//#endregion
|
|
45
|
-
export {
|
|
61
|
+
export { stripBoundaryBreaks };
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { isElement, isText } from "../../common.js";
|
|
2
2
|
//#region src/transforms/dom/stripEmptyTags.ts
|
|
3
3
|
const preserveWhenEmpty = new Set([
|
|
4
4
|
"iframe",
|
|
@@ -33,12 +33,11 @@ const stripEmptyTags = () => {
|
|
|
33
33
|
let hasContent = false;
|
|
34
34
|
for (let j = 0; j < childCount; j++) {
|
|
35
35
|
const child = childNodes[j];
|
|
36
|
-
|
|
37
|
-
if (nodeType === Node.ELEMENT_NODE) {
|
|
36
|
+
if (isElement(child)) {
|
|
38
37
|
hasContent = true;
|
|
39
38
|
break;
|
|
40
39
|
}
|
|
41
|
-
if (
|
|
40
|
+
if (isText(child) && child.data.trim().length > 0) {
|
|
42
41
|
hasContent = true;
|
|
43
42
|
break;
|
|
44
43
|
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { isText } from "../../common.js";
|
|
2
2
|
//#region src/transforms/dom/unwrapDoublyNestedLists.ts
|
|
3
3
|
const unwrapDoublyNestedLists = () => {
|
|
4
4
|
return (document) => {
|
|
@@ -21,7 +21,7 @@ const unwrapDoublyNestedLists = () => {
|
|
|
21
21
|
}
|
|
22
22
|
if (elementDisqualified || inner === null) continue;
|
|
23
23
|
let textDisqualified = false;
|
|
24
|
-
for (let node = wrapper.firstChild; node !== null; node = node.nextSibling) if (node
|
|
24
|
+
for (let node = wrapper.firstChild; node !== null; node = node.nextSibling) if (isText(node) && node.textContent?.trim()) {
|
|
25
25
|
textDisqualified = true;
|
|
26
26
|
break;
|
|
27
27
|
}
|
|
@@ -30,7 +30,7 @@ const unwrapDoublyNestedLists = () => {
|
|
|
30
30
|
if (parent === null) continue;
|
|
31
31
|
for (let node = wrapper.firstChild; node !== null;) {
|
|
32
32
|
const next = node.nextSibling;
|
|
33
|
-
if (node
|
|
33
|
+
if (isText(node) || node === inner) parent.insertBefore(node, outer);
|
|
34
34
|
node = next;
|
|
35
35
|
}
|
|
36
36
|
outer.remove();
|
package/package.json
CHANGED
|
@@ -40,10 +40,13 @@
|
|
|
40
40
|
],
|
|
41
41
|
"scripts": {
|
|
42
42
|
"prepare": "lefthook install",
|
|
43
|
-
"build": "tsdown src/index.ts src/defaults.ts src/parsers/linkedom.ts --format esm --dts --clean --unbundle --no-fixed-extension"
|
|
43
|
+
"build": "tsdown src/index.ts src/defaults.ts src/parsers/linkedom.ts --format esm --dts --clean --unbundle --no-fixed-extension",
|
|
44
|
+
"test": "bun test",
|
|
45
|
+
"test:linkedom": "DOM_LIBRARY=linkedom bun test",
|
|
46
|
+
"test:jsdom": "DOM_LIBRARY=jsdom bun test"
|
|
44
47
|
},
|
|
45
48
|
"dependencies": {
|
|
46
|
-
"@wordpress/autop": "^4.
|
|
49
|
+
"@wordpress/autop": "^4.47.0",
|
|
47
50
|
"highlight.js": "^11.11.1",
|
|
48
51
|
"linkifyjs": "^4.3.2",
|
|
49
52
|
"srcset": "^5.0.3"
|
|
@@ -60,9 +63,11 @@
|
|
|
60
63
|
},
|
|
61
64
|
"devDependencies": {
|
|
62
65
|
"@types/bun": "^1.3.13",
|
|
66
|
+
"@types/jsdom": "^28.0.3",
|
|
67
|
+
"jsdom": "^29.1.1",
|
|
63
68
|
"kvalita": "^1.13.0",
|
|
64
69
|
"linkedom": "^0.18.12",
|
|
65
|
-
"tsdown": "^0.22.
|
|
70
|
+
"tsdown": "^0.22.1"
|
|
66
71
|
},
|
|
67
|
-
"version": "2.
|
|
72
|
+
"version": "2.1.0"
|
|
68
73
|
}
|