@mz1999/defuddle 0.14.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +371 -0
- package/dist/cli.d.ts +2 -0
- package/dist/cli.js +145 -0
- package/dist/cli.js.map +1 -0
- package/dist/constants.d.ts +24 -0
- package/dist/constants.js +950 -0
- package/dist/constants.js.map +1 -0
- package/dist/defuddle.d.ts +136 -0
- package/dist/defuddle.js +1816 -0
- package/dist/defuddle.js.map +1 -0
- package/dist/elements/callouts.d.ts +6 -0
- package/dist/elements/callouts.js +74 -0
- package/dist/elements/callouts.js.map +1 -0
- package/dist/elements/code.d.ts +5 -0
- package/dist/elements/code.js +346 -0
- package/dist/elements/code.js.map +1 -0
- package/dist/elements/footnotes.d.ts +5 -0
- package/dist/elements/footnotes.js +619 -0
- package/dist/elements/footnotes.js.map +1 -0
- package/dist/elements/headings.d.ts +11 -0
- package/dist/elements/headings.js +100 -0
- package/dist/elements/headings.js.map +1 -0
- package/dist/elements/images.d.ts +8 -0
- package/dist/elements/images.js +877 -0
- package/dist/elements/images.js.map +1 -0
- package/dist/elements/math.base.d.ts +9 -0
- package/dist/elements/math.base.js +195 -0
- package/dist/elements/math.base.js.map +1 -0
- package/dist/elements/math.core.d.ts +7 -0
- package/dist/elements/math.core.js +52 -0
- package/dist/elements/math.core.js.map +1 -0
- package/dist/elements/math.d.ts +2 -0
- package/dist/elements/math.full.d.ts +8 -0
- package/dist/elements/math.js +7 -0
- package/dist/elements/math.js.map +1 -0
- package/dist/extractor-registry.d.ts +16 -0
- package/dist/extractor-registry.js +140 -0
- package/dist/extractor-registry.js.map +1 -0
- package/dist/extractors/_base.d.ts +22 -0
- package/dist/extractors/_base.js +27 -0
- package/dist/extractors/_base.js.map +1 -0
- package/dist/extractors/_conversation.d.ts +9 -0
- package/dist/extractors/_conversation.js +78 -0
- package/dist/extractors/_conversation.js.map +1 -0
- package/dist/extractors/chatgpt.d.ts +14 -0
- package/dist/extractors/chatgpt.js +138 -0
- package/dist/extractors/chatgpt.js.map +1 -0
- package/dist/extractors/claude.d.ts +10 -0
- package/dist/extractors/claude.js +91 -0
- package/dist/extractors/claude.js.map +1 -0
- package/dist/extractors/gemini.d.ts +14 -0
- package/dist/extractors/gemini.js +111 -0
- package/dist/extractors/gemini.js.map +1 -0
- package/dist/extractors/github.d.ts +20 -0
- package/dist/extractors/github.js +251 -0
- package/dist/extractors/github.js.map +1 -0
- package/dist/extractors/grok.d.ts +15 -0
- package/dist/extractors/grok.js +142 -0
- package/dist/extractors/grok.js.map +1 -0
- package/dist/extractors/hackernews.d.ts +21 -0
- package/dist/extractors/hackernews.js +155 -0
- package/dist/extractors/hackernews.js.map +1 -0
- package/dist/extractors/reddit.d.ts +22 -0
- package/dist/extractors/reddit.js +197 -0
- package/dist/extractors/reddit.js.map +1 -0
- package/dist/extractors/twitter.d.ts +16 -0
- package/dist/extractors/twitter.js +204 -0
- package/dist/extractors/twitter.js.map +1 -0
- package/dist/extractors/x-article.d.ts +24 -0
- package/dist/extractors/x-article.js +267 -0
- package/dist/extractors/x-article.js.map +1 -0
- package/dist/extractors/x-oembed.d.ts +20 -0
- package/dist/extractors/x-oembed.js +350 -0
- package/dist/extractors/x-oembed.js.map +1 -0
- package/dist/extractors/youtube.d.ts +87 -0
- package/dist/extractors/youtube.js +869 -0
- package/dist/extractors/youtube.js.map +1 -0
- package/dist/fetch.d.ts +18 -0
- package/dist/fetch.js +265 -0
- package/dist/fetch.js.map +1 -0
- package/dist/index.d.ts +3 -0
- package/dist/index.full.d.ts +12 -0
- package/dist/index.full.js +1 -0
- package/dist/index.js +1 -0
- package/dist/index.js.map +1 -0
- package/dist/markdown.d.ts +30 -0
- package/dist/markdown.js +661 -0
- package/dist/markdown.js.map +1 -0
- package/dist/metadata.d.ts +25 -0
- package/dist/metadata.js +426 -0
- package/dist/metadata.js.map +1 -0
- package/dist/node.d.ts +19 -0
- package/dist/node.js +78 -0
- package/dist/node.js.map +1 -0
- package/dist/scoring.d.ts +31 -0
- package/dist/scoring.js +472 -0
- package/dist/scoring.js.map +1 -0
- package/dist/standardize.d.ts +2 -0
- package/dist/standardize.js +1101 -0
- package/dist/standardize.js.map +1 -0
- package/dist/types/extractors.d.ts +41 -0
- package/dist/types/extractors.js +3 -0
- package/dist/types/extractors.js.map +1 -0
- package/dist/types.d.ts +135 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/comments.d.ts +44 -0
- package/dist/utils/comments.js +103 -0
- package/dist/utils/comments.js.map +1 -0
- package/dist/utils/dom.d.ts +42 -0
- package/dist/utils/dom.js +104 -0
- package/dist/utils/dom.js.map +1 -0
- package/dist/utils/linkedom-compat.d.ts +5 -0
- package/dist/utils/linkedom-compat.js +23 -0
- package/dist/utils/linkedom-compat.js.map +1 -0
- package/dist/utils/transcript.d.ts +37 -0
- package/dist/utils/transcript.js +61 -0
- package/dist/utils/transcript.js.map +1 -0
- package/dist/utils.d.ts +13 -0
- package/dist/utils.js +98 -0
- package/dist/utils.js.map +1 -0
- package/package.json +107 -0
|
@@ -0,0 +1,877 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Standardization rules for handling images
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.imageRules = void 0;
|
|
7
|
+
const utils_1 = require("../utils");
|
|
8
|
+
const dom_1 = require("../utils/dom");
|
|
9
|
+
const constants_1 = require("../constants");
|
|
10
|
+
// Pre-compile regular expressions
|
|
11
|
+
const b64DataUrlRegex = /^data:image\/([^;]+);base64,/;
|
|
12
|
+
const srcsetPattern = /\.(jpg|jpeg|png|webp)\s+\d/;
|
|
13
|
+
const srcPattern = /^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/;
|
|
14
|
+
const imageUrlPattern = /\.(jpg|jpeg|png|webp|gif|avif)(\?.*)?$/i;
|
|
15
|
+
const widthPattern = /\s(\d+)w/;
|
|
16
|
+
const dprPattern = /dpr=(\d+(?:\.\d+)?)/;
|
|
17
|
+
const urlPattern = /^([^\s]+)/;
|
|
18
|
+
const filenamePattern = /^[\w\-\.\/\\]+\.(jpg|jpeg|png|gif|webp|svg)$/i;
|
|
19
|
+
const datePattern = /^\d{4}-\d{2}-\d{2}$/;
|
|
20
|
+
exports.imageRules = [
|
|
21
|
+
// Handle picture elements first to ensure we get the highest resolution
|
|
22
|
+
{
|
|
23
|
+
selector: 'picture',
|
|
24
|
+
element: 'picture',
|
|
25
|
+
transform: (el, doc) => {
|
|
26
|
+
const sourceElements = el.querySelectorAll('source');
|
|
27
|
+
const imgElement = el.querySelector('img');
|
|
28
|
+
if (!imgElement) {
|
|
29
|
+
console.warn('Picture element without img fallback:', el.outerHTML);
|
|
30
|
+
const bestSource = selectBestSource(sourceElements);
|
|
31
|
+
if (bestSource) {
|
|
32
|
+
const srcset = bestSource.getAttribute('srcset');
|
|
33
|
+
if (srcset) {
|
|
34
|
+
const newImg = doc.createElement('img');
|
|
35
|
+
applySrcsetToImage(srcset, newImg);
|
|
36
|
+
el.replaceChildren(newImg);
|
|
37
|
+
return el;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
return el;
|
|
41
|
+
}
|
|
42
|
+
let bestSrcset = null;
|
|
43
|
+
let bestSrc = null;
|
|
44
|
+
if (sourceElements.length > 0) {
|
|
45
|
+
const bestSource = selectBestSource(sourceElements);
|
|
46
|
+
if (bestSource) {
|
|
47
|
+
bestSrcset = bestSource.getAttribute('srcset');
|
|
48
|
+
if (bestSrcset) {
|
|
49
|
+
bestSrc = extractFirstUrlFromSrcset(bestSrcset);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
if (bestSrcset) {
|
|
54
|
+
imgElement.setAttribute('srcset', bestSrcset);
|
|
55
|
+
}
|
|
56
|
+
if (bestSrc && isValidImageUrl(bestSrc)) {
|
|
57
|
+
imgElement.setAttribute('src', bestSrc);
|
|
58
|
+
}
|
|
59
|
+
else if (!imgElement.hasAttribute('src') || !isValidImageUrl(imgElement.getAttribute('src') || '')) {
|
|
60
|
+
const firstUrl = extractFirstUrlFromSrcset(imgElement.getAttribute('srcset') || bestSrcset || '');
|
|
61
|
+
if (firstUrl && isValidImageUrl(firstUrl)) {
|
|
62
|
+
imgElement.setAttribute('src', firstUrl);
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
sourceElements.forEach(source => source.remove());
|
|
66
|
+
return el;
|
|
67
|
+
}
|
|
68
|
+
},
|
|
69
|
+
// Handle custom <uni-image-full-width> elements
|
|
70
|
+
{
|
|
71
|
+
selector: 'uni-image-full-width',
|
|
72
|
+
element: 'figure',
|
|
73
|
+
transform: (el, doc) => {
|
|
74
|
+
const figure = doc.createElement('figure');
|
|
75
|
+
const img = doc.createElement('img');
|
|
76
|
+
// Find the original image element
|
|
77
|
+
const originalImg = el.querySelector('img');
|
|
78
|
+
if (!originalImg) {
|
|
79
|
+
// If no img inside, return an empty figure or maybe just the original element?
|
|
80
|
+
// Returning empty figure for now, as it represents a failed conversion.
|
|
81
|
+
console.warn('uni-image-full-width without img:', el.outerHTML);
|
|
82
|
+
return figure;
|
|
83
|
+
}
|
|
84
|
+
let bestSrc = originalImg.getAttribute('src'); // Default to src
|
|
85
|
+
const dataLoadingAttr = originalImg.getAttribute('data-loading');
|
|
86
|
+
if (dataLoadingAttr) {
|
|
87
|
+
try {
|
|
88
|
+
const dataLoading = JSON.parse(dataLoadingAttr);
|
|
89
|
+
if (dataLoading.desktop && isValidImageUrl(dataLoading.desktop)) {
|
|
90
|
+
bestSrc = dataLoading.desktop; // Prefer desktop URL
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
catch (e) {
|
|
94
|
+
console.warn('Failed to parse data-loading attribute:', dataLoadingAttr, e);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
if (bestSrc && isValidImageUrl(bestSrc)) {
|
|
98
|
+
img.setAttribute('src', bestSrc);
|
|
99
|
+
}
|
|
100
|
+
else {
|
|
101
|
+
// If no valid src found, maybe skip this image?
|
|
102
|
+
console.warn('Could not find valid src for uni-image-full-width:', el.outerHTML);
|
|
103
|
+
return figure; // Return empty figure
|
|
104
|
+
}
|
|
105
|
+
let altText = originalImg.getAttribute('alt');
|
|
106
|
+
if (!altText) {
|
|
107
|
+
altText = el.getAttribute('alt-text'); // Fallback to parent attribute
|
|
108
|
+
}
|
|
109
|
+
if (altText) {
|
|
110
|
+
img.setAttribute('alt', altText);
|
|
111
|
+
}
|
|
112
|
+
// Append the image to the figure
|
|
113
|
+
figure.appendChild(img);
|
|
114
|
+
// Find and add caption
|
|
115
|
+
const figcaptionEl = el.querySelector('figcaption');
|
|
116
|
+
if (figcaptionEl) {
|
|
117
|
+
// Extract text content, potentially from nested elements like <p>
|
|
118
|
+
const captionText = figcaptionEl.textContent?.trim();
|
|
119
|
+
if (captionText && captionText.length > 5) { // Basic check for meaningful caption
|
|
120
|
+
const figcaption = doc.createElement('figcaption');
|
|
121
|
+
// Try to get cleaner text from specific inner element if possible
|
|
122
|
+
const richTextP = figcaptionEl.querySelector('.rich-text p');
|
|
123
|
+
if (richTextP) {
|
|
124
|
+
(0, dom_1.transferContent)(richTextP, figcaption);
|
|
125
|
+
}
|
|
126
|
+
else {
|
|
127
|
+
figcaption.textContent = captionText;
|
|
128
|
+
}
|
|
129
|
+
figure.appendChild(figcaption);
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
return figure;
|
|
133
|
+
}
|
|
134
|
+
},
|
|
135
|
+
// Handle lazy-loaded images
|
|
136
|
+
{
|
|
137
|
+
selector: 'img[data-src], img[data-srcset], img[loading="lazy"], img.lazy, img.lazyload',
|
|
138
|
+
element: 'img',
|
|
139
|
+
transform: (el, doc) => {
|
|
140
|
+
// Check for base64 placeholder images
|
|
141
|
+
const src = el.getAttribute('src') || '';
|
|
142
|
+
const hasBetterSource = hasBetterImageSource(el);
|
|
143
|
+
if (isBase64Placeholder(src) && hasBetterSource) {
|
|
144
|
+
// Remove the placeholder src if we have better alternatives
|
|
145
|
+
el.removeAttribute('src');
|
|
146
|
+
}
|
|
147
|
+
// Handle data-src
|
|
148
|
+
const dataSrc = el.getAttribute('data-src');
|
|
149
|
+
if (dataSrc && !el.getAttribute('src')) {
|
|
150
|
+
el.setAttribute('src', dataSrc);
|
|
151
|
+
}
|
|
152
|
+
// Handle data-srcset
|
|
153
|
+
const dataSrcset = el.getAttribute('data-srcset');
|
|
154
|
+
if (dataSrcset && !el.getAttribute('srcset')) {
|
|
155
|
+
el.setAttribute('srcset', dataSrcset);
|
|
156
|
+
}
|
|
157
|
+
// Check for other attributes that might contain image URLs
|
|
158
|
+
for (let i = 0; i < el.attributes.length; i++) {
|
|
159
|
+
const attr = el.attributes[i];
|
|
160
|
+
if (attr.name === 'src' || attr.name === 'srcset' || attr.name === 'alt') {
|
|
161
|
+
continue; // Skip these attributes
|
|
162
|
+
}
|
|
163
|
+
// Skip JSON-like values (e.g., Substack's data-attrs containing image metadata)
|
|
164
|
+
const firstChar = attr.value.charAt(0);
|
|
165
|
+
if (firstChar === '{' || firstChar === '[') {
|
|
166
|
+
continue;
|
|
167
|
+
}
|
|
168
|
+
// Check if attribute contains an image URL
|
|
169
|
+
if (srcsetPattern.test(attr.value)) {
|
|
170
|
+
// This looks like a srcset value
|
|
171
|
+
el.setAttribute('srcset', attr.value);
|
|
172
|
+
}
|
|
173
|
+
else if (srcPattern.test(attr.value)) {
|
|
174
|
+
// This looks like a src value
|
|
175
|
+
el.setAttribute('src', attr.value);
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
// Remove lazy loading related classes and attributes
|
|
179
|
+
el.classList.remove('lazy', 'lazyload');
|
|
180
|
+
el.removeAttribute('data-ll-status');
|
|
181
|
+
el.removeAttribute('data-src');
|
|
182
|
+
el.removeAttribute('data-srcset');
|
|
183
|
+
el.removeAttribute('loading');
|
|
184
|
+
return el;
|
|
185
|
+
}
|
|
186
|
+
},
|
|
187
|
+
// Handle span elements containing images with captions
|
|
188
|
+
{
|
|
189
|
+
selector: 'span:has(img)',
|
|
190
|
+
element: 'span',
|
|
191
|
+
transform: (el, doc) => {
|
|
192
|
+
try {
|
|
193
|
+
const hasImage = containsImage(el);
|
|
194
|
+
if (!hasImage) {
|
|
195
|
+
return el;
|
|
196
|
+
}
|
|
197
|
+
// Skip spans that are content containers rather than image wrappers.
|
|
198
|
+
// A span with block-level children (p, h1-h6, div, etc.) is a content
|
|
199
|
+
// container that happens to contain images, not an image wrapper.
|
|
200
|
+
for (const child of el.children) {
|
|
201
|
+
if (constants_1.BLOCK_LEVEL_ELEMENTS.has(child.tagName.toLowerCase())) {
|
|
202
|
+
return el;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
const imgElement = findMainImage(el);
|
|
206
|
+
if (!imgElement) {
|
|
207
|
+
return el;
|
|
208
|
+
}
|
|
209
|
+
const caption = findCaption(el);
|
|
210
|
+
// Process the image element (might return the img itself or handle picture/source)
|
|
211
|
+
const processedImg = processImageElement(imgElement, doc);
|
|
212
|
+
if (caption && hasMeaningfulCaption(caption)) {
|
|
213
|
+
const figure = createFigureWithCaption(processedImg, caption, doc);
|
|
214
|
+
// Remove the original caption element from its parent
|
|
215
|
+
// to prevent duplication, as the span itself might remain.
|
|
216
|
+
if (caption.parentNode) {
|
|
217
|
+
caption.parentNode.removeChild(caption);
|
|
218
|
+
}
|
|
219
|
+
return figure; // Replace the span (or its content) with the figure
|
|
220
|
+
}
|
|
221
|
+
else {
|
|
222
|
+
// No meaningful caption, return just the processed image.
|
|
223
|
+
// This might replace the span content or the span itself depending on framework.
|
|
224
|
+
return processedImg;
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
catch (error) {
|
|
228
|
+
console.warn('Error processing span with image:', error);
|
|
229
|
+
return el;
|
|
230
|
+
}
|
|
231
|
+
}
|
|
232
|
+
},
|
|
233
|
+
// Standardize complex image elements (figure, picture, source, figcaption)
|
|
234
|
+
{
|
|
235
|
+
selector: 'figure, p:has([class*="caption"])',
|
|
236
|
+
element: 'figure',
|
|
237
|
+
transform: (el, doc) => {
|
|
238
|
+
try {
|
|
239
|
+
const hasImage = containsImage(el);
|
|
240
|
+
if (!hasImage) {
|
|
241
|
+
return el;
|
|
242
|
+
}
|
|
243
|
+
const imgElement = findMainImage(el); // Initial find (might be picture)
|
|
244
|
+
if (!imgElement) {
|
|
245
|
+
return el;
|
|
246
|
+
}
|
|
247
|
+
// Note: Previous rules might have processed the image inside 'el'.
|
|
248
|
+
const caption = findCaption(el);
|
|
249
|
+
if (caption && hasMeaningfulCaption(caption)) {
|
|
250
|
+
// Find the *current* image element inside 'el' again.
|
|
251
|
+
// It might have been modified (e.g., picture rule -> img)
|
|
252
|
+
const currentImg = findMainImage(el);
|
|
253
|
+
let imageToAdd;
|
|
254
|
+
if (currentImg) {
|
|
255
|
+
// We'll clone this inside the helper function
|
|
256
|
+
imageToAdd = currentImg;
|
|
257
|
+
}
|
|
258
|
+
else {
|
|
259
|
+
// Fallback: process the initially found element.
|
|
260
|
+
console.warn("Figure rule couldn't find current image element in:", el.outerHTML);
|
|
261
|
+
// processImageElement will clone if needed
|
|
262
|
+
imageToAdd = processImageElement(imgElement, doc);
|
|
263
|
+
}
|
|
264
|
+
// Use the helper function to create the figure
|
|
265
|
+
// The helper clones the imageToAdd before appending.
|
|
266
|
+
return createFigureWithCaption(imageToAdd, caption, doc);
|
|
267
|
+
}
|
|
268
|
+
else {
|
|
269
|
+
// No meaningful caption found. Return the original element 'el'.
|
|
270
|
+
// Preceding rules should have processed the image content *within* 'el'.
|
|
271
|
+
return el;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
catch (error) {
|
|
275
|
+
console.warn('Error processing complex image element:', error);
|
|
276
|
+
return el;
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
},
|
|
280
|
+
];
|
|
281
|
+
/**
|
|
282
|
+
* Creates a standard <figure> element containing an image and a caption.
|
|
283
|
+
*/
|
|
284
|
+
function createFigureWithCaption(imageElement, captionElement, doc) {
|
|
285
|
+
const figure = doc.createElement('figure');
|
|
286
|
+
// Append a clone of the image element to prevent side effects
|
|
287
|
+
figure.appendChild(imageElement.cloneNode(true));
|
|
288
|
+
// Add caption
|
|
289
|
+
const figcaption = doc.createElement('figcaption');
|
|
290
|
+
const uniqueCaptionContent = extractUniqueCaptionContent(captionElement);
|
|
291
|
+
figcaption.appendChild((0, dom_1.parseHTML)(doc, uniqueCaptionContent));
|
|
292
|
+
figure.appendChild(figcaption);
|
|
293
|
+
return figure;
|
|
294
|
+
}
|
|
295
|
+
/**
|
|
296
|
+
* Apply srcset to an image element
|
|
297
|
+
*/
|
|
298
|
+
function applySrcsetToImage(srcset, img) {
|
|
299
|
+
img.setAttribute('srcset', srcset);
|
|
300
|
+
// Extract the first URL from srcset as the src
|
|
301
|
+
const firstUrl = extractFirstUrlFromSrcset(srcset);
|
|
302
|
+
if (firstUrl && isValidImageUrl(firstUrl)) {
|
|
303
|
+
img.setAttribute('src', firstUrl);
|
|
304
|
+
}
|
|
305
|
+
}
|
|
306
|
+
/**
|
|
307
|
+
* Copy attributes from one element to another, excluding specified attributes
|
|
308
|
+
*/
|
|
309
|
+
function copyAttributesExcept(source, target, excludeAttrs) {
|
|
310
|
+
for (let i = 0; i < source.attributes.length; i++) {
|
|
311
|
+
const attr = source.attributes[i];
|
|
312
|
+
if (!excludeAttrs.includes(attr.name)) {
|
|
313
|
+
target.setAttribute(attr.name, attr.value);
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
/**
|
|
318
|
+
* Check if a string is a base64 placeholder image
|
|
319
|
+
*/
|
|
320
|
+
function isBase64Placeholder(src) {
|
|
321
|
+
// Check if it's a base64 data URL
|
|
322
|
+
const match = src.match(b64DataUrlRegex);
|
|
323
|
+
if (!match) {
|
|
324
|
+
return false;
|
|
325
|
+
}
|
|
326
|
+
// Skip SVG images as they can be meaningful even when small
|
|
327
|
+
if (match[1] === 'svg+xml') {
|
|
328
|
+
return false;
|
|
329
|
+
}
|
|
330
|
+
// Check if the base64 part is too small (likely a placeholder)
|
|
331
|
+
const b64starts = match[0].length;
|
|
332
|
+
const b64length = src.length - b64starts;
|
|
333
|
+
// If less than 133 bytes (100 bytes after base64 encoding), it's likely a placeholder
|
|
334
|
+
return b64length < 133;
|
|
335
|
+
}
|
|
336
|
+
/**
|
|
337
|
+
* Check if a string is an SVG data URL
|
|
338
|
+
*/
|
|
339
|
+
function isSvgDataUrl(src) {
|
|
340
|
+
return src.startsWith('data:image/svg+xml');
|
|
341
|
+
}
|
|
342
|
+
/**
|
|
343
|
+
* Check if a string is a valid image URL
|
|
344
|
+
*/
|
|
345
|
+
function isValidImageUrl(src) {
|
|
346
|
+
// Skip data URLs (both base64 and SVG)
|
|
347
|
+
if (src.startsWith('data:')) {
|
|
348
|
+
return false;
|
|
349
|
+
}
|
|
350
|
+
// Skip empty or invalid URLs
|
|
351
|
+
if (!src || src.trim() === '') {
|
|
352
|
+
return false;
|
|
353
|
+
}
|
|
354
|
+
// Check if it's a valid image URL
|
|
355
|
+
return imageUrlPattern.test(src) ||
|
|
356
|
+
src.includes('image') ||
|
|
357
|
+
src.includes('img') ||
|
|
358
|
+
src.includes('photo');
|
|
359
|
+
}
|
|
360
|
+
/**
|
|
361
|
+
* Check if an element has better image sources than the current src
|
|
362
|
+
*/
|
|
363
|
+
function hasBetterImageSource(element) {
|
|
364
|
+
// Check for data-src or data-srcset
|
|
365
|
+
if (element.hasAttribute('data-src') || element.hasAttribute('data-srcset')) {
|
|
366
|
+
return true;
|
|
367
|
+
}
|
|
368
|
+
// Check for other attributes that might contain image URLs
|
|
369
|
+
for (let i = 0; i < element.attributes.length; i++) {
|
|
370
|
+
const attr = element.attributes[i];
|
|
371
|
+
if (attr.name === 'src') {
|
|
372
|
+
continue;
|
|
373
|
+
}
|
|
374
|
+
// Check if it's a data-* attribute and contains an image URL
|
|
375
|
+
if (attr.name.startsWith('data-') && /\.(jpg|jpeg|png|webp|gif)(\?.*)?$/i.test(attr.value)) {
|
|
376
|
+
return true;
|
|
377
|
+
}
|
|
378
|
+
// Check non-data attributes for image extensions
|
|
379
|
+
if (/\.(jpg|jpeg|png|webp|gif)(\?.*)?$/i.test(attr.value)) {
|
|
380
|
+
return true;
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
return false;
|
|
384
|
+
}
|
|
385
|
+
/**
|
|
386
|
+
* Check if an element or its children contain an image
|
|
387
|
+
*/
|
|
388
|
+
function containsImage(element) {
|
|
389
|
+
// Check if element itself is an image
|
|
390
|
+
if (isImageElement(element)) {
|
|
391
|
+
return true;
|
|
392
|
+
}
|
|
393
|
+
// Check if element contains an image
|
|
394
|
+
const images = element.querySelectorAll('img, video, picture, source');
|
|
395
|
+
return images.length > 0;
|
|
396
|
+
}
|
|
397
|
+
/**
|
|
398
|
+
* Check if an element is an image element
|
|
399
|
+
*/
|
|
400
|
+
function isImageElement(element) {
|
|
401
|
+
const tagName = element.tagName.toLowerCase();
|
|
402
|
+
return tagName === 'img' || tagName === 'video' || tagName === 'picture' || tagName === 'source';
|
|
403
|
+
}
|
|
404
|
+
/**
|
|
405
|
+
* Find the main image element in a container
|
|
406
|
+
*/
|
|
407
|
+
function findMainImage(element) {
|
|
408
|
+
// If element itself is an image, return it
|
|
409
|
+
if (isImageElement(element)) {
|
|
410
|
+
return element;
|
|
411
|
+
}
|
|
412
|
+
// Look for picture elements first - they often contain the highest quality images
|
|
413
|
+
const pictureElements = element.querySelectorAll('picture');
|
|
414
|
+
if (pictureElements.length > 0) {
|
|
415
|
+
// For picture elements, we want to return the picture itself
|
|
416
|
+
// so we can process all its sources
|
|
417
|
+
return pictureElements[0];
|
|
418
|
+
}
|
|
419
|
+
// Look for img elements next, but skip placeholder images
|
|
420
|
+
const imgElements = element.querySelectorAll('img');
|
|
421
|
+
const filteredImgElements = [];
|
|
422
|
+
for (let i = 0; i < imgElements.length; i++) {
|
|
423
|
+
const img = imgElements[i];
|
|
424
|
+
// Skip placeholder images (SVG data URLs, empty alt, etc.)
|
|
425
|
+
const src = img.getAttribute('src') || '';
|
|
426
|
+
const alt = img.getAttribute('alt') || '';
|
|
427
|
+
// Skip SVG data URLs (placeholders)
|
|
428
|
+
if (src.includes('data:image/svg+xml')) {
|
|
429
|
+
continue;
|
|
430
|
+
}
|
|
431
|
+
// Skip base64 placeholder images
|
|
432
|
+
if (isBase64Placeholder(src)) {
|
|
433
|
+
continue;
|
|
434
|
+
}
|
|
435
|
+
// Skip empty alt text (often indicates decorative images)
|
|
436
|
+
// But only if we have other images with alt text
|
|
437
|
+
if (!alt.trim() && imgElements.length > 1) {
|
|
438
|
+
continue;
|
|
439
|
+
}
|
|
440
|
+
filteredImgElements.push(img);
|
|
441
|
+
}
|
|
442
|
+
if (filteredImgElements.length > 0) {
|
|
443
|
+
return filteredImgElements[0];
|
|
444
|
+
}
|
|
445
|
+
// Look for video elements next
|
|
446
|
+
const videoElements = element.querySelectorAll('video');
|
|
447
|
+
if (videoElements.length > 0) {
|
|
448
|
+
return videoElements[0];
|
|
449
|
+
}
|
|
450
|
+
// Look for any source elements as a last resort
|
|
451
|
+
const anySourceElements = element.querySelectorAll('source');
|
|
452
|
+
if (anySourceElements.length > 0) {
|
|
453
|
+
return anySourceElements[0];
|
|
454
|
+
}
|
|
455
|
+
// If we still haven't found an image, try a more aggressive search
|
|
456
|
+
// This helps with deeply nested structures like Medium articles
|
|
457
|
+
const allImages = element.querySelectorAll('img, picture, source, video');
|
|
458
|
+
if (allImages.length > 0) {
|
|
459
|
+
return allImages[0];
|
|
460
|
+
}
|
|
461
|
+
return null;
|
|
462
|
+
}
|
|
463
|
+
/**
|
|
464
|
+
* Find caption in an element
|
|
465
|
+
*/
|
|
466
|
+
function findCaption(element) {
|
|
467
|
+
// Check for existing figcaption
|
|
468
|
+
const figcaption = element.querySelector('figcaption');
|
|
469
|
+
if (figcaption) {
|
|
470
|
+
return figcaption;
|
|
471
|
+
}
|
|
472
|
+
// Check for elements with caption-related classes or attributes
|
|
473
|
+
const captionSelectors = [
|
|
474
|
+
'[class*="caption"]',
|
|
475
|
+
'[class*="description"]',
|
|
476
|
+
'[class*="alt"]',
|
|
477
|
+
'[class*="title"]',
|
|
478
|
+
'[class*="credit"]',
|
|
479
|
+
'[class*="text"]',
|
|
480
|
+
'[class*="post-thumbnail-text"]',
|
|
481
|
+
'[class*="image-caption"]',
|
|
482
|
+
'[class*="photo-caption"]',
|
|
483
|
+
'[aria-label]',
|
|
484
|
+
'[title]'
|
|
485
|
+
];
|
|
486
|
+
// Track found captions to avoid duplicates
|
|
487
|
+
const foundCaptions = new Set();
|
|
488
|
+
// Combine selectors for a single query
|
|
489
|
+
const combinedSelector = captionSelectors.join(', ');
|
|
490
|
+
const captionElements = element.querySelectorAll(combinedSelector);
|
|
491
|
+
for (let i = 0; i < captionElements.length; i++) {
|
|
492
|
+
const captionEl = captionElements[i];
|
|
493
|
+
// Skip if this is the image element itself
|
|
494
|
+
if (isImageElement(captionEl)) {
|
|
495
|
+
continue;
|
|
496
|
+
}
|
|
497
|
+
// Check if this element has text content
|
|
498
|
+
const textContent = captionEl.textContent?.trim();
|
|
499
|
+
if (textContent && textContent.length > 0) {
|
|
500
|
+
// Check if we've already found this caption text
|
|
501
|
+
if (!foundCaptions.has(textContent)) {
|
|
502
|
+
foundCaptions.add(textContent);
|
|
503
|
+
return captionEl;
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
// Check for alt attribute on image
|
|
508
|
+
const imgElement = element.querySelector('img');
|
|
509
|
+
if (imgElement && imgElement.hasAttribute('alt')) {
|
|
510
|
+
const altText = imgElement.getAttribute('alt');
|
|
511
|
+
if (altText && altText.trim().length > 0) {
|
|
512
|
+
// Create a new element for the alt text
|
|
513
|
+
const captionEl = element.ownerDocument.createElement('div');
|
|
514
|
+
captionEl.textContent = altText;
|
|
515
|
+
return captionEl;
|
|
516
|
+
}
|
|
517
|
+
}
|
|
518
|
+
// Check for sibling elements that might contain captions
|
|
519
|
+
// This is useful for cases like the example where the caption is in a sibling div
|
|
520
|
+
if (element.parentElement) {
|
|
521
|
+
const parent = element.parentElement;
|
|
522
|
+
const siblings = parent.children;
|
|
523
|
+
for (let i = 0; i < siblings.length; i++) {
|
|
524
|
+
const sibling = siblings[i];
|
|
525
|
+
if (sibling === element)
|
|
526
|
+
continue;
|
|
527
|
+
// Check if the sibling has caption-related classes
|
|
528
|
+
const hasCaptionClass = Array.from(sibling.classList).some(cls => cls.includes('caption') ||
|
|
529
|
+
cls.includes('credit') ||
|
|
530
|
+
cls.includes('text') ||
|
|
531
|
+
cls.includes('description'));
|
|
532
|
+
if (hasCaptionClass) {
|
|
533
|
+
const textContent = sibling.textContent?.trim();
|
|
534
|
+
if (textContent && textContent.length > 0) {
|
|
535
|
+
return sibling;
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
}
|
|
540
|
+
// Look for text elements that follow an image within the same parent
|
|
541
|
+
// This handles cases like <p><img><em>caption</em></p>
|
|
542
|
+
const imgElements = element.querySelectorAll('img');
|
|
543
|
+
for (let i = 0; i < imgElements.length; i++) {
|
|
544
|
+
const img = imgElements[i];
|
|
545
|
+
const parent = img.parentElement;
|
|
546
|
+
if (!parent)
|
|
547
|
+
continue;
|
|
548
|
+
// Look for text elements that follow the image
|
|
549
|
+
let nextElement = img.nextElementSibling;
|
|
550
|
+
while (nextElement) {
|
|
551
|
+
// Check if it's a text element (em, strong, span, etc.)
|
|
552
|
+
if (['EM', 'STRONG', 'SPAN', 'I', 'B', 'SMALL', 'CITE'].includes(nextElement.tagName)) {
|
|
553
|
+
const textContent = nextElement.textContent?.trim();
|
|
554
|
+
if (textContent && textContent.length > 0) {
|
|
555
|
+
return nextElement;
|
|
556
|
+
}
|
|
557
|
+
}
|
|
558
|
+
nextElement = nextElement.nextElementSibling;
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
// Check for text elements that are children of the same parent as the image
|
|
562
|
+
// This handles cases like <span><img><em>caption</em></span>
|
|
563
|
+
for (let i = 0; i < imgElements.length; i++) {
|
|
564
|
+
const img = imgElements[i];
|
|
565
|
+
const parent = img.parentElement;
|
|
566
|
+
if (!parent)
|
|
567
|
+
continue;
|
|
568
|
+
// Get all text elements in the parent
|
|
569
|
+
const textElements = parent.querySelectorAll('em, strong, span, i, b, small, cite');
|
|
570
|
+
for (let j = 0; j < textElements.length; j++) {
|
|
571
|
+
const textEl = textElements[j];
|
|
572
|
+
// Skip if this is the image itself
|
|
573
|
+
if (textEl === img)
|
|
574
|
+
continue;
|
|
575
|
+
const textContent = textEl.textContent?.trim();
|
|
576
|
+
if (textContent && textContent.length > 0) {
|
|
577
|
+
return textEl;
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
return null;
|
|
582
|
+
}
|
|
583
|
+
/**
|
|
584
|
+
* Extract unique caption content to avoid duplication
|
|
585
|
+
*/
|
|
586
|
+
function extractUniqueCaptionContent(caption) {
|
|
587
|
+
// Get all text nodes and elements with text content
|
|
588
|
+
const textNodes = [];
|
|
589
|
+
const processedTexts = new Set();
|
|
590
|
+
// Helper function to process a node
|
|
591
|
+
const processNode = (node) => {
|
|
592
|
+
if ((0, utils_1.isTextNode)(node)) {
|
|
593
|
+
const text = node.textContent?.trim() || '';
|
|
594
|
+
if (text && !processedTexts.has(text)) {
|
|
595
|
+
textNodes.push(text);
|
|
596
|
+
processedTexts.add(text);
|
|
597
|
+
}
|
|
598
|
+
}
|
|
599
|
+
else if ((0, utils_1.isElement)(node)) {
|
|
600
|
+
// Process child nodes
|
|
601
|
+
const childNodes = node.childNodes;
|
|
602
|
+
for (let i = 0; i < childNodes.length; i++) {
|
|
603
|
+
processNode(childNodes[i]);
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
};
|
|
607
|
+
// Process all child nodes
|
|
608
|
+
const childNodes = caption.childNodes;
|
|
609
|
+
for (let i = 0; i < childNodes.length; i++) {
|
|
610
|
+
processNode(childNodes[i]);
|
|
611
|
+
}
|
|
612
|
+
// If we found unique text nodes, use them
|
|
613
|
+
if (textNodes.length > 0) {
|
|
614
|
+
return textNodes.join(' ');
|
|
615
|
+
}
|
|
616
|
+
// Otherwise, just use the inner HTML but try to clean it up
|
|
617
|
+
const html = (0, dom_1.serializeHTML)(caption);
|
|
618
|
+
return html;
|
|
619
|
+
}
|
|
620
|
+
/**
|
|
621
|
+
* Check if a caption is meaningful enough to warrant a figure element
|
|
622
|
+
*/
|
|
623
|
+
function hasMeaningfulCaption(caption) {
|
|
624
|
+
// Get the text content
|
|
625
|
+
const textContent = caption.textContent?.trim() || '';
|
|
626
|
+
// If it's just a URL or very short, it's not meaningful
|
|
627
|
+
if (textContent.length < 10 ||
|
|
628
|
+
textContent.startsWith('http://') ||
|
|
629
|
+
textContent.startsWith('https://')) {
|
|
630
|
+
return false;
|
|
631
|
+
}
|
|
632
|
+
// Check if it's just a filename or path
|
|
633
|
+
if (filenamePattern.test(textContent)) {
|
|
634
|
+
return false;
|
|
635
|
+
}
|
|
636
|
+
// Check if it's just a number or date
|
|
637
|
+
if (textContent.match(/^\d+$/) || datePattern.test(textContent)) {
|
|
638
|
+
return false;
|
|
639
|
+
}
|
|
640
|
+
return true;
|
|
641
|
+
}
|
|
642
|
+
/**
|
|
643
|
+
* Process an image element
|
|
644
|
+
*/
|
|
645
|
+
function processImageElement(element, doc) {
|
|
646
|
+
const tagName = element.tagName.toLowerCase();
|
|
647
|
+
// Handle different types of image elements
|
|
648
|
+
if (tagName === 'img') {
|
|
649
|
+
return processImgElement(element, doc);
|
|
650
|
+
}
|
|
651
|
+
else if (tagName === 'picture') {
|
|
652
|
+
// The picture rule modifies the img inside the picture and returns the picture itself.
|
|
653
|
+
// This function might be called by rules like 'span:has(img)' or 'figure'.
|
|
654
|
+
// If it receives a picture element processed by the picture rule, it should extract the img inside.
|
|
655
|
+
const imgInside = element.querySelector('img');
|
|
656
|
+
return imgInside ? processImgElement(imgInside, doc) : element.cloneNode(true);
|
|
657
|
+
}
|
|
658
|
+
else if (tagName === 'source') {
|
|
659
|
+
return processSourceElement(element, doc);
|
|
660
|
+
}
|
|
661
|
+
// Default case: return a clone
|
|
662
|
+
return element.cloneNode(true);
|
|
663
|
+
}
|
|
664
|
+
/**
|
|
665
|
+
* Process an img element
|
|
666
|
+
*/
|
|
667
|
+
function processImgElement(element, doc) {
|
|
668
|
+
// For img elements, check if it's a placeholder
|
|
669
|
+
const src = element.getAttribute('src') || '';
|
|
670
|
+
if (isBase64Placeholder(src) || isSvgDataUrl(src)) {
|
|
671
|
+
// Try to find a better image in the parent
|
|
672
|
+
const parent = element.parentElement;
|
|
673
|
+
if (parent) {
|
|
674
|
+
// Look for source elements with data-srcset
|
|
675
|
+
const sourceElements = parent.querySelectorAll('source');
|
|
676
|
+
const filteredSources = [];
|
|
677
|
+
for (let i = 0; i < sourceElements.length; i++) {
|
|
678
|
+
const source = sourceElements[i];
|
|
679
|
+
if (source.hasAttribute('data-srcset') && source.getAttribute('data-srcset') !== '') {
|
|
680
|
+
filteredSources.push(source);
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
if (filteredSources.length > 0) {
|
|
684
|
+
// Create a new img element with the data-src
|
|
685
|
+
const newImg = doc.createElement('img');
|
|
686
|
+
const dataSrc = element.getAttribute('data-src');
|
|
687
|
+
if (dataSrc && !isSvgDataUrl(dataSrc)) {
|
|
688
|
+
newImg.setAttribute('src', dataSrc);
|
|
689
|
+
}
|
|
690
|
+
// Copy other attributes
|
|
691
|
+
copyAttributesExcept(element, newImg, ['src']);
|
|
692
|
+
return newImg;
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
// Return a clone of the img element
|
|
697
|
+
return element.cloneNode(true);
|
|
698
|
+
}
|
|
699
|
+
/**
|
|
700
|
+
* Process a picture element
|
|
701
|
+
*/
|
|
702
|
+
function processPictureElement(element, doc) {
|
|
703
|
+
// For picture elements, we want to process all sources and select the best one
|
|
704
|
+
// Create a new img element
|
|
705
|
+
const newImg = doc.createElement('img');
|
|
706
|
+
// Get all source elements
|
|
707
|
+
const sourceElements = element.querySelectorAll('source');
|
|
708
|
+
// If we have multiple sources, try to select the best one
|
|
709
|
+
if (sourceElements.length > 1) {
|
|
710
|
+
// Find the best source based on media queries and srcset
|
|
711
|
+
const bestSource = selectBestSource(sourceElements);
|
|
712
|
+
if (bestSource) {
|
|
713
|
+
// Get the srcset from the best source
|
|
714
|
+
const srcset = bestSource.getAttribute('srcset');
|
|
715
|
+
if (srcset) {
|
|
716
|
+
applySrcsetToImage(srcset, newImg);
|
|
717
|
+
}
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
else if (sourceElements.length === 1) {
|
|
721
|
+
// If only one source, use it
|
|
722
|
+
const srcset = sourceElements[0].getAttribute('srcset');
|
|
723
|
+
if (srcset) {
|
|
724
|
+
applySrcsetToImage(srcset, newImg);
|
|
725
|
+
}
|
|
726
|
+
}
|
|
727
|
+
// Copy other attributes from the original img if it exists
|
|
728
|
+
const originalImg = element.querySelector('img');
|
|
729
|
+
if (originalImg) {
|
|
730
|
+
// Copy all attributes except srcset
|
|
731
|
+
copyAttributesExcept(originalImg, newImg, ['srcset']);
|
|
732
|
+
// Always set the src attribute directly from the original img
|
|
733
|
+
const originalSrc = originalImg.getAttribute('src');
|
|
734
|
+
if (originalSrc) {
|
|
735
|
+
newImg.setAttribute('src', originalSrc);
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
return newImg;
|
|
739
|
+
}
|
|
740
|
+
/**
|
|
741
|
+
* Process a source element
|
|
742
|
+
*/
|
|
743
|
+
function processSourceElement(element, doc) {
|
|
744
|
+
// For source elements, create a new img element
|
|
745
|
+
const newImg = doc.createElement('img');
|
|
746
|
+
// Get the srcset from the source
|
|
747
|
+
const srcset = element.getAttribute('srcset');
|
|
748
|
+
if (srcset) {
|
|
749
|
+
applySrcsetToImage(srcset, newImg);
|
|
750
|
+
}
|
|
751
|
+
// Try to find a related img element to copy other attributes
|
|
752
|
+
const parent = element.parentElement;
|
|
753
|
+
if (parent) {
|
|
754
|
+
const imgElements = parent.querySelectorAll('img');
|
|
755
|
+
const filteredImgElements = [];
|
|
756
|
+
for (let i = 0; i < imgElements.length; i++) {
|
|
757
|
+
const img = imgElements[i];
|
|
758
|
+
const src = img.getAttribute('src') || '';
|
|
759
|
+
if (!isBase64Placeholder(src) && !isSvgDataUrl(src) && src !== '') {
|
|
760
|
+
filteredImgElements.push(img);
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
if (filteredImgElements.length > 0) {
|
|
764
|
+
copyAttributesExcept(filteredImgElements[0], newImg, ['src', 'srcset']);
|
|
765
|
+
// If we still don't have a valid src, use the img's src
|
|
766
|
+
if (!newImg.hasAttribute('src') || !isValidImageUrl(newImg.getAttribute('src') || '')) {
|
|
767
|
+
const imgSrc = filteredImgElements[0].getAttribute('src');
|
|
768
|
+
if (imgSrc && isValidImageUrl(imgSrc)) {
|
|
769
|
+
newImg.setAttribute('src', imgSrc);
|
|
770
|
+
}
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
else {
|
|
774
|
+
// If no good img found, look for one with data-src
|
|
775
|
+
const dataSrcImg = parent.querySelector('img[data-src]');
|
|
776
|
+
if (dataSrcImg) {
|
|
777
|
+
copyAttributesExcept(dataSrcImg, newImg, ['src', 'srcset']);
|
|
778
|
+
// If we still don't have a valid src, use the data-src
|
|
779
|
+
if (!newImg.hasAttribute('src') || !isValidImageUrl(newImg.getAttribute('src') || '')) {
|
|
780
|
+
const dataSrc = dataSrcImg.getAttribute('data-src');
|
|
781
|
+
if (dataSrc && isValidImageUrl(dataSrc)) {
|
|
782
|
+
newImg.setAttribute('src', dataSrc);
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
}
|
|
788
|
+
return newImg;
|
|
789
|
+
}
|
|
790
|
+
/**
|
|
791
|
+
* Extract the first URL from a srcset attribute.
|
|
792
|
+
* Handles URLs that contain commas (e.g., Substack CDN URLs like
|
|
793
|
+
* https://substackcdn.com/image/fetch/$s_!YemM!,w_424,c_limit,f_webp/...)
|
|
794
|
+
* by parsing based on width/density descriptors rather than splitting on commas.
|
|
795
|
+
*/
|
|
796
|
+
function extractFirstUrlFromSrcset(srcset) {
|
|
797
|
+
if (!srcset || !srcset.trim())
|
|
798
|
+
return null;
|
|
799
|
+
const trimmed = srcset.trim();
|
|
800
|
+
// Match srcset entries by finding URL + descriptor pairs.
|
|
801
|
+
// Each entry ends with a width descriptor (e.g., "424w") or density descriptor (e.g., "2x").
|
|
802
|
+
// The URL is everything before the whitespace that precedes the descriptor.
|
|
803
|
+
// This handles URLs containing commas (which would break a simple comma-split).
|
|
804
|
+
const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?[wx])/g;
|
|
805
|
+
let match;
|
|
806
|
+
let lastIndex = 0;
|
|
807
|
+
while ((match = entryPattern.exec(trimmed)) !== null) {
|
|
808
|
+
// Extract URL from this entry, trimming any leading comma+whitespace from previous entry
|
|
809
|
+
let url = match[1].trim();
|
|
810
|
+
if (lastIndex > 0) {
|
|
811
|
+
// Remove leading comma separator from previous entry
|
|
812
|
+
url = url.replace(/^,\s*/, '');
|
|
813
|
+
}
|
|
814
|
+
lastIndex = entryPattern.lastIndex;
|
|
815
|
+
if (!url)
|
|
816
|
+
continue;
|
|
817
|
+
// Skip SVG data URLs
|
|
818
|
+
if (isSvgDataUrl(url))
|
|
819
|
+
continue;
|
|
820
|
+
return url;
|
|
821
|
+
}
|
|
822
|
+
// Fallback: try extracting URL before first whitespace (for srcset with single entry and no descriptor)
|
|
823
|
+
const urlMatch = trimmed.match(urlPattern);
|
|
824
|
+
if (urlMatch && urlMatch[1] && !isSvgDataUrl(urlMatch[1])) {
|
|
825
|
+
return urlMatch[1];
|
|
826
|
+
}
|
|
827
|
+
return null;
|
|
828
|
+
}
|
|
829
|
+
/**
|
|
830
|
+
* Select the best source element from a list of sources
|
|
831
|
+
* based on media queries and srcset values
|
|
832
|
+
*/
|
|
833
|
+
function selectBestSource(sources) {
|
|
834
|
+
if (sources.length === 0) {
|
|
835
|
+
return null;
|
|
836
|
+
}
|
|
837
|
+
// If only one source, return it
|
|
838
|
+
if (sources.length === 1) {
|
|
839
|
+
return sources[0];
|
|
840
|
+
}
|
|
841
|
+
// First, try to find a source without media queries (default)
|
|
842
|
+
for (let i = 0; i < sources.length; i++) {
|
|
843
|
+
if (!sources[i].hasAttribute('media')) {
|
|
844
|
+
return sources[i];
|
|
845
|
+
}
|
|
846
|
+
}
|
|
847
|
+
// If no default source, try to find the highest resolution source
|
|
848
|
+
// by analyzing the srcset values
|
|
849
|
+
let bestSource = null;
|
|
850
|
+
let maxResolution = 0;
|
|
851
|
+
for (let i = 0; i < sources.length; i++) {
|
|
852
|
+
const source = sources[i];
|
|
853
|
+
const srcset = source.getAttribute('srcset');
|
|
854
|
+
if (!srcset)
|
|
855
|
+
continue;
|
|
856
|
+
// Extract width and DPR from srcset
|
|
857
|
+
const widthMatch = srcset.match(widthPattern);
|
|
858
|
+
const dprMatch = srcset.match(dprPattern);
|
|
859
|
+
if (widthMatch && widthMatch[1]) {
|
|
860
|
+
const width = parseInt(widthMatch[1], 10);
|
|
861
|
+
const dpr = dprMatch ? parseFloat(dprMatch[1]) : 1;
|
|
862
|
+
// Calculate effective resolution (width * DPR)
|
|
863
|
+
const resolution = width * dpr;
|
|
864
|
+
if (resolution > maxResolution) {
|
|
865
|
+
maxResolution = resolution;
|
|
866
|
+
bestSource = source;
|
|
867
|
+
}
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
// If we found a source with resolution, return it
|
|
871
|
+
if (bestSource) {
|
|
872
|
+
return bestSource;
|
|
873
|
+
}
|
|
874
|
+
// If no resolution found, return the first source
|
|
875
|
+
return sources[0];
|
|
876
|
+
}
|
|
877
|
+
//# sourceMappingURL=images.js.map
|