defuddle 0.5.4 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/constants.d.ts +0 -1
- package/dist/constants.js +87 -28
- package/dist/constants.js.map +1 -1
- package/dist/defuddle.d.ts +1 -14
- package/dist/defuddle.js +23 -907
- package/dist/defuddle.js.map +1 -1
- package/dist/elements/images.d.ts +8 -0
- package/dist/elements/images.js +779 -0
- package/dist/elements/images.js.map +1 -0
- package/dist/index.full.js +1 -1
- package/dist/index.js +1 -1
- package/dist/scoring.d.ts +17 -0
- package/dist/scoring.js +208 -0
- package/dist/scoring.js.map +1 -1
- package/dist/standardize.d.ts +2 -0
- package/dist/standardize.js +830 -0
- package/dist/standardize.js.map +1 -0
- package/dist/utils.d.ts +4 -0
- package/dist/utils.js +38 -0
- package/dist/utils.js.map +1 -0
- package/package.json +1 -1
|
@@ -0,0 +1,779 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Standardization rules for handling images
|
|
4
|
+
*/
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.imageRules = void 0;
|
|
7
|
+
// Pre-compile regular expressions
|
|
8
|
+
const b64DataUrlRegex = /^data:image\/([^;]+);base64,/;
|
|
9
|
+
const srcsetPattern = /\.(jpg|jpeg|png|webp)\s+\d/;
|
|
10
|
+
const srcPattern = /^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/;
|
|
11
|
+
const imageUrlPattern = /\.(jpg|jpeg|png|webp|gif|avif)(\?.*)?$/i;
|
|
12
|
+
const widthPattern = /\s(\d+)w/;
|
|
13
|
+
const dprPattern = /dpr=(\d+(?:\.\d+)?)/;
|
|
14
|
+
const urlPattern = /^([^\s]+)/;
|
|
15
|
+
const filenamePattern = /^[\w\-\.\/\\]+\.(jpg|jpeg|png|gif|webp|svg)$/i;
|
|
16
|
+
const datePattern = /^\d{4}-\d{2}-\d{2}$/;
|
|
17
|
+
exports.imageRules = [
|
|
18
|
+
// Handle picture elements first to ensure we get the highest resolution
|
|
19
|
+
{
|
|
20
|
+
selector: 'picture',
|
|
21
|
+
element: 'img',
|
|
22
|
+
transform: (el, doc) => {
|
|
23
|
+
// Get all source elements
|
|
24
|
+
const sourceElements = el.querySelectorAll('source');
|
|
25
|
+
// Create a new img element
|
|
26
|
+
const newImg = doc.createElement('img');
|
|
27
|
+
// If we have multiple sources, try to select the best one
|
|
28
|
+
if (sourceElements.length > 1) {
|
|
29
|
+
// Find the best source based on media queries and srcset
|
|
30
|
+
const bestSource = selectBestSource(sourceElements);
|
|
31
|
+
if (bestSource) {
|
|
32
|
+
// Get the srcset from the best source
|
|
33
|
+
const srcset = bestSource.getAttribute('srcset');
|
|
34
|
+
if (srcset) {
|
|
35
|
+
applySrcsetToImage(srcset, newImg);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
else if (sourceElements.length === 1) {
|
|
40
|
+
// If only one source, use it
|
|
41
|
+
const srcset = sourceElements[0].getAttribute('srcset');
|
|
42
|
+
if (srcset) {
|
|
43
|
+
applySrcsetToImage(srcset, newImg);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
// Copy other attributes from the original img if it exists
|
|
47
|
+
const originalImg = el.querySelector('img');
|
|
48
|
+
if (originalImg) {
|
|
49
|
+
// Copy all attributes except srcset
|
|
50
|
+
copyAttributesExcept(originalImg, newImg, ['srcset']);
|
|
51
|
+
// Always set the src attribute directly from the original img
|
|
52
|
+
const originalSrc = originalImg.getAttribute('src');
|
|
53
|
+
// Always use the original src if it exists
|
|
54
|
+
if (originalSrc) {
|
|
55
|
+
newImg.setAttribute('src', originalSrc);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
58
|
+
if (!newImg.hasAttribute('src') && originalImg) {
|
|
59
|
+
const originalSrc = originalImg.getAttribute('src');
|
|
60
|
+
if (originalSrc) {
|
|
61
|
+
newImg.setAttribute('src', originalSrc);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return newImg;
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
// Handle lazy-loaded images
|
|
68
|
+
{
|
|
69
|
+
selector: 'img[data-src], img[data-srcset], img[loading="lazy"], img.lazy, img.lazyload',
|
|
70
|
+
element: 'img',
|
|
71
|
+
transform: (el, doc) => {
|
|
72
|
+
// Check for base64 placeholder images
|
|
73
|
+
const src = el.getAttribute('src') || '';
|
|
74
|
+
const hasBetterSource = hasBetterImageSource(el);
|
|
75
|
+
if (isBase64Placeholder(src) && hasBetterSource) {
|
|
76
|
+
// Remove the placeholder src if we have better alternatives
|
|
77
|
+
el.removeAttribute('src');
|
|
78
|
+
}
|
|
79
|
+
// Handle data-src
|
|
80
|
+
const dataSrc = el.getAttribute('data-src');
|
|
81
|
+
if (dataSrc && !el.getAttribute('src')) {
|
|
82
|
+
el.setAttribute('src', dataSrc);
|
|
83
|
+
}
|
|
84
|
+
// Handle data-srcset
|
|
85
|
+
const dataSrcset = el.getAttribute('data-srcset');
|
|
86
|
+
if (dataSrcset && !el.getAttribute('srcset')) {
|
|
87
|
+
el.setAttribute('srcset', dataSrcset);
|
|
88
|
+
}
|
|
89
|
+
// Check for other attributes that might contain image URLs
|
|
90
|
+
for (let i = 0; i < el.attributes.length; i++) {
|
|
91
|
+
const attr = el.attributes[i];
|
|
92
|
+
if (attr.name === 'src' || attr.name === 'srcset' || attr.name === 'alt') {
|
|
93
|
+
continue; // Skip these attributes
|
|
94
|
+
}
|
|
95
|
+
// Check if attribute contains an image URL
|
|
96
|
+
if (srcsetPattern.test(attr.value)) {
|
|
97
|
+
// This looks like a srcset value
|
|
98
|
+
el.setAttribute('srcset', attr.value);
|
|
99
|
+
}
|
|
100
|
+
else if (srcPattern.test(attr.value)) {
|
|
101
|
+
// This looks like a src value
|
|
102
|
+
el.setAttribute('src', attr.value);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
// Remove lazy loading related classes and attributes
|
|
106
|
+
el.classList.remove('lazy', 'lazyload');
|
|
107
|
+
el.removeAttribute('data-ll-status');
|
|
108
|
+
el.removeAttribute('data-src');
|
|
109
|
+
el.removeAttribute('data-srcset');
|
|
110
|
+
el.removeAttribute('loading');
|
|
111
|
+
return el;
|
|
112
|
+
}
|
|
113
|
+
},
|
|
114
|
+
// Handle span elements containing images with captions
|
|
115
|
+
{
|
|
116
|
+
selector: 'span:has(img)',
|
|
117
|
+
element: 'span',
|
|
118
|
+
transform: (el, doc) => {
|
|
119
|
+
try {
|
|
120
|
+
// Check if this element contains an image
|
|
121
|
+
const hasImage = containsImage(el);
|
|
122
|
+
if (!hasImage) {
|
|
123
|
+
return el; // Not an image element, return as is
|
|
124
|
+
}
|
|
125
|
+
// Find the main image element
|
|
126
|
+
const imgElement = findMainImage(el);
|
|
127
|
+
if (!imgElement) {
|
|
128
|
+
return el; // No image found, return as is
|
|
129
|
+
}
|
|
130
|
+
// Find any caption
|
|
131
|
+
const caption = findCaption(el);
|
|
132
|
+
// Process the image element
|
|
133
|
+
const processedImg = processImageElement(imgElement, doc);
|
|
134
|
+
// If there's a meaningful caption, wrap in a figure
|
|
135
|
+
if (caption && hasMeaningfulCaption(caption)) {
|
|
136
|
+
// Create a new figure element
|
|
137
|
+
const figure = doc.createElement('figure');
|
|
138
|
+
// Add the processed image to the figure
|
|
139
|
+
figure.appendChild(processedImg);
|
|
140
|
+
// Add caption - ensure we don't duplicate content
|
|
141
|
+
const figcaption = doc.createElement('figcaption');
|
|
142
|
+
// Extract unique caption content
|
|
143
|
+
const uniqueCaptionContent = extractUniqueCaptionContent(caption);
|
|
144
|
+
figcaption.innerHTML = uniqueCaptionContent;
|
|
145
|
+
figure.appendChild(figcaption);
|
|
146
|
+
// Remove the original caption element to prevent duplication
|
|
147
|
+
if (caption.parentNode) {
|
|
148
|
+
caption.parentNode.removeChild(caption);
|
|
149
|
+
}
|
|
150
|
+
return figure;
|
|
151
|
+
}
|
|
152
|
+
else {
|
|
153
|
+
// No meaningful caption, just return the image
|
|
154
|
+
return processedImg;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
catch (error) {
|
|
158
|
+
console.warn('Error processing span with image:', error);
|
|
159
|
+
return el; // Return original element on error
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
},
|
|
163
|
+
// Standardize complex image elements (figure, picture, source, figcaption)
|
|
164
|
+
{
|
|
165
|
+
selector: 'figure, [class*="figure"], [class*="image"], [class*="img"], [class*="photo"], [class*="picture"], [class*="media"], [class*="caption"]',
|
|
166
|
+
element: 'figure',
|
|
167
|
+
transform: (el, doc) => {
|
|
168
|
+
try {
|
|
169
|
+
// Check if this element or its children contain an image
|
|
170
|
+
const hasImage = containsImage(el);
|
|
171
|
+
if (!hasImage) {
|
|
172
|
+
return el; // Not an image element, return as is
|
|
173
|
+
}
|
|
174
|
+
// Find the main image element
|
|
175
|
+
const imgElement = findMainImage(el);
|
|
176
|
+
if (!imgElement) {
|
|
177
|
+
return el; // No image found, return as is
|
|
178
|
+
}
|
|
179
|
+
// Find any caption
|
|
180
|
+
const caption = findCaption(el);
|
|
181
|
+
// Process the image element
|
|
182
|
+
const processedImg = processImageElement(imgElement, doc);
|
|
183
|
+
// If there's a meaningful caption, wrap in a figure
|
|
184
|
+
if (caption && hasMeaningfulCaption(caption)) {
|
|
185
|
+
// Create a new figure element
|
|
186
|
+
const figure = doc.createElement('figure');
|
|
187
|
+
// Add the processed image to the figure
|
|
188
|
+
figure.appendChild(processedImg);
|
|
189
|
+
// Add caption - ensure we don't duplicate content
|
|
190
|
+
const figcaption = doc.createElement('figcaption');
|
|
191
|
+
// Extract unique caption content
|
|
192
|
+
const uniqueCaptionContent = extractUniqueCaptionContent(caption);
|
|
193
|
+
figcaption.innerHTML = uniqueCaptionContent;
|
|
194
|
+
figure.appendChild(figcaption);
|
|
195
|
+
// Remove the original caption element to prevent duplication
|
|
196
|
+
if (caption.parentNode) {
|
|
197
|
+
caption.parentNode.removeChild(caption);
|
|
198
|
+
}
|
|
199
|
+
return figure;
|
|
200
|
+
}
|
|
201
|
+
else {
|
|
202
|
+
// No meaningful caption, just return the image
|
|
203
|
+
return processedImg;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
catch (error) {
|
|
207
|
+
console.warn('Error processing complex image element:', error);
|
|
208
|
+
return el; // Return original element on error
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
},
|
|
212
|
+
];
|
|
213
|
+
/**
|
|
214
|
+
* Apply srcset to an image element
|
|
215
|
+
*/
|
|
216
|
+
function applySrcsetToImage(srcset, img) {
|
|
217
|
+
img.setAttribute('srcset', srcset);
|
|
218
|
+
// Extract the first URL from srcset as the src
|
|
219
|
+
const firstUrl = extractFirstUrlFromSrcset(srcset);
|
|
220
|
+
if (firstUrl && isValidImageUrl(firstUrl)) {
|
|
221
|
+
img.setAttribute('src', firstUrl);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
/**
|
|
225
|
+
* Copy attributes from one element to another, excluding specified attributes
|
|
226
|
+
*/
|
|
227
|
+
function copyAttributesExcept(source, target, excludeAttrs) {
|
|
228
|
+
for (let i = 0; i < source.attributes.length; i++) {
|
|
229
|
+
const attr = source.attributes[i];
|
|
230
|
+
if (!excludeAttrs.includes(attr.name)) {
|
|
231
|
+
target.setAttribute(attr.name, attr.value);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
/**
|
|
236
|
+
* Check if a string is a base64 placeholder image
|
|
237
|
+
*/
|
|
238
|
+
function isBase64Placeholder(src) {
|
|
239
|
+
// Check if it's a base64 data URL
|
|
240
|
+
const match = src.match(b64DataUrlRegex);
|
|
241
|
+
if (!match) {
|
|
242
|
+
return false;
|
|
243
|
+
}
|
|
244
|
+
// Skip SVG images as they can be meaningful even when small
|
|
245
|
+
if (match[1] === 'svg+xml') {
|
|
246
|
+
return false;
|
|
247
|
+
}
|
|
248
|
+
// Check if the base64 part is too small (likely a placeholder)
|
|
249
|
+
const b64starts = match[0].length;
|
|
250
|
+
const b64length = src.length - b64starts;
|
|
251
|
+
// If less than 133 bytes (100 bytes after base64 encoding), it's likely a placeholder
|
|
252
|
+
return b64length < 133;
|
|
253
|
+
}
|
|
254
|
+
/**
|
|
255
|
+
* Check if a string is an SVG data URL
|
|
256
|
+
*/
|
|
257
|
+
function isSvgDataUrl(src) {
|
|
258
|
+
return src.startsWith('data:image/svg+xml');
|
|
259
|
+
}
|
|
260
|
+
/**
|
|
261
|
+
* Check if a string is a valid image URL
|
|
262
|
+
*/
|
|
263
|
+
function isValidImageUrl(src) {
|
|
264
|
+
// Skip data URLs (both base64 and SVG)
|
|
265
|
+
if (src.startsWith('data:')) {
|
|
266
|
+
return false;
|
|
267
|
+
}
|
|
268
|
+
// Skip empty or invalid URLs
|
|
269
|
+
if (!src || src.trim() === '') {
|
|
270
|
+
return false;
|
|
271
|
+
}
|
|
272
|
+
// Check if it's a valid image URL
|
|
273
|
+
return imageUrlPattern.test(src) ||
|
|
274
|
+
src.includes('image') ||
|
|
275
|
+
src.includes('img') ||
|
|
276
|
+
src.includes('photo');
|
|
277
|
+
}
|
|
278
|
+
/**
|
|
279
|
+
* Check if an element has better image sources than the current src
|
|
280
|
+
*/
|
|
281
|
+
function hasBetterImageSource(element) {
|
|
282
|
+
// Check for data-src or data-srcset
|
|
283
|
+
if (element.hasAttribute('data-src') || element.hasAttribute('data-srcset')) {
|
|
284
|
+
return true;
|
|
285
|
+
}
|
|
286
|
+
// Check for other attributes that might contain image URLs
|
|
287
|
+
for (let i = 0; i < element.attributes.length; i++) {
|
|
288
|
+
const attr = element.attributes[i];
|
|
289
|
+
if (attr.name === 'src') {
|
|
290
|
+
continue;
|
|
291
|
+
}
|
|
292
|
+
if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
|
|
293
|
+
return true;
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
return false;
|
|
297
|
+
}
|
|
298
|
+
/**
|
|
299
|
+
* Check if an element or its children contain an image
|
|
300
|
+
*/
|
|
301
|
+
function containsImage(element) {
|
|
302
|
+
// Check if element itself is an image
|
|
303
|
+
if (isImageElement(element)) {
|
|
304
|
+
return true;
|
|
305
|
+
}
|
|
306
|
+
// Check if element contains an image
|
|
307
|
+
const images = element.querySelectorAll('img, video, picture, source');
|
|
308
|
+
return images.length > 0;
|
|
309
|
+
}
|
|
310
|
+
/**
|
|
311
|
+
* Check if an element is an image element
|
|
312
|
+
*/
|
|
313
|
+
function isImageElement(element) {
|
|
314
|
+
const tagName = element.tagName.toLowerCase();
|
|
315
|
+
return tagName === 'img' || tagName === 'video' || tagName === 'picture' || tagName === 'source';
|
|
316
|
+
}
|
|
317
|
+
/**
|
|
318
|
+
* Find the main image element in a container
|
|
319
|
+
*/
|
|
320
|
+
function findMainImage(element) {
|
|
321
|
+
// If element itself is an image, return it
|
|
322
|
+
if (isImageElement(element)) {
|
|
323
|
+
return element;
|
|
324
|
+
}
|
|
325
|
+
// Look for picture elements first - they often contain the highest quality images
|
|
326
|
+
const pictureElements = element.querySelectorAll('picture');
|
|
327
|
+
if (pictureElements.length > 0) {
|
|
328
|
+
// For picture elements, we want to return the picture itself
|
|
329
|
+
// so we can process all its sources
|
|
330
|
+
return pictureElements[0];
|
|
331
|
+
}
|
|
332
|
+
// Look for img elements next, but skip placeholder images
|
|
333
|
+
const imgElements = element.querySelectorAll('img');
|
|
334
|
+
const filteredImgElements = [];
|
|
335
|
+
for (let i = 0; i < imgElements.length; i++) {
|
|
336
|
+
const img = imgElements[i];
|
|
337
|
+
// Skip placeholder images (SVG data URLs, empty alt, etc.)
|
|
338
|
+
const src = img.getAttribute('src') || '';
|
|
339
|
+
const alt = img.getAttribute('alt') || '';
|
|
340
|
+
// Skip SVG data URLs (placeholders)
|
|
341
|
+
if (src.includes('data:image/svg+xml')) {
|
|
342
|
+
continue;
|
|
343
|
+
}
|
|
344
|
+
// Skip base64 placeholder images
|
|
345
|
+
if (isBase64Placeholder(src)) {
|
|
346
|
+
continue;
|
|
347
|
+
}
|
|
348
|
+
// Skip empty alt text (often indicates decorative images)
|
|
349
|
+
// But only if we have other images with alt text
|
|
350
|
+
if (!alt.trim() && imgElements.length > 1) {
|
|
351
|
+
continue;
|
|
352
|
+
}
|
|
353
|
+
filteredImgElements.push(img);
|
|
354
|
+
}
|
|
355
|
+
if (filteredImgElements.length > 0) {
|
|
356
|
+
return filteredImgElements[0];
|
|
357
|
+
}
|
|
358
|
+
// Look for video elements next
|
|
359
|
+
const videoElements = element.querySelectorAll('video');
|
|
360
|
+
if (videoElements.length > 0) {
|
|
361
|
+
return videoElements[0];
|
|
362
|
+
}
|
|
363
|
+
// Look for any source elements as a last resort
|
|
364
|
+
const anySourceElements = element.querySelectorAll('source');
|
|
365
|
+
if (anySourceElements.length > 0) {
|
|
366
|
+
return anySourceElements[0];
|
|
367
|
+
}
|
|
368
|
+
// If we still haven't found an image, try a more aggressive search
|
|
369
|
+
// This helps with deeply nested structures like Medium articles
|
|
370
|
+
const allImages = element.querySelectorAll('img, picture, source, video');
|
|
371
|
+
if (allImages.length > 0) {
|
|
372
|
+
return allImages[0];
|
|
373
|
+
}
|
|
374
|
+
return null;
|
|
375
|
+
}
|
|
376
|
+
/**
|
|
377
|
+
* Find caption in an element
|
|
378
|
+
*/
|
|
379
|
+
function findCaption(element) {
|
|
380
|
+
// Check for existing figcaption
|
|
381
|
+
const figcaption = element.querySelector('figcaption');
|
|
382
|
+
if (figcaption) {
|
|
383
|
+
return figcaption;
|
|
384
|
+
}
|
|
385
|
+
// Check for elements with caption-related classes or attributes
|
|
386
|
+
const captionSelectors = [
|
|
387
|
+
'[class*="caption"]',
|
|
388
|
+
'[class*="description"]',
|
|
389
|
+
'[class*="alt"]',
|
|
390
|
+
'[class*="title"]',
|
|
391
|
+
'[class*="credit"]',
|
|
392
|
+
'[class*="text"]',
|
|
393
|
+
'[class*="post-thumbnail-text"]',
|
|
394
|
+
'[class*="image-caption"]',
|
|
395
|
+
'[class*="photo-caption"]',
|
|
396
|
+
'[aria-label]',
|
|
397
|
+
'[title]'
|
|
398
|
+
];
|
|
399
|
+
// Track found captions to avoid duplicates
|
|
400
|
+
const foundCaptions = new Set();
|
|
401
|
+
// Combine selectors for a single query
|
|
402
|
+
const combinedSelector = captionSelectors.join(', ');
|
|
403
|
+
const captionElements = element.querySelectorAll(combinedSelector);
|
|
404
|
+
for (let i = 0; i < captionElements.length; i++) {
|
|
405
|
+
const captionEl = captionElements[i];
|
|
406
|
+
// Skip if this is the image element itself
|
|
407
|
+
if (isImageElement(captionEl)) {
|
|
408
|
+
continue;
|
|
409
|
+
}
|
|
410
|
+
// Check if this element has text content
|
|
411
|
+
const textContent = captionEl.textContent?.trim();
|
|
412
|
+
if (textContent && textContent.length > 0) {
|
|
413
|
+
// Check if we've already found this caption text
|
|
414
|
+
if (!foundCaptions.has(textContent)) {
|
|
415
|
+
foundCaptions.add(textContent);
|
|
416
|
+
return captionEl;
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
}
|
|
420
|
+
// Check for alt attribute on image
|
|
421
|
+
const imgElement = element.querySelector('img');
|
|
422
|
+
if (imgElement && imgElement.hasAttribute('alt')) {
|
|
423
|
+
const altText = imgElement.getAttribute('alt');
|
|
424
|
+
if (altText && altText.trim().length > 0) {
|
|
425
|
+
// Create a new element for the alt text
|
|
426
|
+
const captionEl = element.ownerDocument.createElement('div');
|
|
427
|
+
captionEl.textContent = altText;
|
|
428
|
+
return captionEl;
|
|
429
|
+
}
|
|
430
|
+
}
|
|
431
|
+
// Check for sibling elements that might contain captions
|
|
432
|
+
// This is useful for cases like the example where the caption is in a sibling div
|
|
433
|
+
if (element.parentElement) {
|
|
434
|
+
const parent = element.parentElement;
|
|
435
|
+
const siblings = parent.children;
|
|
436
|
+
for (let i = 0; i < siblings.length; i++) {
|
|
437
|
+
const sibling = siblings[i];
|
|
438
|
+
if (sibling === element)
|
|
439
|
+
continue;
|
|
440
|
+
// Check if the sibling has caption-related classes
|
|
441
|
+
const hasCaptionClass = Array.from(sibling.classList).some(cls => cls.includes('caption') ||
|
|
442
|
+
cls.includes('credit') ||
|
|
443
|
+
cls.includes('text') ||
|
|
444
|
+
cls.includes('description'));
|
|
445
|
+
if (hasCaptionClass) {
|
|
446
|
+
const textContent = sibling.textContent?.trim();
|
|
447
|
+
if (textContent && textContent.length > 0) {
|
|
448
|
+
return sibling;
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
// Look for text elements that follow an image within the same parent
|
|
454
|
+
// This handles cases like <p><img><em>caption</em></p>
|
|
455
|
+
const imgElements = element.querySelectorAll('img');
|
|
456
|
+
for (let i = 0; i < imgElements.length; i++) {
|
|
457
|
+
const img = imgElements[i];
|
|
458
|
+
const parent = img.parentElement;
|
|
459
|
+
if (!parent)
|
|
460
|
+
continue;
|
|
461
|
+
// Look for text elements that follow the image
|
|
462
|
+
let nextElement = img.nextElementSibling;
|
|
463
|
+
while (nextElement) {
|
|
464
|
+
// Check if it's a text element (em, strong, span, etc.)
|
|
465
|
+
if (['EM', 'STRONG', 'SPAN', 'I', 'B', 'SMALL', 'CITE'].includes(nextElement.tagName)) {
|
|
466
|
+
const textContent = nextElement.textContent?.trim();
|
|
467
|
+
if (textContent && textContent.length > 0) {
|
|
468
|
+
return nextElement;
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
nextElement = nextElement.nextElementSibling;
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
// Check for text elements that are children of the same parent as the image
|
|
475
|
+
// This handles cases like <span><img><em>caption</em></span>
|
|
476
|
+
for (let i = 0; i < imgElements.length; i++) {
|
|
477
|
+
const img = imgElements[i];
|
|
478
|
+
const parent = img.parentElement;
|
|
479
|
+
if (!parent)
|
|
480
|
+
continue;
|
|
481
|
+
// Get all text elements in the parent
|
|
482
|
+
const textElements = parent.querySelectorAll('em, strong, span, i, b, small, cite');
|
|
483
|
+
for (let j = 0; j < textElements.length; j++) {
|
|
484
|
+
const textEl = textElements[j];
|
|
485
|
+
// Skip if this is the image itself
|
|
486
|
+
if (textEl === img)
|
|
487
|
+
continue;
|
|
488
|
+
const textContent = textEl.textContent?.trim();
|
|
489
|
+
if (textContent && textContent.length > 0) {
|
|
490
|
+
return textEl;
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
return null;
|
|
495
|
+
}
|
|
496
|
+
/**
|
|
497
|
+
* Extract unique caption content to avoid duplication
|
|
498
|
+
*/
|
|
499
|
+
function extractUniqueCaptionContent(caption) {
|
|
500
|
+
// Get all text nodes and elements with text content
|
|
501
|
+
const textNodes = [];
|
|
502
|
+
const processedTexts = new Set();
|
|
503
|
+
// Helper function to process a node
|
|
504
|
+
const processNode = (node) => {
|
|
505
|
+
if (node.nodeType === Node.TEXT_NODE) {
|
|
506
|
+
const text = node.textContent?.trim() || '';
|
|
507
|
+
if (text && !processedTexts.has(text)) {
|
|
508
|
+
textNodes.push(text);
|
|
509
|
+
processedTexts.add(text);
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
else if (node.nodeType === Node.ELEMENT_NODE) {
|
|
513
|
+
const element = node;
|
|
514
|
+
// Process child nodes
|
|
515
|
+
const childNodes = element.childNodes;
|
|
516
|
+
for (let i = 0; i < childNodes.length; i++) {
|
|
517
|
+
processNode(childNodes[i]);
|
|
518
|
+
}
|
|
519
|
+
}
|
|
520
|
+
};
|
|
521
|
+
// Process all child nodes
|
|
522
|
+
const childNodes = caption.childNodes;
|
|
523
|
+
for (let i = 0; i < childNodes.length; i++) {
|
|
524
|
+
processNode(childNodes[i]);
|
|
525
|
+
}
|
|
526
|
+
// If we found unique text nodes, use them
|
|
527
|
+
if (textNodes.length > 0) {
|
|
528
|
+
return textNodes.join(' ');
|
|
529
|
+
}
|
|
530
|
+
// Otherwise, just use the innerHTML but try to clean it up
|
|
531
|
+
const html = caption.innerHTML;
|
|
532
|
+
return html;
|
|
533
|
+
}
|
|
534
|
+
/**
|
|
535
|
+
* Check if a caption is meaningful enough to warrant a figure element
|
|
536
|
+
*/
|
|
537
|
+
function hasMeaningfulCaption(caption) {
|
|
538
|
+
// Get the text content
|
|
539
|
+
const textContent = caption.textContent?.trim() || '';
|
|
540
|
+
// If it's just a URL or very short, it's not meaningful
|
|
541
|
+
if (textContent.length < 10 ||
|
|
542
|
+
textContent.startsWith('http://') ||
|
|
543
|
+
textContent.startsWith('https://')) {
|
|
544
|
+
return false;
|
|
545
|
+
}
|
|
546
|
+
// Check if it's just a filename or path
|
|
547
|
+
if (filenamePattern.test(textContent)) {
|
|
548
|
+
return false;
|
|
549
|
+
}
|
|
550
|
+
// Check if it's just a number or date
|
|
551
|
+
if (textContent.match(/^\d+$/) || datePattern.test(textContent)) {
|
|
552
|
+
return false;
|
|
553
|
+
}
|
|
554
|
+
return true;
|
|
555
|
+
}
|
|
556
|
+
/**
|
|
557
|
+
* Process an image element
|
|
558
|
+
*/
|
|
559
|
+
function processImageElement(element, doc) {
|
|
560
|
+
const tagName = element.tagName.toLowerCase();
|
|
561
|
+
// Handle different types of image elements
|
|
562
|
+
if (tagName === 'img') {
|
|
563
|
+
return processImgElement(element, doc);
|
|
564
|
+
}
|
|
565
|
+
else if (tagName === 'picture') {
|
|
566
|
+
return processPictureElement(element, doc);
|
|
567
|
+
}
|
|
568
|
+
else if (tagName === 'source') {
|
|
569
|
+
return processSourceElement(element, doc);
|
|
570
|
+
}
|
|
571
|
+
// Default case: return a clone
|
|
572
|
+
return element.cloneNode(true);
|
|
573
|
+
}
|
|
574
|
+
/**
|
|
575
|
+
* Process an img element
|
|
576
|
+
*/
|
|
577
|
+
function processImgElement(element, doc) {
|
|
578
|
+
// For img elements, check if it's a placeholder
|
|
579
|
+
const src = element.getAttribute('src') || '';
|
|
580
|
+
if (isBase64Placeholder(src) || isSvgDataUrl(src)) {
|
|
581
|
+
// Try to find a better image in the parent
|
|
582
|
+
const parent = element.parentElement;
|
|
583
|
+
if (parent) {
|
|
584
|
+
// Look for source elements with data-srcset
|
|
585
|
+
const sourceElements = parent.querySelectorAll('source');
|
|
586
|
+
const filteredSources = [];
|
|
587
|
+
for (let i = 0; i < sourceElements.length; i++) {
|
|
588
|
+
const source = sourceElements[i];
|
|
589
|
+
if (source.hasAttribute('data-srcset') && source.getAttribute('data-srcset') !== '') {
|
|
590
|
+
filteredSources.push(source);
|
|
591
|
+
}
|
|
592
|
+
}
|
|
593
|
+
if (filteredSources.length > 0) {
|
|
594
|
+
// Create a new img element with the data-src
|
|
595
|
+
const newImg = doc.createElement('img');
|
|
596
|
+
const dataSrc = element.getAttribute('data-src');
|
|
597
|
+
if (dataSrc && !isSvgDataUrl(dataSrc)) {
|
|
598
|
+
newImg.setAttribute('src', dataSrc);
|
|
599
|
+
}
|
|
600
|
+
// Copy other attributes
|
|
601
|
+
copyAttributesExcept(element, newImg, ['src']);
|
|
602
|
+
return newImg;
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
// Return a clone of the img element
|
|
607
|
+
return element.cloneNode(true);
|
|
608
|
+
}
|
|
609
|
+
/**
|
|
610
|
+
* Process a picture element
|
|
611
|
+
*/
|
|
612
|
+
function processPictureElement(element, doc) {
|
|
613
|
+
// For picture elements, we want to process all sources and select the best one
|
|
614
|
+
// Create a new img element
|
|
615
|
+
const newImg = doc.createElement('img');
|
|
616
|
+
// Get all source elements
|
|
617
|
+
const sourceElements = element.querySelectorAll('source');
|
|
618
|
+
// If we have multiple sources, try to select the best one
|
|
619
|
+
if (sourceElements.length > 1) {
|
|
620
|
+
// Find the best source based on media queries and srcset
|
|
621
|
+
const bestSource = selectBestSource(sourceElements);
|
|
622
|
+
if (bestSource) {
|
|
623
|
+
// Get the srcset from the best source
|
|
624
|
+
const srcset = bestSource.getAttribute('srcset');
|
|
625
|
+
if (srcset) {
|
|
626
|
+
applySrcsetToImage(srcset, newImg);
|
|
627
|
+
}
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
else if (sourceElements.length === 1) {
|
|
631
|
+
// If only one source, use it
|
|
632
|
+
const srcset = sourceElements[0].getAttribute('srcset');
|
|
633
|
+
if (srcset) {
|
|
634
|
+
applySrcsetToImage(srcset, newImg);
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
// Copy other attributes from the original img if it exists
|
|
638
|
+
const originalImg = element.querySelector('img');
|
|
639
|
+
if (originalImg) {
|
|
640
|
+
// Copy all attributes except srcset
|
|
641
|
+
copyAttributesExcept(originalImg, newImg, ['srcset']);
|
|
642
|
+
// Always set the src attribute directly from the original img
|
|
643
|
+
const originalSrc = originalImg.getAttribute('src');
|
|
644
|
+
if (originalSrc) {
|
|
645
|
+
newImg.setAttribute('src', originalSrc);
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
return newImg;
|
|
649
|
+
}
|
|
650
|
+
/**
|
|
651
|
+
* Process a source element
|
|
652
|
+
*/
|
|
653
|
+
function processSourceElement(element, doc) {
|
|
654
|
+
// For source elements, create a new img element
|
|
655
|
+
const newImg = doc.createElement('img');
|
|
656
|
+
// Get the srcset from the source
|
|
657
|
+
const srcset = element.getAttribute('srcset');
|
|
658
|
+
if (srcset) {
|
|
659
|
+
applySrcsetToImage(srcset, newImg);
|
|
660
|
+
}
|
|
661
|
+
// Try to find a related img element to copy other attributes
|
|
662
|
+
const parent = element.parentElement;
|
|
663
|
+
if (parent) {
|
|
664
|
+
const imgElements = parent.querySelectorAll('img');
|
|
665
|
+
const filteredImgElements = [];
|
|
666
|
+
for (let i = 0; i < imgElements.length; i++) {
|
|
667
|
+
const img = imgElements[i];
|
|
668
|
+
const src = img.getAttribute('src') || '';
|
|
669
|
+
if (!isBase64Placeholder(src) && !isSvgDataUrl(src) && src !== '') {
|
|
670
|
+
filteredImgElements.push(img);
|
|
671
|
+
}
|
|
672
|
+
}
|
|
673
|
+
if (filteredImgElements.length > 0) {
|
|
674
|
+
copyAttributesExcept(filteredImgElements[0], newImg, ['src', 'srcset']);
|
|
675
|
+
// If we still don't have a valid src, use the img's src
|
|
676
|
+
if (!newImg.hasAttribute('src') || !isValidImageUrl(newImg.getAttribute('src') || '')) {
|
|
677
|
+
const imgSrc = filteredImgElements[0].getAttribute('src');
|
|
678
|
+
if (imgSrc && isValidImageUrl(imgSrc)) {
|
|
679
|
+
newImg.setAttribute('src', imgSrc);
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
else {
|
|
684
|
+
// If no good img found, look for one with data-src
|
|
685
|
+
const dataSrcImg = parent.querySelector('img[data-src]');
|
|
686
|
+
if (dataSrcImg) {
|
|
687
|
+
copyAttributesExcept(dataSrcImg, newImg, ['src', 'srcset']);
|
|
688
|
+
// If we still don't have a valid src, use the data-src
|
|
689
|
+
if (!newImg.hasAttribute('src') || !isValidImageUrl(newImg.getAttribute('src') || '')) {
|
|
690
|
+
const dataSrc = dataSrcImg.getAttribute('data-src');
|
|
691
|
+
if (dataSrc && isValidImageUrl(dataSrc)) {
|
|
692
|
+
newImg.setAttribute('src', dataSrc);
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
}
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
return newImg;
|
|
699
|
+
}
|
|
700
|
+
/**
|
|
701
|
+
* Extract the first URL from a srcset attribute
|
|
702
|
+
*/
|
|
703
|
+
function extractFirstUrlFromSrcset(srcset) {
|
|
704
|
+
// Split the srcset by commas
|
|
705
|
+
const parts = srcset.split(',');
|
|
706
|
+
if (parts.length === 0) {
|
|
707
|
+
return null;
|
|
708
|
+
}
|
|
709
|
+
// Get the first part
|
|
710
|
+
const firstPart = parts[0].trim();
|
|
711
|
+
// Extract the URL (everything before the first space)
|
|
712
|
+
const urlMatch = firstPart.match(urlPattern);
|
|
713
|
+
if (urlMatch && urlMatch[1]) {
|
|
714
|
+
const url = urlMatch[1];
|
|
715
|
+
// Skip SVG data URLs
|
|
716
|
+
if (isSvgDataUrl(url)) {
|
|
717
|
+
// Try to find a better URL in the srcset
|
|
718
|
+
for (let i = 1; i < parts.length; i++) {
|
|
719
|
+
const part = parts[i].trim();
|
|
720
|
+
const match = part.match(urlPattern);
|
|
721
|
+
if (match && match[1] && !isSvgDataUrl(match[1])) {
|
|
722
|
+
return match[1];
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
return null;
|
|
726
|
+
}
|
|
727
|
+
return url;
|
|
728
|
+
}
|
|
729
|
+
return null;
|
|
730
|
+
}
|
|
731
|
+
/**
|
|
732
|
+
* Select the best source element from a list of sources
|
|
733
|
+
* based on media queries and srcset values
|
|
734
|
+
*/
|
|
735
|
+
function selectBestSource(sources) {
|
|
736
|
+
if (sources.length === 0) {
|
|
737
|
+
return null;
|
|
738
|
+
}
|
|
739
|
+
// If only one source, return it
|
|
740
|
+
if (sources.length === 1) {
|
|
741
|
+
return sources[0];
|
|
742
|
+
}
|
|
743
|
+
// First, try to find a source without media queries (default)
|
|
744
|
+
for (let i = 0; i < sources.length; i++) {
|
|
745
|
+
if (!sources[i].hasAttribute('media')) {
|
|
746
|
+
return sources[i];
|
|
747
|
+
}
|
|
748
|
+
}
|
|
749
|
+
// If no default source, try to find the highest resolution source
|
|
750
|
+
// by analyzing the srcset values
|
|
751
|
+
let bestSource = null;
|
|
752
|
+
let maxResolution = 0;
|
|
753
|
+
for (let i = 0; i < sources.length; i++) {
|
|
754
|
+
const source = sources[i];
|
|
755
|
+
const srcset = source.getAttribute('srcset');
|
|
756
|
+
if (!srcset)
|
|
757
|
+
continue;
|
|
758
|
+
// Extract width and DPR from srcset
|
|
759
|
+
const widthMatch = srcset.match(widthPattern);
|
|
760
|
+
const dprMatch = srcset.match(dprPattern);
|
|
761
|
+
if (widthMatch && widthMatch[1]) {
|
|
762
|
+
const width = parseInt(widthMatch[1], 10);
|
|
763
|
+
const dpr = dprMatch ? parseFloat(dprMatch[1]) : 1;
|
|
764
|
+
// Calculate effective resolution (width * DPR)
|
|
765
|
+
const resolution = width * dpr;
|
|
766
|
+
if (resolution > maxResolution) {
|
|
767
|
+
maxResolution = resolution;
|
|
768
|
+
bestSource = source;
|
|
769
|
+
}
|
|
770
|
+
}
|
|
771
|
+
}
|
|
772
|
+
// If we found a source with resolution, return it
|
|
773
|
+
if (bestSource) {
|
|
774
|
+
return bestSource;
|
|
775
|
+
}
|
|
776
|
+
// If no resolution found, return the first source
|
|
777
|
+
return sources[0];
|
|
778
|
+
}
|
|
779
|
+
//# sourceMappingURL=images.js.map
|