defuddle 0.5.4 → 0.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,779 @@
1
+ "use strict";
2
+ /**
3
+ * Standardization rules for handling images
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.imageRules = void 0;
7
+ // Pre-compile regular expressions
8
+ const b64DataUrlRegex = /^data:image\/([^;]+);base64,/;
9
+ const srcsetPattern = /\.(jpg|jpeg|png|webp)\s+\d/;
10
+ const srcPattern = /^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/;
11
+ const imageUrlPattern = /\.(jpg|jpeg|png|webp|gif|avif)(\?.*)?$/i;
12
+ const widthPattern = /\s(\d+)w/;
13
+ const dprPattern = /dpr=(\d+(?:\.\d+)?)/;
14
+ const urlPattern = /^([^\s]+)/;
15
+ const filenamePattern = /^[\w\-\.\/\\]+\.(jpg|jpeg|png|gif|webp|svg)$/i;
16
+ const datePattern = /^\d{4}-\d{2}-\d{2}$/;
17
+ exports.imageRules = [
18
+ // Handle picture elements first to ensure we get the highest resolution
19
+ {
20
+ selector: 'picture',
21
+ element: 'img',
22
+ transform: (el, doc) => {
23
+ // Get all source elements
24
+ const sourceElements = el.querySelectorAll('source');
25
+ // Create a new img element
26
+ const newImg = doc.createElement('img');
27
+ // If we have multiple sources, try to select the best one
28
+ if (sourceElements.length > 1) {
29
+ // Find the best source based on media queries and srcset
30
+ const bestSource = selectBestSource(sourceElements);
31
+ if (bestSource) {
32
+ // Get the srcset from the best source
33
+ const srcset = bestSource.getAttribute('srcset');
34
+ if (srcset) {
35
+ applySrcsetToImage(srcset, newImg);
36
+ }
37
+ }
38
+ }
39
+ else if (sourceElements.length === 1) {
40
+ // If only one source, use it
41
+ const srcset = sourceElements[0].getAttribute('srcset');
42
+ if (srcset) {
43
+ applySrcsetToImage(srcset, newImg);
44
+ }
45
+ }
46
+ // Copy other attributes from the original img if it exists
47
+ const originalImg = el.querySelector('img');
48
+ if (originalImg) {
49
+ // Copy all attributes except srcset
50
+ copyAttributesExcept(originalImg, newImg, ['srcset']);
51
+ // Always set the src attribute directly from the original img
52
+ const originalSrc = originalImg.getAttribute('src');
53
+ // Always use the original src if it exists
54
+ if (originalSrc) {
55
+ newImg.setAttribute('src', originalSrc);
56
+ }
57
+ }
58
+ if (!newImg.hasAttribute('src') && originalImg) {
59
+ const originalSrc = originalImg.getAttribute('src');
60
+ if (originalSrc) {
61
+ newImg.setAttribute('src', originalSrc);
62
+ }
63
+ }
64
+ return newImg;
65
+ }
66
+ },
67
+ // Handle lazy-loaded images
68
+ {
69
+ selector: 'img[data-src], img[data-srcset], img[loading="lazy"], img.lazy, img.lazyload',
70
+ element: 'img',
71
+ transform: (el, doc) => {
72
+ // Check for base64 placeholder images
73
+ const src = el.getAttribute('src') || '';
74
+ const hasBetterSource = hasBetterImageSource(el);
75
+ if (isBase64Placeholder(src) && hasBetterSource) {
76
+ // Remove the placeholder src if we have better alternatives
77
+ el.removeAttribute('src');
78
+ }
79
+ // Handle data-src
80
+ const dataSrc = el.getAttribute('data-src');
81
+ if (dataSrc && !el.getAttribute('src')) {
82
+ el.setAttribute('src', dataSrc);
83
+ }
84
+ // Handle data-srcset
85
+ const dataSrcset = el.getAttribute('data-srcset');
86
+ if (dataSrcset && !el.getAttribute('srcset')) {
87
+ el.setAttribute('srcset', dataSrcset);
88
+ }
89
+ // Check for other attributes that might contain image URLs
90
+ for (let i = 0; i < el.attributes.length; i++) {
91
+ const attr = el.attributes[i];
92
+ if (attr.name === 'src' || attr.name === 'srcset' || attr.name === 'alt') {
93
+ continue; // Skip these attributes
94
+ }
95
+ // Check if attribute contains an image URL
96
+ if (srcsetPattern.test(attr.value)) {
97
+ // This looks like a srcset value
98
+ el.setAttribute('srcset', attr.value);
99
+ }
100
+ else if (srcPattern.test(attr.value)) {
101
+ // This looks like a src value
102
+ el.setAttribute('src', attr.value);
103
+ }
104
+ }
105
+ // Remove lazy loading related classes and attributes
106
+ el.classList.remove('lazy', 'lazyload');
107
+ el.removeAttribute('data-ll-status');
108
+ el.removeAttribute('data-src');
109
+ el.removeAttribute('data-srcset');
110
+ el.removeAttribute('loading');
111
+ return el;
112
+ }
113
+ },
114
+ // Handle span elements containing images with captions
115
+ {
116
+ selector: 'span:has(img)',
117
+ element: 'span',
118
+ transform: (el, doc) => {
119
+ try {
120
+ // Check if this element contains an image
121
+ const hasImage = containsImage(el);
122
+ if (!hasImage) {
123
+ return el; // Not an image element, return as is
124
+ }
125
+ // Find the main image element
126
+ const imgElement = findMainImage(el);
127
+ if (!imgElement) {
128
+ return el; // No image found, return as is
129
+ }
130
+ // Find any caption
131
+ const caption = findCaption(el);
132
+ // Process the image element
133
+ const processedImg = processImageElement(imgElement, doc);
134
+ // If there's a meaningful caption, wrap in a figure
135
+ if (caption && hasMeaningfulCaption(caption)) {
136
+ // Create a new figure element
137
+ const figure = doc.createElement('figure');
138
+ // Add the processed image to the figure
139
+ figure.appendChild(processedImg);
140
+ // Add caption - ensure we don't duplicate content
141
+ const figcaption = doc.createElement('figcaption');
142
+ // Extract unique caption content
143
+ const uniqueCaptionContent = extractUniqueCaptionContent(caption);
144
+ figcaption.innerHTML = uniqueCaptionContent;
145
+ figure.appendChild(figcaption);
146
+ // Remove the original caption element to prevent duplication
147
+ if (caption.parentNode) {
148
+ caption.parentNode.removeChild(caption);
149
+ }
150
+ return figure;
151
+ }
152
+ else {
153
+ // No meaningful caption, just return the image
154
+ return processedImg;
155
+ }
156
+ }
157
+ catch (error) {
158
+ console.warn('Error processing span with image:', error);
159
+ return el; // Return original element on error
160
+ }
161
+ }
162
+ },
163
+ // Standardize complex image elements (figure, picture, source, figcaption)
164
+ {
165
+ selector: 'figure, [class*="figure"], [class*="image"], [class*="img"], [class*="photo"], [class*="picture"], [class*="media"], [class*="caption"]',
166
+ element: 'figure',
167
+ transform: (el, doc) => {
168
+ try {
169
+ // Check if this element or its children contain an image
170
+ const hasImage = containsImage(el);
171
+ if (!hasImage) {
172
+ return el; // Not an image element, return as is
173
+ }
174
+ // Find the main image element
175
+ const imgElement = findMainImage(el);
176
+ if (!imgElement) {
177
+ return el; // No image found, return as is
178
+ }
179
+ // Find any caption
180
+ const caption = findCaption(el);
181
+ // Process the image element
182
+ const processedImg = processImageElement(imgElement, doc);
183
+ // If there's a meaningful caption, wrap in a figure
184
+ if (caption && hasMeaningfulCaption(caption)) {
185
+ // Create a new figure element
186
+ const figure = doc.createElement('figure');
187
+ // Add the processed image to the figure
188
+ figure.appendChild(processedImg);
189
+ // Add caption - ensure we don't duplicate content
190
+ const figcaption = doc.createElement('figcaption');
191
+ // Extract unique caption content
192
+ const uniqueCaptionContent = extractUniqueCaptionContent(caption);
193
+ figcaption.innerHTML = uniqueCaptionContent;
194
+ figure.appendChild(figcaption);
195
+ // Remove the original caption element to prevent duplication
196
+ if (caption.parentNode) {
197
+ caption.parentNode.removeChild(caption);
198
+ }
199
+ return figure;
200
+ }
201
+ else {
202
+ // No meaningful caption, just return the image
203
+ return processedImg;
204
+ }
205
+ }
206
+ catch (error) {
207
+ console.warn('Error processing complex image element:', error);
208
+ return el; // Return original element on error
209
+ }
210
+ }
211
+ },
212
+ ];
213
+ /**
214
+ * Apply srcset to an image element
215
+ */
216
+ function applySrcsetToImage(srcset, img) {
217
+ img.setAttribute('srcset', srcset);
218
+ // Extract the first URL from srcset as the src
219
+ const firstUrl = extractFirstUrlFromSrcset(srcset);
220
+ if (firstUrl && isValidImageUrl(firstUrl)) {
221
+ img.setAttribute('src', firstUrl);
222
+ }
223
+ }
224
+ /**
225
+ * Copy attributes from one element to another, excluding specified attributes
226
+ */
227
+ function copyAttributesExcept(source, target, excludeAttrs) {
228
+ for (let i = 0; i < source.attributes.length; i++) {
229
+ const attr = source.attributes[i];
230
+ if (!excludeAttrs.includes(attr.name)) {
231
+ target.setAttribute(attr.name, attr.value);
232
+ }
233
+ }
234
+ }
235
+ /**
236
+ * Check if a string is a base64 placeholder image
237
+ */
238
+ function isBase64Placeholder(src) {
239
+ // Check if it's a base64 data URL
240
+ const match = src.match(b64DataUrlRegex);
241
+ if (!match) {
242
+ return false;
243
+ }
244
+ // Skip SVG images as they can be meaningful even when small
245
+ if (match[1] === 'svg+xml') {
246
+ return false;
247
+ }
248
+ // Check if the base64 part is too small (likely a placeholder)
249
+ const b64starts = match[0].length;
250
+ const b64length = src.length - b64starts;
251
+ // If less than 133 bytes (100 bytes after base64 encoding), it's likely a placeholder
252
+ return b64length < 133;
253
+ }
254
+ /**
255
+ * Check if a string is an SVG data URL
256
+ */
257
+ function isSvgDataUrl(src) {
258
+ return src.startsWith('data:image/svg+xml');
259
+ }
260
+ /**
261
+ * Check if a string is a valid image URL
262
+ */
263
+ function isValidImageUrl(src) {
264
+ // Skip data URLs (both base64 and SVG)
265
+ if (src.startsWith('data:')) {
266
+ return false;
267
+ }
268
+ // Skip empty or invalid URLs
269
+ if (!src || src.trim() === '') {
270
+ return false;
271
+ }
272
+ // Check if it's a valid image URL
273
+ return imageUrlPattern.test(src) ||
274
+ src.includes('image') ||
275
+ src.includes('img') ||
276
+ src.includes('photo');
277
+ }
278
+ /**
279
+ * Check if an element has better image sources than the current src
280
+ */
281
+ function hasBetterImageSource(element) {
282
+ // Check for data-src or data-srcset
283
+ if (element.hasAttribute('data-src') || element.hasAttribute('data-srcset')) {
284
+ return true;
285
+ }
286
+ // Check for other attributes that might contain image URLs
287
+ for (let i = 0; i < element.attributes.length; i++) {
288
+ const attr = element.attributes[i];
289
+ if (attr.name === 'src') {
290
+ continue;
291
+ }
292
+ if (/\.(jpg|jpeg|png|webp)/i.test(attr.value)) {
293
+ return true;
294
+ }
295
+ }
296
+ return false;
297
+ }
298
+ /**
299
+ * Check if an element or its children contain an image
300
+ */
301
+ function containsImage(element) {
302
+ // Check if element itself is an image
303
+ if (isImageElement(element)) {
304
+ return true;
305
+ }
306
+ // Check if element contains an image
307
+ const images = element.querySelectorAll('img, video, picture, source');
308
+ return images.length > 0;
309
+ }
310
+ /**
311
+ * Check if an element is an image element
312
+ */
313
+ function isImageElement(element) {
314
+ const tagName = element.tagName.toLowerCase();
315
+ return tagName === 'img' || tagName === 'video' || tagName === 'picture' || tagName === 'source';
316
+ }
317
+ /**
318
+ * Find the main image element in a container
319
+ */
320
+ function findMainImage(element) {
321
+ // If element itself is an image, return it
322
+ if (isImageElement(element)) {
323
+ return element;
324
+ }
325
+ // Look for picture elements first - they often contain the highest quality images
326
+ const pictureElements = element.querySelectorAll('picture');
327
+ if (pictureElements.length > 0) {
328
+ // For picture elements, we want to return the picture itself
329
+ // so we can process all its sources
330
+ return pictureElements[0];
331
+ }
332
+ // Look for img elements next, but skip placeholder images
333
+ const imgElements = element.querySelectorAll('img');
334
+ const filteredImgElements = [];
335
+ for (let i = 0; i < imgElements.length; i++) {
336
+ const img = imgElements[i];
337
+ // Skip placeholder images (SVG data URLs, empty alt, etc.)
338
+ const src = img.getAttribute('src') || '';
339
+ const alt = img.getAttribute('alt') || '';
340
+ // Skip SVG data URLs (placeholders)
341
+ if (src.includes('data:image/svg+xml')) {
342
+ continue;
343
+ }
344
+ // Skip base64 placeholder images
345
+ if (isBase64Placeholder(src)) {
346
+ continue;
347
+ }
348
+ // Skip empty alt text (often indicates decorative images)
349
+ // But only if we have other images with alt text
350
+ if (!alt.trim() && imgElements.length > 1) {
351
+ continue;
352
+ }
353
+ filteredImgElements.push(img);
354
+ }
355
+ if (filteredImgElements.length > 0) {
356
+ return filteredImgElements[0];
357
+ }
358
+ // Look for video elements next
359
+ const videoElements = element.querySelectorAll('video');
360
+ if (videoElements.length > 0) {
361
+ return videoElements[0];
362
+ }
363
+ // Look for any source elements as a last resort
364
+ const anySourceElements = element.querySelectorAll('source');
365
+ if (anySourceElements.length > 0) {
366
+ return anySourceElements[0];
367
+ }
368
+ // If we still haven't found an image, try a more aggressive search
369
+ // This helps with deeply nested structures like Medium articles
370
+ const allImages = element.querySelectorAll('img, picture, source, video');
371
+ if (allImages.length > 0) {
372
+ return allImages[0];
373
+ }
374
+ return null;
375
+ }
376
+ /**
377
+ * Find caption in an element
378
+ */
379
+ function findCaption(element) {
380
+ // Check for existing figcaption
381
+ const figcaption = element.querySelector('figcaption');
382
+ if (figcaption) {
383
+ return figcaption;
384
+ }
385
+ // Check for elements with caption-related classes or attributes
386
+ const captionSelectors = [
387
+ '[class*="caption"]',
388
+ '[class*="description"]',
389
+ '[class*="alt"]',
390
+ '[class*="title"]',
391
+ '[class*="credit"]',
392
+ '[class*="text"]',
393
+ '[class*="post-thumbnail-text"]',
394
+ '[class*="image-caption"]',
395
+ '[class*="photo-caption"]',
396
+ '[aria-label]',
397
+ '[title]'
398
+ ];
399
+ // Track found captions to avoid duplicates
400
+ const foundCaptions = new Set();
401
+ // Combine selectors for a single query
402
+ const combinedSelector = captionSelectors.join(', ');
403
+ const captionElements = element.querySelectorAll(combinedSelector);
404
+ for (let i = 0; i < captionElements.length; i++) {
405
+ const captionEl = captionElements[i];
406
+ // Skip if this is the image element itself
407
+ if (isImageElement(captionEl)) {
408
+ continue;
409
+ }
410
+ // Check if this element has text content
411
+ const textContent = captionEl.textContent?.trim();
412
+ if (textContent && textContent.length > 0) {
413
+ // Check if we've already found this caption text
414
+ if (!foundCaptions.has(textContent)) {
415
+ foundCaptions.add(textContent);
416
+ return captionEl;
417
+ }
418
+ }
419
+ }
420
+ // Check for alt attribute on image
421
+ const imgElement = element.querySelector('img');
422
+ if (imgElement && imgElement.hasAttribute('alt')) {
423
+ const altText = imgElement.getAttribute('alt');
424
+ if (altText && altText.trim().length > 0) {
425
+ // Create a new element for the alt text
426
+ const captionEl = element.ownerDocument.createElement('div');
427
+ captionEl.textContent = altText;
428
+ return captionEl;
429
+ }
430
+ }
431
+ // Check for sibling elements that might contain captions
432
+ // This is useful for cases like the example where the caption is in a sibling div
433
+ if (element.parentElement) {
434
+ const parent = element.parentElement;
435
+ const siblings = parent.children;
436
+ for (let i = 0; i < siblings.length; i++) {
437
+ const sibling = siblings[i];
438
+ if (sibling === element)
439
+ continue;
440
+ // Check if the sibling has caption-related classes
441
+ const hasCaptionClass = Array.from(sibling.classList).some(cls => cls.includes('caption') ||
442
+ cls.includes('credit') ||
443
+ cls.includes('text') ||
444
+ cls.includes('description'));
445
+ if (hasCaptionClass) {
446
+ const textContent = sibling.textContent?.trim();
447
+ if (textContent && textContent.length > 0) {
448
+ return sibling;
449
+ }
450
+ }
451
+ }
452
+ }
453
+ // Look for text elements that follow an image within the same parent
454
+ // This handles cases like <p><img><em>caption</em></p>
455
+ const imgElements = element.querySelectorAll('img');
456
+ for (let i = 0; i < imgElements.length; i++) {
457
+ const img = imgElements[i];
458
+ const parent = img.parentElement;
459
+ if (!parent)
460
+ continue;
461
+ // Look for text elements that follow the image
462
+ let nextElement = img.nextElementSibling;
463
+ while (nextElement) {
464
+ // Check if it's a text element (em, strong, span, etc.)
465
+ if (['EM', 'STRONG', 'SPAN', 'I', 'B', 'SMALL', 'CITE'].includes(nextElement.tagName)) {
466
+ const textContent = nextElement.textContent?.trim();
467
+ if (textContent && textContent.length > 0) {
468
+ return nextElement;
469
+ }
470
+ }
471
+ nextElement = nextElement.nextElementSibling;
472
+ }
473
+ }
474
+ // Check for text elements that are children of the same parent as the image
475
+ // This handles cases like <span><img><em>caption</em></span>
476
+ for (let i = 0; i < imgElements.length; i++) {
477
+ const img = imgElements[i];
478
+ const parent = img.parentElement;
479
+ if (!parent)
480
+ continue;
481
+ // Get all text elements in the parent
482
+ const textElements = parent.querySelectorAll('em, strong, span, i, b, small, cite');
483
+ for (let j = 0; j < textElements.length; j++) {
484
+ const textEl = textElements[j];
485
+ // Skip if this is the image itself
486
+ if (textEl === img)
487
+ continue;
488
+ const textContent = textEl.textContent?.trim();
489
+ if (textContent && textContent.length > 0) {
490
+ return textEl;
491
+ }
492
+ }
493
+ }
494
+ return null;
495
+ }
496
+ /**
497
+ * Extract unique caption content to avoid duplication
498
+ */
499
+ function extractUniqueCaptionContent(caption) {
500
+ // Get all text nodes and elements with text content
501
+ const textNodes = [];
502
+ const processedTexts = new Set();
503
+ // Helper function to process a node
504
+ const processNode = (node) => {
505
+ if (node.nodeType === Node.TEXT_NODE) {
506
+ const text = node.textContent?.trim() || '';
507
+ if (text && !processedTexts.has(text)) {
508
+ textNodes.push(text);
509
+ processedTexts.add(text);
510
+ }
511
+ }
512
+ else if (node.nodeType === Node.ELEMENT_NODE) {
513
+ const element = node;
514
+ // Process child nodes
515
+ const childNodes = element.childNodes;
516
+ for (let i = 0; i < childNodes.length; i++) {
517
+ processNode(childNodes[i]);
518
+ }
519
+ }
520
+ };
521
+ // Process all child nodes
522
+ const childNodes = caption.childNodes;
523
+ for (let i = 0; i < childNodes.length; i++) {
524
+ processNode(childNodes[i]);
525
+ }
526
+ // If we found unique text nodes, use them
527
+ if (textNodes.length > 0) {
528
+ return textNodes.join(' ');
529
+ }
530
+ // Otherwise, just use the innerHTML but try to clean it up
531
+ const html = caption.innerHTML;
532
+ return html;
533
+ }
534
+ /**
535
+ * Check if a caption is meaningful enough to warrant a figure element
536
+ */
537
+ function hasMeaningfulCaption(caption) {
538
+ // Get the text content
539
+ const textContent = caption.textContent?.trim() || '';
540
+ // If it's just a URL or very short, it's not meaningful
541
+ if (textContent.length < 10 ||
542
+ textContent.startsWith('http://') ||
543
+ textContent.startsWith('https://')) {
544
+ return false;
545
+ }
546
+ // Check if it's just a filename or path
547
+ if (filenamePattern.test(textContent)) {
548
+ return false;
549
+ }
550
+ // Check if it's just a number or date
551
+ if (textContent.match(/^\d+$/) || datePattern.test(textContent)) {
552
+ return false;
553
+ }
554
+ return true;
555
+ }
556
+ /**
557
+ * Process an image element
558
+ */
559
+ function processImageElement(element, doc) {
560
+ const tagName = element.tagName.toLowerCase();
561
+ // Handle different types of image elements
562
+ if (tagName === 'img') {
563
+ return processImgElement(element, doc);
564
+ }
565
+ else if (tagName === 'picture') {
566
+ return processPictureElement(element, doc);
567
+ }
568
+ else if (tagName === 'source') {
569
+ return processSourceElement(element, doc);
570
+ }
571
+ // Default case: return a clone
572
+ return element.cloneNode(true);
573
+ }
574
+ /**
575
+ * Process an img element
576
+ */
577
+ function processImgElement(element, doc) {
578
+ // For img elements, check if it's a placeholder
579
+ const src = element.getAttribute('src') || '';
580
+ if (isBase64Placeholder(src) || isSvgDataUrl(src)) {
581
+ // Try to find a better image in the parent
582
+ const parent = element.parentElement;
583
+ if (parent) {
584
+ // Look for source elements with data-srcset
585
+ const sourceElements = parent.querySelectorAll('source');
586
+ const filteredSources = [];
587
+ for (let i = 0; i < sourceElements.length; i++) {
588
+ const source = sourceElements[i];
589
+ if (source.hasAttribute('data-srcset') && source.getAttribute('data-srcset') !== '') {
590
+ filteredSources.push(source);
591
+ }
592
+ }
593
+ if (filteredSources.length > 0) {
594
+ // Create a new img element with the data-src
595
+ const newImg = doc.createElement('img');
596
+ const dataSrc = element.getAttribute('data-src');
597
+ if (dataSrc && !isSvgDataUrl(dataSrc)) {
598
+ newImg.setAttribute('src', dataSrc);
599
+ }
600
+ // Copy other attributes
601
+ copyAttributesExcept(element, newImg, ['src']);
602
+ return newImg;
603
+ }
604
+ }
605
+ }
606
+ // Return a clone of the img element
607
+ return element.cloneNode(true);
608
+ }
609
+ /**
610
+ * Process a picture element
611
+ */
612
+ function processPictureElement(element, doc) {
613
+ // For picture elements, we want to process all sources and select the best one
614
+ // Create a new img element
615
+ const newImg = doc.createElement('img');
616
+ // Get all source elements
617
+ const sourceElements = element.querySelectorAll('source');
618
+ // If we have multiple sources, try to select the best one
619
+ if (sourceElements.length > 1) {
620
+ // Find the best source based on media queries and srcset
621
+ const bestSource = selectBestSource(sourceElements);
622
+ if (bestSource) {
623
+ // Get the srcset from the best source
624
+ const srcset = bestSource.getAttribute('srcset');
625
+ if (srcset) {
626
+ applySrcsetToImage(srcset, newImg);
627
+ }
628
+ }
629
+ }
630
+ else if (sourceElements.length === 1) {
631
+ // If only one source, use it
632
+ const srcset = sourceElements[0].getAttribute('srcset');
633
+ if (srcset) {
634
+ applySrcsetToImage(srcset, newImg);
635
+ }
636
+ }
637
+ // Copy other attributes from the original img if it exists
638
+ const originalImg = element.querySelector('img');
639
+ if (originalImg) {
640
+ // Copy all attributes except srcset
641
+ copyAttributesExcept(originalImg, newImg, ['srcset']);
642
+ // Always set the src attribute directly from the original img
643
+ const originalSrc = originalImg.getAttribute('src');
644
+ if (originalSrc) {
645
+ newImg.setAttribute('src', originalSrc);
646
+ }
647
+ }
648
+ return newImg;
649
+ }
650
+ /**
651
+ * Process a source element
652
+ */
653
+ function processSourceElement(element, doc) {
654
+ // For source elements, create a new img element
655
+ const newImg = doc.createElement('img');
656
+ // Get the srcset from the source
657
+ const srcset = element.getAttribute('srcset');
658
+ if (srcset) {
659
+ applySrcsetToImage(srcset, newImg);
660
+ }
661
+ // Try to find a related img element to copy other attributes
662
+ const parent = element.parentElement;
663
+ if (parent) {
664
+ const imgElements = parent.querySelectorAll('img');
665
+ const filteredImgElements = [];
666
+ for (let i = 0; i < imgElements.length; i++) {
667
+ const img = imgElements[i];
668
+ const src = img.getAttribute('src') || '';
669
+ if (!isBase64Placeholder(src) && !isSvgDataUrl(src) && src !== '') {
670
+ filteredImgElements.push(img);
671
+ }
672
+ }
673
+ if (filteredImgElements.length > 0) {
674
+ copyAttributesExcept(filteredImgElements[0], newImg, ['src', 'srcset']);
675
+ // If we still don't have a valid src, use the img's src
676
+ if (!newImg.hasAttribute('src') || !isValidImageUrl(newImg.getAttribute('src') || '')) {
677
+ const imgSrc = filteredImgElements[0].getAttribute('src');
678
+ if (imgSrc && isValidImageUrl(imgSrc)) {
679
+ newImg.setAttribute('src', imgSrc);
680
+ }
681
+ }
682
+ }
683
+ else {
684
+ // If no good img found, look for one with data-src
685
+ const dataSrcImg = parent.querySelector('img[data-src]');
686
+ if (dataSrcImg) {
687
+ copyAttributesExcept(dataSrcImg, newImg, ['src', 'srcset']);
688
+ // If we still don't have a valid src, use the data-src
689
+ if (!newImg.hasAttribute('src') || !isValidImageUrl(newImg.getAttribute('src') || '')) {
690
+ const dataSrc = dataSrcImg.getAttribute('data-src');
691
+ if (dataSrc && isValidImageUrl(dataSrc)) {
692
+ newImg.setAttribute('src', dataSrc);
693
+ }
694
+ }
695
+ }
696
+ }
697
+ }
698
+ return newImg;
699
+ }
700
+ /**
701
+ * Extract the first URL from a srcset attribute
702
+ */
703
+ function extractFirstUrlFromSrcset(srcset) {
704
+ // Split the srcset by commas
705
+ const parts = srcset.split(',');
706
+ if (parts.length === 0) {
707
+ return null;
708
+ }
709
+ // Get the first part
710
+ const firstPart = parts[0].trim();
711
+ // Extract the URL (everything before the first space)
712
+ const urlMatch = firstPart.match(urlPattern);
713
+ if (urlMatch && urlMatch[1]) {
714
+ const url = urlMatch[1];
715
+ // Skip SVG data URLs
716
+ if (isSvgDataUrl(url)) {
717
+ // Try to find a better URL in the srcset
718
+ for (let i = 1; i < parts.length; i++) {
719
+ const part = parts[i].trim();
720
+ const match = part.match(urlPattern);
721
+ if (match && match[1] && !isSvgDataUrl(match[1])) {
722
+ return match[1];
723
+ }
724
+ }
725
+ return null;
726
+ }
727
+ return url;
728
+ }
729
+ return null;
730
+ }
731
+ /**
732
+ * Select the best source element from a list of sources
733
+ * based on media queries and srcset values
734
+ */
735
+ function selectBestSource(sources) {
736
+ if (sources.length === 0) {
737
+ return null;
738
+ }
739
+ // If only one source, return it
740
+ if (sources.length === 1) {
741
+ return sources[0];
742
+ }
743
+ // First, try to find a source without media queries (default)
744
+ for (let i = 0; i < sources.length; i++) {
745
+ if (!sources[i].hasAttribute('media')) {
746
+ return sources[i];
747
+ }
748
+ }
749
+ // If no default source, try to find the highest resolution source
750
+ // by analyzing the srcset values
751
+ let bestSource = null;
752
+ let maxResolution = 0;
753
+ for (let i = 0; i < sources.length; i++) {
754
+ const source = sources[i];
755
+ const srcset = source.getAttribute('srcset');
756
+ if (!srcset)
757
+ continue;
758
+ // Extract width and DPR from srcset
759
+ const widthMatch = srcset.match(widthPattern);
760
+ const dprMatch = srcset.match(dprPattern);
761
+ if (widthMatch && widthMatch[1]) {
762
+ const width = parseInt(widthMatch[1], 10);
763
+ const dpr = dprMatch ? parseFloat(dprMatch[1]) : 1;
764
+ // Calculate effective resolution (width * DPR)
765
+ const resolution = width * dpr;
766
+ if (resolution > maxResolution) {
767
+ maxResolution = resolution;
768
+ bestSource = source;
769
+ }
770
+ }
771
+ }
772
+ // If we found a source with resolution, return it
773
+ if (bestSource) {
774
+ return bestSource;
775
+ }
776
+ // If no resolution found, return the first source
777
+ return sources[0];
778
+ }
779
+ //# sourceMappingURL=images.js.map