@mz1999/defuddle 0.14.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +371 -0
  3. package/dist/cli.d.ts +2 -0
  4. package/dist/cli.js +145 -0
  5. package/dist/cli.js.map +1 -0
  6. package/dist/constants.d.ts +24 -0
  7. package/dist/constants.js +950 -0
  8. package/dist/constants.js.map +1 -0
  9. package/dist/defuddle.d.ts +136 -0
  10. package/dist/defuddle.js +1816 -0
  11. package/dist/defuddle.js.map +1 -0
  12. package/dist/elements/callouts.d.ts +6 -0
  13. package/dist/elements/callouts.js +74 -0
  14. package/dist/elements/callouts.js.map +1 -0
  15. package/dist/elements/code.d.ts +5 -0
  16. package/dist/elements/code.js +346 -0
  17. package/dist/elements/code.js.map +1 -0
  18. package/dist/elements/footnotes.d.ts +5 -0
  19. package/dist/elements/footnotes.js +619 -0
  20. package/dist/elements/footnotes.js.map +1 -0
  21. package/dist/elements/headings.d.ts +11 -0
  22. package/dist/elements/headings.js +100 -0
  23. package/dist/elements/headings.js.map +1 -0
  24. package/dist/elements/images.d.ts +8 -0
  25. package/dist/elements/images.js +877 -0
  26. package/dist/elements/images.js.map +1 -0
  27. package/dist/elements/math.base.d.ts +9 -0
  28. package/dist/elements/math.base.js +195 -0
  29. package/dist/elements/math.base.js.map +1 -0
  30. package/dist/elements/math.core.d.ts +7 -0
  31. package/dist/elements/math.core.js +52 -0
  32. package/dist/elements/math.core.js.map +1 -0
  33. package/dist/elements/math.d.ts +2 -0
  34. package/dist/elements/math.full.d.ts +8 -0
  35. package/dist/elements/math.js +7 -0
  36. package/dist/elements/math.js.map +1 -0
  37. package/dist/extractor-registry.d.ts +16 -0
  38. package/dist/extractor-registry.js +140 -0
  39. package/dist/extractor-registry.js.map +1 -0
  40. package/dist/extractors/_base.d.ts +22 -0
  41. package/dist/extractors/_base.js +27 -0
  42. package/dist/extractors/_base.js.map +1 -0
  43. package/dist/extractors/_conversation.d.ts +9 -0
  44. package/dist/extractors/_conversation.js +78 -0
  45. package/dist/extractors/_conversation.js.map +1 -0
  46. package/dist/extractors/chatgpt.d.ts +14 -0
  47. package/dist/extractors/chatgpt.js +138 -0
  48. package/dist/extractors/chatgpt.js.map +1 -0
  49. package/dist/extractors/claude.d.ts +10 -0
  50. package/dist/extractors/claude.js +91 -0
  51. package/dist/extractors/claude.js.map +1 -0
  52. package/dist/extractors/gemini.d.ts +14 -0
  53. package/dist/extractors/gemini.js +111 -0
  54. package/dist/extractors/gemini.js.map +1 -0
  55. package/dist/extractors/github.d.ts +20 -0
  56. package/dist/extractors/github.js +251 -0
  57. package/dist/extractors/github.js.map +1 -0
  58. package/dist/extractors/grok.d.ts +15 -0
  59. package/dist/extractors/grok.js +142 -0
  60. package/dist/extractors/grok.js.map +1 -0
  61. package/dist/extractors/hackernews.d.ts +21 -0
  62. package/dist/extractors/hackernews.js +155 -0
  63. package/dist/extractors/hackernews.js.map +1 -0
  64. package/dist/extractors/reddit.d.ts +22 -0
  65. package/dist/extractors/reddit.js +197 -0
  66. package/dist/extractors/reddit.js.map +1 -0
  67. package/dist/extractors/twitter.d.ts +16 -0
  68. package/dist/extractors/twitter.js +204 -0
  69. package/dist/extractors/twitter.js.map +1 -0
  70. package/dist/extractors/x-article.d.ts +24 -0
  71. package/dist/extractors/x-article.js +267 -0
  72. package/dist/extractors/x-article.js.map +1 -0
  73. package/dist/extractors/x-oembed.d.ts +20 -0
  74. package/dist/extractors/x-oembed.js +350 -0
  75. package/dist/extractors/x-oembed.js.map +1 -0
  76. package/dist/extractors/youtube.d.ts +87 -0
  77. package/dist/extractors/youtube.js +869 -0
  78. package/dist/extractors/youtube.js.map +1 -0
  79. package/dist/fetch.d.ts +18 -0
  80. package/dist/fetch.js +265 -0
  81. package/dist/fetch.js.map +1 -0
  82. package/dist/index.d.ts +3 -0
  83. package/dist/index.full.d.ts +12 -0
  84. package/dist/index.full.js +1 -0
  85. package/dist/index.js +1 -0
  86. package/dist/index.js.map +1 -0
  87. package/dist/markdown.d.ts +30 -0
  88. package/dist/markdown.js +661 -0
  89. package/dist/markdown.js.map +1 -0
  90. package/dist/metadata.d.ts +25 -0
  91. package/dist/metadata.js +426 -0
  92. package/dist/metadata.js.map +1 -0
  93. package/dist/node.d.ts +19 -0
  94. package/dist/node.js +78 -0
  95. package/dist/node.js.map +1 -0
  96. package/dist/scoring.d.ts +31 -0
  97. package/dist/scoring.js +472 -0
  98. package/dist/scoring.js.map +1 -0
  99. package/dist/standardize.d.ts +2 -0
  100. package/dist/standardize.js +1101 -0
  101. package/dist/standardize.js.map +1 -0
  102. package/dist/types/extractors.d.ts +41 -0
  103. package/dist/types/extractors.js +3 -0
  104. package/dist/types/extractors.js.map +1 -0
  105. package/dist/types.d.ts +135 -0
  106. package/dist/types.js +3 -0
  107. package/dist/types.js.map +1 -0
  108. package/dist/utils/comments.d.ts +44 -0
  109. package/dist/utils/comments.js +103 -0
  110. package/dist/utils/comments.js.map +1 -0
  111. package/dist/utils/dom.d.ts +42 -0
  112. package/dist/utils/dom.js +104 -0
  113. package/dist/utils/dom.js.map +1 -0
  114. package/dist/utils/linkedom-compat.d.ts +5 -0
  115. package/dist/utils/linkedom-compat.js +23 -0
  116. package/dist/utils/linkedom-compat.js.map +1 -0
  117. package/dist/utils/transcript.d.ts +37 -0
  118. package/dist/utils/transcript.js +61 -0
  119. package/dist/utils/transcript.js.map +1 -0
  120. package/dist/utils.d.ts +13 -0
  121. package/dist/utils.js +98 -0
  122. package/dist/utils.js.map +1 -0
  123. package/package.json +107 -0
@@ -0,0 +1,877 @@
1
+ "use strict";
2
+ /**
3
+ * Standardization rules for handling images
4
+ */
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.imageRules = void 0;
7
+ const utils_1 = require("../utils");
8
+ const dom_1 = require("../utils/dom");
9
+ const constants_1 = require("../constants");
10
+ // Pre-compile regular expressions
11
+ const b64DataUrlRegex = /^data:image\/([^;]+);base64,/;
12
+ const srcsetPattern = /\.(jpg|jpeg|png|webp)\s+\d/;
13
+ const srcPattern = /^\s*\S+\.(jpg|jpeg|png|webp)\S*\s*$/;
14
+ const imageUrlPattern = /\.(jpg|jpeg|png|webp|gif|avif)(\?.*)?$/i;
15
+ const widthPattern = /\s(\d+)w/;
16
+ const dprPattern = /dpr=(\d+(?:\.\d+)?)/;
17
+ const urlPattern = /^([^\s]+)/;
18
+ const filenamePattern = /^[\w\-\.\/\\]+\.(jpg|jpeg|png|gif|webp|svg)$/i;
19
+ const datePattern = /^\d{4}-\d{2}-\d{2}$/;
20
+ exports.imageRules = [
21
+ // Handle picture elements first to ensure we get the highest resolution
22
+ {
23
+ selector: 'picture',
24
+ element: 'picture',
25
+ transform: (el, doc) => {
26
+ const sourceElements = el.querySelectorAll('source');
27
+ const imgElement = el.querySelector('img');
28
+ if (!imgElement) {
29
+ console.warn('Picture element without img fallback:', el.outerHTML);
30
+ const bestSource = selectBestSource(sourceElements);
31
+ if (bestSource) {
32
+ const srcset = bestSource.getAttribute('srcset');
33
+ if (srcset) {
34
+ const newImg = doc.createElement('img');
35
+ applySrcsetToImage(srcset, newImg);
36
+ el.replaceChildren(newImg);
37
+ return el;
38
+ }
39
+ }
40
+ return el;
41
+ }
42
+ let bestSrcset = null;
43
+ let bestSrc = null;
44
+ if (sourceElements.length > 0) {
45
+ const bestSource = selectBestSource(sourceElements);
46
+ if (bestSource) {
47
+ bestSrcset = bestSource.getAttribute('srcset');
48
+ if (bestSrcset) {
49
+ bestSrc = extractFirstUrlFromSrcset(bestSrcset);
50
+ }
51
+ }
52
+ }
53
+ if (bestSrcset) {
54
+ imgElement.setAttribute('srcset', bestSrcset);
55
+ }
56
+ if (bestSrc && isValidImageUrl(bestSrc)) {
57
+ imgElement.setAttribute('src', bestSrc);
58
+ }
59
+ else if (!imgElement.hasAttribute('src') || !isValidImageUrl(imgElement.getAttribute('src') || '')) {
60
+ const firstUrl = extractFirstUrlFromSrcset(imgElement.getAttribute('srcset') || bestSrcset || '');
61
+ if (firstUrl && isValidImageUrl(firstUrl)) {
62
+ imgElement.setAttribute('src', firstUrl);
63
+ }
64
+ }
65
+ sourceElements.forEach(source => source.remove());
66
+ return el;
67
+ }
68
+ },
69
+ // Handle custom <uni-image-full-width> elements
70
+ {
71
+ selector: 'uni-image-full-width',
72
+ element: 'figure',
73
+ transform: (el, doc) => {
74
+ const figure = doc.createElement('figure');
75
+ const img = doc.createElement('img');
76
+ // Find the original image element
77
+ const originalImg = el.querySelector('img');
78
+ if (!originalImg) {
79
+ // If no img inside, return an empty figure or maybe just the original element?
80
+ // Returning empty figure for now, as it represents a failed conversion.
81
+ console.warn('uni-image-full-width without img:', el.outerHTML);
82
+ return figure;
83
+ }
84
+ let bestSrc = originalImg.getAttribute('src'); // Default to src
85
+ const dataLoadingAttr = originalImg.getAttribute('data-loading');
86
+ if (dataLoadingAttr) {
87
+ try {
88
+ const dataLoading = JSON.parse(dataLoadingAttr);
89
+ if (dataLoading.desktop && isValidImageUrl(dataLoading.desktop)) {
90
+ bestSrc = dataLoading.desktop; // Prefer desktop URL
91
+ }
92
+ }
93
+ catch (e) {
94
+ console.warn('Failed to parse data-loading attribute:', dataLoadingAttr, e);
95
+ }
96
+ }
97
+ if (bestSrc && isValidImageUrl(bestSrc)) {
98
+ img.setAttribute('src', bestSrc);
99
+ }
100
+ else {
101
+ // If no valid src found, maybe skip this image?
102
+ console.warn('Could not find valid src for uni-image-full-width:', el.outerHTML);
103
+ return figure; // Return empty figure
104
+ }
105
+ let altText = originalImg.getAttribute('alt');
106
+ if (!altText) {
107
+ altText = el.getAttribute('alt-text'); // Fallback to parent attribute
108
+ }
109
+ if (altText) {
110
+ img.setAttribute('alt', altText);
111
+ }
112
+ // Append the image to the figure
113
+ figure.appendChild(img);
114
+ // Find and add caption
115
+ const figcaptionEl = el.querySelector('figcaption');
116
+ if (figcaptionEl) {
117
+ // Extract text content, potentially from nested elements like <p>
118
+ const captionText = figcaptionEl.textContent?.trim();
119
+ if (captionText && captionText.length > 5) { // Basic check for meaningful caption
120
+ const figcaption = doc.createElement('figcaption');
121
+ // Try to get cleaner text from specific inner element if possible
122
+ const richTextP = figcaptionEl.querySelector('.rich-text p');
123
+ if (richTextP) {
124
+ (0, dom_1.transferContent)(richTextP, figcaption);
125
+ }
126
+ else {
127
+ figcaption.textContent = captionText;
128
+ }
129
+ figure.appendChild(figcaption);
130
+ }
131
+ }
132
+ return figure;
133
+ }
134
+ },
135
+ // Handle lazy-loaded images
136
+ {
137
+ selector: 'img[data-src], img[data-srcset], img[loading="lazy"], img.lazy, img.lazyload',
138
+ element: 'img',
139
+ transform: (el, doc) => {
140
+ // Check for base64 placeholder images
141
+ const src = el.getAttribute('src') || '';
142
+ const hasBetterSource = hasBetterImageSource(el);
143
+ if (isBase64Placeholder(src) && hasBetterSource) {
144
+ // Remove the placeholder src if we have better alternatives
145
+ el.removeAttribute('src');
146
+ }
147
+ // Handle data-src
148
+ const dataSrc = el.getAttribute('data-src');
149
+ if (dataSrc && !el.getAttribute('src')) {
150
+ el.setAttribute('src', dataSrc);
151
+ }
152
+ // Handle data-srcset
153
+ const dataSrcset = el.getAttribute('data-srcset');
154
+ if (dataSrcset && !el.getAttribute('srcset')) {
155
+ el.setAttribute('srcset', dataSrcset);
156
+ }
157
+ // Check for other attributes that might contain image URLs
158
+ for (let i = 0; i < el.attributes.length; i++) {
159
+ const attr = el.attributes[i];
160
+ if (attr.name === 'src' || attr.name === 'srcset' || attr.name === 'alt') {
161
+ continue; // Skip these attributes
162
+ }
163
+ // Skip JSON-like values (e.g., Substack's data-attrs containing image metadata)
164
+ const firstChar = attr.value.charAt(0);
165
+ if (firstChar === '{' || firstChar === '[') {
166
+ continue;
167
+ }
168
+ // Check if attribute contains an image URL
169
+ if (srcsetPattern.test(attr.value)) {
170
+ // This looks like a srcset value
171
+ el.setAttribute('srcset', attr.value);
172
+ }
173
+ else if (srcPattern.test(attr.value)) {
174
+ // This looks like a src value
175
+ el.setAttribute('src', attr.value);
176
+ }
177
+ }
178
+ // Remove lazy loading related classes and attributes
179
+ el.classList.remove('lazy', 'lazyload');
180
+ el.removeAttribute('data-ll-status');
181
+ el.removeAttribute('data-src');
182
+ el.removeAttribute('data-srcset');
183
+ el.removeAttribute('loading');
184
+ return el;
185
+ }
186
+ },
187
+ // Handle span elements containing images with captions
188
+ {
189
+ selector: 'span:has(img)',
190
+ element: 'span',
191
+ transform: (el, doc) => {
192
+ try {
193
+ const hasImage = containsImage(el);
194
+ if (!hasImage) {
195
+ return el;
196
+ }
197
+ // Skip spans that are content containers rather than image wrappers.
198
+ // A span with block-level children (p, h1-h6, div, etc.) is a content
199
+ // container that happens to contain images, not an image wrapper.
200
+ for (const child of el.children) {
201
+ if (constants_1.BLOCK_LEVEL_ELEMENTS.has(child.tagName.toLowerCase())) {
202
+ return el;
203
+ }
204
+ }
205
+ const imgElement = findMainImage(el);
206
+ if (!imgElement) {
207
+ return el;
208
+ }
209
+ const caption = findCaption(el);
210
+ // Process the image element (might return the img itself or handle picture/source)
211
+ const processedImg = processImageElement(imgElement, doc);
212
+ if (caption && hasMeaningfulCaption(caption)) {
213
+ const figure = createFigureWithCaption(processedImg, caption, doc);
214
+ // Remove the original caption element from its parent
215
+ // to prevent duplication, as the span itself might remain.
216
+ if (caption.parentNode) {
217
+ caption.parentNode.removeChild(caption);
218
+ }
219
+ return figure; // Replace the span (or its content) with the figure
220
+ }
221
+ else {
222
+ // No meaningful caption, return just the processed image.
223
+ // This might replace the span content or the span itself depending on framework.
224
+ return processedImg;
225
+ }
226
+ }
227
+ catch (error) {
228
+ console.warn('Error processing span with image:', error);
229
+ return el;
230
+ }
231
+ }
232
+ },
233
+ // Standardize complex image elements (figure, picture, source, figcaption)
234
+ {
235
+ selector: 'figure, p:has([class*="caption"])',
236
+ element: 'figure',
237
+ transform: (el, doc) => {
238
+ try {
239
+ const hasImage = containsImage(el);
240
+ if (!hasImage) {
241
+ return el;
242
+ }
243
+ const imgElement = findMainImage(el); // Initial find (might be picture)
244
+ if (!imgElement) {
245
+ return el;
246
+ }
247
+ // Note: Previous rules might have processed the image inside 'el'.
248
+ const caption = findCaption(el);
249
+ if (caption && hasMeaningfulCaption(caption)) {
250
+ // Find the *current* image element inside 'el' again.
251
+ // It might have been modified (e.g., picture rule -> img)
252
+ const currentImg = findMainImage(el);
253
+ let imageToAdd;
254
+ if (currentImg) {
255
+ // We'll clone this inside the helper function
256
+ imageToAdd = currentImg;
257
+ }
258
+ else {
259
+ // Fallback: process the initially found element.
260
+ console.warn("Figure rule couldn't find current image element in:", el.outerHTML);
261
+ // processImageElement will clone if needed
262
+ imageToAdd = processImageElement(imgElement, doc);
263
+ }
264
+ // Use the helper function to create the figure
265
+ // The helper clones the imageToAdd before appending.
266
+ return createFigureWithCaption(imageToAdd, caption, doc);
267
+ }
268
+ else {
269
+ // No meaningful caption found. Return the original element 'el'.
270
+ // Preceding rules should have processed the image content *within* 'el'.
271
+ return el;
272
+ }
273
+ }
274
+ catch (error) {
275
+ console.warn('Error processing complex image element:', error);
276
+ return el;
277
+ }
278
+ }
279
+ },
280
+ ];
281
+ /**
282
+ * Creates a standard <figure> element containing an image and a caption.
283
+ */
284
+ function createFigureWithCaption(imageElement, captionElement, doc) {
285
+ const figure = doc.createElement('figure');
286
+ // Append a clone of the image element to prevent side effects
287
+ figure.appendChild(imageElement.cloneNode(true));
288
+ // Add caption
289
+ const figcaption = doc.createElement('figcaption');
290
+ const uniqueCaptionContent = extractUniqueCaptionContent(captionElement);
291
+ figcaption.appendChild((0, dom_1.parseHTML)(doc, uniqueCaptionContent));
292
+ figure.appendChild(figcaption);
293
+ return figure;
294
+ }
295
+ /**
296
+ * Apply srcset to an image element
297
+ */
298
+ function applySrcsetToImage(srcset, img) {
299
+ img.setAttribute('srcset', srcset);
300
+ // Extract the first URL from srcset as the src
301
+ const firstUrl = extractFirstUrlFromSrcset(srcset);
302
+ if (firstUrl && isValidImageUrl(firstUrl)) {
303
+ img.setAttribute('src', firstUrl);
304
+ }
305
+ }
306
+ /**
307
+ * Copy attributes from one element to another, excluding specified attributes
308
+ */
309
+ function copyAttributesExcept(source, target, excludeAttrs) {
310
+ for (let i = 0; i < source.attributes.length; i++) {
311
+ const attr = source.attributes[i];
312
+ if (!excludeAttrs.includes(attr.name)) {
313
+ target.setAttribute(attr.name, attr.value);
314
+ }
315
+ }
316
+ }
317
+ /**
318
+ * Check if a string is a base64 placeholder image
319
+ */
320
+ function isBase64Placeholder(src) {
321
+ // Check if it's a base64 data URL
322
+ const match = src.match(b64DataUrlRegex);
323
+ if (!match) {
324
+ return false;
325
+ }
326
+ // Skip SVG images as they can be meaningful even when small
327
+ if (match[1] === 'svg+xml') {
328
+ return false;
329
+ }
330
+ // Check if the base64 part is too small (likely a placeholder)
331
+ const b64starts = match[0].length;
332
+ const b64length = src.length - b64starts;
333
+ // If less than 133 bytes (100 bytes after base64 encoding), it's likely a placeholder
334
+ return b64length < 133;
335
+ }
336
+ /**
337
+ * Check if a string is an SVG data URL
338
+ */
339
+ function isSvgDataUrl(src) {
340
+ return src.startsWith('data:image/svg+xml');
341
+ }
342
+ /**
343
+ * Check if a string is a valid image URL
344
+ */
345
+ function isValidImageUrl(src) {
346
+ // Skip data URLs (both base64 and SVG)
347
+ if (src.startsWith('data:')) {
348
+ return false;
349
+ }
350
+ // Skip empty or invalid URLs
351
+ if (!src || src.trim() === '') {
352
+ return false;
353
+ }
354
+ // Check if it's a valid image URL
355
+ return imageUrlPattern.test(src) ||
356
+ src.includes('image') ||
357
+ src.includes('img') ||
358
+ src.includes('photo');
359
+ }
360
+ /**
361
+ * Check if an element has better image sources than the current src
362
+ */
363
+ function hasBetterImageSource(element) {
364
+ // Check for data-src or data-srcset
365
+ if (element.hasAttribute('data-src') || element.hasAttribute('data-srcset')) {
366
+ return true;
367
+ }
368
+ // Check for other attributes that might contain image URLs
369
+ for (let i = 0; i < element.attributes.length; i++) {
370
+ const attr = element.attributes[i];
371
+ if (attr.name === 'src') {
372
+ continue;
373
+ }
374
+ // Check if it's a data-* attribute and contains an image URL
375
+ if (attr.name.startsWith('data-') && /\.(jpg|jpeg|png|webp|gif)(\?.*)?$/i.test(attr.value)) {
376
+ return true;
377
+ }
378
+ // Check non-data attributes for image extensions
379
+ if (/\.(jpg|jpeg|png|webp|gif)(\?.*)?$/i.test(attr.value)) {
380
+ return true;
381
+ }
382
+ }
383
+ return false;
384
+ }
385
+ /**
386
+ * Check if an element or its children contain an image
387
+ */
388
+ function containsImage(element) {
389
+ // Check if element itself is an image
390
+ if (isImageElement(element)) {
391
+ return true;
392
+ }
393
+ // Check if element contains an image
394
+ const images = element.querySelectorAll('img, video, picture, source');
395
+ return images.length > 0;
396
+ }
397
+ /**
398
+ * Check if an element is an image element
399
+ */
400
+ function isImageElement(element) {
401
+ const tagName = element.tagName.toLowerCase();
402
+ return tagName === 'img' || tagName === 'video' || tagName === 'picture' || tagName === 'source';
403
+ }
404
+ /**
405
+ * Find the main image element in a container
406
+ */
407
+ function findMainImage(element) {
408
+ // If element itself is an image, return it
409
+ if (isImageElement(element)) {
410
+ return element;
411
+ }
412
+ // Look for picture elements first - they often contain the highest quality images
413
+ const pictureElements = element.querySelectorAll('picture');
414
+ if (pictureElements.length > 0) {
415
+ // For picture elements, we want to return the picture itself
416
+ // so we can process all its sources
417
+ return pictureElements[0];
418
+ }
419
+ // Look for img elements next, but skip placeholder images
420
+ const imgElements = element.querySelectorAll('img');
421
+ const filteredImgElements = [];
422
+ for (let i = 0; i < imgElements.length; i++) {
423
+ const img = imgElements[i];
424
+ // Skip placeholder images (SVG data URLs, empty alt, etc.)
425
+ const src = img.getAttribute('src') || '';
426
+ const alt = img.getAttribute('alt') || '';
427
+ // Skip SVG data URLs (placeholders)
428
+ if (src.includes('data:image/svg+xml')) {
429
+ continue;
430
+ }
431
+ // Skip base64 placeholder images
432
+ if (isBase64Placeholder(src)) {
433
+ continue;
434
+ }
435
+ // Skip empty alt text (often indicates decorative images)
436
+ // But only if we have other images with alt text
437
+ if (!alt.trim() && imgElements.length > 1) {
438
+ continue;
439
+ }
440
+ filteredImgElements.push(img);
441
+ }
442
+ if (filteredImgElements.length > 0) {
443
+ return filteredImgElements[0];
444
+ }
445
+ // Look for video elements next
446
+ const videoElements = element.querySelectorAll('video');
447
+ if (videoElements.length > 0) {
448
+ return videoElements[0];
449
+ }
450
+ // Look for any source elements as a last resort
451
+ const anySourceElements = element.querySelectorAll('source');
452
+ if (anySourceElements.length > 0) {
453
+ return anySourceElements[0];
454
+ }
455
+ // If we still haven't found an image, try a more aggressive search
456
+ // This helps with deeply nested structures like Medium articles
457
+ const allImages = element.querySelectorAll('img, picture, source, video');
458
+ if (allImages.length > 0) {
459
+ return allImages[0];
460
+ }
461
+ return null;
462
+ }
463
+ /**
464
+ * Find caption in an element
465
+ */
466
+ function findCaption(element) {
467
+ // Check for existing figcaption
468
+ const figcaption = element.querySelector('figcaption');
469
+ if (figcaption) {
470
+ return figcaption;
471
+ }
472
+ // Check for elements with caption-related classes or attributes
473
+ const captionSelectors = [
474
+ '[class*="caption"]',
475
+ '[class*="description"]',
476
+ '[class*="alt"]',
477
+ '[class*="title"]',
478
+ '[class*="credit"]',
479
+ '[class*="text"]',
480
+ '[class*="post-thumbnail-text"]',
481
+ '[class*="image-caption"]',
482
+ '[class*="photo-caption"]',
483
+ '[aria-label]',
484
+ '[title]'
485
+ ];
486
+ // Track found captions to avoid duplicates
487
+ const foundCaptions = new Set();
488
+ // Combine selectors for a single query
489
+ const combinedSelector = captionSelectors.join(', ');
490
+ const captionElements = element.querySelectorAll(combinedSelector);
491
+ for (let i = 0; i < captionElements.length; i++) {
492
+ const captionEl = captionElements[i];
493
+ // Skip if this is the image element itself
494
+ if (isImageElement(captionEl)) {
495
+ continue;
496
+ }
497
+ // Check if this element has text content
498
+ const textContent = captionEl.textContent?.trim();
499
+ if (textContent && textContent.length > 0) {
500
+ // Check if we've already found this caption text
501
+ if (!foundCaptions.has(textContent)) {
502
+ foundCaptions.add(textContent);
503
+ return captionEl;
504
+ }
505
+ }
506
+ }
507
+ // Check for alt attribute on image
508
+ const imgElement = element.querySelector('img');
509
+ if (imgElement && imgElement.hasAttribute('alt')) {
510
+ const altText = imgElement.getAttribute('alt');
511
+ if (altText && altText.trim().length > 0) {
512
+ // Create a new element for the alt text
513
+ const captionEl = element.ownerDocument.createElement('div');
514
+ captionEl.textContent = altText;
515
+ return captionEl;
516
+ }
517
+ }
518
+ // Check for sibling elements that might contain captions
519
+ // This is useful for cases like the example where the caption is in a sibling div
520
+ if (element.parentElement) {
521
+ const parent = element.parentElement;
522
+ const siblings = parent.children;
523
+ for (let i = 0; i < siblings.length; i++) {
524
+ const sibling = siblings[i];
525
+ if (sibling === element)
526
+ continue;
527
+ // Check if the sibling has caption-related classes
528
+ const hasCaptionClass = Array.from(sibling.classList).some(cls => cls.includes('caption') ||
529
+ cls.includes('credit') ||
530
+ cls.includes('text') ||
531
+ cls.includes('description'));
532
+ if (hasCaptionClass) {
533
+ const textContent = sibling.textContent?.trim();
534
+ if (textContent && textContent.length > 0) {
535
+ return sibling;
536
+ }
537
+ }
538
+ }
539
+ }
540
+ // Look for text elements that follow an image within the same parent
541
+ // This handles cases like <p><img><em>caption</em></p>
542
+ const imgElements = element.querySelectorAll('img');
543
+ for (let i = 0; i < imgElements.length; i++) {
544
+ const img = imgElements[i];
545
+ const parent = img.parentElement;
546
+ if (!parent)
547
+ continue;
548
+ // Look for text elements that follow the image
549
+ let nextElement = img.nextElementSibling;
550
+ while (nextElement) {
551
+ // Check if it's a text element (em, strong, span, etc.)
552
+ if (['EM', 'STRONG', 'SPAN', 'I', 'B', 'SMALL', 'CITE'].includes(nextElement.tagName)) {
553
+ const textContent = nextElement.textContent?.trim();
554
+ if (textContent && textContent.length > 0) {
555
+ return nextElement;
556
+ }
557
+ }
558
+ nextElement = nextElement.nextElementSibling;
559
+ }
560
+ }
561
+ // Check for text elements that are children of the same parent as the image
562
+ // This handles cases like <span><img><em>caption</em></span>
563
+ for (let i = 0; i < imgElements.length; i++) {
564
+ const img = imgElements[i];
565
+ const parent = img.parentElement;
566
+ if (!parent)
567
+ continue;
568
+ // Get all text elements in the parent
569
+ const textElements = parent.querySelectorAll('em, strong, span, i, b, small, cite');
570
+ for (let j = 0; j < textElements.length; j++) {
571
+ const textEl = textElements[j];
572
+ // Skip if this is the image itself
573
+ if (textEl === img)
574
+ continue;
575
+ const textContent = textEl.textContent?.trim();
576
+ if (textContent && textContent.length > 0) {
577
+ return textEl;
578
+ }
579
+ }
580
+ }
581
+ return null;
582
+ }
583
+ /**
584
+ * Extract unique caption content to avoid duplication
585
+ */
586
+ function extractUniqueCaptionContent(caption) {
587
+ // Get all text nodes and elements with text content
588
+ const textNodes = [];
589
+ const processedTexts = new Set();
590
+ // Helper function to process a node
591
+ const processNode = (node) => {
592
+ if ((0, utils_1.isTextNode)(node)) {
593
+ const text = node.textContent?.trim() || '';
594
+ if (text && !processedTexts.has(text)) {
595
+ textNodes.push(text);
596
+ processedTexts.add(text);
597
+ }
598
+ }
599
+ else if ((0, utils_1.isElement)(node)) {
600
+ // Process child nodes
601
+ const childNodes = node.childNodes;
602
+ for (let i = 0; i < childNodes.length; i++) {
603
+ processNode(childNodes[i]);
604
+ }
605
+ }
606
+ };
607
+ // Process all child nodes
608
+ const childNodes = caption.childNodes;
609
+ for (let i = 0; i < childNodes.length; i++) {
610
+ processNode(childNodes[i]);
611
+ }
612
+ // If we found unique text nodes, use them
613
+ if (textNodes.length > 0) {
614
+ return textNodes.join(' ');
615
+ }
616
+ // Otherwise, just use the inner HTML but try to clean it up
617
+ const html = (0, dom_1.serializeHTML)(caption);
618
+ return html;
619
+ }
620
+ /**
621
+ * Check if a caption is meaningful enough to warrant a figure element
622
+ */
623
+ function hasMeaningfulCaption(caption) {
624
+ // Get the text content
625
+ const textContent = caption.textContent?.trim() || '';
626
+ // If it's just a URL or very short, it's not meaningful
627
+ if (textContent.length < 10 ||
628
+ textContent.startsWith('http://') ||
629
+ textContent.startsWith('https://')) {
630
+ return false;
631
+ }
632
+ // Check if it's just a filename or path
633
+ if (filenamePattern.test(textContent)) {
634
+ return false;
635
+ }
636
+ // Check if it's just a number or date
637
+ if (textContent.match(/^\d+$/) || datePattern.test(textContent)) {
638
+ return false;
639
+ }
640
+ return true;
641
+ }
642
+ /**
643
+ * Process an image element
644
+ */
645
+ function processImageElement(element, doc) {
646
+ const tagName = element.tagName.toLowerCase();
647
+ // Handle different types of image elements
648
+ if (tagName === 'img') {
649
+ return processImgElement(element, doc);
650
+ }
651
+ else if (tagName === 'picture') {
652
+ // The picture rule modifies the img inside the picture and returns the picture itself.
653
+ // This function might be called by rules like 'span:has(img)' or 'figure'.
654
+ // If it receives a picture element processed by the picture rule, it should extract the img inside.
655
+ const imgInside = element.querySelector('img');
656
+ return imgInside ? processImgElement(imgInside, doc) : element.cloneNode(true);
657
+ }
658
+ else if (tagName === 'source') {
659
+ return processSourceElement(element, doc);
660
+ }
661
+ // Default case: return a clone
662
+ return element.cloneNode(true);
663
+ }
664
+ /**
665
+ * Process an img element
666
+ */
667
+ function processImgElement(element, doc) {
668
+ // For img elements, check if it's a placeholder
669
+ const src = element.getAttribute('src') || '';
670
+ if (isBase64Placeholder(src) || isSvgDataUrl(src)) {
671
+ // Try to find a better image in the parent
672
+ const parent = element.parentElement;
673
+ if (parent) {
674
+ // Look for source elements with data-srcset
675
+ const sourceElements = parent.querySelectorAll('source');
676
+ const filteredSources = [];
677
+ for (let i = 0; i < sourceElements.length; i++) {
678
+ const source = sourceElements[i];
679
+ if (source.hasAttribute('data-srcset') && source.getAttribute('data-srcset') !== '') {
680
+ filteredSources.push(source);
681
+ }
682
+ }
683
+ if (filteredSources.length > 0) {
684
+ // Create a new img element with the data-src
685
+ const newImg = doc.createElement('img');
686
+ const dataSrc = element.getAttribute('data-src');
687
+ if (dataSrc && !isSvgDataUrl(dataSrc)) {
688
+ newImg.setAttribute('src', dataSrc);
689
+ }
690
+ // Copy other attributes
691
+ copyAttributesExcept(element, newImg, ['src']);
692
+ return newImg;
693
+ }
694
+ }
695
+ }
696
+ // Return a clone of the img element
697
+ return element.cloneNode(true);
698
+ }
699
+ /**
700
+ * Process a picture element
701
+ */
702
+ function processPictureElement(element, doc) {
703
+ // For picture elements, we want to process all sources and select the best one
704
+ // Create a new img element
705
+ const newImg = doc.createElement('img');
706
+ // Get all source elements
707
+ const sourceElements = element.querySelectorAll('source');
708
+ // If we have multiple sources, try to select the best one
709
+ if (sourceElements.length > 1) {
710
+ // Find the best source based on media queries and srcset
711
+ const bestSource = selectBestSource(sourceElements);
712
+ if (bestSource) {
713
+ // Get the srcset from the best source
714
+ const srcset = bestSource.getAttribute('srcset');
715
+ if (srcset) {
716
+ applySrcsetToImage(srcset, newImg);
717
+ }
718
+ }
719
+ }
720
+ else if (sourceElements.length === 1) {
721
+ // If only one source, use it
722
+ const srcset = sourceElements[0].getAttribute('srcset');
723
+ if (srcset) {
724
+ applySrcsetToImage(srcset, newImg);
725
+ }
726
+ }
727
+ // Copy other attributes from the original img if it exists
728
+ const originalImg = element.querySelector('img');
729
+ if (originalImg) {
730
+ // Copy all attributes except srcset
731
+ copyAttributesExcept(originalImg, newImg, ['srcset']);
732
+ // Always set the src attribute directly from the original img
733
+ const originalSrc = originalImg.getAttribute('src');
734
+ if (originalSrc) {
735
+ newImg.setAttribute('src', originalSrc);
736
+ }
737
+ }
738
+ return newImg;
739
+ }
740
+ /**
741
+ * Process a source element
742
+ */
743
+ function processSourceElement(element, doc) {
744
+ // For source elements, create a new img element
745
+ const newImg = doc.createElement('img');
746
+ // Get the srcset from the source
747
+ const srcset = element.getAttribute('srcset');
748
+ if (srcset) {
749
+ applySrcsetToImage(srcset, newImg);
750
+ }
751
+ // Try to find a related img element to copy other attributes
752
+ const parent = element.parentElement;
753
+ if (parent) {
754
+ const imgElements = parent.querySelectorAll('img');
755
+ const filteredImgElements = [];
756
+ for (let i = 0; i < imgElements.length; i++) {
757
+ const img = imgElements[i];
758
+ const src = img.getAttribute('src') || '';
759
+ if (!isBase64Placeholder(src) && !isSvgDataUrl(src) && src !== '') {
760
+ filteredImgElements.push(img);
761
+ }
762
+ }
763
+ if (filteredImgElements.length > 0) {
764
+ copyAttributesExcept(filteredImgElements[0], newImg, ['src', 'srcset']);
765
+ // If we still don't have a valid src, use the img's src
766
+ if (!newImg.hasAttribute('src') || !isValidImageUrl(newImg.getAttribute('src') || '')) {
767
+ const imgSrc = filteredImgElements[0].getAttribute('src');
768
+ if (imgSrc && isValidImageUrl(imgSrc)) {
769
+ newImg.setAttribute('src', imgSrc);
770
+ }
771
+ }
772
+ }
773
+ else {
774
+ // If no good img found, look for one with data-src
775
+ const dataSrcImg = parent.querySelector('img[data-src]');
776
+ if (dataSrcImg) {
777
+ copyAttributesExcept(dataSrcImg, newImg, ['src', 'srcset']);
778
+ // If we still don't have a valid src, use the data-src
779
+ if (!newImg.hasAttribute('src') || !isValidImageUrl(newImg.getAttribute('src') || '')) {
780
+ const dataSrc = dataSrcImg.getAttribute('data-src');
781
+ if (dataSrc && isValidImageUrl(dataSrc)) {
782
+ newImg.setAttribute('src', dataSrc);
783
+ }
784
+ }
785
+ }
786
+ }
787
+ }
788
+ return newImg;
789
+ }
790
+ /**
791
+ * Extract the first URL from a srcset attribute.
792
+ * Handles URLs that contain commas (e.g., Substack CDN URLs like
793
+ * https://substackcdn.com/image/fetch/$s_!YemM!,w_424,c_limit,f_webp/...)
794
+ * by parsing based on width/density descriptors rather than splitting on commas.
795
+ */
796
+ function extractFirstUrlFromSrcset(srcset) {
797
+ if (!srcset || !srcset.trim())
798
+ return null;
799
+ const trimmed = srcset.trim();
800
+ // Match srcset entries by finding URL + descriptor pairs.
801
+ // Each entry ends with a width descriptor (e.g., "424w") or density descriptor (e.g., "2x").
802
+ // The URL is everything before the whitespace that precedes the descriptor.
803
+ // This handles URLs containing commas (which would break a simple comma-split).
804
+ const entryPattern = /(.+?)\s+(\d+(?:\.\d+)?[wx])/g;
805
+ let match;
806
+ let lastIndex = 0;
807
+ while ((match = entryPattern.exec(trimmed)) !== null) {
808
+ // Extract URL from this entry, trimming any leading comma+whitespace from previous entry
809
+ let url = match[1].trim();
810
+ if (lastIndex > 0) {
811
+ // Remove leading comma separator from previous entry
812
+ url = url.replace(/^,\s*/, '');
813
+ }
814
+ lastIndex = entryPattern.lastIndex;
815
+ if (!url)
816
+ continue;
817
+ // Skip SVG data URLs
818
+ if (isSvgDataUrl(url))
819
+ continue;
820
+ return url;
821
+ }
822
+ // Fallback: try extracting URL before first whitespace (for srcset with single entry and no descriptor)
823
+ const urlMatch = trimmed.match(urlPattern);
824
+ if (urlMatch && urlMatch[1] && !isSvgDataUrl(urlMatch[1])) {
825
+ return urlMatch[1];
826
+ }
827
+ return null;
828
+ }
829
+ /**
830
+ * Select the best source element from a list of sources
831
+ * based on media queries and srcset values
832
+ */
833
+ function selectBestSource(sources) {
834
+ if (sources.length === 0) {
835
+ return null;
836
+ }
837
+ // If only one source, return it
838
+ if (sources.length === 1) {
839
+ return sources[0];
840
+ }
841
+ // First, try to find a source without media queries (default)
842
+ for (let i = 0; i < sources.length; i++) {
843
+ if (!sources[i].hasAttribute('media')) {
844
+ return sources[i];
845
+ }
846
+ }
847
+ // If no default source, try to find the highest resolution source
848
+ // by analyzing the srcset values
849
+ let bestSource = null;
850
+ let maxResolution = 0;
851
+ for (let i = 0; i < sources.length; i++) {
852
+ const source = sources[i];
853
+ const srcset = source.getAttribute('srcset');
854
+ if (!srcset)
855
+ continue;
856
+ // Extract width and DPR from srcset
857
+ const widthMatch = srcset.match(widthPattern);
858
+ const dprMatch = srcset.match(dprPattern);
859
+ if (widthMatch && widthMatch[1]) {
860
+ const width = parseInt(widthMatch[1], 10);
861
+ const dpr = dprMatch ? parseFloat(dprMatch[1]) : 1;
862
+ // Calculate effective resolution (width * DPR)
863
+ const resolution = width * dpr;
864
+ if (resolution > maxResolution) {
865
+ maxResolution = resolution;
866
+ bestSource = source;
867
+ }
868
+ }
869
+ }
870
+ // If we found a source with resolution, return it
871
+ if (bestSource) {
872
+ return bestSource;
873
+ }
874
+ // If no resolution found, return the first source
875
+ return sources[0];
876
+ }
877
+ //# sourceMappingURL=images.js.map