@j0hanz/fetch-url-mcp 1.10.13 → 1.10.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"AA6oBA,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAY/D;AAED,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CASR;AA0CD,qEAAqE;AACrE,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CAG5D;AAyTD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAMN;AA4BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR"}
|
package/dist/lib/dom-prep.js
CHANGED
|
@@ -497,10 +497,61 @@ function processUrlElement(el, attr, base, isSrcset) {
|
|
|
497
497
|
el.setAttribute(attr, resolved.href);
|
|
498
498
|
}
|
|
499
499
|
}
|
|
500
|
+
// Rewrite WordPress Photon CDN image URLs to point to the original host, since srcset URLs are often preserved with the updated domain while src is not.
|
|
501
|
+
// This ensures images are correctly resolved when the page is migrated to a new domain but still references the old domain in img src attributes.
|
|
502
|
+
const WP_PHOTON_HOST_PATTERN = /^i\d\.wp\.com$/;
|
|
503
|
+
function rewritePhotonSrc(document, pageHost) {
|
|
504
|
+
for (const img of document.querySelectorAll('img[src]')) {
|
|
505
|
+
const src = img.getAttribute('src');
|
|
506
|
+
if (!src)
|
|
507
|
+
continue;
|
|
508
|
+
const parsed = URL.parse(src);
|
|
509
|
+
if (!parsed || !WP_PHOTON_HOST_PATTERN.test(parsed.hostname))
|
|
510
|
+
continue;
|
|
511
|
+
if (img.getAttribute('srcset'))
|
|
512
|
+
continue;
|
|
513
|
+
const segments = parsed.pathname.split('/').filter(Boolean);
|
|
514
|
+
if (segments.length < 2)
|
|
515
|
+
continue;
|
|
516
|
+
const originHost = segments[0];
|
|
517
|
+
if (!originHost?.includes('.'))
|
|
518
|
+
continue;
|
|
519
|
+
const resourcePath = `/${segments.slice(1).join('/')}`;
|
|
520
|
+
const rewritten = `https://${pageHost}${resourcePath}`;
|
|
521
|
+
img.setAttribute('src', rewritten);
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
// For images with src URLs pointing to a different domain than the page, check if their srcset contains a same-domain URL and prefer that for the src attribute.
|
|
525
|
+
// This can help preserve image loading when migrating content that references an old domain, as srcset entries are often left unchanged while src attributes are updated or removed.
|
|
526
|
+
function preferSameDomainSrc(document, base) {
|
|
527
|
+
const pageHost = base.hostname;
|
|
528
|
+
for (const img of document.querySelectorAll('img[src][srcset]')) {
|
|
529
|
+
const src = img.getAttribute('src');
|
|
530
|
+
if (!src)
|
|
531
|
+
continue;
|
|
532
|
+
const srcParsed = URL.parse(src);
|
|
533
|
+
if (!srcParsed || srcParsed.hostname === pageHost)
|
|
534
|
+
continue;
|
|
535
|
+
const srcset = img.getAttribute('srcset') ?? '';
|
|
536
|
+
const entries = srcset.split(',');
|
|
537
|
+
for (const entry of entries) {
|
|
538
|
+
const url = entry.trim().split(/\s+/)[0];
|
|
539
|
+
if (!url)
|
|
540
|
+
continue;
|
|
541
|
+
const parsed = URL.parse(url);
|
|
542
|
+
if (parsed?.hostname === pageHost) {
|
|
543
|
+
img.setAttribute('src', url);
|
|
544
|
+
break;
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
}
|
|
500
549
|
function resolveUrls(document, baseUrlStr) {
|
|
501
550
|
const base = URL.parse(baseUrlStr);
|
|
502
551
|
if (!base)
|
|
503
552
|
return;
|
|
553
|
+
rewritePhotonSrc(document, base.hostname);
|
|
554
|
+
preferSameDomainSrc(document, base);
|
|
504
555
|
const elements = document.querySelectorAll('a[href],img[src],source[srcset]');
|
|
505
556
|
for (const el of elements) {
|
|
506
557
|
const tag = el.tagName.toLowerCase();
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AA4eA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
|
|
@@ -122,10 +122,27 @@ function resolveLazySrc(getAttribute) {
|
|
|
122
122
|
}
|
|
123
123
|
return undefined;
|
|
124
124
|
}
|
|
125
|
+
// Some sites (notably WordPress with Photon CDN) use a CDN proxy URL in img src while keeping the original same-domain URL in srcset.
|
|
126
|
+
// Since the converter prefers srcset URLs for CDN-hosted images, we need to detect this pattern and extract the canonical URL from srcset to ensure images are correctly resolved, especially when migrating content to a new domain.
|
|
127
|
+
const WP_PHOTON_HOST_PATTERN = /^i\d\.wp\.com$/;
|
|
128
|
+
function isWpPhotonUrl(src) {
|
|
129
|
+
const parsed = URL.parse(src);
|
|
130
|
+
return parsed !== null && WP_PHOTON_HOST_PATTERN.test(parsed.hostname);
|
|
131
|
+
}
|
|
125
132
|
function resolveImageSrc(getAttribute) {
|
|
126
133
|
if (!getAttribute)
|
|
127
134
|
return '';
|
|
128
135
|
const srcRaw = getAttribute('src') ?? '';
|
|
136
|
+
// When src is a CDN proxy URL, prefer srcset which usually has the
|
|
137
|
+
// canonical same-domain URL that survives domain migrations.
|
|
138
|
+
if (srcRaw && isWpPhotonUrl(srcRaw)) {
|
|
139
|
+
const srcset = getAttribute('srcset');
|
|
140
|
+
if (srcset) {
|
|
141
|
+
const url = extractNonDataSrcsetUrl(srcset);
|
|
142
|
+
if (url)
|
|
143
|
+
return url;
|
|
144
|
+
}
|
|
145
|
+
}
|
|
129
146
|
if (srcRaw && !isDataUri(srcRaw))
|
|
130
147
|
return srcRaw;
|
|
131
148
|
// First check common lazy-loading attributes that may contain non-data URLs before falling back to the native srcset, as some sites use data URIs in lazy attributes while still providing valid URLs in srcset.
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AA2DA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAmCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA6ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;
|
|
1
|
+
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AA2DA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAmCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA6ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAicD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA4KD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AA0ID,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAgED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAwjBD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAgBzB;AAaD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AA+G1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
|
|
@@ -294,6 +294,21 @@ function resolveCollapsedTextLengthUpTo(text, max) {
|
|
|
294
294
|
}
|
|
295
295
|
return length;
|
|
296
296
|
}
|
|
297
|
+
function preserveGalleryImages(doc) {
|
|
298
|
+
const galleries = doc.querySelectorAll('[class*="gallery"],[class*="slideshow"],[class*="carousel"]');
|
|
299
|
+
for (const gallery of galleries) {
|
|
300
|
+
const images = gallery.querySelectorAll('img');
|
|
301
|
+
if (images.length === 0)
|
|
302
|
+
continue;
|
|
303
|
+
const fragment = doc.createDocumentFragment();
|
|
304
|
+
for (const img of images) {
|
|
305
|
+
const figure = doc.createElement('figure');
|
|
306
|
+
figure.appendChild(img.cloneNode(true));
|
|
307
|
+
fragment.appendChild(figure);
|
|
308
|
+
}
|
|
309
|
+
gallery.replaceWith(fragment);
|
|
310
|
+
}
|
|
311
|
+
}
|
|
297
312
|
function preserveAlertElements(doc) {
|
|
298
313
|
const alerts = doc.querySelectorAll('[role="alert"], .admonition, [class*="callout"]');
|
|
299
314
|
for (const el of alerts) {
|
|
@@ -313,6 +328,7 @@ function preserveCodeLanguageAttributes(doc) {
|
|
|
313
328
|
}
|
|
314
329
|
const STRUCTURAL_SKIP_TAGS = new Set(['HTML', 'BODY']);
|
|
315
330
|
function prepareReadabilityDocument(readabilityDoc) {
|
|
331
|
+
preserveGalleryImages(readabilityDoc);
|
|
316
332
|
preserveAlertElements(readabilityDoc);
|
|
317
333
|
preserveCodeLanguageAttributes(readabilityDoc);
|
|
318
334
|
normalizeTabContent(readabilityDoc);
|
package/package.json
CHANGED