@j0hanz/fetch-url-mcp 1.10.13 → 1.10.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- {"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"AA4lBA,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAY/D;AAED,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CASR;AA0CD,qEAAqE;AACrE,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CAG5D;AAyTD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAMN;AA4BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR"}
1
+ {"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"AA6oBA,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAY/D;AAED,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CASR;AA0CD,qEAAqE;AACrE,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CAG5D;AAyTD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAMN;AA4BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR"}
@@ -497,10 +497,61 @@ function processUrlElement(el, attr, base, isSrcset) {
497
497
  el.setAttribute(attr, resolved.href);
498
498
  }
499
499
  }
500
+ // Rewrite WordPress Photon CDN image URLs to point to the original host, since srcset URLs are often preserved with the updated domain while src is not.
501
+ // This ensures images are correctly resolved when the page is migrated to a new domain but still references the old domain in img src attributes.
502
+ const WP_PHOTON_HOST_PATTERN = /^i\d\.wp\.com$/;
503
+ function rewritePhotonSrc(document, pageHost) {
504
+ for (const img of document.querySelectorAll('img[src]')) {
505
+ const src = img.getAttribute('src');
506
+ if (!src)
507
+ continue;
508
+ const parsed = URL.parse(src);
509
+ if (!parsed || !WP_PHOTON_HOST_PATTERN.test(parsed.hostname))
510
+ continue;
511
+ if (img.getAttribute('srcset'))
512
+ continue;
513
+ const segments = parsed.pathname.split('/').filter(Boolean);
514
+ if (segments.length < 2)
515
+ continue;
516
+ const originHost = segments[0];
517
+ if (!originHost?.includes('.'))
518
+ continue;
519
+ const resourcePath = `/${segments.slice(1).join('/')}`;
520
+ const rewritten = `https://${pageHost}${resourcePath}`;
521
+ img.setAttribute('src', rewritten);
522
+ }
523
+ }
524
+ // For images with src URLs pointing to a different domain than the page, check if their srcset contains a same-domain URL and prefer that for the src attribute.
525
+ // This can help preserve image loading when migrating content that references an old domain, as srcset entries are often left unchanged while src attributes are updated or removed.
526
+ function preferSameDomainSrc(document, base) {
527
+ const pageHost = base.hostname;
528
+ for (const img of document.querySelectorAll('img[src][srcset]')) {
529
+ const src = img.getAttribute('src');
530
+ if (!src)
531
+ continue;
532
+ const srcParsed = URL.parse(src);
533
+ if (!srcParsed || srcParsed.hostname === pageHost)
534
+ continue;
535
+ const srcset = img.getAttribute('srcset') ?? '';
536
+ const entries = srcset.split(',');
537
+ for (const entry of entries) {
538
+ const url = entry.trim().split(/\s+/)[0];
539
+ if (!url)
540
+ continue;
541
+ const parsed = URL.parse(url);
542
+ if (parsed?.hostname === pageHost) {
543
+ img.setAttribute('src', url);
544
+ break;
545
+ }
546
+ }
547
+ }
548
+ }
500
549
  function resolveUrls(document, baseUrlStr) {
501
550
  const base = URL.parse(baseUrlStr);
502
551
  if (!base)
503
552
  return;
553
+ rewritePhotonSrc(document, base.hostname);
554
+ preferSameDomainSrc(document, base);
504
555
  const elements = document.querySelectorAll('a[href],img[src],source[srcset]');
505
556
  for (const el of elements) {
506
557
  const tag = el.tagName.toLowerCase();
@@ -1 +1 @@
1
- {"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AAwdA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
1
+ {"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AA4eA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
@@ -122,10 +122,27 @@ function resolveLazySrc(getAttribute) {
122
122
  }
123
123
  return undefined;
124
124
  }
125
+ // Some sites (notably WordPress with Photon CDN) use a CDN proxy URL in img src while keeping the original same-domain URL in srcset.
126
+ // Since the converter prefers srcset URLs for CDN-hosted images, we need to detect this pattern and extract the canonical URL from srcset to ensure images are correctly resolved, especially when migrating content to a new domain.
127
+ const WP_PHOTON_HOST_PATTERN = /^i\d\.wp\.com$/;
128
+ function isWpPhotonUrl(src) {
129
+ const parsed = URL.parse(src);
130
+ return parsed !== null && WP_PHOTON_HOST_PATTERN.test(parsed.hostname);
131
+ }
125
132
  function resolveImageSrc(getAttribute) {
126
133
  if (!getAttribute)
127
134
  return '';
128
135
  const srcRaw = getAttribute('src') ?? '';
136
+ // When src is a CDN proxy URL, prefer srcset which usually has the
137
+ // canonical same-domain URL that survives domain migrations.
138
+ if (srcRaw && isWpPhotonUrl(srcRaw)) {
139
+ const srcset = getAttribute('srcset');
140
+ if (srcset) {
141
+ const url = extractNonDataSrcsetUrl(srcset);
142
+ if (url)
143
+ return url;
144
+ }
145
+ }
129
146
  if (srcRaw && !isDataUri(srcRaw))
130
147
  return srcRaw;
131
148
  // First check common lazy-loading attributes that may contain non-data URLs before falling back to the native srcset, as some sites use data URIs in lazy attributes while still providing valid URLs in srcset.
@@ -1 +1 @@
1
- {"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AA2DA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAmCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA6ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AA8aD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA4KD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AA0ID,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAgED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAwjBD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAgBzB;AAaD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AA+G1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
1
+ {"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AA2DA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAmCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA6ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAicD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA4KD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AA0ID,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAgED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAwjBD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAgBzB;AAaD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AA+G1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
@@ -294,6 +294,21 @@ function resolveCollapsedTextLengthUpTo(text, max) {
294
294
  }
295
295
  return length;
296
296
  }
297
+ function preserveGalleryImages(doc) {
298
+ const galleries = doc.querySelectorAll('[class*="gallery"],[class*="slideshow"],[class*="carousel"]');
299
+ for (const gallery of galleries) {
300
+ const images = gallery.querySelectorAll('img');
301
+ if (images.length === 0)
302
+ continue;
303
+ const fragment = doc.createDocumentFragment();
304
+ for (const img of images) {
305
+ const figure = doc.createElement('figure');
306
+ figure.appendChild(img.cloneNode(true));
307
+ fragment.appendChild(figure);
308
+ }
309
+ gallery.replaceWith(fragment);
310
+ }
311
+ }
297
312
  function preserveAlertElements(doc) {
298
313
  const alerts = doc.querySelectorAll('[role="alert"], .admonition, [class*="callout"]');
299
314
  for (const el of alerts) {
@@ -313,6 +328,7 @@ function preserveCodeLanguageAttributes(doc) {
313
328
  }
314
329
  const STRUCTURAL_SKIP_TAGS = new Set(['HTML', 'BODY']);
315
330
  function prepareReadabilityDocument(readabilityDoc) {
331
+ preserveGalleryImages(readabilityDoc);
316
332
  preserveAlertElements(readabilityDoc);
317
333
  preserveCodeLanguageAttributes(readabilityDoc);
318
334
  normalizeTabContent(readabilityDoc);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@j0hanz/fetch-url-mcp",
3
- "version": "1.10.13",
3
+ "version": "1.10.15",
4
4
  "mcpName": "io.github.j0hanz/fetch-url-mcp",
5
5
  "description": "A web content fetcher MCP server that converts HTML to clean, AI and human readable markdown.",
6
6
  "type": "module",