@j0hanz/fetch-url-mcp 1.10.14 → 1.10.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ export declare function extractNoscriptImages(document: Document): void;
1
2
  export declare function resolveDocumentBody(document: Document): Element;
2
3
  export declare function serializeDocumentForMarkdown(document: Document, fallback: string): string;
3
4
  /** Surface hidden tab panels, then strip unselected tab triggers. */
@@ -1 +1 @@
1
- {"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"AA4lBA,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAY/D;AAED,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CASR;AA0CD,qEAAqE;AACrE,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CAG5D;AAyTD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAMN;AA4BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR"}
1
+ {"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"AAwnBA,wBAAgB,qBAAqB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CA6B9D;AAuBD,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAY/D;AAED,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CASR;AA0CD,qEAAqE;AACrE,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CAG5D;AAyTD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAON;AA4BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR"}
@@ -497,10 +497,90 @@ function processUrlElement(el, attr, base, isSrcset) {
497
497
  el.setAttribute(attr, resolved.href);
498
498
  }
499
499
  }
500
+ // Rewrite WordPress Photon CDN image URLs to point to the original host, since srcset URLs are often preserved with the updated domain while src is not.
501
+ // This ensures images are correctly resolved when the page is migrated to a new domain but still references the old domain in img src attributes.
502
+ const WP_PHOTON_HOST_PATTERN = /^i\d\.wp\.com$/;
503
+ function rewritePhotonSrc(document, pageHost) {
504
+ for (const img of document.querySelectorAll('img[src]')) {
505
+ const src = img.getAttribute('src');
506
+ if (!src)
507
+ continue;
508
+ const parsed = URL.parse(src);
509
+ if (!parsed || !WP_PHOTON_HOST_PATTERN.test(parsed.hostname))
510
+ continue;
511
+ if (img.getAttribute('srcset'))
512
+ continue;
513
+ const segments = parsed.pathname.split('/').filter(Boolean);
514
+ if (segments.length < 2)
515
+ continue;
516
+ const originHost = segments[0];
517
+ if (!originHost?.includes('.'))
518
+ continue;
519
+ const resourcePath = `/${segments.slice(1).join('/')}`;
520
+ const rewritten = `https://${pageHost}${resourcePath}`;
521
+ img.setAttribute('src', rewritten);
522
+ }
523
+ }
524
+ // For images with src URLs pointing to a different domain than the page, check if their srcset contains a same-domain URL and prefer that for the src attribute.
525
+ // This can help preserve image loading when migrating content that references an old domain, as srcset entries are often left unchanged while src attributes are updated or removed.
526
+ function preferSameDomainSrc(document, base) {
527
+ const pageHost = base.hostname;
528
+ for (const img of document.querySelectorAll('img[src][srcset]')) {
529
+ const src = img.getAttribute('src');
530
+ if (!src)
531
+ continue;
532
+ const srcParsed = URL.parse(src);
533
+ if (!srcParsed || srcParsed.hostname === pageHost)
534
+ continue;
535
+ const srcset = img.getAttribute('srcset') ?? '';
536
+ const entries = srcset.split(',');
537
+ for (const entry of entries) {
538
+ const url = entry.trim().split(/\s+/)[0];
539
+ if (!url)
540
+ continue;
541
+ const parsed = URL.parse(url);
542
+ if (parsed?.hostname === pageHost) {
543
+ img.setAttribute('src', url);
544
+ break;
545
+ }
546
+ }
547
+ }
548
+ }
549
+ export function extractNoscriptImages(document) {
550
+ for (const noscript of document.querySelectorAll('noscript')) {
551
+ // linkedom may parse noscript children as DOM or raw text — handle both.
552
+ let imgs = Array.from(noscript.querySelectorAll('img'));
553
+ if (imgs.length === 0) {
554
+ const html = noscript.innerHTML || noscript.textContent || '';
555
+ if (!/<img\b/i.test(html))
556
+ continue;
557
+ const { document: fragDoc } = parseHTML(`<body>${html}</body>`);
558
+ imgs = Array.from(fragDoc.querySelectorAll('img'));
559
+ }
560
+ if (imgs.length === 0)
561
+ continue;
562
+ // Skip when the previous sibling is (or contains) an <img> — the
563
+ // lazy-loaded placeholder is already in the DOM and the translators
564
+ // handle data-src / placeholder detection.
565
+ const prev = noscript.previousElementSibling;
566
+ if (prev?.tagName === 'IMG' || prev?.querySelector('img'))
567
+ continue;
568
+ for (const img of imgs) {
569
+ // Skip tracking pixels (commonly 1×1 images placed in noscript by
570
+ // analytics providers).
571
+ if (img.getAttribute('width') === '1' ||
572
+ img.getAttribute('height') === '1')
573
+ continue;
574
+ noscript.before(img.cloneNode(true));
575
+ }
576
+ }
577
+ }
500
578
  function resolveUrls(document, baseUrlStr) {
501
579
  const base = URL.parse(baseUrlStr);
502
580
  if (!base)
503
581
  return;
582
+ rewritePhotonSrc(document, base.hostname);
583
+ preferSameDomainSrc(document, base);
504
584
  const elements = document.querySelectorAll('a[href],img[src],source[srcset]');
505
585
  for (const el of elements) {
506
586
  const tag = el.tagName.toLowerCase();
@@ -813,6 +893,7 @@ function runUrlResolutionPass(document, baseUrl) {
813
893
  // (post-Readability). Some passes (stripTabTriggers, etc.) are no-ops
814
894
  // on Readability output since tabs are already stripped or absent.
815
895
  export function prepareDocumentForMarkdown(document, baseUrl, signal) {
896
+ extractNoscriptImages(document);
816
897
  runDocsControlPass(document);
817
898
  runStructuralNoisePass(document, signal);
818
899
  runCodeExamplePass(document);
@@ -1 +1 @@
1
- {"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AAwdA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
1
+ {"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AAwfA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
@@ -97,12 +97,23 @@ function extractFirstSrcsetUrl(srcset) {
97
97
  const LAZY_SRC_ATTRIBUTES = [
98
98
  'data-src',
99
99
  'data-lazy-src',
100
+ 'data-lazy',
100
101
  'data-original',
102
+ 'data-echo',
101
103
  'data-srcset',
102
104
  ];
103
105
  function isDataUri(value) {
104
106
  return value.startsWith('data:');
105
107
  }
108
+ const PLACEHOLDER_FILENAME_PATTERN = /(?:^|\/)(?:blank|spacer|placeholder|grey|gray|pixel|loading|lazy|transparent|empty|dummy)\.[a-z]{3,4}$/i;
109
+ function isPlaceholderSrc(value) {
110
+ if (isDataUri(value))
111
+ return true;
112
+ const parsed = URL.parse(value) ?? URL.parse(value, 'http://localhost');
113
+ if (!parsed)
114
+ return false;
115
+ return PLACEHOLDER_FILENAME_PATTERN.test(parsed.pathname);
116
+ }
106
117
  function extractNonDataSrcsetUrl(value) {
107
118
  const url = extractFirstSrcsetUrl(value);
108
119
  return url && !isDataUri(url) ? url : undefined;
@@ -122,11 +133,28 @@ function resolveLazySrc(getAttribute) {
122
133
  }
123
134
  return undefined;
124
135
  }
136
+ // Some sites (notably WordPress with Photon CDN) use a CDN proxy URL in img src while keeping the original same-domain URL in srcset.
137
+ // Since the converter prefers srcset URLs for CDN-hosted images, we need to detect this pattern and extract the canonical URL from srcset to ensure images are correctly resolved, especially when migrating content to a new domain.
138
+ const WP_PHOTON_HOST_PATTERN = /^i\d\.wp\.com$/;
139
+ function isWpPhotonUrl(src) {
140
+ const parsed = URL.parse(src);
141
+ return parsed !== null && WP_PHOTON_HOST_PATTERN.test(parsed.hostname);
142
+ }
125
143
  function resolveImageSrc(getAttribute) {
126
144
  if (!getAttribute)
127
145
  return '';
128
146
  const srcRaw = getAttribute('src') ?? '';
129
- if (srcRaw && !isDataUri(srcRaw))
147
+ // When src is a CDN proxy URL, prefer srcset which usually has the
148
+ // canonical same-domain URL that survives domain migrations.
149
+ if (srcRaw && isWpPhotonUrl(srcRaw)) {
150
+ const srcset = getAttribute('srcset');
151
+ if (srcset) {
152
+ const url = extractNonDataSrcsetUrl(srcset);
153
+ if (url)
154
+ return url;
155
+ }
156
+ }
157
+ if (srcRaw && !isPlaceholderSrc(srcRaw))
130
158
  return srcRaw;
131
159
  // First check common lazy-loading attributes that may contain non-data URLs before falling back to the native srcset, as some sites use data URIs in lazy attributes while still providing valid URLs in srcset.
132
160
  const lazySrc = resolveLazySrc(getAttribute);
@@ -1 +1 @@
1
- {"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AA2DA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAmCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA6ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAicD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA4KD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AA0ID,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAgED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAwjBD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAgBzB;AAaD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AA+G1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
1
+ {"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AA4DA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAmCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA6ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAkcD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA4KD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AA0ID,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAgED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAwjBD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAgBzB;AAaD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AA+G1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
@@ -5,7 +5,7 @@ import { parseHTML } from 'linkedom';
5
5
  import { extractLanguageFromClassName } from '../lib/code-lang.js';
6
6
  import { config } from '../lib/core.js';
7
7
  import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../lib/core.js';
8
- import { normalizeTabContent, prepareDocumentForMarkdown, removeNoiseFromHtml, resolveDocumentBody, serializeDocumentForMarkdown, } from '../lib/dom-prep.js';
8
+ import { extractNoscriptImages, normalizeTabContent, prepareDocumentForMarkdown, removeNoiseFromHtml, resolveDocumentBody, serializeDocumentForMarkdown, } from '../lib/dom-prep.js';
9
9
  import { isRawTextContentUrl } from '../lib/http.js';
10
10
  import { cleanupMarkdownArtifacts, processFencedContent, } from '../lib/md-cleanup.js';
11
11
  import { addSourceToMarkdown, buildMetadataFooter, extractTitleFromRawMarkdown, isRawTextContent, } from '../lib/md-metadata.js';
@@ -328,6 +328,7 @@ function preserveCodeLanguageAttributes(doc) {
328
328
  }
329
329
  const STRUCTURAL_SKIP_TAGS = new Set(['HTML', 'BODY']);
330
330
  function prepareReadabilityDocument(readabilityDoc) {
331
+ extractNoscriptImages(readabilityDoc);
331
332
  preserveGalleryImages(readabilityDoc);
332
333
  preserveAlertElements(readabilityDoc);
333
334
  preserveCodeLanguageAttributes(readabilityDoc);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@j0hanz/fetch-url-mcp",
3
- "version": "1.10.14",
3
+ "version": "1.10.16",
4
4
  "mcpName": "io.github.j0hanz/fetch-url-mcp",
5
5
  "description": "A web content fetcher MCP server that converts HTML to clean, AI and human readable markdown.",
6
6
  "type": "module",