@j0hanz/fetch-url-mcp 1.10.14 → 1.10.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/dom-prep.d.ts +1 -0
- package/dist/lib/dom-prep.d.ts.map +1 -1
- package/dist/lib/dom-prep.js +81 -0
- package/dist/transform/html-translators.d.ts.map +1 -1
- package/dist/transform/html-translators.js +29 -1
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +2 -1
- package/package.json +1 -1
package/dist/lib/dom-prep.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
export declare function extractNoscriptImages(document: Document): void;
|
|
1
2
|
export declare function resolveDocumentBody(document: Document): Element;
|
|
2
3
|
export declare function serializeDocumentForMarkdown(document: Document, fallback: string): string;
|
|
3
4
|
/** Surface hidden tab panels, then strip unselected tab triggers. */
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"AAwnBA,wBAAgB,qBAAqB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CA6B9D;AAuBD,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAY/D;AAED,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CASR;AA0CD,qEAAqE;AACrE,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CAG5D;AAyTD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAON;AA4BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR"}
|
package/dist/lib/dom-prep.js
CHANGED
|
@@ -497,10 +497,90 @@ function processUrlElement(el, attr, base, isSrcset) {
|
|
|
497
497
|
el.setAttribute(attr, resolved.href);
|
|
498
498
|
}
|
|
499
499
|
}
|
|
500
|
+
// Rewrite WordPress Photon CDN image URLs to point to the original host, since srcset URLs are often preserved with the updated domain while src is not.
|
|
501
|
+
// This ensures images are correctly resolved when the page is migrated to a new domain but still references the old domain in img src attributes.
|
|
502
|
+
const WP_PHOTON_HOST_PATTERN = /^i\d\.wp\.com$/;
|
|
503
|
+
function rewritePhotonSrc(document, pageHost) {
|
|
504
|
+
for (const img of document.querySelectorAll('img[src]')) {
|
|
505
|
+
const src = img.getAttribute('src');
|
|
506
|
+
if (!src)
|
|
507
|
+
continue;
|
|
508
|
+
const parsed = URL.parse(src);
|
|
509
|
+
if (!parsed || !WP_PHOTON_HOST_PATTERN.test(parsed.hostname))
|
|
510
|
+
continue;
|
|
511
|
+
if (img.getAttribute('srcset'))
|
|
512
|
+
continue;
|
|
513
|
+
const segments = parsed.pathname.split('/').filter(Boolean);
|
|
514
|
+
if (segments.length < 2)
|
|
515
|
+
continue;
|
|
516
|
+
const originHost = segments[0];
|
|
517
|
+
if (!originHost?.includes('.'))
|
|
518
|
+
continue;
|
|
519
|
+
const resourcePath = `/${segments.slice(1).join('/')}`;
|
|
520
|
+
const rewritten = `https://${pageHost}${resourcePath}`;
|
|
521
|
+
img.setAttribute('src', rewritten);
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
// For images with src URLs pointing to a different domain than the page, check if their srcset contains a same-domain URL and prefer that for the src attribute.
|
|
525
|
+
// This can help preserve image loading when migrating content that references an old domain, as srcset entries are often left unchanged while src attributes are updated or removed.
|
|
526
|
+
function preferSameDomainSrc(document, base) {
|
|
527
|
+
const pageHost = base.hostname;
|
|
528
|
+
for (const img of document.querySelectorAll('img[src][srcset]')) {
|
|
529
|
+
const src = img.getAttribute('src');
|
|
530
|
+
if (!src)
|
|
531
|
+
continue;
|
|
532
|
+
const srcParsed = URL.parse(src);
|
|
533
|
+
if (!srcParsed || srcParsed.hostname === pageHost)
|
|
534
|
+
continue;
|
|
535
|
+
const srcset = img.getAttribute('srcset') ?? '';
|
|
536
|
+
const entries = srcset.split(',');
|
|
537
|
+
for (const entry of entries) {
|
|
538
|
+
const url = entry.trim().split(/\s+/)[0];
|
|
539
|
+
if (!url)
|
|
540
|
+
continue;
|
|
541
|
+
const parsed = URL.parse(url);
|
|
542
|
+
if (parsed?.hostname === pageHost) {
|
|
543
|
+
img.setAttribute('src', url);
|
|
544
|
+
break;
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
}
|
|
549
|
+
export function extractNoscriptImages(document) {
|
|
550
|
+
for (const noscript of document.querySelectorAll('noscript')) {
|
|
551
|
+
// linkedom may parse noscript children as DOM or raw text — handle both.
|
|
552
|
+
let imgs = Array.from(noscript.querySelectorAll('img'));
|
|
553
|
+
if (imgs.length === 0) {
|
|
554
|
+
const html = noscript.innerHTML || noscript.textContent || '';
|
|
555
|
+
if (!/<img\b/i.test(html))
|
|
556
|
+
continue;
|
|
557
|
+
const { document: fragDoc } = parseHTML(`<body>${html}</body>`);
|
|
558
|
+
imgs = Array.from(fragDoc.querySelectorAll('img'));
|
|
559
|
+
}
|
|
560
|
+
if (imgs.length === 0)
|
|
561
|
+
continue;
|
|
562
|
+
// Skip when the previous sibling is (or contains) an <img> — the
|
|
563
|
+
// lazy-loaded placeholder is already in the DOM and the translators
|
|
564
|
+
// handle data-src / placeholder detection.
|
|
565
|
+
const prev = noscript.previousElementSibling;
|
|
566
|
+
if (prev?.tagName === 'IMG' || prev?.querySelector('img'))
|
|
567
|
+
continue;
|
|
568
|
+
for (const img of imgs) {
|
|
569
|
+
// Skip tracking pixels (commonly 1×1 images placed in noscript by
|
|
570
|
+
// analytics providers).
|
|
571
|
+
if (img.getAttribute('width') === '1' ||
|
|
572
|
+
img.getAttribute('height') === '1')
|
|
573
|
+
continue;
|
|
574
|
+
noscript.before(img.cloneNode(true));
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
}
|
|
500
578
|
function resolveUrls(document, baseUrlStr) {
|
|
501
579
|
const base = URL.parse(baseUrlStr);
|
|
502
580
|
if (!base)
|
|
503
581
|
return;
|
|
582
|
+
rewritePhotonSrc(document, base.hostname);
|
|
583
|
+
preferSameDomainSrc(document, base);
|
|
504
584
|
const elements = document.querySelectorAll('a[href],img[src],source[srcset]');
|
|
505
585
|
for (const el of elements) {
|
|
506
586
|
const tag = el.tagName.toLowerCase();
|
|
@@ -813,6 +893,7 @@ function runUrlResolutionPass(document, baseUrl) {
|
|
|
813
893
|
// (post-Readability). Some passes (stripTabTriggers, etc.) are no-ops
|
|
814
894
|
// on Readability output since tabs are already stripped or absent.
|
|
815
895
|
export function prepareDocumentForMarkdown(document, baseUrl, signal) {
|
|
896
|
+
extractNoscriptImages(document);
|
|
816
897
|
runDocsControlPass(document);
|
|
817
898
|
runStructuralNoisePass(document, signal);
|
|
818
899
|
runCodeExamplePass(document);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AAwfA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
|
|
@@ -97,12 +97,23 @@ function extractFirstSrcsetUrl(srcset) {
|
|
|
97
97
|
const LAZY_SRC_ATTRIBUTES = [
|
|
98
98
|
'data-src',
|
|
99
99
|
'data-lazy-src',
|
|
100
|
+
'data-lazy',
|
|
100
101
|
'data-original',
|
|
102
|
+
'data-echo',
|
|
101
103
|
'data-srcset',
|
|
102
104
|
];
|
|
103
105
|
function isDataUri(value) {
|
|
104
106
|
return value.startsWith('data:');
|
|
105
107
|
}
|
|
108
|
+
const PLACEHOLDER_FILENAME_PATTERN = /(?:^|\/)(?:blank|spacer|placeholder|grey|gray|pixel|loading|lazy|transparent|empty|dummy)\.[a-z]{3,4}$/i;
|
|
109
|
+
function isPlaceholderSrc(value) {
|
|
110
|
+
if (isDataUri(value))
|
|
111
|
+
return true;
|
|
112
|
+
const parsed = URL.parse(value) ?? URL.parse(value, 'http://localhost');
|
|
113
|
+
if (!parsed)
|
|
114
|
+
return false;
|
|
115
|
+
return PLACEHOLDER_FILENAME_PATTERN.test(parsed.pathname);
|
|
116
|
+
}
|
|
106
117
|
function extractNonDataSrcsetUrl(value) {
|
|
107
118
|
const url = extractFirstSrcsetUrl(value);
|
|
108
119
|
return url && !isDataUri(url) ? url : undefined;
|
|
@@ -122,11 +133,28 @@ function resolveLazySrc(getAttribute) {
|
|
|
122
133
|
}
|
|
123
134
|
return undefined;
|
|
124
135
|
}
|
|
136
|
+
// Some sites (notably WordPress with Photon CDN) use a CDN proxy URL in img src while keeping the original same-domain URL in srcset.
|
|
137
|
+
// Since the converter prefers srcset URLs for CDN-hosted images, we need to detect this pattern and extract the canonical URL from srcset to ensure images are correctly resolved, especially when migrating content to a new domain.
|
|
138
|
+
const WP_PHOTON_HOST_PATTERN = /^i\d\.wp\.com$/;
|
|
139
|
+
function isWpPhotonUrl(src) {
|
|
140
|
+
const parsed = URL.parse(src);
|
|
141
|
+
return parsed !== null && WP_PHOTON_HOST_PATTERN.test(parsed.hostname);
|
|
142
|
+
}
|
|
125
143
|
function resolveImageSrc(getAttribute) {
|
|
126
144
|
if (!getAttribute)
|
|
127
145
|
return '';
|
|
128
146
|
const srcRaw = getAttribute('src') ?? '';
|
|
129
|
-
|
|
147
|
+
// When src is a CDN proxy URL, prefer srcset which usually has the
|
|
148
|
+
// canonical same-domain URL that survives domain migrations.
|
|
149
|
+
if (srcRaw && isWpPhotonUrl(srcRaw)) {
|
|
150
|
+
const srcset = getAttribute('srcset');
|
|
151
|
+
if (srcset) {
|
|
152
|
+
const url = extractNonDataSrcsetUrl(srcset);
|
|
153
|
+
if (url)
|
|
154
|
+
return url;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
if (srcRaw && !isPlaceholderSrc(srcRaw))
|
|
130
158
|
return srcRaw;
|
|
131
159
|
// First check common lazy-loading attributes that may contain non-data URLs before falling back to the native srcset, as some sites use data URIs in lazy attributes while still providing valid URLs in srcset.
|
|
132
160
|
const lazySrc = resolveLazySrc(getAttribute);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AA4DA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAmCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA6ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAkcD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA4KD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AA0ID,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAgED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAwjBD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAgBzB;AAaD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AA+G1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
|
|
@@ -5,7 +5,7 @@ import { parseHTML } from 'linkedom';
|
|
|
5
5
|
import { extractLanguageFromClassName } from '../lib/code-lang.js';
|
|
6
6
|
import { config } from '../lib/core.js';
|
|
7
7
|
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../lib/core.js';
|
|
8
|
-
import { normalizeTabContent, prepareDocumentForMarkdown, removeNoiseFromHtml, resolveDocumentBody, serializeDocumentForMarkdown, } from '../lib/dom-prep.js';
|
|
8
|
+
import { extractNoscriptImages, normalizeTabContent, prepareDocumentForMarkdown, removeNoiseFromHtml, resolveDocumentBody, serializeDocumentForMarkdown, } from '../lib/dom-prep.js';
|
|
9
9
|
import { isRawTextContentUrl } from '../lib/http.js';
|
|
10
10
|
import { cleanupMarkdownArtifacts, processFencedContent, } from '../lib/md-cleanup.js';
|
|
11
11
|
import { addSourceToMarkdown, buildMetadataFooter, extractTitleFromRawMarkdown, isRawTextContent, } from '../lib/md-metadata.js';
|
|
@@ -328,6 +328,7 @@ function preserveCodeLanguageAttributes(doc) {
|
|
|
328
328
|
}
|
|
329
329
|
const STRUCTURAL_SKIP_TAGS = new Set(['HTML', 'BODY']);
|
|
330
330
|
function prepareReadabilityDocument(readabilityDoc) {
|
|
331
|
+
extractNoscriptImages(readabilityDoc);
|
|
331
332
|
preserveGalleryImages(readabilityDoc);
|
|
332
333
|
preserveAlertElements(readabilityDoc);
|
|
333
334
|
preserveCodeLanguageAttributes(readabilityDoc);
|
package/package.json
CHANGED