@j0hanz/fetch-url-mcp 1.10.15 → 1.10.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/dom-prep.d.ts +1 -0
- package/dist/lib/dom-prep.d.ts.map +1 -1
- package/dist/lib/dom-prep.js +30 -0
- package/dist/transform/html-translators.d.ts.map +1 -1
- package/dist/transform/html-translators.js +12 -1
- package/dist/transform/transform.d.ts.map +1 -1
- package/dist/transform/transform.js +2 -1
- package/package.json +1 -1
package/dist/lib/dom-prep.d.ts
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
export declare function extractNoscriptImages(document: Document): void;
|
|
1
2
|
export declare function resolveDocumentBody(document: Document): Element;
|
|
2
3
|
export declare function serializeDocumentForMarkdown(document: Document, fallback: string): string;
|
|
3
4
|
/** Surface hidden tab panels, then strip unselected tab triggers. */
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"AAwnBA,wBAAgB,qBAAqB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CA6B9D;AAuBD,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAY/D;AAED,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CASR;AA0CD,qEAAqE;AACrE,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CAG5D;AAyTD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAON;AA4BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR"}
|
package/dist/lib/dom-prep.js
CHANGED
|
@@ -546,6 +546,35 @@ function preferSameDomainSrc(document, base) {
|
|
|
546
546
|
}
|
|
547
547
|
}
|
|
548
548
|
}
|
|
549
|
+
export function extractNoscriptImages(document) {
|
|
550
|
+
for (const noscript of document.querySelectorAll('noscript')) {
|
|
551
|
+
// linkedom may parse noscript children as DOM or raw text — handle both.
|
|
552
|
+
let imgs = Array.from(noscript.querySelectorAll('img'));
|
|
553
|
+
if (imgs.length === 0) {
|
|
554
|
+
const html = noscript.innerHTML || noscript.textContent || '';
|
|
555
|
+
if (!/<img\b/i.test(html))
|
|
556
|
+
continue;
|
|
557
|
+
const { document: fragDoc } = parseHTML(`<body>${html}</body>`);
|
|
558
|
+
imgs = Array.from(fragDoc.querySelectorAll('img'));
|
|
559
|
+
}
|
|
560
|
+
if (imgs.length === 0)
|
|
561
|
+
continue;
|
|
562
|
+
// Skip when the previous sibling is (or contains) an <img> — the
|
|
563
|
+
// lazy-loaded placeholder is already in the DOM and the translators
|
|
564
|
+
// handle data-src / placeholder detection.
|
|
565
|
+
const prev = noscript.previousElementSibling;
|
|
566
|
+
if (prev?.tagName === 'IMG' || prev?.querySelector('img'))
|
|
567
|
+
continue;
|
|
568
|
+
for (const img of imgs) {
|
|
569
|
+
// Skip tracking pixels (commonly 1×1 images placed in noscript by
|
|
570
|
+
// analytics providers).
|
|
571
|
+
if (img.getAttribute('width') === '1' ||
|
|
572
|
+
img.getAttribute('height') === '1')
|
|
573
|
+
continue;
|
|
574
|
+
noscript.before(img.cloneNode(true));
|
|
575
|
+
}
|
|
576
|
+
}
|
|
577
|
+
}
|
|
549
578
|
function resolveUrls(document, baseUrlStr) {
|
|
550
579
|
const base = URL.parse(baseUrlStr);
|
|
551
580
|
if (!base)
|
|
@@ -864,6 +893,7 @@ function runUrlResolutionPass(document, baseUrl) {
|
|
|
864
893
|
// (post-Readability). Some passes (stripTabTriggers, etc.) are no-ops
|
|
865
894
|
// on Readability output since tabs are already stripped or absent.
|
|
866
895
|
export function prepareDocumentForMarkdown(document, baseUrl, signal) {
|
|
896
|
+
extractNoscriptImages(document);
|
|
867
897
|
runDocsControlPass(document);
|
|
868
898
|
runStructuralNoisePass(document, signal);
|
|
869
899
|
runCodeExamplePass(document);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AAwfA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
|
|
@@ -97,12 +97,23 @@ function extractFirstSrcsetUrl(srcset) {
|
|
|
97
97
|
const LAZY_SRC_ATTRIBUTES = [
|
|
98
98
|
'data-src',
|
|
99
99
|
'data-lazy-src',
|
|
100
|
+
'data-lazy',
|
|
100
101
|
'data-original',
|
|
102
|
+
'data-echo',
|
|
101
103
|
'data-srcset',
|
|
102
104
|
];
|
|
103
105
|
function isDataUri(value) {
|
|
104
106
|
return value.startsWith('data:');
|
|
105
107
|
}
|
|
108
|
+
const PLACEHOLDER_FILENAME_PATTERN = /(?:^|\/)(?:blank|spacer|placeholder|grey|gray|pixel|loading|lazy|transparent|empty|dummy)\.[a-z]{3,4}$/i;
|
|
109
|
+
function isPlaceholderSrc(value) {
|
|
110
|
+
if (isDataUri(value))
|
|
111
|
+
return true;
|
|
112
|
+
const parsed = URL.parse(value) ?? URL.parse(value, 'http://localhost');
|
|
113
|
+
if (!parsed)
|
|
114
|
+
return false;
|
|
115
|
+
return PLACEHOLDER_FILENAME_PATTERN.test(parsed.pathname);
|
|
116
|
+
}
|
|
106
117
|
function extractNonDataSrcsetUrl(value) {
|
|
107
118
|
const url = extractFirstSrcsetUrl(value);
|
|
108
119
|
return url && !isDataUri(url) ? url : undefined;
|
|
@@ -143,7 +154,7 @@ function resolveImageSrc(getAttribute) {
|
|
|
143
154
|
return url;
|
|
144
155
|
}
|
|
145
156
|
}
|
|
146
|
-
if (srcRaw && !
|
|
157
|
+
if (srcRaw && !isPlaceholderSrc(srcRaw))
|
|
147
158
|
return srcRaw;
|
|
148
159
|
// First check common lazy-loading attributes that may contain non-data URLs before falling back to the native srcset, as some sites use data URIs in lazy attributes while still providing valid URLs in srcset.
|
|
149
160
|
const lazySrc = resolveLazySrc(getAttribute);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AA4DA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAmCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA6ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAkcD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA4KD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AA0ID,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAgED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAwjBD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAgBzB;AAaD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AA+G1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
|
|
@@ -5,7 +5,7 @@ import { parseHTML } from 'linkedom';
|
|
|
5
5
|
import { extractLanguageFromClassName } from '../lib/code-lang.js';
|
|
6
6
|
import { config } from '../lib/core.js';
|
|
7
7
|
import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../lib/core.js';
|
|
8
|
-
import { normalizeTabContent, prepareDocumentForMarkdown, removeNoiseFromHtml, resolveDocumentBody, serializeDocumentForMarkdown, } from '../lib/dom-prep.js';
|
|
8
|
+
import { extractNoscriptImages, normalizeTabContent, prepareDocumentForMarkdown, removeNoiseFromHtml, resolveDocumentBody, serializeDocumentForMarkdown, } from '../lib/dom-prep.js';
|
|
9
9
|
import { isRawTextContentUrl } from '../lib/http.js';
|
|
10
10
|
import { cleanupMarkdownArtifacts, processFencedContent, } from '../lib/md-cleanup.js';
|
|
11
11
|
import { addSourceToMarkdown, buildMetadataFooter, extractTitleFromRawMarkdown, isRawTextContent, } from '../lib/md-metadata.js';
|
|
@@ -328,6 +328,7 @@ function preserveCodeLanguageAttributes(doc) {
|
|
|
328
328
|
}
|
|
329
329
|
const STRUCTURAL_SKIP_TAGS = new Set(['HTML', 'BODY']);
|
|
330
330
|
function prepareReadabilityDocument(readabilityDoc) {
|
|
331
|
+
extractNoscriptImages(readabilityDoc);
|
|
331
332
|
preserveGalleryImages(readabilityDoc);
|
|
332
333
|
preserveAlertElements(readabilityDoc);
|
|
333
334
|
preserveCodeLanguageAttributes(readabilityDoc);
|
package/package.json
CHANGED