@j0hanz/fetch-url-mcp 1.10.15 → 1.10.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ export declare function extractNoscriptImages(document: Document): void;
1
2
  export declare function resolveDocumentBody(document: Document): Element;
2
3
  export declare function serializeDocumentForMarkdown(document: Document, fallback: string): string;
3
4
  /** Surface hidden tab panels, then strip unselected tab triggers. */
@@ -1 +1 @@
1
- {"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"AA6oBA,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAY/D;AAED,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CASR;AA0CD,qEAAqE;AACrE,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CAG5D;AAyTD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAMN;AA4BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR"}
1
+ {"version":3,"file":"dom-prep.d.ts","sourceRoot":"","sources":["../../src/lib/dom-prep.ts"],"names":[],"mappings":"AAwnBA,wBAAgB,qBAAqB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CA6B9D;AAuBD,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,OAAO,CAY/D;AAED,wBAAgB,4BAA4B,CAC1C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,MAAM,GACf,MAAM,CASR;AA0CD,qEAAqE;AACrE,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,QAAQ,GAAG,IAAI,CAG5D;AAyTD,wBAAgB,0BAA0B,CACxC,QAAQ,EAAE,QAAQ,EAClB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,IAAI,CAON;AA4BD,wBAAgB,mBAAmB,CACjC,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,QAAQ,EACnB,OAAO,CAAC,EAAE,MAAM,EAChB,MAAM,CAAC,EAAE,WAAW,GACnB,MAAM,CAcR"}
@@ -546,6 +546,35 @@ function preferSameDomainSrc(document, base) {
546
546
  }
547
547
  }
548
548
  }
549
+ export function extractNoscriptImages(document) {
550
+ for (const noscript of document.querySelectorAll('noscript')) {
551
+ // linkedom may parse noscript children as DOM or raw text — handle both.
552
+ let imgs = Array.from(noscript.querySelectorAll('img'));
553
+ if (imgs.length === 0) {
554
+ const html = noscript.innerHTML || noscript.textContent || '';
555
+ if (!/<img\b/i.test(html))
556
+ continue;
557
+ const { document: fragDoc } = parseHTML(`<body>${html}</body>`);
558
+ imgs = Array.from(fragDoc.querySelectorAll('img'));
559
+ }
560
+ if (imgs.length === 0)
561
+ continue;
562
+ // Skip when the previous sibling is (or contains) an <img> — the
563
+ // lazy-loaded placeholder is already in the DOM and the translators
564
+ // handle data-src / placeholder detection.
565
+ const prev = noscript.previousElementSibling;
566
+ if (prev?.tagName === 'IMG' || prev?.querySelector('img'))
567
+ continue;
568
+ for (const img of imgs) {
569
+ // Skip tracking pixels (commonly 1×1 images placed in noscript by
570
+ // analytics providers).
571
+ if (img.getAttribute('width') === '1' ||
572
+ img.getAttribute('height') === '1')
573
+ continue;
574
+ noscript.before(img.cloneNode(true));
575
+ }
576
+ }
577
+ }
549
578
  function resolveUrls(document, baseUrlStr) {
550
579
  const base = URL.parse(baseUrlStr);
551
580
  if (!base)
@@ -864,6 +893,7 @@ function runUrlResolutionPass(document, baseUrl) {
864
893
  // (post-Readability). Some passes (stripTabTriggers, etc.) are no-ops
865
894
  // on Readability output since tabs are already stripped or absent.
866
895
  export function prepareDocumentForMarkdown(document, baseUrl, signal) {
896
+ extractNoscriptImages(document);
867
897
  runDocsControlPass(document);
868
898
  runStructuralNoisePass(document, signal);
869
899
  runCodeExamplePass(document);
@@ -1 +1 @@
1
- {"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AA4eA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
1
+ {"version":3,"file":"html-translators.d.ts","sourceRoot":"","sources":["../../src/transform/html-translators.ts"],"names":[],"mappings":"AAwfA,wBAAgB,+BAA+B,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEpE"}
@@ -97,12 +97,23 @@ function extractFirstSrcsetUrl(srcset) {
97
97
  const LAZY_SRC_ATTRIBUTES = [
98
98
  'data-src',
99
99
  'data-lazy-src',
100
+ 'data-lazy',
100
101
  'data-original',
102
+ 'data-echo',
101
103
  'data-srcset',
102
104
  ];
103
105
  function isDataUri(value) {
104
106
  return value.startsWith('data:');
105
107
  }
108
+ const PLACEHOLDER_FILENAME_PATTERN = /(?:^|\/)(?:blank|spacer|placeholder|grey|gray|pixel|loading|lazy|transparent|empty|dummy)\.[a-z]{3,4}$/i;
109
+ function isPlaceholderSrc(value) {
110
+ if (isDataUri(value))
111
+ return true;
112
+ const parsed = URL.parse(value) ?? URL.parse(value, 'http://localhost');
113
+ if (!parsed)
114
+ return false;
115
+ return PLACEHOLDER_FILENAME_PATTERN.test(parsed.pathname);
116
+ }
106
117
  function extractNonDataSrcsetUrl(value) {
107
118
  const url = extractFirstSrcsetUrl(value);
108
119
  return url && !isDataUri(url) ? url : undefined;
@@ -143,7 +154,7 @@ function resolveImageSrc(getAttribute) {
143
154
  return url;
144
155
  }
145
156
  }
146
- if (srcRaw && !isDataUri(srcRaw))
157
+ if (srcRaw && !isPlaceholderSrc(srcRaw))
147
158
  return srcRaw;
148
159
  // First check common lazy-loading attributes that may contain non-data URLs before falling back to the native srcset, as some sites use data URIs in lazy attributes while still providing valid URLs in srcset.
149
160
  const lazySrc = resolveLazySrc(getAttribute);
@@ -1 +1 @@
1
- {"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AA2DA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAmCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA6ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAicD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA4KD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AA0ID,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAgED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAwjBD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAgBzB;AAaD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AA+G1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
1
+ {"version":3,"file":"transform.d.ts","sourceRoot":"","sources":["../../src/transform/transform.ts"],"names":[],"mappings":"AA4DA,OAAO,KAAK,EACV,gBAAgB,EAChB,iBAAiB,EACjB,gBAAgB,EAChB,uBAAuB,EACvB,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EAEtB,MAAM,YAAY,CAAC;AAmCpB,UAAU,WAAW;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;CACnB;AA6ID,wBAAgB,mBAAmB,CACjC,GAAG,EAAE,MAAM,EACX,KAAK,EAAE,MAAM,EACb,MAAM,CAAC,EAAE,WAAW,GACnB,qBAAqB,GAAG,IAAI,CAE9B;AAED,wBAAgB,iBAAiB,CAC/B,OAAO,EAAE,qBAAqB,GAAG,IAAI,EACrC,OAAO,CAAC,EAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAE,GAChC,MAAM,CAER;AAkcD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,GAAE;IAAE,cAAc,CAAC,EAAE,OAAO,CAAC;IAAC,MAAM,CAAC,EAAE,WAAW,CAAA;CAExD,GACA,gBAAgB,CAGlB;AA4KD,wBAAgB,cAAc,CAC5B,IAAI,EAAE,MAAM,EACZ,QAAQ,CAAC,EAAE,aAAa,EACxB,OAAO,CAAC,EAAE;IACR,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,MAAM,CAAC,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,QAAQ,CAAC;IACpB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,GACA,MAAM,CAsBR;AA0ID,wBAAgB,sBAAsB,CACpC,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,sBAAsB,EAAE,MAAM,GAAG,QAAQ,GACxC,OAAO,CAQT;AAgED,wBAAgB,gCAAgC,CAC9C,OAAO,EAAE,gBAAgB,GAAG,IAAI,GAC/B,OAAO,IAAI,gBAAgB,CAE7B;AAED,wBAAgB,0BAA0B,CACxC,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GAAG,IAAI,EAChC,aAAa,EAAE,iBAAiB,EAChC,wBAAwB,EAAE,OAAO,EACjC,eAAe,EAAE,OAAO,GACvB,aAAa,GAAG,SAAS,CAuB3B;AAwjBD,wBAAgB,gCAAgC,CAC9C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,uBAAuB,CAgBzB;AAaD,UAAU,kBAAkB;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAED,wBAAgB,qBAAqB,IAAI,kBAAkB,GAAG,IAAI,CAEjE;AAED,wBAAsB,2BAA2B,IAAI,OAAO,CAAC,IAAI,CAAC,CAEjE;AAED,KAAK,yBAAyB,GAAG,gBAAgB,GAAG;IAAE,QAAQ,CAAC,EAAE,MAAM,CAAA;CAAE,CAAC;AA+G1E,wBAAsB,uBAAuB,CAC3C,IAAI,EAAE,MAAM,EACZ,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,gBAAgB,GACxB,OAAO,CAAC,uBAAuB,CAAC,CAElC;AAED,wBAAsB,yBAAyB,CAC7C,UAAU,EAAE,UAAU,EACtB,GAAG,EAAE,MAAM,EACX,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,uBAAuB,CAAC,CAElC"}
@@ -5,7 +5,7 @@ import { parseHTML } from 'linkedom';
5
5
  import { extractLanguageFromClassName } from '../lib/code-lang.js';
6
6
  import { config } from '../lib/core.js';
7
7
  import { getOperationId, getRequestId, logDebug, logError, logInfo, logWarn, redactUrl, } from '../lib/core.js';
8
- import { normalizeTabContent, prepareDocumentForMarkdown, removeNoiseFromHtml, resolveDocumentBody, serializeDocumentForMarkdown, } from '../lib/dom-prep.js';
8
+ import { extractNoscriptImages, normalizeTabContent, prepareDocumentForMarkdown, removeNoiseFromHtml, resolveDocumentBody, serializeDocumentForMarkdown, } from '../lib/dom-prep.js';
9
9
  import { isRawTextContentUrl } from '../lib/http.js';
10
10
  import { cleanupMarkdownArtifacts, processFencedContent, } from '../lib/md-cleanup.js';
11
11
  import { addSourceToMarkdown, buildMetadataFooter, extractTitleFromRawMarkdown, isRawTextContent, } from '../lib/md-metadata.js';
@@ -328,6 +328,7 @@ function preserveCodeLanguageAttributes(doc) {
328
328
  }
329
329
  const STRUCTURAL_SKIP_TAGS = new Set(['HTML', 'BODY']);
330
330
  function prepareReadabilityDocument(readabilityDoc) {
331
+ extractNoscriptImages(readabilityDoc);
331
332
  preserveGalleryImages(readabilityDoc);
332
333
  preserveAlertElements(readabilityDoc);
333
334
  preserveCodeLanguageAttributes(readabilityDoc);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@j0hanz/fetch-url-mcp",
3
- "version": "1.10.15",
3
+ "version": "1.10.16",
4
4
  "mcpName": "io.github.j0hanz/fetch-url-mcp",
5
5
  "description": "A web content fetcher MCP server that converts HTML to clean, AI and human readable markdown.",
6
6
  "type": "module",