@j0hanz/superfetch 2.4.13 → 2.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/fetch.d.ts CHANGED
@@ -1,4 +1,3 @@
1
- import type { Dispatcher } from 'undici';
2
1
  export interface FetchOptions {
3
2
  signal?: AbortSignal;
4
3
  }
@@ -18,8 +17,6 @@ export declare function validateAndNormalizeUrl(urlString: string): string;
18
17
  /** Backwards-compatible exports */
19
18
  export declare function transformToRawUrl(url: string): TransformResult;
20
19
  export declare function isRawTextContentUrl(url: string): boolean;
21
- export declare const dispatcher: Dispatcher;
22
- export declare function destroyAgents(): void;
23
20
  export interface FetchTelemetryContext {
24
21
  requestId: string;
25
22
  startTime: number;
package/dist/fetch.js CHANGED
@@ -2,9 +2,7 @@ import { randomUUID } from 'node:crypto';
2
2
  import diagnosticsChannel from 'node:diagnostics_channel';
3
3
  import dns from 'node:dns';
4
4
  import { BlockList, isIP } from 'node:net';
5
- import os from 'node:os';
6
5
  import { performance } from 'node:perf_hooks';
7
- import { Agent } from 'undici';
8
6
  import { config } from './config.js';
9
7
  import { createErrorWithCode, FetchError, isSystemError } from './errors.js';
10
8
  import { getOperationId, getRequestId, logDebug, logError, logWarn, redactUrl, } from './observability.js';
@@ -435,21 +433,16 @@ class SafeDnsLookup {
435
433
  }
436
434
  }
437
435
  const safeDns = new SafeDnsLookup();
438
- /* -------------------------------------------------------------------------------------------------
439
- * Dispatcher / Agent lifecycle
440
- * ------------------------------------------------------------------------------------------------- */
441
- function getAgentOptions() {
442
- const cpuCount = os.availableParallelism();
443
- return {
444
- keepAliveTimeout: 60000,
445
- connections: Math.max(cpuCount * 2, 25),
446
- pipelining: 1,
447
- connect: { lookup: safeDns.lookup.bind(safeDns) },
448
- };
449
- }
450
- export const dispatcher = new Agent(getAgentOptions());
451
- export function destroyAgents() {
452
- void dispatcher.close();
436
+ async function assertSafeDnsLookup(hostname) {
437
+ await new Promise((resolve, reject) => {
438
+ safeDns.lookup(hostname, { all: true }, (err) => {
439
+ if (err) {
440
+ reject(err);
441
+ return;
442
+ }
443
+ resolve();
444
+ });
445
+ });
453
446
  }
454
447
  /* -------------------------------------------------------------------------------------------------
455
448
  * Fetch error mapping (request-level)
@@ -834,7 +827,7 @@ function buildRequestSignal(timeoutMs, external) {
834
827
  return external ? AbortSignal.any([external, timeoutSignal]) : timeoutSignal;
835
828
  }
836
829
  function buildRequestInit(headers, signal) {
837
- return { method: 'GET', headers, signal, dispatcher };
830
+ return { method: 'GET', headers, signal };
838
831
  }
839
832
  function resolveResponseError(response, finalUrl) {
840
833
  if (response.status === 429) {
@@ -856,6 +849,8 @@ async function handleFetchResponse(response, finalUrl, ctx, signal) {
856
849
  }
857
850
  class HttpFetcher {
858
851
  async fetchNormalizedUrl(normalizedUrl, options) {
852
+ const { hostname } = new URL(normalizedUrl);
853
+ await assertSafeDnsLookup(hostname);
859
854
  const timeoutMs = config.fetcher.timeout;
860
855
  const headers = buildHeaders();
861
856
  const signal = buildRequestSignal(timeoutMs, options?.signal);
@@ -319,6 +319,8 @@ const detector = new LanguageDetector();
319
319
  * Detect programming language from code content using heuristics.
320
320
  */
321
321
  export function detectLanguageFromCode(code) {
322
+ if (!code || code.trim().length === 0)
323
+ return undefined;
322
324
  return detector.detect(code);
323
325
  }
324
326
  /**
@@ -398,7 +398,7 @@ function hasMarkdownSourceLine(content) {
398
398
  }
399
399
  return false;
400
400
  }
401
- function addSourceToMarkdownMarkdownFormat(content, url) {
401
+ function addSourceToMarkdownAsMarkdown(content, url) {
402
402
  if (hasMarkdownSourceLine(content))
403
403
  return content;
404
404
  const lineEnding = detectLineEnding(content);
@@ -423,7 +423,7 @@ function addSourceToMarkdownMarkdownFormat(content, url) {
423
423
  export function addSourceToMarkdown(content, url) {
424
424
  const fm = frontmatter.find(content);
425
425
  if (config.transform.metadataFormat === 'markdown' && !fm) {
426
- return addSourceToMarkdownMarkdownFormat(content, url);
426
+ return addSourceToMarkdownAsMarkdown(content, url);
427
427
  }
428
428
  if (!fm) {
429
429
  // Preserve existing behavior: always uses LF even if content uses CRLF.
package/dist/mcp.js CHANGED
@@ -5,7 +5,6 @@ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'
5
5
  import { CallToolRequestSchema, ErrorCode, McpError, } from '@modelcontextprotocol/sdk/types.js';
6
6
  import { registerCachedContentResource } from './cache.js';
7
7
  import { config } from './config.js';
8
- import { destroyAgents } from './fetch.js';
9
8
  import { logError, logInfo, setMcpServer } from './observability.js';
10
9
  import { registerConfigResource } from './resources.js';
11
10
  import { taskManager } from './tasks.js';
@@ -415,7 +414,6 @@ function handleShutdownSignal(server, signal) {
415
414
  process.stderr.write(`\n${signal} received, shutting down superFetch MCP server...\n`);
416
415
  Promise.resolve()
417
416
  .then(async () => {
418
- destroyAgents();
419
417
  await shutdownTransformWorkerPool();
420
418
  await server.close();
421
419
  })
package/dist/tools.js CHANGED
@@ -95,29 +95,33 @@ class ToolProgressReporter {
95
95
  }
96
96
  async report(progress, message) {
97
97
  try {
98
- await Promise.race([
99
- this.sendNotification({
100
- method: 'notifications/progress',
101
- params: {
102
- progressToken: this.token,
103
- progress,
104
- total: FETCH_PROGRESS_TOTAL,
105
- message,
106
- ...(this.relatedTaskMeta
107
- ? {
108
- _meta: {
109
- 'io.modelcontextprotocol/related-task': this.relatedTaskMeta,
110
- },
111
- }
112
- : {}),
113
- },
114
- }),
115
- new Promise((_, reject) => {
116
- setTimeout(() => {
117
- reject(new Error('Progress notification timeout'));
118
- }, PROGRESS_NOTIFICATION_TIMEOUT_MS);
119
- }),
120
- ]);
98
+ let timeoutId;
99
+ const timeoutPromise = new Promise((_, reject) => {
100
+ timeoutId = setTimeout(() => {
101
+ reject(new Error('Progress notification timeout'));
102
+ }, PROGRESS_NOTIFICATION_TIMEOUT_MS);
103
+ timeoutId.unref();
104
+ });
105
+ const sendPromise = this.sendNotification({
106
+ method: 'notifications/progress',
107
+ params: {
108
+ progressToken: this.token,
109
+ progress,
110
+ total: FETCH_PROGRESS_TOTAL,
111
+ message,
112
+ ...(this.relatedTaskMeta
113
+ ? {
114
+ _meta: {
115
+ 'io.modelcontextprotocol/related-task': this.relatedTaskMeta,
116
+ },
117
+ }
118
+ : {}),
119
+ },
120
+ }).finally(() => {
121
+ if (timeoutId)
122
+ clearTimeout(timeoutId);
123
+ });
124
+ await Promise.race([sendPromise, timeoutPromise]);
121
125
  }
122
126
  catch (error) {
123
127
  const isTimeout = error instanceof Error &&
package/dist/transform.js CHANGED
@@ -630,6 +630,10 @@ function translateHtmlToMarkdown(params) {
630
630
  }
631
631
  function appendMetadataFooter(content, metadata, url) {
632
632
  const footer = buildMetadataFooter(metadata, url);
633
+ if (!content.trim() && footer) {
634
+ const note = '> **Note:** This page contains no readable content. It may require JavaScript to render.\n\n';
635
+ return `${note}${footer}`;
636
+ }
633
637
  return footer ? `${content}\n\n${footer}` : content;
634
638
  }
635
639
  export function htmlToMarkdown(html, metadata, options) {
@@ -684,12 +688,12 @@ function tryTransformRawContent(params) {
684
688
  /* -------------------------------------------------------------------------------------------------
685
689
  * Quality gates + content source resolution
686
690
  * ------------------------------------------------------------------------------------------------- */
687
- const MIN_CONTENT_RATIO = 0.3;
691
+ const MIN_CONTENT_RATIO = 0.15;
688
692
  const MIN_HTML_LENGTH_FOR_GATE = 100;
689
- const MIN_HEADING_RETENTION_RATIO = 0.7;
690
- const MIN_CODE_BLOCK_RETENTION_RATIO = 0.5;
693
+ const MIN_HEADING_RETENTION_RATIO = 0.3;
694
+ const MIN_CODE_BLOCK_RETENTION_RATIO = 0.15;
691
695
  const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
692
- const MAX_TRUNCATED_LINE_RATIO = 0.5;
696
+ const MAX_TRUNCATED_LINE_RATIO = 0.95;
693
697
  function needsDocumentWrapper(html) {
694
698
  const trimmed = html.trim().toLowerCase();
695
699
  return (!trimmed.startsWith('<!doctype') &&
@@ -780,8 +784,8 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
780
784
  return metadata;
781
785
  }
782
786
  const CONTENT_ROOT_SELECTORS = [
783
- 'main',
784
787
  'article',
788
+ 'main',
785
789
  '[role="main"]',
786
790
  '#content',
787
791
  '#main-content',
@@ -808,10 +812,9 @@ function findContentRoot(document) {
808
812
  }
809
813
  return undefined;
810
814
  }
811
- function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
815
+ function shouldUseArticleContent(article, originalHtmlOrDocument) {
812
816
  const articleLength = article.textContent.length;
813
817
  const originalLength = getVisibleTextLength(originalHtmlOrDocument);
814
- const safeUrl = url.substring(0, 80);
815
818
  let articleDocument = null;
816
819
  const getArticleDocument = () => {
817
820
  if (articleDocument)
@@ -821,69 +824,45 @@ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
821
824
  };
822
825
  if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
823
826
  const ratio = articleLength / originalLength;
824
- if (ratio < MIN_CONTENT_RATIO) {
825
- logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
826
- url: safeUrl,
827
- articleLength,
828
- });
827
+ if (ratio < MIN_CONTENT_RATIO)
829
828
  return false;
830
- }
831
829
  }
832
830
  const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
833
831
  if (originalHeadings > 0) {
834
832
  const articleHeadings = countHeadingsDom(getArticleDocument());
835
833
  const retentionRatio = articleHeadings / originalHeadings;
836
- if (retentionRatio < MIN_HEADING_RETENTION_RATIO) {
837
- logDebug('Quality gate: Readability broke heading structure, using full HTML', {
838
- url: safeUrl,
839
- originalHeadings,
840
- articleHeadings,
841
- });
834
+ if (retentionRatio < MIN_HEADING_RETENTION_RATIO)
842
835
  return false;
843
- }
844
836
  }
845
837
  const originalCodeBlocks = countCodeBlocksDom(originalHtmlOrDocument);
846
838
  if (originalCodeBlocks > 0) {
847
839
  const articleCodeBlocks = countCodeBlocksDom(getArticleDocument());
848
840
  const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
849
- logDebug('Code block retention check', {
850
- url: safeUrl,
851
- originalCodeBlocks,
852
- articleCodeBlocks,
853
- codeRetentionRatio,
854
- });
855
- if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO) {
856
- logDebug('Quality gate: Readability removed code blocks, using full HTML', {
857
- url: safeUrl,
858
- originalCodeBlocks,
859
- articleCodeBlocks,
860
- });
841
+ if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO)
861
842
  return false;
862
- }
863
- }
864
- if (hasTruncatedSentences(article.textContent)) {
865
- logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', {
866
- url: safeUrl,
867
- });
868
- return false;
869
843
  }
870
- return true;
844
+ return !hasTruncatedSentences(article.textContent);
871
845
  }
872
846
  function buildContentSource(params) {
873
847
  const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, } = params;
874
848
  const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
875
849
  if (useArticleContent && article) {
876
- return { sourceHtml: article.content, title: article.title, metadata };
850
+ // Apply noise removal to Readability-extracted content to remove
851
+ // author bylines, social share buttons, and other boilerplate
852
+ // that Readability may have included in the article content
853
+ const cleanedArticleHtml = removeNoiseFromHtml(article.content, undefined, url);
854
+ return {
855
+ sourceHtml: cleanedArticleHtml,
856
+ title: article.title,
857
+ metadata,
858
+ skipNoiseRemoval: true, // Already cleaned
859
+ };
877
860
  }
878
861
  if (document) {
879
- removeNoiseFromHtml(html, document, url);
880
- const cleanedDoc = document;
862
+ const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
863
+ const { document: cleanedDoc } = parseHTML(cleanedHtml);
881
864
  const contentRoot = findContentRoot(cleanedDoc);
882
865
  if (contentRoot) {
883
- logDebug('Using content root fallback instead of full HTML', {
884
- url: url.substring(0, 80),
885
- contentLength: contentRoot.length,
886
- });
887
866
  return {
888
867
  sourceHtml: contentRoot,
889
868
  title: extractedMeta.title,
@@ -905,7 +884,7 @@ function resolveContentSource(params) {
905
884
  ...(params.signal ? { signal: params.signal } : {}),
906
885
  });
907
886
  const useArticleContent = article
908
- ? shouldUseArticleContent(article, document, params.url)
887
+ ? shouldUseArticleContent(article, document)
909
888
  : false;
910
889
  return buildContentSource({
911
890
  html: params.html,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@j0hanz/superfetch",
3
- "version": "2.4.13",
3
+ "version": "2.5.1",
4
4
  "mcpName": "io.github.j0hanz/superfetch",
5
5
  "description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable Markdown",
6
6
  "type": "module",
@@ -60,7 +60,6 @@
60
60
  "@mozilla/readability": "^0.6.0",
61
61
  "linkedom": "^0.18.12",
62
62
  "node-html-markdown": "^2.0.0",
63
- "undici": "^7.19.2",
64
63
  "zod": "^4.3.6"
65
64
  },
66
65
  "devDependencies": {