@j0hanz/superfetch 2.4.13 → 2.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/fetch.d.ts +0 -3
- package/dist/fetch.js +13 -18
- package/dist/language-detection.js +2 -0
- package/dist/markdown-cleanup.js +2 -2
- package/dist/mcp.js +0 -2
- package/dist/tools.js +27 -23
- package/dist/transform.js +27 -48
- package/package.json +1 -2
package/dist/fetch.d.ts
CHANGED
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import type { Dispatcher } from 'undici';
|
|
2
1
|
export interface FetchOptions {
|
|
3
2
|
signal?: AbortSignal;
|
|
4
3
|
}
|
|
@@ -18,8 +17,6 @@ export declare function validateAndNormalizeUrl(urlString: string): string;
|
|
|
18
17
|
/** Backwards-compatible exports */
|
|
19
18
|
export declare function transformToRawUrl(url: string): TransformResult;
|
|
20
19
|
export declare function isRawTextContentUrl(url: string): boolean;
|
|
21
|
-
export declare const dispatcher: Dispatcher;
|
|
22
|
-
export declare function destroyAgents(): void;
|
|
23
20
|
export interface FetchTelemetryContext {
|
|
24
21
|
requestId: string;
|
|
25
22
|
startTime: number;
|
package/dist/fetch.js
CHANGED
|
@@ -2,9 +2,7 @@ import { randomUUID } from 'node:crypto';
|
|
|
2
2
|
import diagnosticsChannel from 'node:diagnostics_channel';
|
|
3
3
|
import dns from 'node:dns';
|
|
4
4
|
import { BlockList, isIP } from 'node:net';
|
|
5
|
-
import os from 'node:os';
|
|
6
5
|
import { performance } from 'node:perf_hooks';
|
|
7
|
-
import { Agent } from 'undici';
|
|
8
6
|
import { config } from './config.js';
|
|
9
7
|
import { createErrorWithCode, FetchError, isSystemError } from './errors.js';
|
|
10
8
|
import { getOperationId, getRequestId, logDebug, logError, logWarn, redactUrl, } from './observability.js';
|
|
@@ -435,21 +433,16 @@ class SafeDnsLookup {
|
|
|
435
433
|
}
|
|
436
434
|
}
|
|
437
435
|
const safeDns = new SafeDnsLookup();
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
};
|
|
449
|
-
}
|
|
450
|
-
export const dispatcher = new Agent(getAgentOptions());
|
|
451
|
-
export function destroyAgents() {
|
|
452
|
-
void dispatcher.close();
|
|
436
|
+
async function assertSafeDnsLookup(hostname) {
|
|
437
|
+
await new Promise((resolve, reject) => {
|
|
438
|
+
safeDns.lookup(hostname, { all: true }, (err) => {
|
|
439
|
+
if (err) {
|
|
440
|
+
reject(err);
|
|
441
|
+
return;
|
|
442
|
+
}
|
|
443
|
+
resolve();
|
|
444
|
+
});
|
|
445
|
+
});
|
|
453
446
|
}
|
|
454
447
|
/* -------------------------------------------------------------------------------------------------
|
|
455
448
|
* Fetch error mapping (request-level)
|
|
@@ -834,7 +827,7 @@ function buildRequestSignal(timeoutMs, external) {
|
|
|
834
827
|
return external ? AbortSignal.any([external, timeoutSignal]) : timeoutSignal;
|
|
835
828
|
}
|
|
836
829
|
function buildRequestInit(headers, signal) {
|
|
837
|
-
return { method: 'GET', headers, signal
|
|
830
|
+
return { method: 'GET', headers, signal };
|
|
838
831
|
}
|
|
839
832
|
function resolveResponseError(response, finalUrl) {
|
|
840
833
|
if (response.status === 429) {
|
|
@@ -856,6 +849,8 @@ async function handleFetchResponse(response, finalUrl, ctx, signal) {
|
|
|
856
849
|
}
|
|
857
850
|
class HttpFetcher {
|
|
858
851
|
async fetchNormalizedUrl(normalizedUrl, options) {
|
|
852
|
+
const { hostname } = new URL(normalizedUrl);
|
|
853
|
+
await assertSafeDnsLookup(hostname);
|
|
859
854
|
const timeoutMs = config.fetcher.timeout;
|
|
860
855
|
const headers = buildHeaders();
|
|
861
856
|
const signal = buildRequestSignal(timeoutMs, options?.signal);
|
|
@@ -319,6 +319,8 @@ const detector = new LanguageDetector();
|
|
|
319
319
|
* Detect programming language from code content using heuristics.
|
|
320
320
|
*/
|
|
321
321
|
export function detectLanguageFromCode(code) {
|
|
322
|
+
if (!code || code.trim().length === 0)
|
|
323
|
+
return undefined;
|
|
322
324
|
return detector.detect(code);
|
|
323
325
|
}
|
|
324
326
|
/**
|
package/dist/markdown-cleanup.js
CHANGED
|
@@ -398,7 +398,7 @@ function hasMarkdownSourceLine(content) {
|
|
|
398
398
|
}
|
|
399
399
|
return false;
|
|
400
400
|
}
|
|
401
|
-
function
|
|
401
|
+
function addSourceToMarkdownAsMarkdown(content, url) {
|
|
402
402
|
if (hasMarkdownSourceLine(content))
|
|
403
403
|
return content;
|
|
404
404
|
const lineEnding = detectLineEnding(content);
|
|
@@ -423,7 +423,7 @@ function addSourceToMarkdownMarkdownFormat(content, url) {
|
|
|
423
423
|
export function addSourceToMarkdown(content, url) {
|
|
424
424
|
const fm = frontmatter.find(content);
|
|
425
425
|
if (config.transform.metadataFormat === 'markdown' && !fm) {
|
|
426
|
-
return
|
|
426
|
+
return addSourceToMarkdownAsMarkdown(content, url);
|
|
427
427
|
}
|
|
428
428
|
if (!fm) {
|
|
429
429
|
// Preserve existing behavior: always uses LF even if content uses CRLF.
|
package/dist/mcp.js
CHANGED
|
@@ -5,7 +5,6 @@ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js'
|
|
|
5
5
|
import { CallToolRequestSchema, ErrorCode, McpError, } from '@modelcontextprotocol/sdk/types.js';
|
|
6
6
|
import { registerCachedContentResource } from './cache.js';
|
|
7
7
|
import { config } from './config.js';
|
|
8
|
-
import { destroyAgents } from './fetch.js';
|
|
9
8
|
import { logError, logInfo, setMcpServer } from './observability.js';
|
|
10
9
|
import { registerConfigResource } from './resources.js';
|
|
11
10
|
import { taskManager } from './tasks.js';
|
|
@@ -415,7 +414,6 @@ function handleShutdownSignal(server, signal) {
|
|
|
415
414
|
process.stderr.write(`\n${signal} received, shutting down superFetch MCP server...\n`);
|
|
416
415
|
Promise.resolve()
|
|
417
416
|
.then(async () => {
|
|
418
|
-
destroyAgents();
|
|
419
417
|
await shutdownTransformWorkerPool();
|
|
420
418
|
await server.close();
|
|
421
419
|
})
|
package/dist/tools.js
CHANGED
|
@@ -95,29 +95,33 @@ class ToolProgressReporter {
|
|
|
95
95
|
}
|
|
96
96
|
async report(progress, message) {
|
|
97
97
|
try {
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
}
|
|
120
|
-
|
|
98
|
+
let timeoutId;
|
|
99
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
100
|
+
timeoutId = setTimeout(() => {
|
|
101
|
+
reject(new Error('Progress notification timeout'));
|
|
102
|
+
}, PROGRESS_NOTIFICATION_TIMEOUT_MS);
|
|
103
|
+
timeoutId.unref();
|
|
104
|
+
});
|
|
105
|
+
const sendPromise = this.sendNotification({
|
|
106
|
+
method: 'notifications/progress',
|
|
107
|
+
params: {
|
|
108
|
+
progressToken: this.token,
|
|
109
|
+
progress,
|
|
110
|
+
total: FETCH_PROGRESS_TOTAL,
|
|
111
|
+
message,
|
|
112
|
+
...(this.relatedTaskMeta
|
|
113
|
+
? {
|
|
114
|
+
_meta: {
|
|
115
|
+
'io.modelcontextprotocol/related-task': this.relatedTaskMeta,
|
|
116
|
+
},
|
|
117
|
+
}
|
|
118
|
+
: {}),
|
|
119
|
+
},
|
|
120
|
+
}).finally(() => {
|
|
121
|
+
if (timeoutId)
|
|
122
|
+
clearTimeout(timeoutId);
|
|
123
|
+
});
|
|
124
|
+
await Promise.race([sendPromise, timeoutPromise]);
|
|
121
125
|
}
|
|
122
126
|
catch (error) {
|
|
123
127
|
const isTimeout = error instanceof Error &&
|
package/dist/transform.js
CHANGED
|
@@ -630,6 +630,10 @@ function translateHtmlToMarkdown(params) {
|
|
|
630
630
|
}
|
|
631
631
|
function appendMetadataFooter(content, metadata, url) {
|
|
632
632
|
const footer = buildMetadataFooter(metadata, url);
|
|
633
|
+
if (!content.trim() && footer) {
|
|
634
|
+
const note = '> **Note:** This page contains no readable content. It may require JavaScript to render.\n\n';
|
|
635
|
+
return `${note}${footer}`;
|
|
636
|
+
}
|
|
633
637
|
return footer ? `${content}\n\n${footer}` : content;
|
|
634
638
|
}
|
|
635
639
|
export function htmlToMarkdown(html, metadata, options) {
|
|
@@ -684,12 +688,12 @@ function tryTransformRawContent(params) {
|
|
|
684
688
|
/* -------------------------------------------------------------------------------------------------
|
|
685
689
|
* Quality gates + content source resolution
|
|
686
690
|
* ------------------------------------------------------------------------------------------------- */
|
|
687
|
-
const MIN_CONTENT_RATIO = 0.
|
|
691
|
+
const MIN_CONTENT_RATIO = 0.15;
|
|
688
692
|
const MIN_HTML_LENGTH_FOR_GATE = 100;
|
|
689
|
-
const MIN_HEADING_RETENTION_RATIO = 0.
|
|
690
|
-
const MIN_CODE_BLOCK_RETENTION_RATIO = 0.
|
|
693
|
+
const MIN_HEADING_RETENTION_RATIO = 0.3;
|
|
694
|
+
const MIN_CODE_BLOCK_RETENTION_RATIO = 0.15;
|
|
691
695
|
const MIN_LINE_LENGTH_FOR_TRUNCATION_CHECK = 20;
|
|
692
|
-
const MAX_TRUNCATED_LINE_RATIO = 0.
|
|
696
|
+
const MAX_TRUNCATED_LINE_RATIO = 0.95;
|
|
693
697
|
function needsDocumentWrapper(html) {
|
|
694
698
|
const trimmed = html.trim().toLowerCase();
|
|
695
699
|
return (!trimmed.startsWith('<!doctype') &&
|
|
@@ -780,8 +784,8 @@ export function createContentMetadataBlock(url, article, extractedMeta, shouldEx
|
|
|
780
784
|
return metadata;
|
|
781
785
|
}
|
|
782
786
|
const CONTENT_ROOT_SELECTORS = [
|
|
783
|
-
'main',
|
|
784
787
|
'article',
|
|
788
|
+
'main',
|
|
785
789
|
'[role="main"]',
|
|
786
790
|
'#content',
|
|
787
791
|
'#main-content',
|
|
@@ -808,10 +812,9 @@ function findContentRoot(document) {
|
|
|
808
812
|
}
|
|
809
813
|
return undefined;
|
|
810
814
|
}
|
|
811
|
-
function shouldUseArticleContent(article, originalHtmlOrDocument
|
|
815
|
+
function shouldUseArticleContent(article, originalHtmlOrDocument) {
|
|
812
816
|
const articleLength = article.textContent.length;
|
|
813
817
|
const originalLength = getVisibleTextLength(originalHtmlOrDocument);
|
|
814
|
-
const safeUrl = url.substring(0, 80);
|
|
815
818
|
let articleDocument = null;
|
|
816
819
|
const getArticleDocument = () => {
|
|
817
820
|
if (articleDocument)
|
|
@@ -821,69 +824,45 @@ function shouldUseArticleContent(article, originalHtmlOrDocument, url) {
|
|
|
821
824
|
};
|
|
822
825
|
if (originalLength >= MIN_HTML_LENGTH_FOR_GATE) {
|
|
823
826
|
const ratio = articleLength / originalLength;
|
|
824
|
-
if (ratio < MIN_CONTENT_RATIO)
|
|
825
|
-
logDebug('Quality gate: Readability extraction below threshold, using full HTML', {
|
|
826
|
-
url: safeUrl,
|
|
827
|
-
articleLength,
|
|
828
|
-
});
|
|
827
|
+
if (ratio < MIN_CONTENT_RATIO)
|
|
829
828
|
return false;
|
|
830
|
-
}
|
|
831
829
|
}
|
|
832
830
|
const originalHeadings = countHeadingsDom(originalHtmlOrDocument);
|
|
833
831
|
if (originalHeadings > 0) {
|
|
834
832
|
const articleHeadings = countHeadingsDom(getArticleDocument());
|
|
835
833
|
const retentionRatio = articleHeadings / originalHeadings;
|
|
836
|
-
if (retentionRatio < MIN_HEADING_RETENTION_RATIO)
|
|
837
|
-
logDebug('Quality gate: Readability broke heading structure, using full HTML', {
|
|
838
|
-
url: safeUrl,
|
|
839
|
-
originalHeadings,
|
|
840
|
-
articleHeadings,
|
|
841
|
-
});
|
|
834
|
+
if (retentionRatio < MIN_HEADING_RETENTION_RATIO)
|
|
842
835
|
return false;
|
|
843
|
-
}
|
|
844
836
|
}
|
|
845
837
|
const originalCodeBlocks = countCodeBlocksDom(originalHtmlOrDocument);
|
|
846
838
|
if (originalCodeBlocks > 0) {
|
|
847
839
|
const articleCodeBlocks = countCodeBlocksDom(getArticleDocument());
|
|
848
840
|
const codeRetentionRatio = articleCodeBlocks / originalCodeBlocks;
|
|
849
|
-
|
|
850
|
-
url: safeUrl,
|
|
851
|
-
originalCodeBlocks,
|
|
852
|
-
articleCodeBlocks,
|
|
853
|
-
codeRetentionRatio,
|
|
854
|
-
});
|
|
855
|
-
if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO) {
|
|
856
|
-
logDebug('Quality gate: Readability removed code blocks, using full HTML', {
|
|
857
|
-
url: safeUrl,
|
|
858
|
-
originalCodeBlocks,
|
|
859
|
-
articleCodeBlocks,
|
|
860
|
-
});
|
|
841
|
+
if (codeRetentionRatio < MIN_CODE_BLOCK_RETENTION_RATIO)
|
|
861
842
|
return false;
|
|
862
|
-
}
|
|
863
|
-
}
|
|
864
|
-
if (hasTruncatedSentences(article.textContent)) {
|
|
865
|
-
logDebug('Quality gate: Extracted text has many truncated sentences, using full HTML', {
|
|
866
|
-
url: safeUrl,
|
|
867
|
-
});
|
|
868
|
-
return false;
|
|
869
843
|
}
|
|
870
|
-
return
|
|
844
|
+
return !hasTruncatedSentences(article.textContent);
|
|
871
845
|
}
|
|
872
846
|
function buildContentSource(params) {
|
|
873
847
|
const { html, url, article, extractedMeta, includeMetadata, useArticleContent, document, } = params;
|
|
874
848
|
const metadata = createContentMetadataBlock(url, article, extractedMeta, useArticleContent, includeMetadata);
|
|
875
849
|
if (useArticleContent && article) {
|
|
876
|
-
|
|
850
|
+
// Apply noise removal to Readability-extracted content to remove
|
|
851
|
+
// author bylines, social share buttons, and other boilerplate
|
|
852
|
+
// that Readability may have included in the article content
|
|
853
|
+
const cleanedArticleHtml = removeNoiseFromHtml(article.content, undefined, url);
|
|
854
|
+
return {
|
|
855
|
+
sourceHtml: cleanedArticleHtml,
|
|
856
|
+
title: article.title,
|
|
857
|
+
metadata,
|
|
858
|
+
skipNoiseRemoval: true, // Already cleaned
|
|
859
|
+
};
|
|
877
860
|
}
|
|
878
861
|
if (document) {
|
|
879
|
-
removeNoiseFromHtml(html,
|
|
880
|
-
const cleanedDoc =
|
|
862
|
+
const cleanedHtml = removeNoiseFromHtml(html, undefined, url);
|
|
863
|
+
const { document: cleanedDoc } = parseHTML(cleanedHtml);
|
|
881
864
|
const contentRoot = findContentRoot(cleanedDoc);
|
|
882
865
|
if (contentRoot) {
|
|
883
|
-
logDebug('Using content root fallback instead of full HTML', {
|
|
884
|
-
url: url.substring(0, 80),
|
|
885
|
-
contentLength: contentRoot.length,
|
|
886
|
-
});
|
|
887
866
|
return {
|
|
888
867
|
sourceHtml: contentRoot,
|
|
889
868
|
title: extractedMeta.title,
|
|
@@ -905,7 +884,7 @@ function resolveContentSource(params) {
|
|
|
905
884
|
...(params.signal ? { signal: params.signal } : {}),
|
|
906
885
|
});
|
|
907
886
|
const useArticleContent = article
|
|
908
|
-
? shouldUseArticleContent(article, document
|
|
887
|
+
? shouldUseArticleContent(article, document)
|
|
909
888
|
: false;
|
|
910
889
|
return buildContentSource({
|
|
911
890
|
html: params.html,
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@j0hanz/superfetch",
|
|
3
|
-
"version": "2.
|
|
3
|
+
"version": "2.5.1",
|
|
4
4
|
"mcpName": "io.github.j0hanz/superfetch",
|
|
5
5
|
"description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable Markdown",
|
|
6
6
|
"type": "module",
|
|
@@ -60,7 +60,6 @@
|
|
|
60
60
|
"@mozilla/readability": "^0.6.0",
|
|
61
61
|
"linkedom": "^0.18.12",
|
|
62
62
|
"node-html-markdown": "^2.0.0",
|
|
63
|
-
"undici": "^7.19.2",
|
|
64
63
|
"zod": "^4.3.6"
|
|
65
64
|
},
|
|
66
65
|
"devDependencies": {
|