@j0hanz/superfetch 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +61 -46
- package/dist/config/formatting.d.ts +1 -1
- package/dist/config/types/content.d.ts +3 -3
- package/dist/config/types/runtime.d.ts +1 -1
- package/dist/config/types/tools.d.ts +12 -12
- package/dist/http/cors.js +23 -23
- package/dist/http/download-routes.js +9 -4
- package/dist/http/mcp-routes.js +2 -13
- package/dist/http/mcp-validation.js +1 -1
- package/dist/http/server-middleware.js +2 -1
- package/dist/http/server.js +2 -0
- package/dist/index.js +5 -0
- package/dist/middleware/error-handler.js +1 -1
- package/dist/resources/cached-content.js +8 -4
- package/dist/server.js +2 -0
- package/dist/services/cache.d.ts +1 -1
- package/dist/services/cache.js +20 -7
- package/dist/services/context.d.ts +2 -4
- package/dist/services/context.js +1 -1
- package/dist/services/extractor.js +26 -21
- package/dist/services/fetcher/interceptors.d.ts +22 -0
- package/dist/services/fetcher/interceptors.js +18 -8
- package/dist/services/fetcher/response.js +32 -24
- package/dist/services/fetcher.d.ts +0 -1
- package/dist/services/fetcher.js +5 -7
- package/dist/services/metadata-collector.d.ts +10 -0
- package/dist/services/metadata-collector.js +11 -0
- package/dist/services/parser.js +26 -25
- package/dist/services/transform-worker-pool.d.ts +14 -0
- package/dist/services/transform-worker-pool.js +167 -0
- package/dist/tools/handlers/fetch-markdown.tool.d.ts +9 -1
- package/dist/tools/handlers/fetch-markdown.tool.js +58 -30
- package/dist/tools/handlers/fetch-single.shared.d.ts +8 -3
- package/dist/tools/handlers/fetch-single.shared.js +42 -17
- package/dist/tools/handlers/fetch-url.tool.js +46 -16
- package/dist/tools/index.js +13 -0
- package/dist/tools/schemas.d.ts +29 -133
- package/dist/tools/schemas.js +22 -32
- package/dist/tools/utils/common.js +20 -16
- package/dist/tools/utils/content-transform-async.d.ts +6 -0
- package/dist/tools/utils/content-transform-async.js +33 -0
- package/dist/tools/utils/content-transform.d.ts +4 -1
- package/dist/tools/utils/content-transform.js +7 -2
- package/dist/tools/utils/fetch-pipeline.js +18 -10
- package/dist/utils/content-cleaner.d.ts +1 -1
- package/dist/utils/download-url.d.ts +9 -1
- package/dist/utils/download-url.js +9 -6
- package/dist/utils/tool-error-handler.d.ts +2 -2
- package/dist/utils/tool-error-handler.js +7 -7
- package/dist/utils/url-validator.js +38 -0
- package/dist/workers/transform-worker.d.ts +1 -0
- package/dist/workers/transform-worker.js +50 -0
- package/package.json +5 -7
|
@@ -102,7 +102,10 @@ export function transformHtmlToMarkdown(html, url, options) {
|
|
|
102
102
|
};
|
|
103
103
|
}
|
|
104
104
|
export function transformHtmlToMarkdownWithBlocks(html, url, options) {
|
|
105
|
-
|
|
105
|
+
const includeContentBlocks = options.includeContentBlocks ?? true;
|
|
106
|
+
if (includeContentBlocks &&
|
|
107
|
+
!options.extractMainContent &&
|
|
108
|
+
options.includeMetadata) {
|
|
106
109
|
const parsed = parseHtmlWithMetadata(html);
|
|
107
110
|
const context = {
|
|
108
111
|
sourceHtml: html,
|
|
@@ -118,7 +121,9 @@ export function transformHtmlToMarkdownWithBlocks(html, url, options) {
|
|
|
118
121
|
};
|
|
119
122
|
}
|
|
120
123
|
const context = resolveContentSource(html, url, options);
|
|
121
|
-
const contentBlocks =
|
|
124
|
+
const contentBlocks = includeContentBlocks
|
|
125
|
+
? parseHtml(context.sourceHtml)
|
|
126
|
+
: [];
|
|
122
127
|
const { content, truncated } = buildMarkdownPayload(context, options.maxContentLength);
|
|
123
128
|
return {
|
|
124
129
|
content,
|
|
@@ -51,7 +51,7 @@ export async function executeFetchPipeline(options) {
|
|
|
51
51
|
const fetchOptions = buildFetchOptions(options);
|
|
52
52
|
logDebug('Fetching URL', { url: normalizedUrl, retries: options.retries });
|
|
53
53
|
const html = await fetchNormalizedUrlWithRetry(normalizedUrl, fetchOptions, options.retries);
|
|
54
|
-
const data = options.transform(html, normalizedUrl);
|
|
54
|
+
const data = await options.transform(html, normalizedUrl);
|
|
55
55
|
if (cache.isEnabled()) {
|
|
56
56
|
persistCache(cacheKey, data, options.serialize, normalizedUrl);
|
|
57
57
|
}
|
|
@@ -62,20 +62,28 @@ function resolveCacheKey(options, normalizedUrl) {
|
|
|
62
62
|
return cache.createCacheKey(options.cacheNamespace, normalizedUrl, cacheVary);
|
|
63
63
|
}
|
|
64
64
|
function buildFetchOptions(options) {
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
65
|
+
const fetchOptions = {};
|
|
66
|
+
if (options.customHeaders !== undefined) {
|
|
67
|
+
fetchOptions.customHeaders = options.customHeaders;
|
|
68
|
+
}
|
|
69
|
+
if (options.signal !== undefined) {
|
|
70
|
+
fetchOptions.signal = options.signal;
|
|
71
|
+
}
|
|
72
|
+
if (options.timeout !== undefined) {
|
|
73
|
+
fetchOptions.timeout = options.timeout;
|
|
74
|
+
}
|
|
75
|
+
return fetchOptions;
|
|
70
76
|
}
|
|
71
77
|
function persistCache(cacheKey, data, serialize, normalizedUrl) {
|
|
72
78
|
if (!cacheKey)
|
|
73
79
|
return;
|
|
74
80
|
const serializer = serialize ?? JSON.stringify;
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
81
|
+
const metadata = { url: normalizedUrl };
|
|
82
|
+
const title = extractTitle(data);
|
|
83
|
+
if (title !== undefined) {
|
|
84
|
+
metadata.title = title;
|
|
85
|
+
}
|
|
86
|
+
cache.set(cacheKey, serializer(data), metadata);
|
|
79
87
|
}
|
|
80
88
|
function extractTitle(value) {
|
|
81
89
|
if (!value || typeof value !== 'object')
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
export declare function cleanParagraph(text: string): string | null;
|
|
2
2
|
export declare function cleanHeading(text: string): string | null;
|
|
3
|
-
export declare function cleanListItems(items: string[]): string[];
|
|
3
|
+
export declare function cleanListItems(items: readonly string[]): readonly string[];
|
|
4
4
|
export declare function cleanCodeBlock(code: string): string | null;
|
|
5
5
|
export declare function removeInlineTimestamps(text: string): string;
|
|
@@ -1,8 +1,16 @@
|
|
|
1
|
+
import { config } from '../config/index.js';
|
|
1
2
|
import type { FileDownloadInfo } from '../config/types/tools.js';
|
|
3
|
+
import * as cache from '../services/cache.js';
|
|
4
|
+
import { generateSafeFilename } from './filename-generator.js';
|
|
2
5
|
interface DownloadInfoOptions {
|
|
3
6
|
cacheKey: string | null;
|
|
4
7
|
url: string;
|
|
5
8
|
title?: string;
|
|
6
9
|
}
|
|
7
|
-
|
|
10
|
+
interface DownloadInfoDeps {
|
|
11
|
+
readonly config?: typeof config;
|
|
12
|
+
readonly cache?: Pick<typeof cache, 'get' | 'parseCacheKey'>;
|
|
13
|
+
readonly generateSafeFilename?: typeof generateSafeFilename;
|
|
14
|
+
}
|
|
15
|
+
export declare function buildFileDownloadInfo(options: DownloadInfoOptions, deps?: DownloadInfoDeps): FileDownloadInfo | null;
|
|
8
16
|
export {};
|
|
@@ -1,22 +1,25 @@
|
|
|
1
1
|
import { config } from '../config/index.js';
|
|
2
2
|
import * as cache from '../services/cache.js';
|
|
3
3
|
import { generateSafeFilename } from './filename-generator.js';
|
|
4
|
-
export function buildFileDownloadInfo(options) {
|
|
5
|
-
|
|
4
|
+
export function buildFileDownloadInfo(options, deps = {}) {
|
|
5
|
+
const resolvedConfig = deps.config ?? config;
|
|
6
|
+
const resolvedCache = deps.cache ?? cache;
|
|
7
|
+
const resolveFilename = deps.generateSafeFilename ?? generateSafeFilename;
|
|
8
|
+
if (!resolvedConfig.runtime.httpMode) {
|
|
6
9
|
return null;
|
|
7
10
|
}
|
|
8
|
-
if (!
|
|
11
|
+
if (!resolvedConfig.cache.enabled || !options.cacheKey) {
|
|
9
12
|
return null;
|
|
10
13
|
}
|
|
11
|
-
const parts =
|
|
14
|
+
const parts = resolvedCache.parseCacheKey(options.cacheKey);
|
|
12
15
|
if (!parts)
|
|
13
16
|
return null;
|
|
14
|
-
const cacheEntry =
|
|
17
|
+
const cacheEntry = resolvedCache.get(options.cacheKey);
|
|
15
18
|
if (!cacheEntry)
|
|
16
19
|
return null;
|
|
17
20
|
const { expiresAt, title, url } = cacheEntry;
|
|
18
21
|
const downloadUrl = buildDownloadUrl(parts.namespace, parts.urlHash);
|
|
19
|
-
const fileName =
|
|
22
|
+
const fileName = resolveFilename(url, title ?? options.title, parts.urlHash, resolveExtension(parts.namespace));
|
|
20
23
|
return { downloadUrl, fileName, expiresAt };
|
|
21
24
|
}
|
|
22
25
|
function buildDownloadUrl(namespace, hash) {
|
|
@@ -1,3 +1,3 @@
|
|
|
1
1
|
import type { ToolErrorResponse } from '../config/types/tools.js';
|
|
2
|
-
export declare function createToolErrorResponse(message: string, url: string, code: string): ToolErrorResponse;
|
|
3
|
-
export declare function handleToolError(error: unknown, url: string, fallbackMessage?: string): ToolErrorResponse;
|
|
2
|
+
export declare function createToolErrorResponse(message: string, url: string, code: string, details?: Record<string, unknown>): ToolErrorResponse;
|
|
3
|
+
export declare function handleToolError(error: unknown, url: string, fallbackMessage?: string, details?: Record<string, unknown>): ToolErrorResponse;
|
|
@@ -22,12 +22,12 @@ function normalizeToolErrorCode(code) {
|
|
|
22
22
|
return String(ErrorCode.InternalError);
|
|
23
23
|
return MCP_ERROR_CODE_MAP[code] ?? code;
|
|
24
24
|
}
|
|
25
|
-
export function createToolErrorResponse(message, url, code) {
|
|
25
|
+
export function createToolErrorResponse(message, url, code, details = {}) {
|
|
26
26
|
const structuredContent = {
|
|
27
|
+
...details,
|
|
27
28
|
error: message,
|
|
28
29
|
url,
|
|
29
30
|
errorCode: normalizeToolErrorCode(code),
|
|
30
|
-
errorType: code,
|
|
31
31
|
};
|
|
32
32
|
return {
|
|
33
33
|
content: [{ type: 'text', text: JSON.stringify(structuredContent) }],
|
|
@@ -42,19 +42,19 @@ function formatErrorMessage(baseMessage, error, fallback) {
|
|
|
42
42
|
}
|
|
43
43
|
return message;
|
|
44
44
|
}
|
|
45
|
-
export function handleToolError(error, url, fallbackMessage = 'Operation failed') {
|
|
45
|
+
export function handleToolError(error, url, fallbackMessage = 'Operation failed', details = {}) {
|
|
46
46
|
if (isValidationError(error)) {
|
|
47
|
-
return createToolErrorResponse(error.message, url, 'VALIDATION_ERROR');
|
|
47
|
+
return createToolErrorResponse(error.message, url, 'VALIDATION_ERROR', details);
|
|
48
48
|
}
|
|
49
49
|
if (error instanceof FetchError) {
|
|
50
50
|
const message = formatErrorMessage(error.message, error);
|
|
51
|
-
return createToolErrorResponse(message, url, error.code);
|
|
51
|
+
return createToolErrorResponse(message, url, error.code, details);
|
|
52
52
|
}
|
|
53
53
|
if (error instanceof Error) {
|
|
54
54
|
const message = formatErrorMessage(error.message, error, fallbackMessage);
|
|
55
|
-
return createToolErrorResponse(message, url, 'UNKNOWN_ERROR');
|
|
55
|
+
return createToolErrorResponse(message, url, 'UNKNOWN_ERROR', details);
|
|
56
56
|
}
|
|
57
|
-
return createToolErrorResponse(`${fallbackMessage}: Unknown error`, url, 'UNKNOWN_ERROR');
|
|
57
|
+
return createToolErrorResponse(`${fallbackMessage}: Unknown error`, url, 'UNKNOWN_ERROR', details);
|
|
58
58
|
}
|
|
59
59
|
function isValidationError(error) {
|
|
60
60
|
return (error instanceof Error &&
|
|
@@ -32,6 +32,35 @@ for (const entry of BLOCKED_IPV6_SUBNETS) {
|
|
|
32
32
|
BLOCK_LIST.addSubnet(entry.subnet, entry.prefix, 'ipv6');
|
|
33
33
|
}
|
|
34
34
|
const DNS_LOOKUP_TIMEOUT_MS = 5000;
|
|
35
|
+
const DNS_DECISION_TTL_MS = 60000;
|
|
36
|
+
const DNS_DECISION_MAX = 1000;
|
|
37
|
+
const dnsDecisionCache = new Map();
|
|
38
|
+
function getCachedDnsDecision(hostname) {
|
|
39
|
+
const cached = dnsDecisionCache.get(hostname);
|
|
40
|
+
if (!cached)
|
|
41
|
+
return null;
|
|
42
|
+
if (cached.expiresAt <= Date.now()) {
|
|
43
|
+
dnsDecisionCache.delete(hostname);
|
|
44
|
+
return null;
|
|
45
|
+
}
|
|
46
|
+
return cached;
|
|
47
|
+
}
|
|
48
|
+
function setCachedDnsDecision(hostname, ok) {
|
|
49
|
+
dnsDecisionCache.set(hostname, {
|
|
50
|
+
ok,
|
|
51
|
+
expiresAt: Date.now() + DNS_DECISION_TTL_MS,
|
|
52
|
+
});
|
|
53
|
+
if (dnsDecisionCache.size <= DNS_DECISION_MAX)
|
|
54
|
+
return;
|
|
55
|
+
const evictCount = Math.ceil(DNS_DECISION_MAX * 0.05);
|
|
56
|
+
const iterator = dnsDecisionCache.keys();
|
|
57
|
+
for (let i = 0; i < evictCount; i++) {
|
|
58
|
+
const { value, done } = iterator.next();
|
|
59
|
+
if (done)
|
|
60
|
+
break;
|
|
61
|
+
dnsDecisionCache.delete(value);
|
|
62
|
+
}
|
|
63
|
+
}
|
|
35
64
|
function matchesBlockedIpPatterns(resolvedIp) {
|
|
36
65
|
for (const pattern of config.security.blockedIpPatterns) {
|
|
37
66
|
if (pattern.test(resolvedIp)) {
|
|
@@ -79,6 +108,13 @@ function lookupWithTimeout(hostname) {
|
|
|
79
108
|
});
|
|
80
109
|
}
|
|
81
110
|
export async function assertResolvedAddressesAllowed(hostname) {
|
|
111
|
+
const cached = getCachedDnsDecision(hostname);
|
|
112
|
+
if (cached) {
|
|
113
|
+
if (!cached.ok) {
|
|
114
|
+
throw createValidationError(`Blocked IP range resolved from hostname: ${hostname}`);
|
|
115
|
+
}
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
82
118
|
try {
|
|
83
119
|
const result = await lookupWithTimeout(hostname);
|
|
84
120
|
const addresses = Array.isArray(result) ? result : [result];
|
|
@@ -87,9 +123,11 @@ export async function assertResolvedAddressesAllowed(hostname) {
|
|
|
87
123
|
}
|
|
88
124
|
for (const { address } of addresses) {
|
|
89
125
|
if (isBlockedIp(address.toLowerCase())) {
|
|
126
|
+
setCachedDnsDecision(hostname, false);
|
|
90
127
|
throw createValidationError(`Blocked IP range resolved from hostname: ${hostname}`);
|
|
91
128
|
}
|
|
92
129
|
}
|
|
130
|
+
setCachedDnsDecision(hostname, true);
|
|
93
131
|
}
|
|
94
132
|
catch (error) {
|
|
95
133
|
const code = error?.code;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { parentPort } from 'node:worker_threads';
|
|
2
|
+
import { transformHtmlToJsonl, transformHtmlToMarkdown, transformHtmlToMarkdownWithBlocks, } from '../tools/utils/content-transform.js';
|
|
3
|
+
function isTransformJob(value) {
|
|
4
|
+
if (!value || typeof value !== 'object')
|
|
5
|
+
return false;
|
|
6
|
+
const record = value;
|
|
7
|
+
return (typeof record.id === 'number' &&
|
|
8
|
+
typeof record.mode === 'string' &&
|
|
9
|
+
typeof record.html === 'string' &&
|
|
10
|
+
typeof record.url === 'string');
|
|
11
|
+
}
|
|
12
|
+
function resolveTransform(job) {
|
|
13
|
+
if (job.mode === 'markdown') {
|
|
14
|
+
return transformHtmlToMarkdown(job.html, job.url, job.options);
|
|
15
|
+
}
|
|
16
|
+
if (job.mode === 'markdown-blocks') {
|
|
17
|
+
return transformHtmlToMarkdownWithBlocks(job.html, job.url, {
|
|
18
|
+
...job.options,
|
|
19
|
+
includeContentBlocks: job.options.includeContentBlocks ?? true,
|
|
20
|
+
});
|
|
21
|
+
}
|
|
22
|
+
return transformHtmlToJsonl(job.html, job.url, job.options);
|
|
23
|
+
}
|
|
24
|
+
function sendResponse(response) {
|
|
25
|
+
if (!parentPort)
|
|
26
|
+
return;
|
|
27
|
+
parentPort.postMessage(response);
|
|
28
|
+
}
|
|
29
|
+
function handleMessage(message) {
|
|
30
|
+
if (!isTransformJob(message)) {
|
|
31
|
+
sendResponse({
|
|
32
|
+
id: -1,
|
|
33
|
+
ok: false,
|
|
34
|
+
error: 'Invalid transform job payload',
|
|
35
|
+
});
|
|
36
|
+
return;
|
|
37
|
+
}
|
|
38
|
+
try {
|
|
39
|
+
const result = resolveTransform(message);
|
|
40
|
+
sendResponse({ id: message.id, ok: true, result });
|
|
41
|
+
}
|
|
42
|
+
catch (error) {
|
|
43
|
+
sendResponse({
|
|
44
|
+
id: message.id,
|
|
45
|
+
ok: false,
|
|
46
|
+
error: error instanceof Error ? error.message : String(error),
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
parentPort?.on('message', handleMessage);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@j0hanz/superfetch",
|
|
3
|
-
"version": "1.2.
|
|
3
|
+
"version": "1.2.4",
|
|
4
4
|
"mcpName": "io.github.j0hanz/superfetch",
|
|
5
5
|
"description": "Intelligent web content fetcher MCP server that converts HTML to clean, AI-readable JSONL format",
|
|
6
6
|
"type": "module",
|
|
@@ -44,8 +44,8 @@
|
|
|
44
44
|
"type-check": "tsc --noEmit",
|
|
45
45
|
"lint": "eslint .",
|
|
46
46
|
"lint:fix": "eslint . --fix",
|
|
47
|
-
"test": "
|
|
48
|
-
"test:coverage": "
|
|
47
|
+
"test": "npm run build --silent && node --test --experimental-transform-types",
|
|
48
|
+
"test:coverage": "npm run build --silent && node --test --experimental-transform-types --experimental-test-coverage",
|
|
49
49
|
"bench": "npm run build && node scripts/bench.mjs",
|
|
50
50
|
"knip": "knip",
|
|
51
51
|
"knip:fix": "knip --fix"
|
|
@@ -59,7 +59,7 @@
|
|
|
59
59
|
"linkedom": "^0.18.12",
|
|
60
60
|
"turndown": "^7.2.2",
|
|
61
61
|
"undici": "^6.22.0",
|
|
62
|
-
"zod": "^3.
|
|
62
|
+
"zod": "^4.3.4"
|
|
63
63
|
},
|
|
64
64
|
"devDependencies": {
|
|
65
65
|
"@eslint/js": "^9.39.2",
|
|
@@ -67,7 +67,6 @@
|
|
|
67
67
|
"@types/express": "^5.0.6",
|
|
68
68
|
"@types/node": "^22.19.3",
|
|
69
69
|
"@types/turndown": "^5.0.6",
|
|
70
|
-
"@vitest/coverage-v8": "^2.1.9",
|
|
71
70
|
"eslint": "^9.23.2",
|
|
72
71
|
"eslint-config-prettier": "^10.1.8",
|
|
73
72
|
"eslint-plugin-unused-imports": "^4.3.0",
|
|
@@ -76,8 +75,7 @@
|
|
|
76
75
|
"shx": "^0.4.0",
|
|
77
76
|
"tsx": "^4.21.0",
|
|
78
77
|
"typescript": "^5.9.3",
|
|
79
|
-
"typescript-eslint": "^8.51.0"
|
|
80
|
-
"vitest": "^2.1.9"
|
|
78
|
+
"typescript-eslint": "^8.51.0"
|
|
81
79
|
},
|
|
82
80
|
"engines": {
|
|
83
81
|
"node": ">=20.12.0"
|