@j0hanz/superfetch 2.5.2 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +356 -223
- package/dist/assets/logo.svg +24837 -24835
- package/dist/cache.d.ts +28 -20
- package/dist/cache.js +292 -514
- package/dist/config.d.ts +41 -7
- package/dist/config.js +298 -148
- package/dist/crypto.js +25 -12
- package/dist/dom-noise-removal.js +379 -421
- package/dist/errors.d.ts +2 -2
- package/dist/errors.js +25 -8
- package/dist/fetch.d.ts +18 -16
- package/dist/fetch.js +1132 -526
- package/dist/host-normalization.js +40 -10
- package/dist/http-native.js +628 -287
- package/dist/index.js +67 -7
- package/dist/instructions.md +44 -30
- package/dist/ip-blocklist.d.ts +8 -0
- package/dist/ip-blocklist.js +65 -0
- package/dist/json.js +14 -9
- package/dist/language-detection.d.ts +2 -11
- package/dist/language-detection.js +289 -280
- package/dist/markdown-cleanup.d.ts +0 -1
- package/dist/markdown-cleanup.js +391 -429
- package/dist/mcp-validator.js +4 -2
- package/dist/mcp.js +184 -135
- package/dist/observability.js +89 -21
- package/dist/resources.js +16 -6
- package/dist/server-tuning.d.ts +2 -0
- package/dist/server-tuning.js +25 -23
- package/dist/session.d.ts +1 -0
- package/dist/session.js +41 -33
- package/dist/tasks.d.ts +2 -0
- package/dist/tasks.js +91 -9
- package/dist/timer-utils.d.ts +5 -0
- package/dist/timer-utils.js +20 -0
- package/dist/tools.d.ts +28 -5
- package/dist/tools.js +317 -183
- package/dist/transform-types.d.ts +5 -1
- package/dist/transform.d.ts +3 -2
- package/dist/transform.js +1138 -421
- package/dist/type-guards.d.ts +1 -0
- package/dist/type-guards.js +7 -0
- package/dist/workers/transform-child.d.ts +1 -0
- package/dist/workers/transform-child.js +118 -0
- package/dist/workers/transform-worker.js +87 -78
- package/package.json +21 -13
package/dist/index.js
CHANGED
|
@@ -1,13 +1,59 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
|
+
import process from 'node:process';
|
|
2
3
|
import { parseArgs } from 'node:util';
|
|
4
|
+
import { serverVersion } from './config.js';
|
|
3
5
|
import { startHttpServer } from './http-native.js';
|
|
4
6
|
import { startStdioServer } from './mcp.js';
|
|
5
7
|
import { logError } from './observability.js';
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
8
|
+
function printUsage() {
|
|
9
|
+
process.stdout.write([
|
|
10
|
+
'superfetch MCP server',
|
|
11
|
+
'',
|
|
12
|
+
'Usage:',
|
|
13
|
+
' superfetch [--stdio] [--help] [--version]',
|
|
14
|
+
'',
|
|
15
|
+
'Options:',
|
|
16
|
+
' --stdio Run in stdio mode (no HTTP server).',
|
|
17
|
+
' --help Show this help message.',
|
|
18
|
+
' --version Show server version.',
|
|
19
|
+
'',
|
|
20
|
+
].join('\n'));
|
|
21
|
+
}
|
|
22
|
+
const FORCE_EXIT_TIMEOUT_MS = 10_000;
|
|
23
|
+
let forcedExitTimer;
|
|
24
|
+
function scheduleForcedExit(reason) {
|
|
25
|
+
if (forcedExitTimer)
|
|
26
|
+
return;
|
|
27
|
+
forcedExitTimer = setTimeout(() => {
|
|
28
|
+
process.stderr.write(`${reason}; forcing exit.\n`);
|
|
29
|
+
process.exit(1);
|
|
30
|
+
}, FORCE_EXIT_TIMEOUT_MS);
|
|
31
|
+
forcedExitTimer.unref();
|
|
32
|
+
}
|
|
33
|
+
let values;
|
|
34
|
+
try {
|
|
35
|
+
({ values } = parseArgs({
|
|
36
|
+
options: {
|
|
37
|
+
stdio: { type: 'boolean', default: false },
|
|
38
|
+
help: { type: 'boolean', default: false },
|
|
39
|
+
version: { type: 'boolean', default: false },
|
|
40
|
+
},
|
|
41
|
+
}));
|
|
42
|
+
}
|
|
43
|
+
catch (error) {
|
|
44
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
45
|
+
process.stderr.write(`Invalid arguments: ${message}\n\n`);
|
|
46
|
+
printUsage();
|
|
47
|
+
process.exit(1);
|
|
48
|
+
}
|
|
49
|
+
if (values.help) {
|
|
50
|
+
printUsage();
|
|
51
|
+
process.exit(0);
|
|
52
|
+
}
|
|
53
|
+
if (values.version) {
|
|
54
|
+
process.stdout.write(`${serverVersion}\n`);
|
|
55
|
+
process.exit(0);
|
|
56
|
+
}
|
|
11
57
|
const isStdioMode = values.stdio;
|
|
12
58
|
let isShuttingDown = false;
|
|
13
59
|
const shutdownHandlerRef = {};
|
|
@@ -21,14 +67,26 @@ function attemptShutdown(signal) {
|
|
|
21
67
|
process.stderr.write('Attempting graceful shutdown...\n');
|
|
22
68
|
void shutdownHandlerRef.current(signal);
|
|
23
69
|
}
|
|
70
|
+
function registerHttpSignalHandlers() {
|
|
71
|
+
process.once('SIGINT', () => {
|
|
72
|
+
if (shouldAttemptShutdown())
|
|
73
|
+
attemptShutdown('SIGINT');
|
|
74
|
+
});
|
|
75
|
+
process.once('SIGTERM', () => {
|
|
76
|
+
if (shouldAttemptShutdown())
|
|
77
|
+
attemptShutdown('SIGTERM');
|
|
78
|
+
});
|
|
79
|
+
}
|
|
24
80
|
function handleFatalError(label, error, signal) {
|
|
25
81
|
logError(label, error);
|
|
26
82
|
process.stderr.write(`${label}: ${error.message}\n`);
|
|
83
|
+
process.exitCode = 1;
|
|
27
84
|
if (shouldAttemptShutdown()) {
|
|
28
85
|
attemptShutdown(signal);
|
|
86
|
+
scheduleForcedExit('Graceful shutdown timed out');
|
|
29
87
|
return;
|
|
30
88
|
}
|
|
31
|
-
|
|
89
|
+
scheduleForcedExit('Fatal error without shutdown handler');
|
|
32
90
|
}
|
|
33
91
|
process.on('uncaughtException', (error) => {
|
|
34
92
|
handleFatalError('Uncaught exception', error, 'UNCAUGHT_EXCEPTION');
|
|
@@ -44,11 +102,13 @@ try {
|
|
|
44
102
|
else {
|
|
45
103
|
const { shutdown } = await startHttpServer();
|
|
46
104
|
shutdownHandlerRef.current = shutdown;
|
|
105
|
+
registerHttpSignalHandlers();
|
|
47
106
|
}
|
|
48
107
|
}
|
|
49
108
|
catch (error) {
|
|
50
109
|
logError('Failed to start server', error instanceof Error ? error : undefined);
|
|
51
110
|
const message = error instanceof Error ? error.message : String(error);
|
|
52
111
|
process.stderr.write(`Failed to start server: ${message}\n`);
|
|
53
|
-
process.
|
|
112
|
+
process.exitCode = 1;
|
|
113
|
+
scheduleForcedExit('Startup failure');
|
|
54
114
|
}
|
package/dist/instructions.md
CHANGED
|
@@ -1,44 +1,58 @@
|
|
|
1
|
-
#
|
|
1
|
+
# SUPERFETCH INSTRUCTIONS
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Available as resource (internal://instructions) or prompt (get-help). Load when unsure about tool usage.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
---
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
- **Primary Resources:** Markdown content, cached snapshots (`superfetch://cache/...`).
|
|
9
|
-
- **Tools:** `fetch-url` (**Read-only**; no write tools exist).
|
|
7
|
+
## CORE CAPABILITY
|
|
10
8
|
|
|
11
|
-
|
|
9
|
+
- Domain: Fetch public web pages and convert HTML to clean, LLM-readable Markdown.
|
|
10
|
+
- Primary Resources: Markdown content, cached snapshots (superfetch://cache/...).
|
|
11
|
+
- Tools: fetch-url (READ-ONLY; no write tools exist).
|
|
12
12
|
|
|
13
|
-
|
|
13
|
+
---
|
|
14
14
|
|
|
15
|
-
|
|
16
|
-
2. Read the `markdown` field from `structuredContent`.
|
|
17
|
-
3. **If truncated** (ends with `...[truncated]`): read the `resource_link` URI to get full content.
|
|
18
|
-
> Constraint: Never guess URIs; always use the one returned.
|
|
15
|
+
## THE “GOLDEN PATH” WORKFLOWS (CRITICAL)
|
|
19
16
|
|
|
20
|
-
###
|
|
17
|
+
### WORKFLOW A: STANDARD FETCH
|
|
21
18
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
19
|
+
- Call fetch-url with: { "url": "https://..." }
|
|
20
|
+
- Read the “markdown” field from “structuredContent”.
|
|
21
|
+
- If truncated (ends with "...[truncated]"): read the "resource_link" URI to get full content.
|
|
22
|
+
NOTE: Never guess URIs; always use the one returned.
|
|
25
23
|
|
|
26
|
-
|
|
24
|
+
### WORKFLOW B: ASYNC EXECUTION (LARGE SITES / TIMEOUTS)
|
|
27
25
|
|
|
28
|
-
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
- **Side effects:** None (read-only, idempotent). Populates cache automatically.
|
|
32
|
-
- **Limits:** Inline content capped at 20,000 chars; larger content offloaded to `superfetch://cache/...`.
|
|
33
|
-
- **Blocked targets:** `localhost`, private IPs (`10.x`, `172.16–31.x`, `192.168.x`), cloud metadata endpoints.
|
|
26
|
+
- Call tools/call with task: { ttl: ... } to start a background fetch.
|
|
27
|
+
- Poll tasks/get until status is “completed” or “failed”.
|
|
28
|
+
- Retrieve result via tasks/result.
|
|
34
29
|
|
|
35
|
-
|
|
30
|
+
---
|
|
36
31
|
|
|
37
|
-
|
|
38
|
-
- **`FETCH_ERROR`:** Network/upstream failure. **Retry once** with backoff.
|
|
39
|
-
- **`queue_full`:** Worker pool busy. Wait briefly, then retry or use Task interface.
|
|
32
|
+
## TOOL NUANCES & GOTCHAS
|
|
40
33
|
|
|
41
|
-
|
|
34
|
+
fetch-url
|
|
42
35
|
|
|
43
|
-
-
|
|
44
|
-
-
|
|
36
|
+
- Purpose: Fetch a URL and return Markdown.
|
|
37
|
+
- Input: { "url": "https://..." }
|
|
38
|
+
- Optional: skipNoiseRemoval (bool, keeps nav/footers), forceRefresh (bool, bypasses cache).
|
|
39
|
+
- Side effects: None (read-only, idempotent). Populates cache automatically.
|
|
40
|
+
- Limits: HTML capped at 10 MB (MAX_HTML_BYTES). Inline content unlimited by default; set MAX_INLINE_CONTENT_CHARS to cap.
|
|
41
|
+
- Blocked: localhost, private IPs (10.x, 172.16–31.x, 192.168.x), cloud metadata endpoints.
|
|
42
|
+
- Quality: Varies by HTML structure. Best with articles/docs. Always verify output.
|
|
43
|
+
|
|
44
|
+
---
|
|
45
|
+
|
|
46
|
+
## ERROR HANDLING STRATEGY
|
|
47
|
+
|
|
48
|
+
- VALIDATION_ERROR: URL invalid or blocked. Do not retry.
|
|
49
|
+
- FETCH_ERROR: Network/upstream failure. Retry once with backoff.
|
|
50
|
+
- queue_full: Worker pool busy. Wait briefly, then retry or use Task interface.
|
|
51
|
+
|
|
52
|
+
---
|
|
53
|
+
|
|
54
|
+
## RESOURCES
|
|
55
|
+
|
|
56
|
+
- internal://instructions — This document.
|
|
57
|
+
- internal://config — Current server limits (secrets redacted).
|
|
58
|
+
- superfetch://cache/{key} — Immutable cached snapshots. Re-fetch for fresh content.
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import { BlockList, isIP } from 'node:net';
|
|
2
|
+
function buildIpv4(parts) {
|
|
3
|
+
return parts.join('.');
|
|
4
|
+
}
|
|
5
|
+
function buildIpv6(parts) {
|
|
6
|
+
return parts.map(String).join(':');
|
|
7
|
+
}
|
|
8
|
+
const IPV6_ZERO = buildIpv6([0, 0, 0, 0, 0, 0, 0, 0]);
|
|
9
|
+
const IPV6_LOOPBACK = buildIpv6([0, 0, 0, 0, 0, 0, 0, 1]);
|
|
10
|
+
const IPV6_64_FF9B = buildIpv6(['64', 'ff9b', 0, 0, 0, 0, 0, 0]);
|
|
11
|
+
const IPV6_64_FF9B_1 = buildIpv6(['64', 'ff9b', 1, 0, 0, 0, 0, 0]);
|
|
12
|
+
const IPV6_2001 = buildIpv6(['2001', 0, 0, 0, 0, 0, 0, 0]);
|
|
13
|
+
const IPV6_2002 = buildIpv6(['2002', 0, 0, 0, 0, 0, 0, 0]);
|
|
14
|
+
const IPV6_FC00 = buildIpv6(['fc00', 0, 0, 0, 0, 0, 0, 0]);
|
|
15
|
+
const IPV6_FE80 = buildIpv6(['fe80', 0, 0, 0, 0, 0, 0, 0]);
|
|
16
|
+
const IPV6_FF00 = buildIpv6(['ff00', 0, 0, 0, 0, 0, 0, 0]);
|
|
17
|
+
const BLOCKED_SUBNETS = [
|
|
18
|
+
{ subnet: buildIpv4([0, 0, 0, 0]), prefix: 8, family: 'ipv4' },
|
|
19
|
+
{ subnet: buildIpv4([10, 0, 0, 0]), prefix: 8, family: 'ipv4' },
|
|
20
|
+
{ subnet: buildIpv4([100, 64, 0, 0]), prefix: 10, family: 'ipv4' },
|
|
21
|
+
{ subnet: buildIpv4([127, 0, 0, 0]), prefix: 8, family: 'ipv4' },
|
|
22
|
+
{ subnet: buildIpv4([169, 254, 0, 0]), prefix: 16, family: 'ipv4' },
|
|
23
|
+
{ subnet: buildIpv4([172, 16, 0, 0]), prefix: 12, family: 'ipv4' },
|
|
24
|
+
{ subnet: buildIpv4([192, 168, 0, 0]), prefix: 16, family: 'ipv4' },
|
|
25
|
+
{ subnet: buildIpv4([224, 0, 0, 0]), prefix: 4, family: 'ipv4' },
|
|
26
|
+
{ subnet: buildIpv4([240, 0, 0, 0]), prefix: 4, family: 'ipv4' },
|
|
27
|
+
{ subnet: IPV6_ZERO, prefix: 128, family: 'ipv6' },
|
|
28
|
+
{ subnet: IPV6_LOOPBACK, prefix: 128, family: 'ipv6' },
|
|
29
|
+
{ subnet: IPV6_64_FF9B, prefix: 96, family: 'ipv6' },
|
|
30
|
+
{ subnet: IPV6_64_FF9B_1, prefix: 48, family: 'ipv6' },
|
|
31
|
+
{ subnet: IPV6_2001, prefix: 32, family: 'ipv6' },
|
|
32
|
+
{ subnet: IPV6_2002, prefix: 16, family: 'ipv6' },
|
|
33
|
+
{ subnet: IPV6_FC00, prefix: 7, family: 'ipv6' },
|
|
34
|
+
{ subnet: IPV6_FE80, prefix: 10, family: 'ipv6' },
|
|
35
|
+
{ subnet: IPV6_FF00, prefix: 8, family: 'ipv6' },
|
|
36
|
+
];
|
|
37
|
+
export function createDefaultBlockList() {
|
|
38
|
+
const list = new BlockList();
|
|
39
|
+
for (const entry of BLOCKED_SUBNETS) {
|
|
40
|
+
list.addSubnet(entry.subnet, entry.prefix, entry.family);
|
|
41
|
+
}
|
|
42
|
+
return list;
|
|
43
|
+
}
|
|
44
|
+
function extractMappedIpv4(ip) {
|
|
45
|
+
const prefix = '::ffff:';
|
|
46
|
+
if (!ip.startsWith(prefix))
|
|
47
|
+
return null;
|
|
48
|
+
const mapped = ip.slice(prefix.length);
|
|
49
|
+
return isIP(mapped) === 4 ? mapped : null;
|
|
50
|
+
}
|
|
51
|
+
export function normalizeIpForBlockList(input) {
|
|
52
|
+
const trimmed = input.trim().toLowerCase();
|
|
53
|
+
if (!trimmed)
|
|
54
|
+
return null;
|
|
55
|
+
const ipType = isIP(trimmed);
|
|
56
|
+
if (ipType === 4)
|
|
57
|
+
return { ip: trimmed, family: 'ipv4' };
|
|
58
|
+
if (ipType === 6) {
|
|
59
|
+
const mapped = extractMappedIpv4(trimmed);
|
|
60
|
+
if (mapped)
|
|
61
|
+
return { ip: mapped, family: 'ipv4' };
|
|
62
|
+
return { ip: trimmed, family: 'ipv6' };
|
|
63
|
+
}
|
|
64
|
+
return null;
|
|
65
|
+
}
|
package/dist/json.js
CHANGED
|
@@ -7,21 +7,26 @@ function processValue(obj, depth, seen) {
|
|
|
7
7
|
if (depth > MAX_DEPTH) {
|
|
8
8
|
throw new Error(`stableStringify: Max depth (${MAX_DEPTH}) exceeded`);
|
|
9
9
|
}
|
|
10
|
-
// Cycle detection
|
|
10
|
+
// Cycle detection (track active recursion stack only).
|
|
11
11
|
if (seen.has(obj)) {
|
|
12
12
|
throw new Error('stableStringify: Circular reference detected');
|
|
13
13
|
}
|
|
14
14
|
seen.add(obj);
|
|
15
|
-
|
|
16
|
-
|
|
15
|
+
try {
|
|
16
|
+
if (Array.isArray(obj)) {
|
|
17
|
+
return obj.map((item) => processValue(item, depth + 1, seen));
|
|
18
|
+
}
|
|
19
|
+
const keys = Object.keys(obj).sort((a, b) => a.localeCompare(b));
|
|
20
|
+
const record = obj;
|
|
21
|
+
const sortedObj = {};
|
|
22
|
+
for (const key of keys) {
|
|
23
|
+
sortedObj[key] = processValue(record[key], depth + 1, seen);
|
|
24
|
+
}
|
|
25
|
+
return sortedObj;
|
|
17
26
|
}
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
const sortedObj = {};
|
|
21
|
-
for (const key of keys) {
|
|
22
|
-
sortedObj[key] = processValue(record[key], depth + 1, seen);
|
|
27
|
+
finally {
|
|
28
|
+
seen.delete(obj);
|
|
23
29
|
}
|
|
24
|
-
return sortedObj;
|
|
25
30
|
}
|
|
26
31
|
export function stableStringify(obj, depth = 0, seen = new WeakSet()) {
|
|
27
32
|
const processed = processValue(obj, depth, seen);
|
|
@@ -1,12 +1,3 @@
|
|
|
1
|
-
|
|
2
|
-
* Language detection for code blocks.
|
|
3
|
-
* Detects programming languages from code content and HTML attributes.
|
|
4
|
-
*/
|
|
5
|
-
/**
|
|
6
|
-
* Detect programming language from code content using heuristics.
|
|
7
|
-
*/
|
|
8
|
-
export declare function detectLanguageFromCode(code: string): string | undefined;
|
|
9
|
-
/**
|
|
10
|
-
* Resolve language from HTML attributes (class name and data-language).
|
|
11
|
-
*/
|
|
1
|
+
export declare function extractLanguageFromClassName(className: string): string | undefined;
|
|
12
2
|
export declare function resolveLanguageFromAttributes(className: string, dataLang: string): string | undefined;
|
|
3
|
+
export declare function detectLanguageFromCode(code: string): string | undefined;
|