@j0hanz/superfetch 2.5.2 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/README.md +356 -223
  2. package/dist/assets/logo.svg +24837 -24835
  3. package/dist/cache.d.ts +28 -20
  4. package/dist/cache.js +292 -514
  5. package/dist/config.d.ts +41 -7
  6. package/dist/config.js +298 -148
  7. package/dist/crypto.js +25 -12
  8. package/dist/dom-noise-removal.js +379 -421
  9. package/dist/errors.d.ts +2 -2
  10. package/dist/errors.js +25 -8
  11. package/dist/fetch.d.ts +18 -16
  12. package/dist/fetch.js +1132 -526
  13. package/dist/host-normalization.js +40 -10
  14. package/dist/http-native.js +628 -287
  15. package/dist/index.js +67 -7
  16. package/dist/instructions.md +44 -30
  17. package/dist/ip-blocklist.d.ts +8 -0
  18. package/dist/ip-blocklist.js +65 -0
  19. package/dist/json.js +14 -9
  20. package/dist/language-detection.d.ts +2 -11
  21. package/dist/language-detection.js +289 -280
  22. package/dist/markdown-cleanup.d.ts +0 -1
  23. package/dist/markdown-cleanup.js +391 -429
  24. package/dist/mcp-validator.js +4 -2
  25. package/dist/mcp.js +184 -135
  26. package/dist/observability.js +89 -21
  27. package/dist/resources.js +16 -6
  28. package/dist/server-tuning.d.ts +2 -0
  29. package/dist/server-tuning.js +25 -23
  30. package/dist/session.d.ts +1 -0
  31. package/dist/session.js +41 -33
  32. package/dist/tasks.d.ts +2 -0
  33. package/dist/tasks.js +91 -9
  34. package/dist/timer-utils.d.ts +5 -0
  35. package/dist/timer-utils.js +20 -0
  36. package/dist/tools.d.ts +28 -5
  37. package/dist/tools.js +317 -183
  38. package/dist/transform-types.d.ts +5 -1
  39. package/dist/transform.d.ts +3 -2
  40. package/dist/transform.js +1138 -421
  41. package/dist/type-guards.d.ts +1 -0
  42. package/dist/type-guards.js +7 -0
  43. package/dist/workers/transform-child.d.ts +1 -0
  44. package/dist/workers/transform-child.js +118 -0
  45. package/dist/workers/transform-worker.js +87 -78
  46. package/package.json +21 -13
package/dist/index.js CHANGED
@@ -1,13 +1,59 @@
1
1
  #!/usr/bin/env node
2
+ import process from 'node:process';
2
3
  import { parseArgs } from 'node:util';
4
+ import { serverVersion } from './config.js';
3
5
  import { startHttpServer } from './http-native.js';
4
6
  import { startStdioServer } from './mcp.js';
5
7
  import { logError } from './observability.js';
6
- const { values } = parseArgs({
7
- options: {
8
- stdio: { type: 'boolean', default: false },
9
- },
10
- });
8
+ function printUsage() {
9
+ process.stdout.write([
10
+ 'superfetch MCP server',
11
+ '',
12
+ 'Usage:',
13
+ ' superfetch [--stdio] [--help] [--version]',
14
+ '',
15
+ 'Options:',
16
+ ' --stdio Run in stdio mode (no HTTP server).',
17
+ ' --help Show this help message.',
18
+ ' --version Show server version.',
19
+ '',
20
+ ].join('\n'));
21
+ }
22
+ const FORCE_EXIT_TIMEOUT_MS = 10_000;
23
+ let forcedExitTimer;
24
+ function scheduleForcedExit(reason) {
25
+ if (forcedExitTimer)
26
+ return;
27
+ forcedExitTimer = setTimeout(() => {
28
+ process.stderr.write(`${reason}; forcing exit.\n`);
29
+ process.exit(1);
30
+ }, FORCE_EXIT_TIMEOUT_MS);
31
+ forcedExitTimer.unref();
32
+ }
33
+ let values;
34
+ try {
35
+ ({ values } = parseArgs({
36
+ options: {
37
+ stdio: { type: 'boolean', default: false },
38
+ help: { type: 'boolean', default: false },
39
+ version: { type: 'boolean', default: false },
40
+ },
41
+ }));
42
+ }
43
+ catch (error) {
44
+ const message = error instanceof Error ? error.message : String(error);
45
+ process.stderr.write(`Invalid arguments: ${message}\n\n`);
46
+ printUsage();
47
+ process.exit(1);
48
+ }
49
+ if (values.help) {
50
+ printUsage();
51
+ process.exit(0);
52
+ }
53
+ if (values.version) {
54
+ process.stdout.write(`${serverVersion}\n`);
55
+ process.exit(0);
56
+ }
11
57
  const isStdioMode = values.stdio;
12
58
  let isShuttingDown = false;
13
59
  const shutdownHandlerRef = {};
@@ -21,14 +67,26 @@ function attemptShutdown(signal) {
21
67
  process.stderr.write('Attempting graceful shutdown...\n');
22
68
  void shutdownHandlerRef.current(signal);
23
69
  }
70
+ function registerHttpSignalHandlers() {
71
+ process.once('SIGINT', () => {
72
+ if (shouldAttemptShutdown())
73
+ attemptShutdown('SIGINT');
74
+ });
75
+ process.once('SIGTERM', () => {
76
+ if (shouldAttemptShutdown())
77
+ attemptShutdown('SIGTERM');
78
+ });
79
+ }
24
80
  function handleFatalError(label, error, signal) {
25
81
  logError(label, error);
26
82
  process.stderr.write(`${label}: ${error.message}\n`);
83
+ process.exitCode = 1;
27
84
  if (shouldAttemptShutdown()) {
28
85
  attemptShutdown(signal);
86
+ scheduleForcedExit('Graceful shutdown timed out');
29
87
  return;
30
88
  }
31
- process.exit(1);
89
+ scheduleForcedExit('Fatal error without shutdown handler');
32
90
  }
33
91
  process.on('uncaughtException', (error) => {
34
92
  handleFatalError('Uncaught exception', error, 'UNCAUGHT_EXCEPTION');
@@ -44,11 +102,13 @@ try {
44
102
  else {
45
103
  const { shutdown } = await startHttpServer();
46
104
  shutdownHandlerRef.current = shutdown;
105
+ registerHttpSignalHandlers();
47
106
  }
48
107
  }
49
108
  catch (error) {
50
109
  logError('Failed to start server', error instanceof Error ? error : undefined);
51
110
  const message = error instanceof Error ? error.message : String(error);
52
111
  process.stderr.write(`Failed to start server: ${message}\n`);
53
- process.exit(1);
112
+ process.exitCode = 1;
113
+ scheduleForcedExit('Startup failure');
54
114
  }
@@ -1,44 +1,58 @@
1
- # superFetch Instructions
1
+ # SUPERFETCH INSTRUCTIONS
2
2
 
3
- > **Guidance for the Agent:** These instructions are available as a resource (`internal://instructions`) or prompt (`get-help`). Load them when unsure about tool usage.
3
+ Available as resource (internal://instructions) or prompt (get-help). Load when unsure about tool usage.
4
4
 
5
- ## 1. Core Capability
5
+ ---
6
6
 
7
- - **Domain:** Fetch public web pages and convert HTML to clean, LLM-readable Markdown.
8
- - **Primary Resources:** Markdown content, cached snapshots (`superfetch://cache/...`).
9
- - **Tools:** `fetch-url` (**Read-only**; no write tools exist).
7
+ ## CORE CAPABILITY
10
8
 
11
- ## 2. The "Golden Path" Workflows (Critical)
9
+ - Domain: Fetch public web pages and convert HTML to clean, LLM-readable Markdown.
10
+ - Primary Resources: Markdown content, cached snapshots (superfetch://cache/...).
11
+ - Tools: fetch-url (READ-ONLY; no write tools exist).
12
12
 
13
- ### Workflow A: Standard Fetch
13
+ ---
14
14
 
15
- 1. Call `fetch-url` with `{ "url": "https://..." }`.
16
- 2. Read the `markdown` field from `structuredContent`.
17
- 3. **If truncated** (ends with `...[truncated]`): read the `resource_link` URI to get full content.
18
- > Constraint: Never guess URIs; always use the one returned.
15
+ ## THE “GOLDEN PATH” WORKFLOWS (CRITICAL)
19
16
 
20
- ### Workflow B: Async Execution (Large Sites / Timeouts)
17
+ ### WORKFLOW A: STANDARD FETCH
21
18
 
22
- 1. Call `tools/call` with `task: { ttl: ... }` to start a background fetch.
23
- 2. Poll `tasks/get` until `status` is `completed` or `failed`.
24
- 3. Retrieve result via `tasks/result`.
19
+ - Call fetch-url with: { "url": "https://..." }
20
+ - Read the “markdown” field from “structuredContent”.
21
+ - If truncated (ends with "...[truncated]"): read the "resource_link" URI to get full content.
22
+ NOTE: Never guess URIs; always use the one returned.
25
23
 
26
- ## 3. Tool Nuances & Gotchas
24
+ ### WORKFLOW B: ASYNC EXECUTION (LARGE SITES / TIMEOUTS)
27
25
 
28
- - **`fetch-url`**
29
- - **Purpose:** Fetch a URL and return Markdown.
30
- - **Inputs:** `url` (required; 1–2048 chars; `https?://` only).
31
- - **Side effects:** None (read-only, idempotent). Populates cache automatically.
32
- - **Limits:** Inline content capped at 20,000 chars; larger content offloaded to `superfetch://cache/...`.
33
- - **Blocked targets:** `localhost`, private IPs (`10.x`, `172.16–31.x`, `192.168.x`), cloud metadata endpoints.
26
+ - Call tools/call with task: { ttl: ... } to start a background fetch.
27
+ - Poll tasks/get until status is “completed” or “failed”.
28
+ - Retrieve result via tasks/result.
34
29
 
35
- ## 4. Error Handling Strategy
30
+ ---
36
31
 
37
- - **`VALIDATION_ERROR`:** URL invalid or blocked. **Do not retry.**
38
- - **`FETCH_ERROR`:** Network/upstream failure. **Retry once** with backoff.
39
- - **`queue_full`:** Worker pool busy. Wait briefly, then retry or use Task interface.
32
+ ## TOOL NUANCES & GOTCHAS
40
33
 
41
- ## 5. Resources
34
+ fetch-url
42
35
 
43
- - `internal://config` Current server limits (secrets redacted).
44
- - `superfetch://cache/{key}` Immutable cached snapshots. Re-fetch for fresh content.
36
+ - Purpose: Fetch a URL and return Markdown.
37
+ - Input: { "url": "https://..." }
38
+ - Optional: skipNoiseRemoval (bool, keeps nav/footers), forceRefresh (bool, bypasses cache).
39
+ - Side effects: None (read-only, idempotent). Populates cache automatically.
40
+ - Limits: HTML capped at 10 MB (MAX_HTML_BYTES). Inline content unlimited by default; set MAX_INLINE_CONTENT_CHARS to cap.
41
+ - Blocked: localhost, private IPs (10.x, 172.16–31.x, 192.168.x), cloud metadata endpoints.
42
+ - Quality: Varies by HTML structure. Best with articles/docs. Always verify output.
43
+
44
+ ---
45
+
46
+ ## ERROR HANDLING STRATEGY
47
+
48
+ - VALIDATION_ERROR: URL invalid or blocked. Do not retry.
49
+ - FETCH_ERROR: Network/upstream failure. Retry once with backoff.
50
+ - queue_full: Worker pool busy. Wait briefly, then retry or use Task interface.
51
+
52
+ ---
53
+
54
+ ## RESOURCES
55
+
56
+ - internal://instructions — This document.
57
+ - internal://config — Current server limits (secrets redacted).
58
+ - superfetch://cache/{key} — Immutable cached snapshots. Re-fetch for fresh content.
@@ -0,0 +1,8 @@
1
+ import { BlockList } from 'node:net';
2
+ type IpFamily = 'ipv4' | 'ipv6';
3
+ export declare function createDefaultBlockList(): BlockList;
4
+ export declare function normalizeIpForBlockList(input: string): {
5
+ ip: string;
6
+ family: IpFamily;
7
+ } | null;
8
+ export {};
@@ -0,0 +1,65 @@
1
+ import { BlockList, isIP } from 'node:net';
2
+ function buildIpv4(parts) {
3
+ return parts.join('.');
4
+ }
5
+ function buildIpv6(parts) {
6
+ return parts.map(String).join(':');
7
+ }
8
+ const IPV6_ZERO = buildIpv6([0, 0, 0, 0, 0, 0, 0, 0]);
9
+ const IPV6_LOOPBACK = buildIpv6([0, 0, 0, 0, 0, 0, 0, 1]);
10
+ const IPV6_64_FF9B = buildIpv6(['64', 'ff9b', 0, 0, 0, 0, 0, 0]);
11
+ const IPV6_64_FF9B_1 = buildIpv6(['64', 'ff9b', 1, 0, 0, 0, 0, 0]);
12
+ const IPV6_2001 = buildIpv6(['2001', 0, 0, 0, 0, 0, 0, 0]);
13
+ const IPV6_2002 = buildIpv6(['2002', 0, 0, 0, 0, 0, 0, 0]);
14
+ const IPV6_FC00 = buildIpv6(['fc00', 0, 0, 0, 0, 0, 0, 0]);
15
+ const IPV6_FE80 = buildIpv6(['fe80', 0, 0, 0, 0, 0, 0, 0]);
16
+ const IPV6_FF00 = buildIpv6(['ff00', 0, 0, 0, 0, 0, 0, 0]);
17
+ const BLOCKED_SUBNETS = [
18
+ { subnet: buildIpv4([0, 0, 0, 0]), prefix: 8, family: 'ipv4' },
19
+ { subnet: buildIpv4([10, 0, 0, 0]), prefix: 8, family: 'ipv4' },
20
+ { subnet: buildIpv4([100, 64, 0, 0]), prefix: 10, family: 'ipv4' },
21
+ { subnet: buildIpv4([127, 0, 0, 0]), prefix: 8, family: 'ipv4' },
22
+ { subnet: buildIpv4([169, 254, 0, 0]), prefix: 16, family: 'ipv4' },
23
+ { subnet: buildIpv4([172, 16, 0, 0]), prefix: 12, family: 'ipv4' },
24
+ { subnet: buildIpv4([192, 168, 0, 0]), prefix: 16, family: 'ipv4' },
25
+ { subnet: buildIpv4([224, 0, 0, 0]), prefix: 4, family: 'ipv4' },
26
+ { subnet: buildIpv4([240, 0, 0, 0]), prefix: 4, family: 'ipv4' },
27
+ { subnet: IPV6_ZERO, prefix: 128, family: 'ipv6' },
28
+ { subnet: IPV6_LOOPBACK, prefix: 128, family: 'ipv6' },
29
+ { subnet: IPV6_64_FF9B, prefix: 96, family: 'ipv6' },
30
+ { subnet: IPV6_64_FF9B_1, prefix: 48, family: 'ipv6' },
31
+ { subnet: IPV6_2001, prefix: 32, family: 'ipv6' },
32
+ { subnet: IPV6_2002, prefix: 16, family: 'ipv6' },
33
+ { subnet: IPV6_FC00, prefix: 7, family: 'ipv6' },
34
+ { subnet: IPV6_FE80, prefix: 10, family: 'ipv6' },
35
+ { subnet: IPV6_FF00, prefix: 8, family: 'ipv6' },
36
+ ];
37
+ export function createDefaultBlockList() {
38
+ const list = new BlockList();
39
+ for (const entry of BLOCKED_SUBNETS) {
40
+ list.addSubnet(entry.subnet, entry.prefix, entry.family);
41
+ }
42
+ return list;
43
+ }
44
+ function extractMappedIpv4(ip) {
45
+ const prefix = '::ffff:';
46
+ if (!ip.startsWith(prefix))
47
+ return null;
48
+ const mapped = ip.slice(prefix.length);
49
+ return isIP(mapped) === 4 ? mapped : null;
50
+ }
51
+ export function normalizeIpForBlockList(input) {
52
+ const trimmed = input.trim().toLowerCase();
53
+ if (!trimmed)
54
+ return null;
55
+ const ipType = isIP(trimmed);
56
+ if (ipType === 4)
57
+ return { ip: trimmed, family: 'ipv4' };
58
+ if (ipType === 6) {
59
+ const mapped = extractMappedIpv4(trimmed);
60
+ if (mapped)
61
+ return { ip: mapped, family: 'ipv4' };
62
+ return { ip: trimmed, family: 'ipv6' };
63
+ }
64
+ return null;
65
+ }
package/dist/json.js CHANGED
@@ -7,21 +7,26 @@ function processValue(obj, depth, seen) {
7
7
  if (depth > MAX_DEPTH) {
8
8
  throw new Error(`stableStringify: Max depth (${MAX_DEPTH}) exceeded`);
9
9
  }
10
- // Cycle detection
10
+ // Cycle detection (track active recursion stack only).
11
11
  if (seen.has(obj)) {
12
12
  throw new Error('stableStringify: Circular reference detected');
13
13
  }
14
14
  seen.add(obj);
15
- if (Array.isArray(obj)) {
16
- return obj.map((item) => processValue(item, depth + 1, seen));
15
+ try {
16
+ if (Array.isArray(obj)) {
17
+ return obj.map((item) => processValue(item, depth + 1, seen));
18
+ }
19
+ const keys = Object.keys(obj).sort((a, b) => a.localeCompare(b));
20
+ const record = obj;
21
+ const sortedObj = {};
22
+ for (const key of keys) {
23
+ sortedObj[key] = processValue(record[key], depth + 1, seen);
24
+ }
25
+ return sortedObj;
17
26
  }
18
- const keys = Object.keys(obj).sort((a, b) => a.localeCompare(b));
19
- const record = obj;
20
- const sortedObj = {};
21
- for (const key of keys) {
22
- sortedObj[key] = processValue(record[key], depth + 1, seen);
27
+ finally {
28
+ seen.delete(obj);
23
29
  }
24
- return sortedObj;
25
30
  }
26
31
  export function stableStringify(obj, depth = 0, seen = new WeakSet()) {
27
32
  const processed = processValue(obj, depth, seen);
@@ -1,12 +1,3 @@
1
- /**
2
- * Language detection for code blocks.
3
- * Detects programming languages from code content and HTML attributes.
4
- */
5
- /**
6
- * Detect programming language from code content using heuristics.
7
- */
8
- export declare function detectLanguageFromCode(code: string): string | undefined;
9
- /**
10
- * Resolve language from HTML attributes (class name and data-language).
11
- */
1
+ export declare function extractLanguageFromClassName(className: string): string | undefined;
12
2
  export declare function resolveLanguageFromAttributes(className: string, dataLang: string): string | undefined;
3
+ export declare function detectLanguageFromCode(code: string): string | undefined;