crawlforge-mcp-server 3.0.18 → 3.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +5 -2
- package/server.js +192 -1277
- package/src/core/ActionExecutor.js +2 -43
- package/src/core/AuthManager.js +127 -14
- package/src/core/BrowserContextPool.js +187 -0
- package/src/core/JobManager.js +7 -5
- package/src/core/LocalizationManager.js +14 -125
- package/src/core/StealthBrowserManager.js +26 -18
- package/src/core/cache/CacheManager.js +4 -1
- package/src/core/crawlers/BFSCrawler.js +19 -5
- package/src/observability/metrics.js +137 -0
- package/src/observability/tracing.js +74 -0
- package/src/server/auth/oauth.js +388 -0
- package/src/server/registerTool.js +41 -0
- package/src/server/schemas/common.js +29 -0
- package/src/server/transports/http.js +22 -0
- package/src/server/transports/stdio.js +16 -0
- package/src/server/transports/streamableHttp.js +226 -0
- package/src/server/withAuth.js +121 -0
- package/src/tools/advanced/BatchScrapeTool.js +12 -1086
- package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
- package/src/tools/advanced/batchScrape/index.js +328 -0
- package/src/tools/advanced/batchScrape/queue.js +91 -0
- package/src/tools/advanced/batchScrape/reporter.js +26 -0
- package/src/tools/advanced/batchScrape/schema.js +37 -0
- package/src/tools/advanced/batchScrape/worker.js +179 -0
- package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
- package/src/tools/basic/_fetch.js +35 -0
- package/src/tools/basic/extractLinks.js +74 -0
- package/src/tools/basic/extractMetadata.js +74 -0
- package/src/tools/basic/extractText.js +46 -0
- package/src/tools/basic/fetchUrl.js +44 -0
- package/src/tools/basic/scrapeStructured.js +58 -0
- package/src/tools/crawl/_sessionContext.js +234 -0
- package/src/tools/crawl/crawlDeep.js +55 -5
- package/src/tools/crawl/mapSite.js +23 -2
- package/src/tools/extract/_fetchAndParse.js +57 -0
- package/src/tools/extract/extractStructured.js +3 -19
- package/src/tools/extract/extractWithLlm.js +365 -0
- package/src/tools/search/providers/searxng.js +126 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
- package/src/tools/search/ranking/ResultRanker.js +17 -10
- package/src/tools/search/ranking/SearchResultCache.js +52 -0
- package/src/tools/search/searchWeb.js +112 -6
- package/src/tools/tracking/trackChanges/differ.js +98 -0
- package/src/tools/tracking/trackChanges/index.js +432 -0
- package/src/tools/tracking/trackChanges/monitor.js +93 -0
- package/src/tools/tracking/trackChanges/notifier.js +105 -0
- package/src/tools/tracking/trackChanges/schema.js +127 -0
- package/src/tools/tracking/trackChanges.js +12 -1374
|
@@ -0,0 +1,226 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Streamable HTTP transport (MCP spec 2025-06-18).
|
|
3
|
+
*
|
|
4
|
+
* Single endpoint at /mcp:
|
|
5
|
+
* - POST /mcp — JSON-RPC request, response as JSON or SSE stream
|
|
6
|
+
* - GET /mcp — SSE stream for server → client notifications
|
|
7
|
+
* - DELETE /mcp — terminate session
|
|
8
|
+
*
|
|
9
|
+
* Session resumption:
|
|
10
|
+
* - Server generates a session id and returns it as `Mcp-Session-Id` on init
|
|
11
|
+
* - Clients re-send `Mcp-Session-Id` on subsequent requests to resume state
|
|
12
|
+
*
|
|
13
|
+
* Auth:
|
|
14
|
+
* - Bearer / X-API-Key required per request (creator mode bypasses)
|
|
15
|
+
* - When OAuth is enabled (CRAWLFORGE_OAUTH_ENABLED=true), OAuth bearer
|
|
16
|
+
* tokens are validated by the OAuth provider and mapped server-side to
|
|
17
|
+
* a CrawlForge API key. See src/server/auth/oauth.js.
|
|
18
|
+
*
|
|
19
|
+
* Observability:
|
|
20
|
+
* - GET /metrics returns Prometheus exposition (when observability enabled)
|
|
21
|
+
* - GET /health returns liveness probe
|
|
22
|
+
*
|
|
23
|
+
* Replaces the legacy stateless http.js. Old /mcp endpoint behavior is
|
|
24
|
+
* preserved when CRAWLFORGE_LEGACY_HTTP=true (one-release deprecation window).
|
|
25
|
+
*/
|
|
26
|
+
|
|
27
|
+
import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js';
|
|
28
|
+
import { createServer } from 'node:http';
|
|
29
|
+
import { randomUUID } from 'node:crypto';
|
|
30
|
+
|
|
31
|
+
const SERVER_VERSION = '3.2.0';
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Stateful, session-aware Streamable HTTP transport.
|
|
35
|
+
*
|
|
36
|
+
* @param {import('@modelcontextprotocol/sdk/server/mcp.js').McpServer} server
|
|
37
|
+
* @param {import('../../core/AuthManager.js').default} authManager
|
|
38
|
+
* @param {import('../../utils/Logger.js').logger} logger
|
|
39
|
+
* @param {object} [options]
|
|
40
|
+
* @param {number} [options.port=3000]
|
|
41
|
+
* @param {boolean} [options.legacy=false] — if true, run in stateless mode (3.1 behavior)
|
|
42
|
+
* @param {object} [options.oauth] — OAuth provider (see src/server/auth/oauth.js)
|
|
43
|
+
* @param {object} [options.metrics] — Prometheus registry (see src/observability/metrics.js)
|
|
44
|
+
*/
|
|
45
|
+
export async function connectStreamableHttp(server, authManager, logger, options = {}) {
|
|
46
|
+
const port = options.port ?? 3000;
|
|
47
|
+
const host = options.host ?? '0.0.0.0';
|
|
48
|
+
const legacy = options.legacy === true;
|
|
49
|
+
const oauthProvider = options.oauth ?? null;
|
|
50
|
+
const metrics = options.metrics ?? null;
|
|
51
|
+
|
|
52
|
+
// Stateful mode: server generates session ids. Stateless when legacy=true.
|
|
53
|
+
const transport = new StreamableHTTPServerTransport({
|
|
54
|
+
sessionIdGenerator: legacy ? undefined : () => randomUUID()
|
|
55
|
+
});
|
|
56
|
+
await server.connect(transport);
|
|
57
|
+
|
|
58
|
+
const mode = legacy ? 'legacy-stateless' : 'streamable-stateful';
|
|
59
|
+
|
|
60
|
+
const httpServer = createServer(async (req, res) => {
|
|
61
|
+
// CORS — Smithery + browser-based MCP clients
|
|
62
|
+
res.setHeader('Access-Control-Allow-Origin', '*');
|
|
63
|
+
res.setHeader('Access-Control-Allow-Methods', 'GET, POST, DELETE, OPTIONS');
|
|
64
|
+
res.setHeader('Access-Control-Allow-Headers', 'Content-Type, Mcp-Session-Id, mcp-session-id, Authorization, X-API-Key');
|
|
65
|
+
res.setHeader('Access-Control-Expose-Headers', 'Mcp-Session-Id, mcp-session-id');
|
|
66
|
+
|
|
67
|
+
if (req.method === 'OPTIONS') {
|
|
68
|
+
res.writeHead(204);
|
|
69
|
+
res.end();
|
|
70
|
+
return;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// Health probe
|
|
74
|
+
if (req.url === '/health') {
|
|
75
|
+
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
76
|
+
res.end(JSON.stringify({ status: 'ok', version: SERVER_VERSION, mode }));
|
|
77
|
+
return;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Prometheus metrics endpoint
|
|
81
|
+
if (req.url === '/metrics') {
|
|
82
|
+
if (!metrics) {
|
|
83
|
+
res.writeHead(404, { 'Content-Type': 'text/plain' });
|
|
84
|
+
res.end('metrics disabled — set OTEL_SDK_DISABLED=false to enable');
|
|
85
|
+
return;
|
|
86
|
+
}
|
|
87
|
+
try {
|
|
88
|
+
const body = await metrics.render();
|
|
89
|
+
res.writeHead(200, { 'Content-Type': metrics.contentType });
|
|
90
|
+
res.end(body);
|
|
91
|
+
} catch (err) {
|
|
92
|
+
logger.error('metrics render failed', { error: err?.message });
|
|
93
|
+
res.writeHead(500, { 'Content-Type': 'text/plain' });
|
|
94
|
+
res.end('metrics error');
|
|
95
|
+
}
|
|
96
|
+
return;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
// Smithery discovery
|
|
100
|
+
if (req.url === '/.well-known/mcp/server-card.json') {
|
|
101
|
+
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
102
|
+
res.end(JSON.stringify({
|
|
103
|
+
serverInfo: {
|
|
104
|
+
name: 'crawlforge',
|
|
105
|
+
version: SERVER_VERSION,
|
|
106
|
+
description: 'Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.',
|
|
107
|
+
homepage: 'https://www.crawlforge.dev',
|
|
108
|
+
icon: 'https://www.crawlforge.dev/icon.png'
|
|
109
|
+
},
|
|
110
|
+
transport: { type: 'streamable-http', url: '/mcp' },
|
|
111
|
+
configSchema: {
|
|
112
|
+
type: 'object',
|
|
113
|
+
properties: {
|
|
114
|
+
apiKey: {
|
|
115
|
+
type: 'string',
|
|
116
|
+
title: 'CrawlForge API Key',
|
|
117
|
+
description: 'Your CrawlForge API key. Get one free at https://www.crawlforge.dev/signup (includes 1,000 credits)',
|
|
118
|
+
'x-from': { header: 'x-api-key' }
|
|
119
|
+
}
|
|
120
|
+
},
|
|
121
|
+
required: ['apiKey']
|
|
122
|
+
}
|
|
123
|
+
}));
|
|
124
|
+
return;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// OAuth 2.1 discovery + endpoints (only if OAuth is enabled)
|
|
128
|
+
if (oauthProvider && oauthProvider.handle && oauthProvider.matches(req.url, req.method)) {
|
|
129
|
+
await oauthProvider.handle(req, res);
|
|
130
|
+
return;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// MCP endpoint
|
|
134
|
+
if (req.url === '/mcp' || req.url === '/' || req.url?.startsWith('/mcp?')) {
|
|
135
|
+
// Per-request auth (bypassed in creator mode)
|
|
136
|
+
if (!authManager.isCreatorMode()) {
|
|
137
|
+
const authResult = await authenticateRequest(req, authManager, oauthProvider);
|
|
138
|
+
if (!authResult.ok) {
|
|
139
|
+
logger.warn('Streamable HTTP request rejected', {
|
|
140
|
+
reason: authResult.reason,
|
|
141
|
+
remoteAddress: req.socket?.remoteAddress
|
|
142
|
+
});
|
|
143
|
+
res.writeHead(authResult.status, { 'Content-Type': 'application/json' });
|
|
144
|
+
res.end(JSON.stringify({
|
|
145
|
+
error: authResult.error,
|
|
146
|
+
message: authResult.message
|
|
147
|
+
}));
|
|
148
|
+
return;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
await transport.handleRequest(req, res);
|
|
153
|
+
return;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
res.writeHead(404);
|
|
157
|
+
res.end('Not Found');
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
await new Promise((resolve) => {
|
|
161
|
+
httpServer.listen(port, host, () => {
|
|
162
|
+
const actual = httpServer.address()?.port ?? port;
|
|
163
|
+
console.error(`CrawlForge MCP Server v${SERVER_VERSION} running on Streamable HTTP (${mode}) port ${actual}`);
|
|
164
|
+
console.error(`MCP endpoint: http://localhost:${actual}/mcp`);
|
|
165
|
+
console.error(`Health check: http://localhost:${actual}/health`);
|
|
166
|
+
if (metrics) console.error(`Metrics: http://localhost:${actual}/metrics`);
|
|
167
|
+
if (oauthProvider) console.error(`OAuth: http://localhost:${actual}/.well-known/oauth-authorization-server`);
|
|
168
|
+
resolve();
|
|
169
|
+
});
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
return { transport, httpServer };
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
/**
|
|
176
|
+
* Validate a request's credentials.
|
|
177
|
+
*
|
|
178
|
+
* Accepts:
|
|
179
|
+
* - `Authorization: Bearer <crawlforge-api-key>` (legacy static key)
|
|
180
|
+
* - `X-API-Key: <crawlforge-api-key>` (legacy static key)
|
|
181
|
+
* - `Authorization: Bearer <oauth-access-token>` if OAuth is enabled —
|
|
182
|
+
* the OAuth provider validates the token and maps it to the API key.
|
|
183
|
+
*
|
|
184
|
+
* @returns {Promise<{ok: true} | {ok: false, status: number, error: string, message: string, reason: string}>}
|
|
185
|
+
*/
|
|
186
|
+
async function authenticateRequest(req, authManager, oauthProvider) {
|
|
187
|
+
const authHeader = (req.headers['authorization'] || '').toString();
|
|
188
|
+
const apiKeyHeader = (req.headers['x-api-key'] || '').toString();
|
|
189
|
+
const expectedKey = authManager.getConfig()?.apiKey;
|
|
190
|
+
|
|
191
|
+
let providedKey = '';
|
|
192
|
+
if (authHeader.toLowerCase().startsWith('bearer ')) {
|
|
193
|
+
providedKey = authHeader.slice(7).trim();
|
|
194
|
+
} else if (apiKeyHeader.length > 0) {
|
|
195
|
+
providedKey = apiKeyHeader.trim();
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
if (!providedKey) {
|
|
199
|
+
return {
|
|
200
|
+
ok: false,
|
|
201
|
+
status: 401,
|
|
202
|
+
error: 'Unauthorized',
|
|
203
|
+
message: 'CrawlForge Streamable HTTP transport requires Authorization: Bearer <api-key-or-oauth-token> (or X-API-Key) on every request.',
|
|
204
|
+
reason: 'missing-credentials'
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
// Static API key match
|
|
209
|
+
if (expectedKey && providedKey === expectedKey) {
|
|
210
|
+
return { ok: true };
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
// OAuth token path
|
|
214
|
+
if (oauthProvider && typeof oauthProvider.validateBearer === 'function') {
|
|
215
|
+
const result = await oauthProvider.validateBearer(providedKey);
|
|
216
|
+
if (result?.ok) return { ok: true };
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
return {
|
|
220
|
+
ok: false,
|
|
221
|
+
status: 401,
|
|
222
|
+
error: 'Unauthorized',
|
|
223
|
+
message: 'Invalid API key or OAuth token.',
|
|
224
|
+
reason: 'invalid-credentials'
|
|
225
|
+
};
|
|
226
|
+
}
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* withAuth — wraps a tool handler with authentication, credit tracking,
|
|
3
|
+
* structured invocation logging (audit phase A2), and observability
|
|
4
|
+
* (OpenTelemetry spans + Prometheus counters) added in v3.2.0.
|
|
5
|
+
*
|
|
6
|
+
* Contract:
|
|
7
|
+
* - resolves toolCost once per call
|
|
8
|
+
* - try/finally guarantees a single `tool invocation` log line per call
|
|
9
|
+
* - log payload: { toolName, paramHash, durationMs, outcome, creditCost, creatorMode }
|
|
10
|
+
* - outcome ∈ { 'success' | 'error' | 'insufficient_credits' }
|
|
11
|
+
* - emits an OTel span via src/observability/tracing.js (no-op if disabled)
|
|
12
|
+
* - increments Prometheus counters via src/observability/metrics.js (if registry passed)
|
|
13
|
+
*/
|
|
14
|
+
|
|
15
|
+
import { createHash } from 'node:crypto';
|
|
16
|
+
import { recordToolInvocation } from '../observability/tracing.js';
|
|
17
|
+
|
|
18
|
+
export function hashParams(params) {
|
|
19
|
+
try {
|
|
20
|
+
return createHash('sha256').update(JSON.stringify(params ?? {})).digest('hex').slice(0, 12);
|
|
21
|
+
} catch {
|
|
22
|
+
return 'unhashable';
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* @param {object} deps
|
|
28
|
+
* @param {object} deps.authManager
|
|
29
|
+
* @param {object} deps.logger
|
|
30
|
+
* @param {object} [deps.metrics] — optional Prometheus registry (see src/observability/metrics.js)
|
|
31
|
+
*/
|
|
32
|
+
export function makeWithAuth({ authManager, logger, metrics = null }) {
|
|
33
|
+
return function withAuth(toolName, handler) {
|
|
34
|
+
return async (params) => {
|
|
35
|
+
const startTime = Date.now();
|
|
36
|
+
const paramHash = hashParams(params);
|
|
37
|
+
const creatorMode = authManager.isCreatorMode();
|
|
38
|
+
const creditCost = creatorMode ? 0 : authManager.getToolCost(toolName);
|
|
39
|
+
let outcome = 'pending';
|
|
40
|
+
let thrown = null;
|
|
41
|
+
|
|
42
|
+
try {
|
|
43
|
+
if (!creatorMode) {
|
|
44
|
+
const hasCredits = await authManager.checkCredits(creditCost);
|
|
45
|
+
if (!hasCredits) {
|
|
46
|
+
outcome = 'insufficient_credits';
|
|
47
|
+
return {
|
|
48
|
+
content: [{
|
|
49
|
+
type: 'text',
|
|
50
|
+
text: JSON.stringify({
|
|
51
|
+
error: 'Insufficient credits',
|
|
52
|
+
message: `This operation requires ${creditCost} credits. Please upgrade your plan at https://www.crawlforge.dev/pricing`,
|
|
53
|
+
creditsRequired: creditCost
|
|
54
|
+
}, null, 2)
|
|
55
|
+
}]
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const result = await handler(params);
|
|
61
|
+
outcome = 'success';
|
|
62
|
+
|
|
63
|
+
if (!creatorMode) {
|
|
64
|
+
await authManager.reportUsage(toolName, creditCost, params, 200, Date.now() - startTime);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
return result;
|
|
68
|
+
} catch (error) {
|
|
69
|
+
outcome = 'error';
|
|
70
|
+
thrown = error;
|
|
71
|
+
if (!creatorMode) {
|
|
72
|
+
await authManager.reportUsage(
|
|
73
|
+
toolName,
|
|
74
|
+
Math.max(1, Math.floor(creditCost * 0.5)),
|
|
75
|
+
params,
|
|
76
|
+
500,
|
|
77
|
+
Date.now() - startTime
|
|
78
|
+
);
|
|
79
|
+
}
|
|
80
|
+
throw error;
|
|
81
|
+
} finally {
|
|
82
|
+
const durationMs = Date.now() - startTime;
|
|
83
|
+
logger.info('tool invocation', {
|
|
84
|
+
toolName,
|
|
85
|
+
paramHash,
|
|
86
|
+
durationMs,
|
|
87
|
+
outcome,
|
|
88
|
+
creditCost,
|
|
89
|
+
creatorMode
|
|
90
|
+
});
|
|
91
|
+
|
|
92
|
+
// Prometheus (no-op unless registry was supplied)
|
|
93
|
+
if (metrics) {
|
|
94
|
+
try {
|
|
95
|
+
metrics.incCounter('crawlforge_tool_requests_total', { tool: toolName, outcome });
|
|
96
|
+
if (outcome === 'error') {
|
|
97
|
+
metrics.incCounter('crawlforge_tool_errors_total', {
|
|
98
|
+
tool: toolName,
|
|
99
|
+
error_class: thrown?.name ?? 'Error'
|
|
100
|
+
});
|
|
101
|
+
}
|
|
102
|
+
metrics.observeHistogram('crawlforge_tool_duration_ms', { tool: toolName }, durationMs);
|
|
103
|
+
if (outcome === 'success' && creditCost > 0) {
|
|
104
|
+
metrics.incCounter('crawlforge_credits_consumed_total', { tool: toolName }, creditCost);
|
|
105
|
+
}
|
|
106
|
+
} catch {
|
|
107
|
+
// metrics must never break the request path
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
// OpenTelemetry (no-op when OTEL_SDK_DISABLED !== 'false')
|
|
112
|
+
recordToolInvocation(toolName, {
|
|
113
|
+
duration_ms: durationMs,
|
|
114
|
+
outcome,
|
|
115
|
+
credit_cost: creditCost,
|
|
116
|
+
creator_mode: creatorMode
|
|
117
|
+
}, thrown);
|
|
118
|
+
}
|
|
119
|
+
};
|
|
120
|
+
};
|
|
121
|
+
}
|