crawlforge-mcp-server 3.0.18 → 3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. package/package.json +5 -2
  2. package/server.js +192 -1277
  3. package/src/core/ActionExecutor.js +2 -43
  4. package/src/core/AuthManager.js +127 -14
  5. package/src/core/BrowserContextPool.js +187 -0
  6. package/src/core/JobManager.js +7 -5
  7. package/src/core/LocalizationManager.js +14 -125
  8. package/src/core/StealthBrowserManager.js +26 -18
  9. package/src/core/cache/CacheManager.js +4 -1
  10. package/src/core/crawlers/BFSCrawler.js +19 -5
  11. package/src/observability/metrics.js +137 -0
  12. package/src/observability/tracing.js +74 -0
  13. package/src/server/auth/oauth.js +388 -0
  14. package/src/server/registerTool.js +41 -0
  15. package/src/server/schemas/common.js +29 -0
  16. package/src/server/transports/http.js +22 -0
  17. package/src/server/transports/stdio.js +16 -0
  18. package/src/server/transports/streamableHttp.js +226 -0
  19. package/src/server/withAuth.js +121 -0
  20. package/src/tools/advanced/BatchScrapeTool.js +12 -1086
  21. package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
  22. package/src/tools/advanced/batchScrape/index.js +328 -0
  23. package/src/tools/advanced/batchScrape/queue.js +91 -0
  24. package/src/tools/advanced/batchScrape/reporter.js +26 -0
  25. package/src/tools/advanced/batchScrape/schema.js +37 -0
  26. package/src/tools/advanced/batchScrape/worker.js +179 -0
  27. package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
  28. package/src/tools/basic/_fetch.js +35 -0
  29. package/src/tools/basic/extractLinks.js +74 -0
  30. package/src/tools/basic/extractMetadata.js +74 -0
  31. package/src/tools/basic/extractText.js +46 -0
  32. package/src/tools/basic/fetchUrl.js +44 -0
  33. package/src/tools/basic/scrapeStructured.js +58 -0
  34. package/src/tools/crawl/_sessionContext.js +234 -0
  35. package/src/tools/crawl/crawlDeep.js +55 -5
  36. package/src/tools/crawl/mapSite.js +23 -2
  37. package/src/tools/extract/_fetchAndParse.js +57 -0
  38. package/src/tools/extract/extractStructured.js +3 -19
  39. package/src/tools/extract/extractWithLlm.js +295 -0
  40. package/src/tools/search/providers/searxng.js +126 -0
  41. package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
  42. package/src/tools/search/ranking/ResultRanker.js +17 -10
  43. package/src/tools/search/ranking/SearchResultCache.js +52 -0
  44. package/src/tools/search/searchWeb.js +112 -6
  45. package/src/tools/tracking/trackChanges/differ.js +98 -0
  46. package/src/tools/tracking/trackChanges/index.js +432 -0
  47. package/src/tools/tracking/trackChanges/monitor.js +93 -0
  48. package/src/tools/tracking/trackChanges/notifier.js +105 -0
  49. package/src/tools/tracking/trackChanges/schema.js +127 -0
  50. package/src/tools/tracking/trackChanges.js +12 -1374
@@ -0,0 +1,226 @@
1
+ /**
2
+ * Streamable HTTP transport (MCP spec 2025-06-18).
3
+ *
4
+ * Single endpoint at /mcp:
5
+ * - POST /mcp — JSON-RPC request, response as JSON or SSE stream
6
+ * - GET /mcp — SSE stream for server → client notifications
7
+ * - DELETE /mcp — terminate session
8
+ *
9
+ * Session resumption:
10
+ * - Server generates a session id and returns it as `Mcp-Session-Id` on init
11
+ * - Clients re-send `Mcp-Session-Id` on subsequent requests to resume state
12
+ *
13
+ * Auth:
14
+ * - Bearer / X-API-Key required per request (creator mode bypasses)
15
+ * - When OAuth is enabled (CRAWLFORGE_OAUTH_ENABLED=true), OAuth bearer
16
+ * tokens are validated by the OAuth provider and mapped server-side to
17
+ * a CrawlForge API key. See src/server/auth/oauth.js.
18
+ *
19
+ * Observability:
20
+ * - GET /metrics returns Prometheus exposition (when observability enabled)
21
+ * - GET /health returns liveness probe
22
+ *
23
+ * Replaces the legacy stateless http.js. Old /mcp endpoint behavior is
24
+ * preserved when CRAWLFORGE_LEGACY_HTTP=true (one-release deprecation window).
25
+ */
26
+
27
+ import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js';
28
+ import { createServer } from 'node:http';
29
+ import { randomUUID } from 'node:crypto';
30
+
31
+ const SERVER_VERSION = '3.2.0';
32
+
33
+ /**
34
+ * Stateful, session-aware Streamable HTTP transport.
35
+ *
36
+ * @param {import('@modelcontextprotocol/sdk/server/mcp.js').McpServer} server
37
+ * @param {import('../../core/AuthManager.js').default} authManager
38
+ * @param {import('../../utils/Logger.js').logger} logger
39
+ * @param {object} [options]
40
+ * @param {number} [options.port=3000]
41
+ * @param {boolean} [options.legacy=false] — if true, run in stateless mode (3.1 behavior)
42
+ * @param {object} [options.oauth] — OAuth provider (see src/server/auth/oauth.js)
43
+ * @param {object} [options.metrics] — Prometheus registry (see src/observability/metrics.js)
44
+ */
45
+ export async function connectStreamableHttp(server, authManager, logger, options = {}) {
46
+ const port = options.port ?? 3000;
47
+ const host = options.host ?? '0.0.0.0';
48
+ const legacy = options.legacy === true;
49
+ const oauthProvider = options.oauth ?? null;
50
+ const metrics = options.metrics ?? null;
51
+
52
+ // Stateful mode: server generates session ids. Stateless when legacy=true.
53
+ const transport = new StreamableHTTPServerTransport({
54
+ sessionIdGenerator: legacy ? undefined : () => randomUUID()
55
+ });
56
+ await server.connect(transport);
57
+
58
+ const mode = legacy ? 'legacy-stateless' : 'streamable-stateful';
59
+
60
+ const httpServer = createServer(async (req, res) => {
61
+ // CORS — Smithery + browser-based MCP clients
62
+ res.setHeader('Access-Control-Allow-Origin', '*');
63
+ res.setHeader('Access-Control-Allow-Methods', 'GET, POST, DELETE, OPTIONS');
64
+ res.setHeader('Access-Control-Allow-Headers', 'Content-Type, Mcp-Session-Id, mcp-session-id, Authorization, X-API-Key');
65
+ res.setHeader('Access-Control-Expose-Headers', 'Mcp-Session-Id, mcp-session-id');
66
+
67
+ if (req.method === 'OPTIONS') {
68
+ res.writeHead(204);
69
+ res.end();
70
+ return;
71
+ }
72
+
73
+ // Health probe
74
+ if (req.url === '/health') {
75
+ res.writeHead(200, { 'Content-Type': 'application/json' });
76
+ res.end(JSON.stringify({ status: 'ok', version: SERVER_VERSION, mode }));
77
+ return;
78
+ }
79
+
80
+ // Prometheus metrics endpoint
81
+ if (req.url === '/metrics') {
82
+ if (!metrics) {
83
+ res.writeHead(404, { 'Content-Type': 'text/plain' });
84
+ res.end('metrics disabled — set OTEL_SDK_DISABLED=false to enable');
85
+ return;
86
+ }
87
+ try {
88
+ const body = await metrics.render();
89
+ res.writeHead(200, { 'Content-Type': metrics.contentType });
90
+ res.end(body);
91
+ } catch (err) {
92
+ logger.error('metrics render failed', { error: err?.message });
93
+ res.writeHead(500, { 'Content-Type': 'text/plain' });
94
+ res.end('metrics error');
95
+ }
96
+ return;
97
+ }
98
+
99
+ // Smithery discovery
100
+ if (req.url === '/.well-known/mcp/server-card.json') {
101
+ res.writeHead(200, { 'Content-Type': 'application/json' });
102
+ res.end(JSON.stringify({
103
+ serverInfo: {
104
+ name: 'crawlforge',
105
+ version: SERVER_VERSION,
106
+ description: 'Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.',
107
+ homepage: 'https://www.crawlforge.dev',
108
+ icon: 'https://www.crawlforge.dev/icon.png'
109
+ },
110
+ transport: { type: 'streamable-http', url: '/mcp' },
111
+ configSchema: {
112
+ type: 'object',
113
+ properties: {
114
+ apiKey: {
115
+ type: 'string',
116
+ title: 'CrawlForge API Key',
117
+ description: 'Your CrawlForge API key. Get one free at https://www.crawlforge.dev/signup (includes 1,000 credits)',
118
+ 'x-from': { header: 'x-api-key' }
119
+ }
120
+ },
121
+ required: ['apiKey']
122
+ }
123
+ }));
124
+ return;
125
+ }
126
+
127
+ // OAuth 2.1 discovery + endpoints (only if OAuth is enabled)
128
+ if (oauthProvider && oauthProvider.handle && oauthProvider.matches(req.url, req.method)) {
129
+ await oauthProvider.handle(req, res);
130
+ return;
131
+ }
132
+
133
+ // MCP endpoint
134
+ if (req.url === '/mcp' || req.url === '/' || req.url?.startsWith('/mcp?')) {
135
+ // Per-request auth (bypassed in creator mode)
136
+ if (!authManager.isCreatorMode()) {
137
+ const authResult = await authenticateRequest(req, authManager, oauthProvider);
138
+ if (!authResult.ok) {
139
+ logger.warn('Streamable HTTP request rejected', {
140
+ reason: authResult.reason,
141
+ remoteAddress: req.socket?.remoteAddress
142
+ });
143
+ res.writeHead(authResult.status, { 'Content-Type': 'application/json' });
144
+ res.end(JSON.stringify({
145
+ error: authResult.error,
146
+ message: authResult.message
147
+ }));
148
+ return;
149
+ }
150
+ }
151
+
152
+ await transport.handleRequest(req, res);
153
+ return;
154
+ }
155
+
156
+ res.writeHead(404);
157
+ res.end('Not Found');
158
+ });
159
+
160
+ await new Promise((resolve) => {
161
+ httpServer.listen(port, host, () => {
162
+ const actual = httpServer.address()?.port ?? port;
163
+ console.error(`CrawlForge MCP Server v${SERVER_VERSION} running on Streamable HTTP (${mode}) port ${actual}`);
164
+ console.error(`MCP endpoint: http://localhost:${actual}/mcp`);
165
+ console.error(`Health check: http://localhost:${actual}/health`);
166
+ if (metrics) console.error(`Metrics: http://localhost:${actual}/metrics`);
167
+ if (oauthProvider) console.error(`OAuth: http://localhost:${actual}/.well-known/oauth-authorization-server`);
168
+ resolve();
169
+ });
170
+ });
171
+
172
+ return { transport, httpServer };
173
+ }
174
+
175
+ /**
176
+ * Validate a request's credentials.
177
+ *
178
+ * Accepts:
179
+ * - `Authorization: Bearer <crawlforge-api-key>` (legacy static key)
180
+ * - `X-API-Key: <crawlforge-api-key>` (legacy static key)
181
+ * - `Authorization: Bearer <oauth-access-token>` if OAuth is enabled —
182
+ * the OAuth provider validates the token and maps it to the API key.
183
+ *
184
+ * @returns {Promise<{ok: true} | {ok: false, status: number, error: string, message: string, reason: string}>}
185
+ */
186
+ async function authenticateRequest(req, authManager, oauthProvider) {
187
+ const authHeader = (req.headers['authorization'] || '').toString();
188
+ const apiKeyHeader = (req.headers['x-api-key'] || '').toString();
189
+ const expectedKey = authManager.getConfig()?.apiKey;
190
+
191
+ let providedKey = '';
192
+ if (authHeader.toLowerCase().startsWith('bearer ')) {
193
+ providedKey = authHeader.slice(7).trim();
194
+ } else if (apiKeyHeader.length > 0) {
195
+ providedKey = apiKeyHeader.trim();
196
+ }
197
+
198
+ if (!providedKey) {
199
+ return {
200
+ ok: false,
201
+ status: 401,
202
+ error: 'Unauthorized',
203
+ message: 'CrawlForge Streamable HTTP transport requires Authorization: Bearer <api-key-or-oauth-token> (or X-API-Key) on every request.',
204
+ reason: 'missing-credentials'
205
+ };
206
+ }
207
+
208
+ // Static API key match
209
+ if (expectedKey && providedKey === expectedKey) {
210
+ return { ok: true };
211
+ }
212
+
213
+ // OAuth token path
214
+ if (oauthProvider && typeof oauthProvider.validateBearer === 'function') {
215
+ const result = await oauthProvider.validateBearer(providedKey);
216
+ if (result?.ok) return { ok: true };
217
+ }
218
+
219
+ return {
220
+ ok: false,
221
+ status: 401,
222
+ error: 'Unauthorized',
223
+ message: 'Invalid API key or OAuth token.',
224
+ reason: 'invalid-credentials'
225
+ };
226
+ }
@@ -0,0 +1,121 @@
1
+ /**
2
+ * withAuth — wraps a tool handler with authentication, credit tracking,
3
+ * structured invocation logging (audit phase A2), and observability
4
+ * (OpenTelemetry spans + Prometheus counters) added in v3.2.0.
5
+ *
6
+ * Contract:
7
+ * - resolves toolCost once per call
8
+ * - try/finally guarantees a single `tool invocation` log line per call
9
+ * - log payload: { toolName, paramHash, durationMs, outcome, creditCost, creatorMode }
10
+ * - outcome ∈ { 'success' | 'error' | 'insufficient_credits' }
11
+ * - emits an OTel span via src/observability/tracing.js (no-op if disabled)
12
+ * - increments Prometheus counters via src/observability/metrics.js (if registry passed)
13
+ */
14
+
15
+ import { createHash } from 'node:crypto';
16
+ import { recordToolInvocation } from '../observability/tracing.js';
17
+
18
+ export function hashParams(params) {
19
+ try {
20
+ return createHash('sha256').update(JSON.stringify(params ?? {})).digest('hex').slice(0, 12);
21
+ } catch {
22
+ return 'unhashable';
23
+ }
24
+ }
25
+
26
+ /**
27
+ * @param {object} deps
28
+ * @param {object} deps.authManager
29
+ * @param {object} deps.logger
30
+ * @param {object} [deps.metrics] — optional Prometheus registry (see src/observability/metrics.js)
31
+ */
32
+ export function makeWithAuth({ authManager, logger, metrics = null }) {
33
+ return function withAuth(toolName, handler) {
34
+ return async (params) => {
35
+ const startTime = Date.now();
36
+ const paramHash = hashParams(params);
37
+ const creatorMode = authManager.isCreatorMode();
38
+ const creditCost = creatorMode ? 0 : authManager.getToolCost(toolName);
39
+ let outcome = 'pending';
40
+ let thrown = null;
41
+
42
+ try {
43
+ if (!creatorMode) {
44
+ const hasCredits = await authManager.checkCredits(creditCost);
45
+ if (!hasCredits) {
46
+ outcome = 'insufficient_credits';
47
+ return {
48
+ content: [{
49
+ type: 'text',
50
+ text: JSON.stringify({
51
+ error: 'Insufficient credits',
52
+ message: `This operation requires ${creditCost} credits. Please upgrade your plan at https://www.crawlforge.dev/pricing`,
53
+ creditsRequired: creditCost
54
+ }, null, 2)
55
+ }]
56
+ };
57
+ }
58
+ }
59
+
60
+ const result = await handler(params);
61
+ outcome = 'success';
62
+
63
+ if (!creatorMode) {
64
+ await authManager.reportUsage(toolName, creditCost, params, 200, Date.now() - startTime);
65
+ }
66
+
67
+ return result;
68
+ } catch (error) {
69
+ outcome = 'error';
70
+ thrown = error;
71
+ if (!creatorMode) {
72
+ await authManager.reportUsage(
73
+ toolName,
74
+ Math.max(1, Math.floor(creditCost * 0.5)),
75
+ params,
76
+ 500,
77
+ Date.now() - startTime
78
+ );
79
+ }
80
+ throw error;
81
+ } finally {
82
+ const durationMs = Date.now() - startTime;
83
+ logger.info('tool invocation', {
84
+ toolName,
85
+ paramHash,
86
+ durationMs,
87
+ outcome,
88
+ creditCost,
89
+ creatorMode
90
+ });
91
+
92
+ // Prometheus (no-op unless registry was supplied)
93
+ if (metrics) {
94
+ try {
95
+ metrics.incCounter('crawlforge_tool_requests_total', { tool: toolName, outcome });
96
+ if (outcome === 'error') {
97
+ metrics.incCounter('crawlforge_tool_errors_total', {
98
+ tool: toolName,
99
+ error_class: thrown?.name ?? 'Error'
100
+ });
101
+ }
102
+ metrics.observeHistogram('crawlforge_tool_duration_ms', { tool: toolName }, durationMs);
103
+ if (outcome === 'success' && creditCost > 0) {
104
+ metrics.incCounter('crawlforge_credits_consumed_total', { tool: toolName }, creditCost);
105
+ }
106
+ } catch {
107
+ // metrics must never break the request path
108
+ }
109
+ }
110
+
111
+ // OpenTelemetry (no-op when OTEL_SDK_DISABLED !== 'false')
112
+ recordToolInvocation(toolName, {
113
+ duration_ms: durationMs,
114
+ outcome,
115
+ credit_cost: creditCost,
116
+ creator_mode: creatorMode
117
+ }, thrown);
118
+ }
119
+ };
120
+ };
121
+ }