crawlforge-mcp-server 3.0.17 → 3.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +2 -0
- package/README.md +1 -0
- package/package.json +6 -2
- package/server.js +192 -1277
- package/src/constants/config.js +2 -1
- package/src/core/ActionExecutor.js +2 -43
- package/src/core/AuthManager.js +230 -32
- package/src/core/BrowserContextPool.js +187 -0
- package/src/core/JobManager.js +7 -5
- package/src/core/LocalizationManager.js +14 -125
- package/src/core/ResearchOrchestrator.js +86 -5
- package/src/core/StealthBrowserManager.js +26 -18
- package/src/core/cache/CacheManager.js +4 -1
- package/src/core/crawlers/BFSCrawler.js +19 -5
- package/src/core/endpointGuard.js +37 -0
- package/src/observability/metrics.js +137 -0
- package/src/observability/tracing.js +74 -0
- package/src/server/auth/oauth.js +388 -0
- package/src/server/registerTool.js +41 -0
- package/src/server/schemas/common.js +29 -0
- package/src/server/transports/http.js +22 -0
- package/src/server/transports/stdio.js +16 -0
- package/src/server/transports/streamableHttp.js +226 -0
- package/src/server/withAuth.js +121 -0
- package/src/tools/advanced/BatchScrapeTool.js +12 -1086
- package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
- package/src/tools/advanced/batchScrape/index.js +328 -0
- package/src/tools/advanced/batchScrape/queue.js +91 -0
- package/src/tools/advanced/batchScrape/reporter.js +26 -0
- package/src/tools/advanced/batchScrape/schema.js +37 -0
- package/src/tools/advanced/batchScrape/worker.js +179 -0
- package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
- package/src/tools/basic/_fetch.js +35 -0
- package/src/tools/basic/extractLinks.js +74 -0
- package/src/tools/basic/extractMetadata.js +74 -0
- package/src/tools/basic/extractText.js +46 -0
- package/src/tools/basic/fetchUrl.js +44 -0
- package/src/tools/basic/scrapeStructured.js +58 -0
- package/src/tools/crawl/_sessionContext.js +234 -0
- package/src/tools/crawl/crawlDeep.js +55 -5
- package/src/tools/crawl/mapSite.js +23 -2
- package/src/tools/extract/_fetchAndParse.js +57 -0
- package/src/tools/extract/extractStructured.js +3 -19
- package/src/tools/extract/extractWithLlm.js +295 -0
- package/src/tools/research/deepResearch.js +33 -8
- package/src/tools/search/providers/searxng.js +126 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
- package/src/tools/search/ranking/ResultRanker.js +17 -10
- package/src/tools/search/ranking/SearchResultCache.js +52 -0
- package/src/tools/search/searchWeb.js +112 -6
- package/src/tools/tracking/trackChanges/differ.js +98 -0
- package/src/tools/tracking/trackChanges/index.js +432 -0
- package/src/tools/tracking/trackChanges/monitor.js +93 -0
- package/src/tools/tracking/trackChanges/notifier.js +105 -0
- package/src/tools/tracking/trackChanges/schema.js +127 -0
- package/src/tools/tracking/trackChanges.js +12 -1374
package/server.js
CHANGED
|
@@ -6,12 +6,8 @@ export { isCreatorModeVerified } from './src/core/creatorMode.js';
|
|
|
6
6
|
|
|
7
7
|
// Import everything else
|
|
8
8
|
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
|
|
9
|
-
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
|
|
10
|
-
import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
|
|
11
|
-
import { createServer } from "node:http";
|
|
12
|
-
import { randomUUID } from "node:crypto";
|
|
13
9
|
import { z } from "zod";
|
|
14
|
-
import {
|
|
10
|
+
import { logger } from "./src/utils/Logger.js";
|
|
15
11
|
import { SearchWebTool } from "./src/tools/search/searchWeb.js";
|
|
16
12
|
import { CrawlDeepTool } from "./src/tools/crawl/crawlDeep.js";
|
|
17
13
|
import { MapSiteTool } from "./src/tools/crawl/mapSite.js";
|
|
@@ -19,24 +15,33 @@ import { ExtractContentTool } from "./src/tools/extract/extractContent.js";
|
|
|
19
15
|
import { ProcessDocumentTool } from "./src/tools/extract/processDocument.js";
|
|
20
16
|
import { SummarizeContentTool } from "./src/tools/extract/summarizeContent.js";
|
|
21
17
|
import { AnalyzeContentTool } from "./src/tools/extract/analyzeContent.js";
|
|
22
|
-
// Phase 1: LLM-Powered Structured Extraction
|
|
23
18
|
import { ExtractStructuredTool } from "./src/tools/extract/extractStructured.js";
|
|
24
|
-
|
|
19
|
+
import { ExtractWithLlm } from "./src/tools/extract/extractWithLlm.js";
|
|
25
20
|
import { BatchScrapeTool } from "./src/tools/advanced/BatchScrapeTool.js";
|
|
26
21
|
import { ScrapeWithActionsTool } from "./src/tools/advanced/ScrapeWithActionsTool.js";
|
|
27
|
-
// Deep Research Tool
|
|
28
22
|
import { DeepResearchTool } from "./src/tools/research/deepResearch.js";
|
|
29
|
-
|
|
30
|
-
import { TrackChangesTool } from "./src/tools/tracking/trackChanges.js";
|
|
31
|
-
// LLMs.txt Generator Tool (Phase 2.5)
|
|
23
|
+
import { TrackChangesTool } from "./src/tools/tracking/trackChanges/index.js";
|
|
32
24
|
import { GenerateLLMsTxtTool } from "./src/tools/llmstxt/generateLLMsTxt.js";
|
|
33
|
-
// Wave 3-4 Core Managers
|
|
34
25
|
import { StealthBrowserManager } from "./src/core/StealthBrowserManager.js";
|
|
35
26
|
import { LocalizationManager } from "./src/core/LocalizationManager.js";
|
|
36
27
|
import { memoryMonitor } from "./src/utils/MemoryMonitor.js";
|
|
37
28
|
import { config, validateConfig, getToolConfig } from "./src/constants/config.js";
|
|
38
|
-
// Authentication Manager
|
|
39
29
|
import AuthManager from "./src/core/AuthManager.js";
|
|
30
|
+
import { makeWithAuth } from "./src/server/withAuth.js";
|
|
31
|
+
// Transport helpers
|
|
32
|
+
import { connectStdio } from "./src/server/transports/stdio.js";
|
|
33
|
+
import { connectHttp } from "./src/server/transports/http.js";
|
|
34
|
+
import { connectStreamableHttp } from "./src/server/transports/streamableHttp.js";
|
|
35
|
+
// OAuth 2.1 (HTTP transport only — opt-in via CRAWLFORGE_OAUTH_ENABLED=true)
|
|
36
|
+
import { createOAuthProvider } from "./src/server/auth/oauth.js";
|
|
37
|
+
// Observability (no-op by default — enable via CRAWLFORGE_METRICS / OTEL_SDK_DISABLED)
|
|
38
|
+
import { createMetricsRegistry } from "./src/observability/metrics.js";
|
|
39
|
+
// Basic tool handlers (extracted from server.js)
|
|
40
|
+
import { fetchUrlHandler } from "./src/tools/basic/fetchUrl.js";
|
|
41
|
+
import { extractTextHandler } from "./src/tools/basic/extractText.js";
|
|
42
|
+
import { extractLinksHandler } from "./src/tools/basic/extractLinks.js";
|
|
43
|
+
import { extractMetadataHandler } from "./src/tools/basic/extractMetadata.js";
|
|
44
|
+
import { scrapeStructuredHandler } from "./src/tools/basic/scrapeStructured.js";
|
|
40
45
|
|
|
41
46
|
// Initialize Authentication Manager
|
|
42
47
|
await AuthManager.initialize();
|
|
@@ -84,7 +89,7 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
|
|
|
84
89
|
// Create the server
|
|
85
90
|
const server = new McpServer({
|
|
86
91
|
name: "crawlforge",
|
|
87
|
-
version: "3.0
|
|
92
|
+
version: "3.2.0",
|
|
88
93
|
description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
|
|
89
94
|
homepage: "https://www.crawlforge.dev",
|
|
90
95
|
icon: "https://www.crawlforge.dev/icon.png"
|
|
@@ -99,7 +104,7 @@ server.prompt("getting-started", {
|
|
|
99
104
|
role: "user",
|
|
100
105
|
content: {
|
|
101
106
|
type: "text",
|
|
102
|
-
text: "You have access to CrawlForge MCP with
|
|
107
|
+
text: "You have access to CrawlForge MCP with 21 web scraping tools. Key tools:\n\n" +
|
|
103
108
|
"- fetch_url: Fetch raw HTML/content from any URL\n" +
|
|
104
109
|
"- extract_text: Extract clean text from a webpage\n" +
|
|
105
110
|
"- extract_content: Smart content extraction with readability\n" +
|
|
@@ -111,6 +116,7 @@ server.prompt("getting-started", {
|
|
|
111
116
|
"- deep_research: Multi-source research on any topic\n" +
|
|
112
117
|
"- stealth_mode: Anti-detection browsing for protected sites\n" +
|
|
113
118
|
"- extract_structured: LLM-powered structured data extraction\n" +
|
|
119
|
+
"- extract_with_llm: Natural-language extraction via OpenAI/Anthropic\n" +
|
|
114
120
|
"- track_changes: Monitor website changes over time\n" +
|
|
115
121
|
"- generate_llms_txt: Generate llms.txt for any website\n\n" +
|
|
116
122
|
"Workflow: search_web -> fetch_url -> extract_content -> analyze_content\n\n" +
|
|
@@ -120,528 +126,37 @@ server.prompt("getting-started", {
|
|
|
120
126
|
};
|
|
121
127
|
});
|
|
122
128
|
|
|
123
|
-
//
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
// Skip credit checks in creator mode
|
|
130
|
-
if (!AuthManager.isCreatorMode()) {
|
|
131
|
-
// Check credits before executing
|
|
132
|
-
const creditCost = AuthManager.getToolCost(toolName);
|
|
133
|
-
const hasCredits = await AuthManager.checkCredits(creditCost);
|
|
134
|
-
|
|
135
|
-
if (!hasCredits) {
|
|
136
|
-
return {
|
|
137
|
-
content: [{
|
|
138
|
-
type: "text",
|
|
139
|
-
text: JSON.stringify({
|
|
140
|
-
error: "Insufficient credits",
|
|
141
|
-
message: `This operation requires ${creditCost} credits. Please upgrade your plan at https://www.crawlforge.dev/pricing`,
|
|
142
|
-
creditsRequired: creditCost
|
|
143
|
-
}, null, 2)
|
|
144
|
-
}]
|
|
145
|
-
};
|
|
146
|
-
}
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
// Execute the tool
|
|
150
|
-
const result = await handler(params);
|
|
151
|
-
|
|
152
|
-
// Report usage for successful execution (skip in creator mode)
|
|
153
|
-
const processingTime = Date.now() - startTime;
|
|
154
|
-
if (!AuthManager.isCreatorMode()) {
|
|
155
|
-
const creditCost = AuthManager.getToolCost(toolName);
|
|
156
|
-
await AuthManager.reportUsage(
|
|
157
|
-
toolName,
|
|
158
|
-
creditCost,
|
|
159
|
-
params,
|
|
160
|
-
200,
|
|
161
|
-
processingTime
|
|
162
|
-
);
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
return result;
|
|
166
|
-
} catch (error) {
|
|
167
|
-
// Report usage even for errors (reduced credit cost) - skip in creator mode
|
|
168
|
-
const processingTime = Date.now() - startTime;
|
|
169
|
-
if (!AuthManager.isCreatorMode()) {
|
|
170
|
-
await AuthManager.reportUsage(
|
|
171
|
-
toolName,
|
|
172
|
-
Math.max(1, Math.floor(AuthManager.getToolCost(toolName) * 0.5)), // Half credits for errors
|
|
173
|
-
params,
|
|
174
|
-
500,
|
|
175
|
-
processingTime
|
|
176
|
-
);
|
|
177
|
-
}
|
|
178
|
-
|
|
179
|
-
throw error;
|
|
180
|
-
}
|
|
181
|
-
};
|
|
182
|
-
}
|
|
129
|
+
// Observability registry — only emit metrics in HTTP mode when explicitly enabled.
|
|
130
|
+
// Stdio mode stays silent to match MCP host expectations.
|
|
131
|
+
const metricsEnabled =
|
|
132
|
+
(process.argv.includes('--http') || process.env.MCP_HTTP === 'true') &&
|
|
133
|
+
process.env.CRAWLFORGE_METRICS === 'true';
|
|
134
|
+
const metrics = metricsEnabled ? createMetricsRegistry() : null;
|
|
183
135
|
|
|
184
|
-
//
|
|
136
|
+
// Tool-handler wrapper: auth + credit tracking + structured invocation logging + observability.
|
|
137
|
+
const withAuth = makeWithAuth({ authManager: AuthManager, logger, metrics });
|
|
138
|
+
|
|
139
|
+
// Initialize tools
|
|
185
140
|
const searchWebTool = new SearchWebTool(getToolConfig("search_web"));
|
|
186
141
|
const crawlDeepTool = new CrawlDeepTool(getToolConfig('crawl_deep'));
|
|
187
142
|
const mapSiteTool = new MapSiteTool(getToolConfig('map_site'));
|
|
188
|
-
|
|
189
|
-
// Initialize Phase 3 tools
|
|
190
143
|
const extractContentTool = new ExtractContentTool();
|
|
191
144
|
const processDocumentTool = new ProcessDocumentTool();
|
|
192
145
|
const summarizeContentTool = new SummarizeContentTool();
|
|
193
146
|
const analyzeContentTool = new AnalyzeContentTool();
|
|
194
|
-
|
|
195
|
-
// Phase 1: LLM-Powered Structured Extraction Tool
|
|
196
147
|
const extractStructuredTool = new ExtractStructuredTool();
|
|
197
|
-
|
|
198
|
-
// Initialize Wave 2 Advanced Tools
|
|
148
|
+
const extractWithLlmTool = new ExtractWithLlm();
|
|
199
149
|
const batchScrapeTool = new BatchScrapeTool();
|
|
200
150
|
const scrapeWithActionsTool = new ScrapeWithActionsTool();
|
|
201
|
-
|
|
202
|
-
// Initialize Deep Research Tool
|
|
203
151
|
const deepResearchTool = new DeepResearchTool();
|
|
204
|
-
|
|
205
|
-
// Initialize Change Tracking Tool
|
|
206
152
|
const trackChangesTool = new TrackChangesTool();
|
|
207
|
-
|
|
208
|
-
// Initialize LLMs.txt Generator Tool (Phase 2.5)
|
|
209
153
|
const generateLLMsTxtTool = new GenerateLLMsTxtTool();
|
|
210
|
-
|
|
211
|
-
// Initialize Wave 3-4 Core Managers
|
|
212
154
|
const stealthBrowserManager = new StealthBrowserManager();
|
|
213
155
|
const localizationManager = new LocalizationManager();
|
|
214
156
|
|
|
215
|
-
//
|
|
216
|
-
const FetchUrlSchema = z.object({
|
|
217
|
-
url: z.string().url(),
|
|
218
|
-
headers: z.record(z.string()).optional(),
|
|
219
|
-
timeout: z.number().min(1000).max(30000).optional().default(10000)
|
|
220
|
-
});
|
|
221
|
-
|
|
222
|
-
const ExtractTextSchema = z.object({
|
|
223
|
-
url: z.string().url(),
|
|
224
|
-
remove_scripts: z.boolean().optional().default(true),
|
|
225
|
-
remove_styles: z.boolean().optional().default(true)
|
|
226
|
-
});
|
|
227
|
-
|
|
228
|
-
const ExtractLinksSchema = z.object({
|
|
229
|
-
url: z.string().url(),
|
|
230
|
-
filter_external: z.boolean().optional().default(false),
|
|
231
|
-
base_url: z.string().url().optional()
|
|
232
|
-
});
|
|
233
|
-
|
|
234
|
-
const ExtractMetadataSchema = z.object({
|
|
235
|
-
url: z.string().url()
|
|
236
|
-
});
|
|
237
|
-
|
|
238
|
-
const ScrapeStructuredSchema = z.object({
|
|
239
|
-
url: z.string().url(),
|
|
240
|
-
selectors: z.record(z.string())
|
|
241
|
-
});
|
|
242
|
-
|
|
243
|
-
const SearchWebSchema = z.object({
|
|
244
|
-
query: z.string(),
|
|
245
|
-
limit: z.number().min(1).max(100).optional(),
|
|
246
|
-
offset: z.number().min(0).optional(),
|
|
247
|
-
lang: z.string().optional(),
|
|
248
|
-
safe_search: z.boolean().optional(),
|
|
249
|
-
time_range: z.enum(['day', 'week', 'month', 'year', 'all']).optional(),
|
|
250
|
-
site: z.string().optional(),
|
|
251
|
-
file_type: z.string().optional()
|
|
252
|
-
});
|
|
253
|
-
|
|
254
|
-
const CrawlDeepSchema = z.object({
|
|
255
|
-
url: z.string().url(),
|
|
256
|
-
max_depth: z.number().min(1).max(5).optional(),
|
|
257
|
-
max_pages: z.number().min(1).max(1000).optional(),
|
|
258
|
-
include_patterns: z.array(z.string()).optional(),
|
|
259
|
-
exclude_patterns: z.array(z.string()).optional(),
|
|
260
|
-
follow_external: z.boolean().optional(),
|
|
261
|
-
respect_robots: z.boolean().optional(),
|
|
262
|
-
extract_content: z.boolean().optional(),
|
|
263
|
-
concurrency: z.number().min(1).max(20).optional()
|
|
264
|
-
});
|
|
265
|
-
|
|
266
|
-
const MapSiteSchema = z.object({
|
|
267
|
-
url: z.string().url(),
|
|
268
|
-
include_sitemap: z.boolean().optional(),
|
|
269
|
-
max_urls: z.number().min(1).max(10000).optional(),
|
|
270
|
-
group_by_path: z.boolean().optional(),
|
|
271
|
-
include_metadata: z.boolean().optional()
|
|
272
|
-
});
|
|
273
|
-
|
|
274
|
-
const ExtractContentSchema = z.object({
|
|
275
|
-
url: z.string().url(),
|
|
276
|
-
options: z.object({}).optional()
|
|
277
|
-
});
|
|
278
|
-
|
|
279
|
-
const ProcessDocumentSchema = z.object({
|
|
280
|
-
source: z.string(),
|
|
281
|
-
sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional(),
|
|
282
|
-
options: z.object({}).optional()
|
|
283
|
-
});
|
|
284
|
-
|
|
285
|
-
const SummarizeContentSchema = z.object({
|
|
286
|
-
text: z.string(),
|
|
287
|
-
options: z.object({}).optional()
|
|
288
|
-
});
|
|
289
|
-
|
|
290
|
-
const AnalyzeContentSchema = z.object({
|
|
291
|
-
text: z.string(),
|
|
292
|
-
options: z.object({}).optional()
|
|
293
|
-
});
|
|
294
|
-
|
|
295
|
-
// Wave 2 Advanced Tools Schemas
|
|
296
|
-
const BatchScrapeSchema = z.object({
|
|
297
|
-
urls: z.array(z.union([
|
|
298
|
-
z.string().url(),
|
|
299
|
-
z.object({
|
|
300
|
-
url: z.string().url(),
|
|
301
|
-
selectors: z.record(z.string()).optional(),
|
|
302
|
-
headers: z.record(z.string()).optional(),
|
|
303
|
-
timeout: z.number().min(1000).max(30000).optional(),
|
|
304
|
-
metadata: z.record(z.any()).optional()
|
|
305
|
-
})
|
|
306
|
-
])).min(1).max(50),
|
|
307
|
-
|
|
308
|
-
formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
|
|
309
|
-
mode: z.enum(['sync', 'async']).default('sync'),
|
|
310
|
-
|
|
311
|
-
webhook: z.object({
|
|
312
|
-
url: z.string().url(),
|
|
313
|
-
events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
|
|
314
|
-
headers: z.record(z.string()).optional(),
|
|
315
|
-
signingSecret: z.string().optional()
|
|
316
|
-
}).optional(),
|
|
317
|
-
|
|
318
|
-
extractionSchema: z.record(z.string()).optional(),
|
|
319
|
-
maxConcurrency: z.number().min(1).max(20).default(10),
|
|
320
|
-
delayBetweenRequests: z.number().min(0).max(10000).default(100),
|
|
321
|
-
includeMetadata: z.boolean().default(true),
|
|
322
|
-
includeFailed: z.boolean().default(true),
|
|
323
|
-
pageSize: z.number().min(1).max(100).default(25),
|
|
324
|
-
|
|
325
|
-
jobOptions: z.object({
|
|
326
|
-
priority: z.number().default(0),
|
|
327
|
-
ttl: z.number().min(60000).default(24 * 60 * 60 * 1000),
|
|
328
|
-
maxRetries: z.number().min(0).max(5).default(1),
|
|
329
|
-
tags: z.array(z.string()).default([])
|
|
330
|
-
}).optional()
|
|
331
|
-
});
|
|
332
|
-
|
|
333
|
-
const ScrapeWithActionsSchema = z.object({
|
|
334
|
-
url: z.string().url(),
|
|
335
|
-
actions: z.array(z.object({
|
|
336
|
-
type: z.enum(['wait', 'click', 'type', 'press', 'scroll', 'screenshot', 'executeJavaScript']),
|
|
337
|
-
selector: z.string().optional(),
|
|
338
|
-
text: z.string().optional(),
|
|
339
|
-
key: z.string().optional(),
|
|
340
|
-
script: z.string().optional(),
|
|
341
|
-
timeout: z.number().optional(),
|
|
342
|
-
description: z.string().optional(),
|
|
343
|
-
continueOnError: z.boolean().default(false),
|
|
344
|
-
retries: z.number().min(0).max(5).default(0)
|
|
345
|
-
})).min(1).max(20),
|
|
346
|
-
|
|
347
|
-
formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
|
|
348
|
-
captureIntermediateStates: z.boolean().default(false),
|
|
349
|
-
captureScreenshots: z.boolean().default(true),
|
|
350
|
-
|
|
351
|
-
formAutoFill: z.object({
|
|
352
|
-
fields: z.array(z.object({
|
|
353
|
-
selector: z.string(),
|
|
354
|
-
value: z.string(),
|
|
355
|
-
type: z.enum(['text', 'select', 'checkbox', 'radio', 'file']).default('text'),
|
|
356
|
-
waitAfter: z.number().min(0).max(5000).default(100)
|
|
357
|
-
})),
|
|
358
|
-
submitSelector: z.string().optional(),
|
|
359
|
-
waitAfterSubmit: z.number().min(0).max(30000).default(2000)
|
|
360
|
-
}).optional(),
|
|
361
|
-
|
|
362
|
-
browserOptions: z.object({
|
|
363
|
-
headless: z.boolean().default(true),
|
|
364
|
-
userAgent: z.string().optional(),
|
|
365
|
-
viewportWidth: z.number().min(800).max(1920).default(1280),
|
|
366
|
-
viewportHeight: z.number().min(600).max(1080).default(720),
|
|
367
|
-
timeout: z.number().min(10000).max(120000).default(30000)
|
|
368
|
-
}).optional(),
|
|
369
|
-
|
|
370
|
-
extractionOptions: z.object({
|
|
371
|
-
selectors: z.record(z.string()).optional(),
|
|
372
|
-
includeMetadata: z.boolean().default(true),
|
|
373
|
-
includeLinks: z.boolean().default(true),
|
|
374
|
-
includeImages: z.boolean().default(true)
|
|
375
|
-
}).optional(),
|
|
376
|
-
|
|
377
|
-
continueOnActionError: z.boolean().default(false),
|
|
378
|
-
maxRetries: z.number().min(0).max(3).default(1),
|
|
379
|
-
screenshotOnError: z.boolean().default(true)
|
|
380
|
-
});
|
|
381
|
-
|
|
382
|
-
// Deep Research Tool Schema
|
|
383
|
-
const DeepResearchSchema = z.object({
|
|
384
|
-
topic: z.string().min(3).max(500),
|
|
385
|
-
maxDepth: z.number().min(1).max(10).optional().default(5),
|
|
386
|
-
maxUrls: z.number().min(1).max(1000).optional().default(50),
|
|
387
|
-
timeLimit: z.number().min(30000).max(300000).optional().default(120000),
|
|
388
|
-
researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad'),
|
|
389
|
-
sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']),
|
|
390
|
-
credibilityThreshold: z.number().min(0).max(1).optional().default(0.3),
|
|
391
|
-
includeRecentOnly: z.boolean().optional().default(false),
|
|
392
|
-
enableConflictDetection: z.boolean().optional().default(true),
|
|
393
|
-
enableSourceVerification: z.boolean().optional().default(true),
|
|
394
|
-
enableSynthesis: z.boolean().optional().default(true),
|
|
395
|
-
outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive'),
|
|
396
|
-
includeRawData: z.boolean().optional().default(false),
|
|
397
|
-
includeActivityLog: z.boolean().optional().default(false),
|
|
398
|
-
queryExpansion: z.object({
|
|
399
|
-
enableSynonyms: z.boolean().optional().default(true),
|
|
400
|
-
enableSpellCheck: z.boolean().optional().default(true),
|
|
401
|
-
enableContextual: z.boolean().optional().default(true),
|
|
402
|
-
maxVariations: z.number().min(1).max(20).optional().default(8)
|
|
403
|
-
}).optional(),
|
|
404
|
-
llmConfig: z.object({
|
|
405
|
-
provider: z.enum(['auto', 'openai', 'anthropic']).optional().default('auto'),
|
|
406
|
-
openai: z.object({
|
|
407
|
-
apiKey: z.string().optional(),
|
|
408
|
-
model: z.string().optional().default('gpt-3.5-turbo'),
|
|
409
|
-
embeddingModel: z.string().optional().default('text-embedding-ada-002')
|
|
410
|
-
}).optional(),
|
|
411
|
-
anthropic: z.object({
|
|
412
|
-
apiKey: z.string().optional(),
|
|
413
|
-
model: z.string().optional().default('claude-3-haiku-20240307')
|
|
414
|
-
}).optional(),
|
|
415
|
-
enableSemanticAnalysis: z.boolean().optional().default(true),
|
|
416
|
-
enableIntelligentSynthesis: z.boolean().optional().default(true)
|
|
417
|
-
}).optional(),
|
|
418
|
-
concurrency: z.number().min(1).max(20).optional().default(5),
|
|
419
|
-
cacheResults: z.boolean().optional().default(true),
|
|
420
|
-
webhook: z.object({
|
|
421
|
-
url: z.string().url(),
|
|
422
|
-
events: z.array(z.enum(['started', 'progress', 'completed', 'failed'])).optional().default(['completed']),
|
|
423
|
-
headers: z.record(z.string()).optional()
|
|
424
|
-
}).optional()
|
|
425
|
-
});
|
|
426
|
-
|
|
427
|
-
// Change Tracking Tool Schema
|
|
428
|
-
const TrackChangesSchema = z.object({
|
|
429
|
-
url: z.string().url(),
|
|
430
|
-
operation: z.enum(['create_baseline', 'compare', 'monitor', 'get_history', 'get_stats']).default('compare'),
|
|
431
|
-
content: z.string().optional(),
|
|
432
|
-
html: z.string().optional(),
|
|
433
|
-
trackingOptions: z.object({
|
|
434
|
-
granularity: z.enum(['page', 'section', 'element', 'text']).default('section'),
|
|
435
|
-
trackText: z.boolean().default(true),
|
|
436
|
-
trackStructure: z.boolean().default(true),
|
|
437
|
-
trackAttributes: z.boolean().default(false),
|
|
438
|
-
trackImages: z.boolean().default(false),
|
|
439
|
-
trackLinks: z.boolean().default(true),
|
|
440
|
-
ignoreWhitespace: z.boolean().default(true),
|
|
441
|
-
ignoreCase: z.boolean().default(false),
|
|
442
|
-
customSelectors: z.array(z.string()).optional(),
|
|
443
|
-
excludeSelectors: z.array(z.string()).optional(),
|
|
444
|
-
significanceThresholds: z.object({
|
|
445
|
-
minor: z.number().min(0).max(1).default(0.1),
|
|
446
|
-
moderate: z.number().min(0).max(1).default(0.3),
|
|
447
|
-
major: z.number().min(0).max(1).default(0.7)
|
|
448
|
-
}).optional()
|
|
449
|
-
}).optional(),
|
|
450
|
-
monitoringOptions: z.object({
|
|
451
|
-
enabled: z.boolean().default(false),
|
|
452
|
-
interval: z.number().min(60000).max(24 * 60 * 60 * 1000).default(300000),
|
|
453
|
-
maxRetries: z.number().min(0).max(5).default(3),
|
|
454
|
-
retryDelay: z.number().min(1000).max(60000).default(5000),
|
|
455
|
-
notificationThreshold: z.enum(['minor', 'moderate', 'major', 'critical']).default('moderate'),
|
|
456
|
-
enableWebhook: z.boolean().default(false),
|
|
457
|
-
webhookUrl: z.string().url().optional(),
|
|
458
|
-
webhookSecret: z.string().optional()
|
|
459
|
-
}).optional(),
|
|
460
|
-
storageOptions: z.object({
|
|
461
|
-
enableSnapshots: z.boolean().default(true),
|
|
462
|
-
retainHistory: z.boolean().default(true),
|
|
463
|
-
maxHistoryEntries: z.number().min(1).max(1000).default(100),
|
|
464
|
-
compressionEnabled: z.boolean().default(true),
|
|
465
|
-
deltaStorageEnabled: z.boolean().default(true)
|
|
466
|
-
}).optional(),
|
|
467
|
-
queryOptions: z.object({
|
|
468
|
-
limit: z.number().min(1).max(500).default(50),
|
|
469
|
-
offset: z.number().min(0).default(0),
|
|
470
|
-
startTime: z.number().optional(),
|
|
471
|
-
endTime: z.number().optional(),
|
|
472
|
-
includeContent: z.boolean().default(false),
|
|
473
|
-
significanceFilter: z.enum(['all', 'minor', 'moderate', 'major', 'critical']).optional()
|
|
474
|
-
}).optional(),
|
|
475
|
-
notificationOptions: z.object({
|
|
476
|
-
webhook: z.object({
|
|
477
|
-
enabled: z.boolean().default(false),
|
|
478
|
-
url: z.string().url().optional(),
|
|
479
|
-
method: z.enum(['POST', 'PUT']).default('POST'),
|
|
480
|
-
headers: z.record(z.string()).optional(),
|
|
481
|
-
signingSecret: z.string().optional(),
|
|
482
|
-
includeContent: z.boolean().default(false)
|
|
483
|
-
}).optional(),
|
|
484
|
-
slack: z.object({
|
|
485
|
-
enabled: z.boolean().default(false),
|
|
486
|
-
webhookUrl: z.string().url().optional(),
|
|
487
|
-
channel: z.string().optional(),
|
|
488
|
-
username: z.string().optional()
|
|
489
|
-
}).optional()
|
|
490
|
-
}).optional()
|
|
491
|
-
});
|
|
492
|
-
|
|
493
|
-
// LLMs.txt Generator Tool Schema (Phase 2.5)
|
|
494
|
-
const GenerateLLMsTxtSchema = z.object({
|
|
495
|
-
url: z.string().url(),
|
|
496
|
-
analysisOptions: z.object({
|
|
497
|
-
maxDepth: z.number().min(1).max(5).optional().default(3),
|
|
498
|
-
maxPages: z.number().min(10).max(500).optional().default(100),
|
|
499
|
-
detectAPIs: z.boolean().optional().default(true),
|
|
500
|
-
analyzeContent: z.boolean().optional().default(true),
|
|
501
|
-
checkSecurity: z.boolean().optional().default(true),
|
|
502
|
-
respectRobots: z.boolean().optional().default(true)
|
|
503
|
-
}).optional(),
|
|
504
|
-
outputOptions: z.object({
|
|
505
|
-
includeDetailed: z.boolean().optional().default(true),
|
|
506
|
-
includeAnalysis: z.boolean().optional().default(false),
|
|
507
|
-
contactEmail: z.string().email().optional(),
|
|
508
|
-
organizationName: z.string().optional(),
|
|
509
|
-
customGuidelines: z.array(z.string()).optional(),
|
|
510
|
-
customRestrictions: z.array(z.string()).optional()
|
|
511
|
-
}).optional(),
|
|
512
|
-
complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard'),
|
|
513
|
-
format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both')
|
|
514
|
-
});
|
|
515
|
-
|
|
516
|
-
// Stealth Mode Tool Schema (Wave 3)
|
|
517
|
-
const StealthModeSchema = z.object({
|
|
518
|
-
operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure'),
|
|
519
|
-
stealthConfig: z.object({
|
|
520
|
-
level: z.enum(['basic', 'medium', 'advanced']).default('medium'),
|
|
521
|
-
randomizeFingerprint: z.boolean().default(true),
|
|
522
|
-
hideWebDriver: z.boolean().default(true),
|
|
523
|
-
blockWebRTC: z.boolean().default(true),
|
|
524
|
-
spoofTimezone: z.boolean().default(true),
|
|
525
|
-
randomizeHeaders: z.boolean().default(true),
|
|
526
|
-
useRandomUserAgent: z.boolean().default(true),
|
|
527
|
-
simulateHumanBehavior: z.boolean().default(true),
|
|
528
|
-
customUserAgent: z.string().optional(),
|
|
529
|
-
customViewport: z.object({
|
|
530
|
-
width: z.number().min(800).max(1920),
|
|
531
|
-
height: z.number().min(600).max(1080)
|
|
532
|
-
}).optional(),
|
|
533
|
-
locale: z.string().default('en-US'),
|
|
534
|
-
timezone: z.string().optional(),
|
|
535
|
-
webRTCPublicIP: z.string().optional(),
|
|
536
|
-
webRTCLocalIPs: z.array(z.string()).optional(),
|
|
537
|
-
proxyRotation: z.object({
|
|
538
|
-
enabled: z.boolean().default(false),
|
|
539
|
-
proxies: z.array(z.string()).optional(),
|
|
540
|
-
rotationInterval: z.number().default(300000)
|
|
541
|
-
}).optional(),
|
|
542
|
-
antiDetection: z.object({
|
|
543
|
-
cloudflareBypass: z.boolean().default(true),
|
|
544
|
-
recaptchaHandling: z.boolean().default(true),
|
|
545
|
-
hideAutomation: z.boolean().default(true),
|
|
546
|
-
spoofMediaDevices: z.boolean().default(true),
|
|
547
|
-
spoofBatteryAPI: z.boolean().default(true)
|
|
548
|
-
}).optional(),
|
|
549
|
-
fingerprinting: z.object({
|
|
550
|
-
canvasNoise: z.boolean().default(true),
|
|
551
|
-
webglSpoofing: z.boolean().default(true),
|
|
552
|
-
audioContextSpoofing: z.boolean().default(true),
|
|
553
|
-
fontSpoofing: z.boolean().default(true),
|
|
554
|
-
hardwareSpoofing: z.boolean().default(true)
|
|
555
|
-
}).optional()
|
|
556
|
-
}).optional(),
|
|
557
|
-
contextId: z.string().optional(),
|
|
558
|
-
urlToTest: z.string().url().optional()
|
|
559
|
-
});
|
|
560
|
-
|
|
561
|
-
// Localization Tool Schema (Wave 3)
|
|
562
|
-
const LocalizationSchema = z.object({
|
|
563
|
-
operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country'),
|
|
564
|
-
countryCode: z.string().length(2).optional(),
|
|
565
|
-
language: z.string().optional(),
|
|
566
|
-
timezone: z.string().optional(),
|
|
567
|
-
currency: z.string().length(3).optional(),
|
|
568
|
-
customHeaders: z.record(z.string()).optional(),
|
|
569
|
-
userAgent: z.string().optional(),
|
|
570
|
-
acceptLanguage: z.string().optional(),
|
|
571
|
-
geoLocation: z.object({
|
|
572
|
-
latitude: z.number().min(-90).max(90),
|
|
573
|
-
longitude: z.number().min(-180).max(180),
|
|
574
|
-
accuracy: z.number().min(1).max(100).optional()
|
|
575
|
-
}).optional(),
|
|
576
|
-
proxySettings: z.object({
|
|
577
|
-
enabled: z.boolean().default(false),
|
|
578
|
-
region: z.string().optional(),
|
|
579
|
-
type: z.enum(['http', 'https', 'socks4', 'socks5']).default('https'),
|
|
580
|
-
server: z.string().optional(),
|
|
581
|
-
port: z.number().optional(),
|
|
582
|
-
username: z.string().optional(),
|
|
583
|
-
password: z.string().optional(),
|
|
584
|
-
rotation: z.object({
|
|
585
|
-
enabled: z.boolean().default(false),
|
|
586
|
-
interval: z.number().default(300000),
|
|
587
|
-
strategy: z.enum(['round-robin', 'random', 'failover']).default('round-robin')
|
|
588
|
-
}).optional(),
|
|
589
|
-
fallback: z.object({
|
|
590
|
-
enabled: z.boolean().default(true),
|
|
591
|
-
maxRetries: z.number().default(3),
|
|
592
|
-
timeout: z.number().default(10000)
|
|
593
|
-
}).optional()
|
|
594
|
-
}).optional(),
|
|
595
|
-
searchParams: z.object({
|
|
596
|
-
query: z.string().optional(),
|
|
597
|
-
limit: z.number().optional(),
|
|
598
|
-
offset: z.number().optional(),
|
|
599
|
-
headers: z.record(z.string()).optional()
|
|
600
|
-
}).optional(),
|
|
601
|
-
browserOptions: z.object({
|
|
602
|
-
locale: z.string().optional(),
|
|
603
|
-
timezoneId: z.string().optional(),
|
|
604
|
-
extraHTTPHeaders: z.record(z.string()).optional(),
|
|
605
|
-
userAgent: z.string().optional()
|
|
606
|
-
}).optional(),
|
|
607
|
-
content: z.string().optional(),
|
|
608
|
-
url: z.string().url().optional(),
|
|
609
|
-
response: z.object({
|
|
610
|
-
status: z.number(),
|
|
611
|
-
body: z.string().optional(),
|
|
612
|
-
statusText: z.string().optional()
|
|
613
|
-
}).optional()
|
|
614
|
-
});
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
// Utility function to fetch URL with error handling
|
|
618
|
-
async function fetchWithTimeout(url, options = {}) {
|
|
619
|
-
const { timeout = 10000, headers = {} } = options;
|
|
620
|
-
|
|
621
|
-
const controller = new AbortController();
|
|
622
|
-
const timeoutId = setTimeout(() => controller.abort(), timeout);
|
|
623
|
-
|
|
624
|
-
try {
|
|
625
|
-
const response = await fetch(url, {
|
|
626
|
-
signal: controller.signal,
|
|
627
|
-
headers: {
|
|
628
|
-
'User-Agent': 'CrawlForge/1.0.0',
|
|
629
|
-
...headers
|
|
630
|
-
}
|
|
631
|
-
});
|
|
632
|
-
|
|
633
|
-
clearTimeout(timeoutId);
|
|
634
|
-
return response;
|
|
635
|
-
} catch (error) {
|
|
636
|
-
clearTimeout(timeoutId);
|
|
637
|
-
if (error.name === 'AbortError') {
|
|
638
|
-
throw new Error(`Request timeout after ${timeout}ms`);
|
|
639
|
-
}
|
|
640
|
-
throw error;
|
|
641
|
-
}
|
|
642
|
-
}
|
|
157
|
+
// ─── Tool registrations ────────────────────────────────────────────────────────
|
|
643
158
|
|
|
644
|
-
// Tool: fetch_url
|
|
159
|
+
// Tool: fetch_url
|
|
645
160
|
server.registerTool("fetch_url", {
|
|
646
161
|
description: "Fetch content from a URL with optional headers and timeout",
|
|
647
162
|
annotations: { title: "Fetch URL", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
@@ -650,45 +165,9 @@ server.registerTool("fetch_url", {
|
|
|
650
165
|
headers: z.record(z.string()).optional().describe("Custom HTTP headers to include in the request"),
|
|
651
166
|
timeout: z.number().min(1000).max(30000).optional().default(10000).describe("Request timeout in milliseconds (1000-30000)")
|
|
652
167
|
}
|
|
653
|
-
}, withAuth("fetch_url",
|
|
654
|
-
try {
|
|
655
|
-
const response = await fetchWithTimeout(url, {
|
|
656
|
-
timeout: timeout || 10000,
|
|
657
|
-
headers: headers || {}
|
|
658
|
-
});
|
|
659
|
-
|
|
660
|
-
const body = await response.text();
|
|
661
|
-
const responseHeaders = {};
|
|
662
|
-
response.headers.forEach((value, key) => {
|
|
663
|
-
responseHeaders[key] = value;
|
|
664
|
-
});
|
|
665
|
-
|
|
666
|
-
return {
|
|
667
|
-
content: [{
|
|
668
|
-
type: "text",
|
|
669
|
-
text: JSON.stringify({
|
|
670
|
-
status: response.status,
|
|
671
|
-
statusText: response.statusText,
|
|
672
|
-
headers: responseHeaders,
|
|
673
|
-
body: body,
|
|
674
|
-
contentType: response.headers.get('content-type') || 'unknown',
|
|
675
|
-
size: body.length,
|
|
676
|
-
url: response.url
|
|
677
|
-
}, null, 2)
|
|
678
|
-
}]
|
|
679
|
-
};
|
|
680
|
-
} catch (error) {
|
|
681
|
-
return {
|
|
682
|
-
content: [{
|
|
683
|
-
type: "text",
|
|
684
|
-
text: `Failed to fetch URL: ${error.message}`
|
|
685
|
-
}],
|
|
686
|
-
isError: true
|
|
687
|
-
};
|
|
688
|
-
}
|
|
689
|
-
}));
|
|
168
|
+
}, withAuth("fetch_url", fetchUrlHandler));
|
|
690
169
|
|
|
691
|
-
// Tool: extract_text
|
|
170
|
+
// Tool: extract_text
|
|
692
171
|
server.registerTool("extract_text", {
|
|
693
172
|
description: "Extract clean text content from a webpage",
|
|
694
173
|
annotations: { title: "Extract Text", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
@@ -697,53 +176,9 @@ server.registerTool("extract_text", {
|
|
|
697
176
|
remove_scripts: z.boolean().optional().default(true).describe("Remove script tags before extraction"),
|
|
698
177
|
remove_styles: z.boolean().optional().default(true).describe("Remove style tags before extraction")
|
|
699
178
|
}
|
|
700
|
-
}, withAuth("extract_text",
|
|
701
|
-
try {
|
|
702
|
-
const response = await fetchWithTimeout(url);
|
|
703
|
-
if (!response.ok) {
|
|
704
|
-
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
705
|
-
}
|
|
706
|
-
|
|
707
|
-
const html = await response.text();
|
|
708
|
-
const $ = load(html);
|
|
709
|
-
|
|
710
|
-
// Remove unwanted elements
|
|
711
|
-
if (remove_scripts !== false) {
|
|
712
|
-
$('script').remove();
|
|
713
|
-
}
|
|
714
|
-
if (remove_styles !== false) {
|
|
715
|
-
$('style').remove();
|
|
716
|
-
}
|
|
717
|
-
|
|
718
|
-
// Remove common non-content elements
|
|
719
|
-
$('nav, header, footer, aside, .advertisement, .ad, .sidebar').remove();
|
|
720
|
-
|
|
721
|
-
// Extract text content
|
|
722
|
-
const text = $('body').text().replace(/\s+/g, ' ').trim();
|
|
723
|
-
|
|
724
|
-
return {
|
|
725
|
-
content: [{
|
|
726
|
-
type: "text",
|
|
727
|
-
text: JSON.stringify({
|
|
728
|
-
text: text,
|
|
729
|
-
word_count: text.split(/\s+/).filter(word => word.length > 0).length,
|
|
730
|
-
char_count: text.length,
|
|
731
|
-
url: response.url
|
|
732
|
-
}, null, 2)
|
|
733
|
-
}]
|
|
734
|
-
};
|
|
735
|
-
} catch (error) {
|
|
736
|
-
return {
|
|
737
|
-
content: [{
|
|
738
|
-
type: "text",
|
|
739
|
-
text: `Failed to extract text: ${error.message}`
|
|
740
|
-
}],
|
|
741
|
-
isError: true
|
|
742
|
-
};
|
|
743
|
-
}
|
|
744
|
-
}));
|
|
179
|
+
}, withAuth("extract_text", extractTextHandler));
|
|
745
180
|
|
|
746
|
-
// Tool: extract_links
|
|
181
|
+
// Tool: extract_links
|
|
747
182
|
server.registerTool("extract_links", {
|
|
748
183
|
description: "Extract all links from a webpage with optional filtering",
|
|
749
184
|
annotations: { title: "Extract Links", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
@@ -752,163 +187,18 @@ server.registerTool("extract_links", {
|
|
|
752
187
|
filter_external: z.boolean().optional().default(false).describe("Only return external links"),
|
|
753
188
|
base_url: z.string().url().optional().describe("Base URL for resolving relative links")
|
|
754
189
|
}
|
|
755
|
-
}, withAuth("extract_links",
|
|
756
|
-
try {
|
|
757
|
-
const response = await fetchWithTimeout(url);
|
|
758
|
-
if (!response.ok) {
|
|
759
|
-
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
760
|
-
}
|
|
761
|
-
|
|
762
|
-
const html = await response.text();
|
|
763
|
-
const $ = load(html);
|
|
764
|
-
|
|
765
|
-
const baseUrl = base_url || new URL(url).origin;
|
|
766
|
-
const pageUrl = new URL(url);
|
|
767
|
-
const links = [];
|
|
768
|
-
|
|
769
|
-
$('a[href]').each((_, element) => {
|
|
770
|
-
const href = $(element).attr('href');
|
|
771
|
-
const text = $(element).text().trim();
|
|
772
|
-
|
|
773
|
-
if (!href) return;
|
|
774
|
-
|
|
775
|
-
let absoluteUrl;
|
|
776
|
-
let isExternal = false;
|
|
777
|
-
|
|
778
|
-
try {
|
|
779
|
-
if (href.startsWith('http://') || href.startsWith('https://')) {
|
|
780
|
-
absoluteUrl = href;
|
|
781
|
-
isExternal = new URL(href).origin !== pageUrl.origin;
|
|
782
|
-
} else {
|
|
783
|
-
absoluteUrl = new URL(href, baseUrl).toString();
|
|
784
|
-
isExternal = false;
|
|
785
|
-
}
|
|
786
|
-
|
|
787
|
-
// Apply filtering
|
|
788
|
-
if (filter_external && isExternal) {
|
|
789
|
-
return;
|
|
790
|
-
}
|
|
791
|
-
|
|
792
|
-
links.push({
|
|
793
|
-
href: absoluteUrl,
|
|
794
|
-
text: text,
|
|
795
|
-
is_external: isExternal,
|
|
796
|
-
original_href: href
|
|
797
|
-
});
|
|
798
|
-
} catch (urlError) {
|
|
799
|
-
// Skip invalid URLs
|
|
800
|
-
}
|
|
801
|
-
});
|
|
802
|
-
|
|
803
|
-
// Remove duplicates
|
|
804
|
-
const uniqueLinks = links.filter((link, index, arr) =>
|
|
805
|
-
arr.findIndex(l => l.href === link.href) === index
|
|
806
|
-
);
|
|
807
|
-
|
|
808
|
-
return {
|
|
809
|
-
content: [{
|
|
810
|
-
type: "text",
|
|
811
|
-
text: JSON.stringify({
|
|
812
|
-
links: uniqueLinks,
|
|
813
|
-
total_count: uniqueLinks.length,
|
|
814
|
-
internal_count: uniqueLinks.filter(l => !l.is_external).length,
|
|
815
|
-
external_count: uniqueLinks.filter(l => l.is_external).length,
|
|
816
|
-
base_url: baseUrl
|
|
817
|
-
}, null, 2)
|
|
818
|
-
}]
|
|
819
|
-
};
|
|
820
|
-
} catch (error) {
|
|
821
|
-
return {
|
|
822
|
-
content: [{
|
|
823
|
-
type: "text",
|
|
824
|
-
text: `Failed to extract links: ${error.message}`
|
|
825
|
-
}],
|
|
826
|
-
isError: true
|
|
827
|
-
};
|
|
828
|
-
}
|
|
829
|
-
}));
|
|
190
|
+
}, withAuth("extract_links", extractLinksHandler));
|
|
830
191
|
|
|
831
|
-
// Tool: extract_metadata
|
|
192
|
+
// Tool: extract_metadata
|
|
832
193
|
server.registerTool("extract_metadata", {
|
|
833
194
|
description: "Extract metadata from a webpage (title, description, keywords, etc.)",
|
|
834
195
|
annotations: { title: "Extract Metadata", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
835
196
|
inputSchema: {
|
|
836
197
|
url: z.string().url().describe("The URL to extract metadata from")
|
|
837
198
|
}
|
|
838
|
-
}, withAuth("extract_metadata",
|
|
839
|
-
try {
|
|
840
|
-
const response = await fetchWithTimeout(url);
|
|
841
|
-
if (!response.ok) {
|
|
842
|
-
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
843
|
-
}
|
|
844
|
-
|
|
845
|
-
const html = await response.text();
|
|
846
|
-
const $ = load(html);
|
|
847
|
-
|
|
848
|
-
// Extract basic metadata
|
|
849
|
-
const title = $('title').text().trim() || $('h1').first().text().trim();
|
|
850
|
-
const description = $('meta[name="description"]').attr('content') ||
|
|
851
|
-
$('meta[property="og:description"]').attr('content') || '';
|
|
852
|
-
const keywords = $('meta[name="keywords"]').attr('content') || '';
|
|
853
|
-
const canonical = $('link[rel="canonical"]').attr('href') || '';
|
|
854
|
-
|
|
855
|
-
// Extract Open Graph tags
|
|
856
|
-
const ogTags = {};
|
|
857
|
-
$('meta[property^="og:"]').each((_, element) => {
|
|
858
|
-
const property = $(element).attr('property');
|
|
859
|
-
const content = $(element).attr('content');
|
|
860
|
-
if (property && content) {
|
|
861
|
-
ogTags[property.replace('og:', '')] = content;
|
|
862
|
-
}
|
|
863
|
-
});
|
|
864
|
-
|
|
865
|
-
// Extract Twitter Card tags
|
|
866
|
-
const twitterTags = {};
|
|
867
|
-
$('meta[name^="twitter:"]').each((_, element) => {
|
|
868
|
-
const name = $(element).attr('name');
|
|
869
|
-
const content = $(element).attr('content');
|
|
870
|
-
if (name && content) {
|
|
871
|
-
twitterTags[name.replace('twitter:', '')] = content;
|
|
872
|
-
}
|
|
873
|
-
});
|
|
874
|
-
|
|
875
|
-
// Extract additional metadata
|
|
876
|
-
const author = $('meta[name="author"]').attr('content') || '';
|
|
877
|
-
const robots = $('meta[name="robots"]').attr('content') || '';
|
|
878
|
-
const viewport = $('meta[name="viewport"]').attr('content') || '';
|
|
879
|
-
const charset = $('meta[charset]').attr('charset') ||
|
|
880
|
-
$('meta[http-equiv="Content-Type"]').attr('content') || '';
|
|
881
|
-
|
|
882
|
-
return {
|
|
883
|
-
content: [{
|
|
884
|
-
type: "text",
|
|
885
|
-
text: JSON.stringify({
|
|
886
|
-
title: title,
|
|
887
|
-
description: description,
|
|
888
|
-
keywords: keywords.split(',').map(k => k.trim()).filter(k => k),
|
|
889
|
-
canonical_url: canonical,
|
|
890
|
-
author: author,
|
|
891
|
-
robots: robots,
|
|
892
|
-
viewport: viewport,
|
|
893
|
-
charset: charset,
|
|
894
|
-
og_tags: ogTags,
|
|
895
|
-
twitter_tags: twitterTags,
|
|
896
|
-
url: response.url
|
|
897
|
-
}, null, 2)
|
|
898
|
-
}]
|
|
899
|
-
};
|
|
900
|
-
} catch (error) {
|
|
901
|
-
return {
|
|
902
|
-
content: [{
|
|
903
|
-
type: "text",
|
|
904
|
-
text: `Failed to extract metadata: ${error.message}`
|
|
905
|
-
}],
|
|
906
|
-
isError: true
|
|
907
|
-
};
|
|
908
|
-
}
|
|
909
|
-
}));
|
|
199
|
+
}, withAuth("extract_metadata", extractMetadataHandler));
|
|
910
200
|
|
|
911
|
-
// Tool: scrape_structured
|
|
201
|
+
// Tool: scrape_structured
|
|
912
202
|
server.registerTool("scrape_structured", {
|
|
913
203
|
description: "Extract structured data from a webpage using CSS selectors",
|
|
914
204
|
annotations: { title: "Scrape Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
@@ -916,63 +206,9 @@ server.registerTool("scrape_structured", {
|
|
|
916
206
|
url: z.string().url().describe("The URL to scrape"),
|
|
917
207
|
selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors")
|
|
918
208
|
}
|
|
919
|
-
}, withAuth("scrape_structured",
|
|
920
|
-
try {
|
|
921
|
-
const response = await fetchWithTimeout(url);
|
|
922
|
-
if (!response.ok) {
|
|
923
|
-
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
924
|
-
}
|
|
925
|
-
|
|
926
|
-
const html = await response.text();
|
|
927
|
-
const $ = load(html);
|
|
928
|
-
|
|
929
|
-
const results = {};
|
|
930
|
-
|
|
931
|
-
for (const [fieldName, selector] of Object.entries(selectors)) {
|
|
932
|
-
try {
|
|
933
|
-
const elements = $(selector);
|
|
934
|
-
|
|
935
|
-
if (elements.length === 0) {
|
|
936
|
-
results[fieldName] = null;
|
|
937
|
-
} else if (elements.length === 1) {
|
|
938
|
-
// Single element - return text content
|
|
939
|
-
results[fieldName] = elements.text().trim();
|
|
940
|
-
} else {
|
|
941
|
-
// Multiple elements - return array of text content
|
|
942
|
-
results[fieldName] = elements.map((_, el) => $(el).text().trim()).get();
|
|
943
|
-
}
|
|
944
|
-
} catch (selectorError) {
|
|
945
|
-
results[fieldName] = {
|
|
946
|
-
error: `Invalid selector: ${selector}`,
|
|
947
|
-
message: selectorError.message
|
|
948
|
-
};
|
|
949
|
-
}
|
|
950
|
-
}
|
|
951
|
-
|
|
952
|
-
return {
|
|
953
|
-
content: [{
|
|
954
|
-
type: "text",
|
|
955
|
-
text: JSON.stringify({
|
|
956
|
-
data: results,
|
|
957
|
-
selectors_used: selectors,
|
|
958
|
-
elements_found: Object.keys(results).length,
|
|
959
|
-
url: response.url
|
|
960
|
-
}, null, 2)
|
|
961
|
-
}]
|
|
962
|
-
};
|
|
963
|
-
} catch (error) {
|
|
964
|
-
return {
|
|
965
|
-
content: [{
|
|
966
|
-
type: "text",
|
|
967
|
-
text: `Failed to scrape structured data: ${error.message}`
|
|
968
|
-
}],
|
|
969
|
-
isError: true
|
|
970
|
-
};
|
|
971
|
-
}
|
|
972
|
-
}));
|
|
209
|
+
}, withAuth("scrape_structured", scrapeStructuredHandler));
|
|
973
210
|
|
|
974
|
-
// Tool: search_web
|
|
975
|
-
// Tool: search_web - Search the web using Google Search via CrawlForge proxy
|
|
211
|
+
// Tool: search_web
|
|
976
212
|
server.registerTool("search_web", {
|
|
977
213
|
description: "Search the web using Google Search API (proxied through CrawlForge)",
|
|
978
214
|
annotations: { title: "Search the Web", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
@@ -989,34 +225,16 @@ server.registerTool("search_web", {
|
|
|
989
225
|
}, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
|
|
990
226
|
try {
|
|
991
227
|
if (!query) {
|
|
992
|
-
return {
|
|
993
|
-
content: [{
|
|
994
|
-
type: "text",
|
|
995
|
-
text: "Query parameter is required"
|
|
996
|
-
}],
|
|
997
|
-
isError: true
|
|
998
|
-
};
|
|
228
|
+
return { content: [{ type: "text", text: "Query parameter is required" }], isError: true };
|
|
999
229
|
}
|
|
1000
|
-
|
|
1001
230
|
const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type });
|
|
1002
|
-
return {
|
|
1003
|
-
content: [{
|
|
1004
|
-
type: "text",
|
|
1005
|
-
text: JSON.stringify(result, null, 2)
|
|
1006
|
-
}]
|
|
1007
|
-
};
|
|
231
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1008
232
|
} catch (error) {
|
|
1009
|
-
return {
|
|
1010
|
-
content: [{
|
|
1011
|
-
type: "text",
|
|
1012
|
-
text: `Search failed: ${error.message}`
|
|
1013
|
-
}],
|
|
1014
|
-
isError: true
|
|
1015
|
-
};
|
|
233
|
+
return { content: [{ type: "text", text: `Search failed: ${error.message}` }], isError: true };
|
|
1016
234
|
}
|
|
1017
235
|
}));
|
|
1018
236
|
|
|
1019
|
-
// Tool: crawl_deep
|
|
237
|
+
// Tool: crawl_deep
|
|
1020
238
|
server.registerTool("crawl_deep", {
|
|
1021
239
|
description: "Crawl websites deeply using breadth-first search",
|
|
1022
240
|
annotations: { title: "Deep Crawl", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
@@ -1034,34 +252,16 @@ server.registerTool("crawl_deep", {
|
|
|
1034
252
|
}, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }) => {
|
|
1035
253
|
try {
|
|
1036
254
|
if (!url) {
|
|
1037
|
-
return {
|
|
1038
|
-
content: [{
|
|
1039
|
-
type: "text",
|
|
1040
|
-
text: "URL parameter is required"
|
|
1041
|
-
}],
|
|
1042
|
-
isError: true
|
|
1043
|
-
};
|
|
255
|
+
return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
|
|
1044
256
|
}
|
|
1045
|
-
|
|
1046
257
|
const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency });
|
|
1047
|
-
return {
|
|
1048
|
-
content: [{
|
|
1049
|
-
type: "text",
|
|
1050
|
-
text: JSON.stringify(result, null, 2)
|
|
1051
|
-
}]
|
|
1052
|
-
};
|
|
258
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1053
259
|
} catch (error) {
|
|
1054
|
-
return {
|
|
1055
|
-
content: [{
|
|
1056
|
-
type: "text",
|
|
1057
|
-
text: `Crawl failed: ${error.message}`
|
|
1058
|
-
}],
|
|
1059
|
-
isError: true
|
|
1060
|
-
};
|
|
260
|
+
return { content: [{ type: "text", text: `Crawl failed: ${error.message}` }], isError: true };
|
|
1061
261
|
}
|
|
1062
262
|
}));
|
|
1063
263
|
|
|
1064
|
-
// Tool: map_site
|
|
264
|
+
// Tool: map_site
|
|
1065
265
|
server.registerTool("map_site", {
|
|
1066
266
|
description: "Discover and map website structure",
|
|
1067
267
|
annotations: { title: "Map Website", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
@@ -1075,36 +275,16 @@ server.registerTool("map_site", {
|
|
|
1075
275
|
}, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata }) => {
|
|
1076
276
|
try {
|
|
1077
277
|
if (!url) {
|
|
1078
|
-
return {
|
|
1079
|
-
content: [{
|
|
1080
|
-
type: "text",
|
|
1081
|
-
text: "URL parameter is required"
|
|
1082
|
-
}],
|
|
1083
|
-
isError: true
|
|
1084
|
-
};
|
|
278
|
+
return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
|
|
1085
279
|
}
|
|
1086
|
-
|
|
1087
280
|
const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata });
|
|
1088
|
-
return {
|
|
1089
|
-
content: [{
|
|
1090
|
-
type: "text",
|
|
1091
|
-
text: JSON.stringify(result, null, 2)
|
|
1092
|
-
}]
|
|
1093
|
-
};
|
|
281
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1094
282
|
} catch (error) {
|
|
1095
|
-
return {
|
|
1096
|
-
content: [{
|
|
1097
|
-
type: "text",
|
|
1098
|
-
text: `Site mapping failed: ${error.message}`
|
|
1099
|
-
}],
|
|
1100
|
-
isError: true
|
|
1101
|
-
};
|
|
283
|
+
return { content: [{ type: "text", text: `Site mapping failed: ${error.message}` }], isError: true };
|
|
1102
284
|
}
|
|
1103
285
|
}));
|
|
1104
286
|
|
|
1105
|
-
//
|
|
1106
|
-
|
|
1107
|
-
// Tool: extract_content - Enhanced content extraction with readability detection
|
|
287
|
+
// Tool: extract_content
|
|
1108
288
|
server.registerTool("extract_content", {
|
|
1109
289
|
description: "Extract and analyze main content from web pages with enhanced readability detection",
|
|
1110
290
|
annotations: { title: "Extract Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
@@ -1115,34 +295,16 @@ server.registerTool("extract_content", {
|
|
|
1115
295
|
}, withAuth("extract_content", async ({ url, options }) => {
|
|
1116
296
|
try {
|
|
1117
297
|
if (!url) {
|
|
1118
|
-
return {
|
|
1119
|
-
content: [{
|
|
1120
|
-
type: "text",
|
|
1121
|
-
text: "URL parameter is required"
|
|
1122
|
-
}],
|
|
1123
|
-
isError: true
|
|
1124
|
-
};
|
|
298
|
+
return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
|
|
1125
299
|
}
|
|
1126
|
-
|
|
1127
300
|
const result = await extractContentTool.execute({ url, options });
|
|
1128
|
-
return {
|
|
1129
|
-
content: [{
|
|
1130
|
-
type: "text",
|
|
1131
|
-
text: JSON.stringify(result, null, 2)
|
|
1132
|
-
}]
|
|
1133
|
-
};
|
|
301
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1134
302
|
} catch (error) {
|
|
1135
|
-
return {
|
|
1136
|
-
content: [{
|
|
1137
|
-
type: "text",
|
|
1138
|
-
text: `Content extraction failed: ${error.message}`
|
|
1139
|
-
}],
|
|
1140
|
-
isError: true
|
|
1141
|
-
};
|
|
303
|
+
return { content: [{ type: "text", text: `Content extraction failed: ${error.message}` }], isError: true };
|
|
1142
304
|
}
|
|
1143
305
|
}));
|
|
1144
306
|
|
|
1145
|
-
// Tool: process_document
|
|
307
|
+
// Tool: process_document
|
|
1146
308
|
server.registerTool("process_document", {
|
|
1147
309
|
description: "Process documents from multiple sources and formats including PDFs and web pages",
|
|
1148
310
|
annotations: { title: "Process Document", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
@@ -1154,34 +316,16 @@ server.registerTool("process_document", {
|
|
|
1154
316
|
}, withAuth("process_document", async ({ source, sourceType, options }) => {
|
|
1155
317
|
try {
|
|
1156
318
|
if (!source) {
|
|
1157
|
-
return {
|
|
1158
|
-
content: [{
|
|
1159
|
-
type: "text",
|
|
1160
|
-
text: "Source parameter is required"
|
|
1161
|
-
}],
|
|
1162
|
-
isError: true
|
|
1163
|
-
};
|
|
319
|
+
return { content: [{ type: "text", text: "Source parameter is required" }], isError: true };
|
|
1164
320
|
}
|
|
1165
|
-
|
|
1166
321
|
const result = await processDocumentTool.execute({ source, sourceType, options });
|
|
1167
|
-
return {
|
|
1168
|
-
content: [{
|
|
1169
|
-
type: "text",
|
|
1170
|
-
text: JSON.stringify(result, null, 2)
|
|
1171
|
-
}]
|
|
1172
|
-
};
|
|
322
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1173
323
|
} catch (error) {
|
|
1174
|
-
return {
|
|
1175
|
-
content: [{
|
|
1176
|
-
type: "text",
|
|
1177
|
-
text: `Document processing failed: ${error.message}`
|
|
1178
|
-
}],
|
|
1179
|
-
isError: true
|
|
1180
|
-
};
|
|
324
|
+
return { content: [{ type: "text", text: `Document processing failed: ${error.message}` }], isError: true };
|
|
1181
325
|
}
|
|
1182
326
|
}));
|
|
1183
327
|
|
|
1184
|
-
// Tool: summarize_content
|
|
328
|
+
// Tool: summarize_content
|
|
1185
329
|
server.registerTool("summarize_content", {
|
|
1186
330
|
description: "Generate intelligent summaries of text content with configurable options",
|
|
1187
331
|
annotations: { title: "Summarize Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
|
|
@@ -1192,34 +336,16 @@ server.registerTool("summarize_content", {
|
|
|
1192
336
|
}, withAuth("summarize_content", async ({ text, options }) => {
|
|
1193
337
|
try {
|
|
1194
338
|
if (!text) {
|
|
1195
|
-
return {
|
|
1196
|
-
content: [{
|
|
1197
|
-
type: "text",
|
|
1198
|
-
text: "Text parameter is required"
|
|
1199
|
-
}],
|
|
1200
|
-
isError: true
|
|
1201
|
-
};
|
|
339
|
+
return { content: [{ type: "text", text: "Text parameter is required" }], isError: true };
|
|
1202
340
|
}
|
|
1203
|
-
|
|
1204
341
|
const result = await summarizeContentTool.execute({ text, options });
|
|
1205
|
-
return {
|
|
1206
|
-
content: [{
|
|
1207
|
-
type: "text",
|
|
1208
|
-
text: JSON.stringify(result, null, 2)
|
|
1209
|
-
}]
|
|
1210
|
-
};
|
|
342
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1211
343
|
} catch (error) {
|
|
1212
|
-
return {
|
|
1213
|
-
content: [{
|
|
1214
|
-
type: "text",
|
|
1215
|
-
text: `Content summarization failed: ${error.message}`
|
|
1216
|
-
}],
|
|
1217
|
-
isError: true
|
|
1218
|
-
};
|
|
344
|
+
return { content: [{ type: "text", text: `Content summarization failed: ${error.message}` }], isError: true };
|
|
1219
345
|
}
|
|
1220
346
|
}));
|
|
1221
347
|
|
|
1222
|
-
// Tool: analyze_content
|
|
348
|
+
// Tool: analyze_content
|
|
1223
349
|
server.registerTool("analyze_content", {
|
|
1224
350
|
description: "Perform comprehensive content analysis including language detection and topic extraction",
|
|
1225
351
|
annotations: { title: "Analyze Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
|
|
@@ -1230,38 +356,16 @@ server.registerTool("analyze_content", {
|
|
|
1230
356
|
}, withAuth("analyze_content", async ({ text, options }) => {
|
|
1231
357
|
try {
|
|
1232
358
|
if (!text) {
|
|
1233
|
-
return {
|
|
1234
|
-
content: [{
|
|
1235
|
-
type: "text",
|
|
1236
|
-
text: "Text parameter is required"
|
|
1237
|
-
}],
|
|
1238
|
-
isError: true
|
|
1239
|
-
};
|
|
359
|
+
return { content: [{ type: "text", text: "Text parameter is required" }], isError: true };
|
|
1240
360
|
}
|
|
1241
|
-
|
|
1242
361
|
const result = await analyzeContentTool.execute({ text, options });
|
|
1243
|
-
return {
|
|
1244
|
-
content: [{
|
|
1245
|
-
type: "text",
|
|
1246
|
-
text: JSON.stringify(result, null, 2)
|
|
1247
|
-
}]
|
|
1248
|
-
};
|
|
362
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1249
363
|
} catch (error) {
|
|
1250
|
-
return {
|
|
1251
|
-
content: [{
|
|
1252
|
-
type: "text",
|
|
1253
|
-
text: `Content analysis failed: ${error.message}`
|
|
1254
|
-
}],
|
|
1255
|
-
isError: true
|
|
1256
|
-
};
|
|
364
|
+
return { content: [{ type: "text", text: `Content analysis failed: ${error.message}` }], isError: true };
|
|
1257
365
|
}
|
|
1258
366
|
}));
|
|
1259
367
|
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
// Phase 1: LLM-Powered Structured Extraction
|
|
1263
|
-
|
|
1264
|
-
// Tool: extract_structured - Extract structured data from a URL using LLM and JSON Schema
|
|
368
|
+
// Tool: extract_structured
|
|
1265
369
|
server.registerTool("extract_structured", {
|
|
1266
370
|
description: "Extract structured data from a webpage using LLM-powered analysis and a JSON Schema. Falls back to CSS selector extraction when no LLM provider is configured.",
|
|
1267
371
|
annotations: { title: "Extract Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
@@ -1282,35 +386,36 @@ server.registerTool("extract_structured", {
|
|
|
1282
386
|
}
|
|
1283
387
|
}, withAuth("extract_structured", async ({ url, schema, prompt, llmConfig, fallbackToSelectors, selectorHints }) => {
|
|
1284
388
|
try {
|
|
1285
|
-
const result = await extractStructuredTool.execute({
|
|
1286
|
-
|
|
1287
|
-
schema,
|
|
1288
|
-
prompt,
|
|
1289
|
-
llmConfig,
|
|
1290
|
-
fallbackToSelectors,
|
|
1291
|
-
selectorHints
|
|
1292
|
-
});
|
|
1293
|
-
return {
|
|
1294
|
-
content: [{
|
|
1295
|
-
type: "text",
|
|
1296
|
-
text: JSON.stringify(result, null, 2)
|
|
1297
|
-
}]
|
|
1298
|
-
};
|
|
389
|
+
const result = await extractStructuredTool.execute({ url, schema, prompt, llmConfig, fallbackToSelectors, selectorHints });
|
|
390
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1299
391
|
} catch (error) {
|
|
1300
|
-
return {
|
|
1301
|
-
content: [{
|
|
1302
|
-
type: "text",
|
|
1303
|
-
text: `Structured extraction failed: ${error.message}`
|
|
1304
|
-
}],
|
|
1305
|
-
isError: true
|
|
1306
|
-
};
|
|
392
|
+
return { content: [{ type: "text", text: `Structured extraction failed: ${error.message}` }], isError: true };
|
|
1307
393
|
}
|
|
1308
394
|
}));
|
|
1309
395
|
|
|
396
|
+
// Tool: extract_with_llm
|
|
397
|
+
server.registerTool("extract_with_llm", {
|
|
398
|
+
description: "Extract structured data from a URL or text using a natural-language prompt, powered by OpenAI or Anthropic. Requires OPENAI_API_KEY or ANTHROPIC_API_KEY in the environment.",
|
|
399
|
+
annotations: { title: "Extract With LLM", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
400
|
+
inputSchema: {
|
|
401
|
+
url: z.string().url().optional().describe("URL to fetch and extract from (one of url/content required)"),
|
|
402
|
+
content: z.string().optional().describe("Pre-fetched text to extract from (one of url/content required)"),
|
|
403
|
+
prompt: z.string().describe("Natural-language extraction instruction"),
|
|
404
|
+
schema: z.record(z.unknown()).optional().describe("Optional JSON-schema-like hint for output shape"),
|
|
405
|
+
provider: z.enum(["openai", "anthropic", "auto"]).optional().default("auto").describe("LLM provider"),
|
|
406
|
+
model: z.string().optional().describe("Override default model"),
|
|
407
|
+
maxTokens: z.number().optional().default(4096).describe("Maximum output tokens")
|
|
408
|
+
}
|
|
409
|
+
}, withAuth("extract_with_llm", async (params) => {
|
|
410
|
+
try {
|
|
411
|
+
const result = await extractWithLlmTool.execute(params);
|
|
412
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
413
|
+
} catch (error) {
|
|
414
|
+
return { content: [{ type: "text", text: `LLM extraction failed: ${error.message}` }], isError: true };
|
|
415
|
+
}
|
|
416
|
+
}));
|
|
1310
417
|
|
|
1311
|
-
//
|
|
1312
|
-
|
|
1313
|
-
// Tool: batch_scrape - Process multiple URLs simultaneously with job management
|
|
418
|
+
// Tool: batch_scrape
|
|
1314
419
|
server.registerTool("batch_scrape", {
|
|
1315
420
|
description: "Process multiple URLs simultaneously with support for async job management and webhook notifications",
|
|
1316
421
|
annotations: { title: "Batch Scrape", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
@@ -1349,24 +454,13 @@ server.registerTool("batch_scrape", {
|
|
|
1349
454
|
}, withAuth("batch_scrape", async (params) => {
|
|
1350
455
|
try {
|
|
1351
456
|
const result = await batchScrapeTool.execute(params);
|
|
1352
|
-
return {
|
|
1353
|
-
content: [{
|
|
1354
|
-
type: "text",
|
|
1355
|
-
text: JSON.stringify(result, null, 2)
|
|
1356
|
-
}]
|
|
1357
|
-
};
|
|
457
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1358
458
|
} catch (error) {
|
|
1359
|
-
return {
|
|
1360
|
-
content: [{
|
|
1361
|
-
type: "text",
|
|
1362
|
-
text: `Batch scrape failed: ${error.message}`
|
|
1363
|
-
}],
|
|
1364
|
-
isError: true
|
|
1365
|
-
};
|
|
459
|
+
return { content: [{ type: "text", text: `Batch scrape failed: ${error.message}` }], isError: true };
|
|
1366
460
|
}
|
|
1367
461
|
}));
|
|
1368
462
|
|
|
1369
|
-
// Tool: scrape_with_actions
|
|
463
|
+
// Tool: scrape_with_actions
|
|
1370
464
|
server.registerTool("scrape_with_actions", {
|
|
1371
465
|
description: "Execute browser action chains before scraping content, with form auto-fill and intermediate state capture",
|
|
1372
466
|
annotations: { title: "Scrape with Browser Actions", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
@@ -1416,24 +510,13 @@ server.registerTool("scrape_with_actions", {
|
|
|
1416
510
|
}, withAuth("scrape_with_actions", async (params) => {
|
|
1417
511
|
try {
|
|
1418
512
|
const result = await scrapeWithActionsTool.execute(params);
|
|
1419
|
-
return {
|
|
1420
|
-
content: [{
|
|
1421
|
-
type: "text",
|
|
1422
|
-
text: JSON.stringify(result, null, 2)
|
|
1423
|
-
}]
|
|
1424
|
-
};
|
|
513
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1425
514
|
} catch (error) {
|
|
1426
|
-
return {
|
|
1427
|
-
content: [{
|
|
1428
|
-
type: "text",
|
|
1429
|
-
text: `Scrape with actions failed: ${error.message}`
|
|
1430
|
-
}],
|
|
1431
|
-
isError: true
|
|
1432
|
-
};
|
|
515
|
+
return { content: [{ type: "text", text: `Scrape with actions failed: ${error.message}` }], isError: true };
|
|
1433
516
|
}
|
|
1434
517
|
}));
|
|
1435
518
|
|
|
1436
|
-
// Tool: deep_research
|
|
519
|
+
// Tool: deep_research
|
|
1437
520
|
server.registerTool("deep_research", {
|
|
1438
521
|
description: "Conduct comprehensive multi-stage research with intelligent query expansion, source verification, and conflict detection",
|
|
1439
522
|
annotations: { title: "Deep Research", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
@@ -1483,42 +566,22 @@ server.registerTool("deep_research", {
|
|
|
1483
566
|
}, withAuth("deep_research", async (params) => {
|
|
1484
567
|
try {
|
|
1485
568
|
const result = await deepResearchTool.execute(params);
|
|
1486
|
-
return {
|
|
1487
|
-
content: [{
|
|
1488
|
-
type: "text",
|
|
1489
|
-
text: JSON.stringify(result, null, 2)
|
|
1490
|
-
}]
|
|
1491
|
-
};
|
|
569
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1492
570
|
} catch (error) {
|
|
1493
|
-
return {
|
|
1494
|
-
content: [{
|
|
1495
|
-
type: "text",
|
|
1496
|
-
text: `Deep research failed: ${error.message}`
|
|
1497
|
-
}],
|
|
1498
|
-
isError: true
|
|
1499
|
-
};
|
|
571
|
+
return { content: [{ type: "text", text: `Deep research failed: ${error.message}` }], isError: true };
|
|
1500
572
|
}
|
|
1501
573
|
}));
|
|
1502
574
|
|
|
1503
|
-
// Tool: track_changes
|
|
575
|
+
// Tool: track_changes
|
|
1504
576
|
server.registerTool("track_changes", {
|
|
1505
577
|
description: "Enhanced content change tracking with baseline capture, comparison, scheduled monitoring, advanced comparison engine, alert system, and historical analysis",
|
|
1506
578
|
annotations: { title: "Track Changes", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
1507
579
|
inputSchema: {
|
|
1508
580
|
url: z.string().url().describe("The URL to track changes for"),
|
|
1509
581
|
operation: z.enum([
|
|
1510
|
-
'create_baseline',
|
|
1511
|
-
'
|
|
1512
|
-
'
|
|
1513
|
-
'get_history',
|
|
1514
|
-
'get_stats',
|
|
1515
|
-
'create_scheduled_monitor',
|
|
1516
|
-
'stop_scheduled_monitor',
|
|
1517
|
-
'get_dashboard',
|
|
1518
|
-
'export_history',
|
|
1519
|
-
'create_alert_rule',
|
|
1520
|
-
'generate_trend_report',
|
|
1521
|
-
'get_monitoring_templates'
|
|
582
|
+
'create_baseline', 'compare', 'monitor', 'get_history', 'get_stats',
|
|
583
|
+
'create_scheduled_monitor', 'stop_scheduled_monitor', 'get_dashboard',
|
|
584
|
+
'export_history', 'create_alert_rule', 'generate_trend_report', 'get_monitoring_templates'
|
|
1522
585
|
]).default('compare').describe("Tracking operation to perform"),
|
|
1523
586
|
content: z.string().optional().describe("Content to compare against baseline"),
|
|
1524
587
|
html: z.string().optional().describe("HTML content to compare against baseline"),
|
|
@@ -1580,15 +643,14 @@ server.registerTool("track_changes", {
|
|
|
1580
643
|
username: z.string().optional()
|
|
1581
644
|
}).optional()
|
|
1582
645
|
}).optional().describe("Notification configuration for webhooks and Slack"),
|
|
1583
|
-
// Enhanced Phase 2.4 options
|
|
1584
646
|
scheduledMonitorOptions: z.object({
|
|
1585
|
-
schedule: z.string().optional(),
|
|
1586
|
-
templateId: z.string().optional(),
|
|
647
|
+
schedule: z.string().optional(),
|
|
648
|
+
templateId: z.string().optional(),
|
|
1587
649
|
enabled: z.boolean().default(true)
|
|
1588
650
|
}).optional().describe("Scheduled monitoring options with cron expressions"),
|
|
1589
651
|
alertRuleOptions: z.object({
|
|
1590
652
|
ruleId: z.string().optional(),
|
|
1591
|
-
condition: z.string().optional(),
|
|
653
|
+
condition: z.string().optional(),
|
|
1592
654
|
actions: z.array(z.enum(['webhook', 'email', 'slack'])).optional(),
|
|
1593
655
|
throttle: z.number().min(0).optional(),
|
|
1594
656
|
priority: z.enum(['low', 'medium', 'high']).optional()
|
|
@@ -1609,24 +671,13 @@ server.registerTool("track_changes", {
|
|
|
1609
671
|
}, withAuth("track_changes", async (params) => {
|
|
1610
672
|
try {
|
|
1611
673
|
const result = await trackChangesTool.execute(params);
|
|
1612
|
-
return {
|
|
1613
|
-
content: [{
|
|
1614
|
-
type: "text",
|
|
1615
|
-
text: JSON.stringify(result, null, 2)
|
|
1616
|
-
}]
|
|
1617
|
-
};
|
|
674
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1618
675
|
} catch (error) {
|
|
1619
|
-
return {
|
|
1620
|
-
content: [{
|
|
1621
|
-
type: "text",
|
|
1622
|
-
text: `Change tracking failed: ${error.message}`
|
|
1623
|
-
}],
|
|
1624
|
-
isError: true
|
|
1625
|
-
};
|
|
676
|
+
return { content: [{ type: "text", text: `Change tracking failed: ${error.message}` }], isError: true };
|
|
1626
677
|
}
|
|
1627
678
|
}));
|
|
1628
679
|
|
|
1629
|
-
// Tool: generate_llms_txt
|
|
680
|
+
// Tool: generate_llms_txt
|
|
1630
681
|
server.registerTool("generate_llms_txt", {
|
|
1631
682
|
description: "Analyze websites and generate standard-compliant LLMs.txt and LLMs-full.txt files defining AI model interaction guidelines",
|
|
1632
683
|
annotations: { title: "Generate llms.txt", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
|
|
@@ -1654,24 +705,13 @@ server.registerTool("generate_llms_txt", {
|
|
|
1654
705
|
}, withAuth("generate_llms_txt", async (params) => {
|
|
1655
706
|
try {
|
|
1656
707
|
const result = await generateLLMsTxtTool.execute(params);
|
|
1657
|
-
return {
|
|
1658
|
-
content: [{
|
|
1659
|
-
type: "text",
|
|
1660
|
-
text: JSON.stringify(result, null, 2)
|
|
1661
|
-
}]
|
|
1662
|
-
};
|
|
708
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1663
709
|
} catch (error) {
|
|
1664
|
-
return {
|
|
1665
|
-
content: [{
|
|
1666
|
-
type: "text",
|
|
1667
|
-
text: `LLMs.txt generation failed: ${error.message}`
|
|
1668
|
-
}],
|
|
1669
|
-
isError: true
|
|
1670
|
-
};
|
|
710
|
+
return { content: [{ type: "text", text: `LLMs.txt generation failed: ${error.message}` }], isError: true };
|
|
1671
711
|
}
|
|
1672
712
|
}));
|
|
1673
713
|
|
|
1674
|
-
// Tool: stealth_mode
|
|
714
|
+
// Tool: stealth_mode
|
|
1675
715
|
server.registerTool("stealth_mode", {
|
|
1676
716
|
description: "Advanced anti-detection browser management with stealth features, fingerprint randomization, and human behavior simulation",
|
|
1677
717
|
annotations: { title: "Stealth Mode", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
@@ -1721,7 +761,6 @@ server.registerTool("stealth_mode", {
|
|
|
1721
761
|
}, withAuth("stealth_mode", async ({ operation, stealthConfig, contextId, urlToTest }) => {
|
|
1722
762
|
try {
|
|
1723
763
|
let result;
|
|
1724
|
-
|
|
1725
764
|
switch (operation) {
|
|
1726
765
|
case 'configure':
|
|
1727
766
|
if (stealthConfig) {
|
|
@@ -1731,69 +770,42 @@ server.registerTool("stealth_mode", {
|
|
|
1731
770
|
result = { error: 'stealthConfig is required for configure operation' };
|
|
1732
771
|
}
|
|
1733
772
|
break;
|
|
1734
|
-
|
|
1735
773
|
case 'enable':
|
|
1736
774
|
stealthBrowserManager.enableStealthMode(stealthConfig?.level || 'medium');
|
|
1737
775
|
result = { enabled: true, level: stealthConfig?.level || 'medium' };
|
|
1738
776
|
break;
|
|
1739
|
-
|
|
1740
777
|
case 'disable':
|
|
1741
778
|
stealthBrowserManager.disableStealthMode();
|
|
1742
779
|
result = { disabled: true };
|
|
1743
780
|
break;
|
|
1744
|
-
|
|
1745
|
-
case 'create_context':
|
|
781
|
+
case 'create_context': {
|
|
1746
782
|
const contextData = await stealthBrowserManager.createStealthContext(stealthConfig);
|
|
1747
|
-
result = {
|
|
1748
|
-
contextId: contextData.contextId,
|
|
1749
|
-
fingerprint: contextData.fingerprint,
|
|
1750
|
-
created: true
|
|
1751
|
-
};
|
|
783
|
+
result = { contextId: contextData.contextId, fingerprint: contextData.fingerprint, created: true };
|
|
1752
784
|
break;
|
|
1753
|
-
|
|
1754
|
-
case 'create_page':
|
|
1755
|
-
if (!contextId)
|
|
1756
|
-
throw new Error('contextId is required for create_page operation');
|
|
1757
|
-
}
|
|
785
|
+
}
|
|
786
|
+
case 'create_page': {
|
|
787
|
+
if (!contextId) throw new Error('contextId is required for create_page operation');
|
|
1758
788
|
const page = await stealthBrowserManager.createStealthPage(contextId);
|
|
1759
|
-
result = {
|
|
1760
|
-
pageCreated: true,
|
|
1761
|
-
contextId: contextId,
|
|
1762
|
-
url: urlToTest ? await page.goto(urlToTest) : null
|
|
1763
|
-
};
|
|
789
|
+
result = { pageCreated: true, contextId, url: urlToTest ? await page.goto(urlToTest) : null };
|
|
1764
790
|
break;
|
|
1765
|
-
|
|
791
|
+
}
|
|
1766
792
|
case 'get_stats':
|
|
1767
793
|
result = stealthBrowserManager.getStats();
|
|
1768
794
|
break;
|
|
1769
|
-
|
|
1770
795
|
case 'cleanup':
|
|
1771
796
|
await stealthBrowserManager.cleanup();
|
|
1772
797
|
result = { cleaned: true };
|
|
1773
798
|
break;
|
|
1774
|
-
|
|
1775
799
|
default:
|
|
1776
800
|
result = { error: `Unknown operation: ${operation}` };
|
|
1777
801
|
}
|
|
1778
|
-
|
|
1779
|
-
return {
|
|
1780
|
-
content: [{
|
|
1781
|
-
type: "text",
|
|
1782
|
-
text: JSON.stringify(result, null, 2)
|
|
1783
|
-
}]
|
|
1784
|
-
};
|
|
802
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1785
803
|
} catch (error) {
|
|
1786
|
-
return {
|
|
1787
|
-
content: [{
|
|
1788
|
-
type: "text",
|
|
1789
|
-
text: `Stealth mode operation failed: ${error.message}`
|
|
1790
|
-
}],
|
|
1791
|
-
isError: true
|
|
1792
|
-
};
|
|
804
|
+
return { content: [{ type: "text", text: `Stealth mode operation failed: ${error.message}` }], isError: true };
|
|
1793
805
|
}
|
|
1794
806
|
}));
|
|
1795
807
|
|
|
1796
|
-
// Tool: localization
|
|
808
|
+
// Tool: localization
|
|
1797
809
|
server.registerTool("localization", {
|
|
1798
810
|
description: "Multi-language and geo-location management with country-specific settings, browser locale emulation, timezone spoofing, and geo-blocked content handling",
|
|
1799
811
|
annotations: { title: "Localization", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
|
|
@@ -1854,186 +866,110 @@ server.registerTool("localization", {
|
|
|
1854
866
|
try {
|
|
1855
867
|
const { operation } = params;
|
|
1856
868
|
let result;
|
|
1857
|
-
|
|
1858
869
|
switch (operation) {
|
|
1859
870
|
case 'configure_country':
|
|
1860
|
-
if (!params.countryCode)
|
|
1861
|
-
throw new Error('countryCode is required for configure_country operation');
|
|
1862
|
-
}
|
|
871
|
+
if (!params.countryCode) throw new Error('countryCode is required for configure_country operation');
|
|
1863
872
|
result = await localizationManager.configureCountry(params.countryCode, params);
|
|
1864
873
|
break;
|
|
1865
|
-
|
|
1866
874
|
case 'localize_search':
|
|
1867
|
-
if (!params.searchParams)
|
|
1868
|
-
throw new Error('searchParams is required for localize_search operation');
|
|
1869
|
-
}
|
|
875
|
+
if (!params.searchParams) throw new Error('searchParams is required for localize_search operation');
|
|
1870
876
|
result = await localizationManager.localizeSearchQuery(params.searchParams, params.countryCode);
|
|
1871
877
|
break;
|
|
1872
|
-
|
|
1873
878
|
case 'localize_browser':
|
|
1874
|
-
if (!params.browserOptions)
|
|
1875
|
-
throw new Error('browserOptions is required for localize_browser operation');
|
|
1876
|
-
}
|
|
879
|
+
if (!params.browserOptions) throw new Error('browserOptions is required for localize_browser operation');
|
|
1877
880
|
result = await localizationManager.localizeBrowserContext(params.browserOptions, params.countryCode);
|
|
1878
881
|
break;
|
|
1879
|
-
|
|
1880
882
|
case 'generate_timezone_spoof':
|
|
1881
883
|
result = {
|
|
1882
884
|
timezoneScript: await localizationManager.generateTimezoneSpoof(params.countryCode),
|
|
1883
885
|
countryCode: params.countryCode || localizationManager.getCurrentSettings().countryCode
|
|
1884
886
|
};
|
|
1885
887
|
break;
|
|
1886
|
-
|
|
1887
888
|
case 'handle_geo_blocking':
|
|
1888
|
-
if (!params.url || !params.response)
|
|
1889
|
-
throw new Error('url and response are required for handle_geo_blocking operation');
|
|
1890
|
-
}
|
|
889
|
+
if (!params.url || !params.response) throw new Error('url and response are required for handle_geo_blocking operation');
|
|
1891
890
|
result = await localizationManager.handleGeoBlocking(params.url, params.response);
|
|
1892
891
|
break;
|
|
1893
|
-
|
|
1894
892
|
case 'auto_detect':
|
|
1895
|
-
if (!params.content || !params.url)
|
|
1896
|
-
throw new Error('content and url are required for auto_detect operation');
|
|
1897
|
-
}
|
|
893
|
+
if (!params.content || !params.url) throw new Error('content and url are required for auto_detect operation');
|
|
1898
894
|
result = await localizationManager.autoDetectLocalization(params.content, params.url);
|
|
1899
895
|
break;
|
|
1900
|
-
|
|
1901
896
|
case 'get_stats':
|
|
1902
897
|
result = localizationManager.getStats();
|
|
1903
898
|
break;
|
|
1904
|
-
|
|
1905
899
|
case 'get_supported_countries':
|
|
1906
900
|
result = {
|
|
1907
901
|
supportedCountries: localizationManager.getSupportedCountries(),
|
|
1908
902
|
totalCount: localizationManager.getSupportedCountries().length
|
|
1909
903
|
};
|
|
1910
904
|
break;
|
|
1911
|
-
|
|
1912
905
|
default:
|
|
1913
906
|
result = { error: `Unknown operation: ${operation}` };
|
|
1914
907
|
}
|
|
1915
|
-
|
|
1916
|
-
return {
|
|
1917
|
-
content: [{
|
|
1918
|
-
type: "text",
|
|
1919
|
-
text: JSON.stringify(result, null, 2)
|
|
1920
|
-
}]
|
|
1921
|
-
};
|
|
908
|
+
return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
|
|
1922
909
|
} catch (error) {
|
|
1923
|
-
return {
|
|
1924
|
-
content: [{
|
|
1925
|
-
type: "text",
|
|
1926
|
-
text: `Localization operation failed: ${error.message}`
|
|
1927
|
-
}],
|
|
1928
|
-
isError: true
|
|
1929
|
-
};
|
|
910
|
+
return { content: [{ type: "text", text: `Localization operation failed: ${error.message}` }], isError: true };
|
|
1930
911
|
}
|
|
1931
912
|
}));
|
|
1932
913
|
|
|
1933
|
-
//
|
|
914
|
+
// ─── Transport + startup ───────────────────────────────────────────────────────
|
|
915
|
+
|
|
1934
916
|
const useHttp = process.argv.includes('--http') || process.env.MCP_HTTP === 'true';
|
|
917
|
+
const useLegacyHttp = process.argv.includes('--legacy-http') || process.env.CRAWLFORGE_LEGACY_HTTP === 'true';
|
|
1935
918
|
|
|
1936
|
-
// Set up transport and start the server
|
|
1937
919
|
async function runServer() {
|
|
1938
920
|
if (useHttp) {
|
|
1939
921
|
const port = parseInt(process.env.PORT || '3000', 10);
|
|
1940
922
|
|
|
1941
|
-
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
}
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1957
|
-
res.end();
|
|
1958
|
-
return;
|
|
1959
|
-
}
|
|
1960
|
-
|
|
1961
|
-
// Health check endpoint
|
|
1962
|
-
if (req.url === '/health') {
|
|
1963
|
-
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
1964
|
-
res.end(JSON.stringify({ status: 'ok', version: '3.0' }));
|
|
1965
|
-
return;
|
|
1966
|
-
}
|
|
1967
|
-
|
|
1968
|
-
// MCP server card for Smithery discovery
|
|
1969
|
-
if (req.url === '/.well-known/mcp/server-card.json') {
|
|
1970
|
-
res.writeHead(200, { 'Content-Type': 'application/json' });
|
|
1971
|
-
res.end(JSON.stringify({
|
|
1972
|
-
serverInfo: {
|
|
1973
|
-
name: "crawlforge",
|
|
1974
|
-
version: "3.0.12",
|
|
1975
|
-
description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
|
|
1976
|
-
homepage: "https://www.crawlforge.dev",
|
|
1977
|
-
icon: "https://www.crawlforge.dev/icon.png"
|
|
1978
|
-
},
|
|
1979
|
-
transport: {
|
|
1980
|
-
type: "streamable-http",
|
|
1981
|
-
url: "/mcp"
|
|
1982
|
-
},
|
|
1983
|
-
configSchema: {
|
|
1984
|
-
type: "object",
|
|
1985
|
-
properties: {
|
|
1986
|
-
apiKey: {
|
|
1987
|
-
type: "string",
|
|
1988
|
-
title: "CrawlForge API Key",
|
|
1989
|
-
description: "Your CrawlForge API key. Get one free at https://www.crawlforge.dev/signup (includes 1,000 credits)",
|
|
1990
|
-
"x-from": { header: "x-api-key" }
|
|
1991
|
-
}
|
|
1992
|
-
},
|
|
1993
|
-
required: ["apiKey"]
|
|
1994
|
-
}
|
|
1995
|
-
}));
|
|
1996
|
-
return;
|
|
1997
|
-
}
|
|
1998
|
-
|
|
1999
|
-
// Route /mcp to the transport handler
|
|
2000
|
-
if (req.url === '/mcp' || req.url === '/') {
|
|
2001
|
-
await transport.handleRequest(req, res);
|
|
2002
|
-
return;
|
|
923
|
+
if (useLegacyHttp) {
|
|
924
|
+
// One-release deprecation window for stateless legacy transport.
|
|
925
|
+
console.error('WARNING: --legacy-http is deprecated and will be removed in v3.3.0. Use the default Streamable HTTP transport.');
|
|
926
|
+
await connectHttp(server, AuthManager, logger, port);
|
|
927
|
+
} else {
|
|
928
|
+
// OAuth (opt-in)
|
|
929
|
+
let oauthProvider = null;
|
|
930
|
+
if (process.env.CRAWLFORGE_OAUTH_ENABLED === 'true') {
|
|
931
|
+
const issuer = process.env.CRAWLFORGE_OAUTH_ISSUER || `http://localhost:${port}`;
|
|
932
|
+
const apiKey = AuthManager.getConfig()?.apiKey;
|
|
933
|
+
if (!apiKey) {
|
|
934
|
+
console.error('OAuth enabled but no CrawlForge API key is configured — falling back to static-key auth.');
|
|
935
|
+
} else {
|
|
936
|
+
oauthProvider = createOAuthProvider({ issuer, apiKey, logger });
|
|
937
|
+
console.error(`OAuth 2.1 enabled — discovery at ${issuer}/.well-known/oauth-authorization-server`);
|
|
938
|
+
}
|
|
2003
939
|
}
|
|
2004
940
|
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
console.error(`Health check: http://localhost:${port}/health`);
|
|
2013
|
-
});
|
|
941
|
+
await connectStreamableHttp(server, AuthManager, logger, {
|
|
942
|
+
port,
|
|
943
|
+
legacy: false,
|
|
944
|
+
oauth: oauthProvider,
|
|
945
|
+
metrics
|
|
946
|
+
});
|
|
947
|
+
}
|
|
2014
948
|
} else {
|
|
2015
|
-
|
|
2016
|
-
await server.connect(transport);
|
|
2017
|
-
console.error("CrawlForge MCP Server v3.0 running on stdio");
|
|
949
|
+
await connectStdio(server);
|
|
2018
950
|
}
|
|
951
|
+
|
|
2019
952
|
console.error(`Environment: ${config.server.nodeEnv}`);
|
|
2020
|
-
|
|
2021
953
|
console.error("Search enabled: true (via CrawlForge proxy)");
|
|
2022
|
-
|
|
2023
|
-
const baseTools = "fetch_url, extract_text, extract_links, extract_metadata, scrape_structured, crawl_deep, map_site";
|
|
2024
|
-
const searchTool = ", search_web";
|
|
2025
|
-
const phase3Tools = ", extract_content, process_document, summarize_content, analyze_content";
|
|
2026
|
-
const wave2Tools = ", batch_scrape, scrape_with_actions";
|
|
2027
|
-
const researchTools = ", deep_research";
|
|
2028
|
-
const trackingTools = ", track_changes";
|
|
2029
|
-
const llmsTxtTools = ", generate_llms_txt";
|
|
2030
|
-
const wave3Tools = ", stealth_mode, localization";
|
|
2031
|
-
const phase1Tools = ", extract_structured";
|
|
2032
|
-
console.error(`Tools available: ${baseTools}${searchTool}${phase3Tools}${wave2Tools}${researchTools}${trackingTools}${llmsTxtTools}${wave3Tools}${phase1Tools}`);
|
|
2033
954
|
|
|
955
|
+
const allTools = [
|
|
956
|
+
"fetch_url", "extract_text", "extract_links", "extract_metadata", "scrape_structured",
|
|
957
|
+
"search_web", "crawl_deep", "map_site",
|
|
958
|
+
"extract_content", "process_document", "summarize_content", "analyze_content",
|
|
959
|
+
"batch_scrape", "scrape_with_actions",
|
|
960
|
+
"deep_research", "track_changes", "generate_llms_txt",
|
|
961
|
+
"stealth_mode", "localization", "extract_structured", "extract_with_llm"
|
|
962
|
+
];
|
|
963
|
+
console.error(`Tools available: ${allTools.join(', ')}`);
|
|
2034
964
|
|
|
2035
|
-
//
|
|
2036
|
-
|
|
965
|
+
// Start memory monitoring in development
|
|
966
|
+
if (config.server.nodeEnv === "development") {
|
|
967
|
+
memoryMonitor.start();
|
|
968
|
+
console.error("Memory monitoring started");
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
|
|
972
|
+
// ─── Graceful shutdown ─────────────────────────────────────────────────────────
|
|
2037
973
|
|
|
2038
974
|
let isShuttingDown = false;
|
|
2039
975
|
|
|
@@ -2042,26 +978,19 @@ async function gracefulShutdown(signal) {
|
|
|
2042
978
|
console.error("Force shutdown...");
|
|
2043
979
|
process.exit(1);
|
|
2044
980
|
}
|
|
2045
|
-
|
|
981
|
+
|
|
2046
982
|
isShuttingDown = true;
|
|
2047
983
|
console.error(`Received ${signal}. Starting graceful shutdown...`);
|
|
2048
|
-
|
|
984
|
+
|
|
2049
985
|
try {
|
|
2050
|
-
// Cleanup tools that have destroy methods
|
|
2051
986
|
const toolsToCleanup = [
|
|
2052
|
-
batchScrapeTool,
|
|
2053
|
-
|
|
2054
|
-
|
|
2055
|
-
trackChangesTool,
|
|
2056
|
-
generateLLMsTxtTool,
|
|
2057
|
-
stealthBrowserManager,
|
|
2058
|
-
localizationManager,
|
|
2059
|
-
extractStructuredTool
|
|
987
|
+
batchScrapeTool, scrapeWithActionsTool, deepResearchTool,
|
|
988
|
+
trackChangesTool, generateLLMsTxtTool, stealthBrowserManager,
|
|
989
|
+
localizationManager, extractStructuredTool
|
|
2060
990
|
].filter(tool => tool && (typeof tool.destroy === 'function' || typeof tool.cleanup === 'function'));
|
|
2061
|
-
|
|
991
|
+
|
|
2062
992
|
console.error(`Cleaning up ${toolsToCleanup.length} tools...`);
|
|
2063
|
-
|
|
2064
|
-
// Cleanup tools with timeout
|
|
993
|
+
|
|
2065
994
|
await Promise.race([
|
|
2066
995
|
Promise.all(toolsToCleanup.map(async (tool) => {
|
|
2067
996
|
try {
|
|
@@ -2075,40 +1004,33 @@ async function gracefulShutdown(signal) {
|
|
|
2075
1004
|
console.error(`Error cleaning up ${tool.constructor.name}:`, error.message);
|
|
2076
1005
|
}
|
|
2077
1006
|
})),
|
|
2078
|
-
new Promise(resolve => setTimeout(resolve, 5000))
|
|
1007
|
+
new Promise(resolve => setTimeout(resolve, 5000))
|
|
2079
1008
|
]);
|
|
2080
|
-
|
|
2081
|
-
// Stop memory monitoring
|
|
1009
|
+
|
|
2082
1010
|
if (memoryMonitor.isMonitoring) {
|
|
2083
1011
|
memoryMonitor.stop();
|
|
2084
1012
|
console.error("Memory monitoring stopped");
|
|
2085
1013
|
}
|
|
2086
1014
|
|
|
2087
|
-
// Force garbage collection if available
|
|
2088
1015
|
if (global.gc) {
|
|
2089
1016
|
console.error("Running final garbage collection...");
|
|
2090
1017
|
global.gc();
|
|
2091
1018
|
}
|
|
2092
|
-
|
|
1019
|
+
|
|
2093
1020
|
console.error("Graceful shutdown completed");
|
|
2094
1021
|
process.exit(0);
|
|
2095
|
-
|
|
2096
1022
|
} catch (error) {
|
|
2097
1023
|
console.error("Error during graceful shutdown:", error);
|
|
2098
1024
|
process.exit(1);
|
|
2099
1025
|
}
|
|
2100
1026
|
}
|
|
2101
1027
|
|
|
2102
|
-
// Register signal handlers
|
|
2103
1028
|
process.on('SIGINT', () => gracefulShutdown('SIGINT'));
|
|
2104
1029
|
process.on('SIGTERM', () => gracefulShutdown('SIGTERM'));
|
|
2105
|
-
|
|
2106
|
-
// Handle uncaught exceptions and unhandled rejections
|
|
2107
1030
|
process.on('uncaughtException', (error) => {
|
|
2108
1031
|
console.error('Uncaught Exception:', error);
|
|
2109
1032
|
gracefulShutdown('uncaughtException');
|
|
2110
1033
|
});
|
|
2111
|
-
|
|
2112
1034
|
process.on('unhandledRejection', (reason, promise) => {
|
|
2113
1035
|
console.error('Unhandled Rejection at:', promise, 'reason:', reason);
|
|
2114
1036
|
gracefulShutdown('unhandledRejection');
|
|
@@ -2119,17 +1041,10 @@ if (config.server.nodeEnv === 'development') {
|
|
|
2119
1041
|
setInterval(() => {
|
|
2120
1042
|
const usage = process.memoryUsage();
|
|
2121
1043
|
const memoryMB = (usage.heapUsed / 1024 / 1024).toFixed(2);
|
|
2122
|
-
if (memoryMB > 200) {
|
|
1044
|
+
if (memoryMB > 200) {
|
|
2123
1045
|
console.error(`Memory usage: ${memoryMB}MB (high usage detected)`);
|
|
2124
1046
|
}
|
|
2125
|
-
}, 60000);
|
|
2126
|
-
}
|
|
2127
|
-
|
|
2128
|
-
// Start memory monitoring in development
|
|
2129
|
-
if (config.server.nodeEnv === "development") {
|
|
2130
|
-
memoryMonitor.start();
|
|
2131
|
-
console.error("Memory monitoring started");
|
|
2132
|
-
}
|
|
1047
|
+
}, 60000);
|
|
2133
1048
|
}
|
|
2134
1049
|
|
|
2135
1050
|
runServer().catch((error) => {
|