crawlforge-mcp-server 3.0.17 → 3.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/CLAUDE.md +2 -0
  2. package/README.md +1 -0
  3. package/package.json +6 -2
  4. package/server.js +192 -1277
  5. package/src/constants/config.js +2 -1
  6. package/src/core/ActionExecutor.js +2 -43
  7. package/src/core/AuthManager.js +230 -32
  8. package/src/core/BrowserContextPool.js +187 -0
  9. package/src/core/JobManager.js +7 -5
  10. package/src/core/LocalizationManager.js +14 -125
  11. package/src/core/ResearchOrchestrator.js +86 -5
  12. package/src/core/StealthBrowserManager.js +26 -18
  13. package/src/core/cache/CacheManager.js +4 -1
  14. package/src/core/crawlers/BFSCrawler.js +19 -5
  15. package/src/core/endpointGuard.js +37 -0
  16. package/src/observability/metrics.js +137 -0
  17. package/src/observability/tracing.js +74 -0
  18. package/src/server/auth/oauth.js +388 -0
  19. package/src/server/registerTool.js +41 -0
  20. package/src/server/schemas/common.js +29 -0
  21. package/src/server/transports/http.js +22 -0
  22. package/src/server/transports/stdio.js +16 -0
  23. package/src/server/transports/streamableHttp.js +226 -0
  24. package/src/server/withAuth.js +121 -0
  25. package/src/tools/advanced/BatchScrapeTool.js +12 -1086
  26. package/src/tools/advanced/ScrapeWithActionsTool.js +105 -19
  27. package/src/tools/advanced/batchScrape/index.js +328 -0
  28. package/src/tools/advanced/batchScrape/queue.js +91 -0
  29. package/src/tools/advanced/batchScrape/reporter.js +26 -0
  30. package/src/tools/advanced/batchScrape/schema.js +37 -0
  31. package/src/tools/advanced/batchScrape/worker.js +179 -0
  32. package/src/tools/advanced/scrapeWithActions/recorder.js +188 -0
  33. package/src/tools/basic/_fetch.js +35 -0
  34. package/src/tools/basic/extractLinks.js +74 -0
  35. package/src/tools/basic/extractMetadata.js +74 -0
  36. package/src/tools/basic/extractText.js +46 -0
  37. package/src/tools/basic/fetchUrl.js +44 -0
  38. package/src/tools/basic/scrapeStructured.js +58 -0
  39. package/src/tools/crawl/_sessionContext.js +234 -0
  40. package/src/tools/crawl/crawlDeep.js +55 -5
  41. package/src/tools/crawl/mapSite.js +23 -2
  42. package/src/tools/extract/_fetchAndParse.js +57 -0
  43. package/src/tools/extract/extractStructured.js +3 -19
  44. package/src/tools/extract/extractWithLlm.js +295 -0
  45. package/src/tools/research/deepResearch.js +33 -8
  46. package/src/tools/search/providers/searxng.js +126 -0
  47. package/src/tools/search/ranking/ResultDeduplicator.js +18 -11
  48. package/src/tools/search/ranking/ResultRanker.js +17 -10
  49. package/src/tools/search/ranking/SearchResultCache.js +52 -0
  50. package/src/tools/search/searchWeb.js +112 -6
  51. package/src/tools/tracking/trackChanges/differ.js +98 -0
  52. package/src/tools/tracking/trackChanges/index.js +432 -0
  53. package/src/tools/tracking/trackChanges/monitor.js +93 -0
  54. package/src/tools/tracking/trackChanges/notifier.js +105 -0
  55. package/src/tools/tracking/trackChanges/schema.js +127 -0
  56. package/src/tools/tracking/trackChanges.js +12 -1374
package/server.js CHANGED
@@ -6,12 +6,8 @@ export { isCreatorModeVerified } from './src/core/creatorMode.js';
6
6
 
7
7
  // Import everything else
8
8
  import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
9
- import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
10
- import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
11
- import { createServer } from "node:http";
12
- import { randomUUID } from "node:crypto";
13
9
  import { z } from "zod";
14
- import { load } from "cheerio";
10
+ import { logger } from "./src/utils/Logger.js";
15
11
  import { SearchWebTool } from "./src/tools/search/searchWeb.js";
16
12
  import { CrawlDeepTool } from "./src/tools/crawl/crawlDeep.js";
17
13
  import { MapSiteTool } from "./src/tools/crawl/mapSite.js";
@@ -19,24 +15,33 @@ import { ExtractContentTool } from "./src/tools/extract/extractContent.js";
19
15
  import { ProcessDocumentTool } from "./src/tools/extract/processDocument.js";
20
16
  import { SummarizeContentTool } from "./src/tools/extract/summarizeContent.js";
21
17
  import { AnalyzeContentTool } from "./src/tools/extract/analyzeContent.js";
22
- // Phase 1: LLM-Powered Structured Extraction
23
18
  import { ExtractStructuredTool } from "./src/tools/extract/extractStructured.js";
24
- // Wave 2 Advanced Tools
19
+ import { ExtractWithLlm } from "./src/tools/extract/extractWithLlm.js";
25
20
  import { BatchScrapeTool } from "./src/tools/advanced/BatchScrapeTool.js";
26
21
  import { ScrapeWithActionsTool } from "./src/tools/advanced/ScrapeWithActionsTool.js";
27
- // Deep Research Tool
28
22
  import { DeepResearchTool } from "./src/tools/research/deepResearch.js";
29
- // Change Tracking Tool
30
- import { TrackChangesTool } from "./src/tools/tracking/trackChanges.js";
31
- // LLMs.txt Generator Tool (Phase 2.5)
23
+ import { TrackChangesTool } from "./src/tools/tracking/trackChanges/index.js";
32
24
  import { GenerateLLMsTxtTool } from "./src/tools/llmstxt/generateLLMsTxt.js";
33
- // Wave 3-4 Core Managers
34
25
  import { StealthBrowserManager } from "./src/core/StealthBrowserManager.js";
35
26
  import { LocalizationManager } from "./src/core/LocalizationManager.js";
36
27
  import { memoryMonitor } from "./src/utils/MemoryMonitor.js";
37
28
  import { config, validateConfig, getToolConfig } from "./src/constants/config.js";
38
- // Authentication Manager
39
29
  import AuthManager from "./src/core/AuthManager.js";
30
+ import { makeWithAuth } from "./src/server/withAuth.js";
31
+ // Transport helpers
32
+ import { connectStdio } from "./src/server/transports/stdio.js";
33
+ import { connectHttp } from "./src/server/transports/http.js";
34
+ import { connectStreamableHttp } from "./src/server/transports/streamableHttp.js";
35
+ // OAuth 2.1 (HTTP transport only — opt-in via CRAWLFORGE_OAUTH_ENABLED=true)
36
+ import { createOAuthProvider } from "./src/server/auth/oauth.js";
37
+ // Observability (no-op by default — enable via CRAWLFORGE_METRICS / OTEL_SDK_DISABLED)
38
+ import { createMetricsRegistry } from "./src/observability/metrics.js";
39
+ // Basic tool handlers (extracted from server.js)
40
+ import { fetchUrlHandler } from "./src/tools/basic/fetchUrl.js";
41
+ import { extractTextHandler } from "./src/tools/basic/extractText.js";
42
+ import { extractLinksHandler } from "./src/tools/basic/extractLinks.js";
43
+ import { extractMetadataHandler } from "./src/tools/basic/extractMetadata.js";
44
+ import { scrapeStructuredHandler } from "./src/tools/basic/scrapeStructured.js";
40
45
 
41
46
  // Initialize Authentication Manager
42
47
  await AuthManager.initialize();
@@ -84,7 +89,7 @@ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
84
89
  // Create the server
85
90
  const server = new McpServer({
86
91
  name: "crawlforge",
87
- version: "3.0.12",
92
+ version: "3.2.0",
88
93
  description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
89
94
  homepage: "https://www.crawlforge.dev",
90
95
  icon: "https://www.crawlforge.dev/icon.png"
@@ -99,7 +104,7 @@ server.prompt("getting-started", {
99
104
  role: "user",
100
105
  content: {
101
106
  type: "text",
102
- text: "You have access to CrawlForge MCP with 20 web scraping tools. Key tools:\n\n" +
107
+ text: "You have access to CrawlForge MCP with 21 web scraping tools. Key tools:\n\n" +
103
108
  "- fetch_url: Fetch raw HTML/content from any URL\n" +
104
109
  "- extract_text: Extract clean text from a webpage\n" +
105
110
  "- extract_content: Smart content extraction with readability\n" +
@@ -111,6 +116,7 @@ server.prompt("getting-started", {
111
116
  "- deep_research: Multi-source research on any topic\n" +
112
117
  "- stealth_mode: Anti-detection browsing for protected sites\n" +
113
118
  "- extract_structured: LLM-powered structured data extraction\n" +
119
+ "- extract_with_llm: Natural-language extraction via OpenAI/Anthropic\n" +
114
120
  "- track_changes: Monitor website changes over time\n" +
115
121
  "- generate_llms_txt: Generate llms.txt for any website\n\n" +
116
122
  "Workflow: search_web -> fetch_url -> extract_content -> analyze_content\n\n" +
@@ -120,528 +126,37 @@ server.prompt("getting-started", {
120
126
  };
121
127
  });
122
128
 
123
- // Helper function to wrap tool handlers with authentication and credit tracking
124
- function withAuth(toolName, handler) {
125
- return async (params) => {
126
- const startTime = Date.now();
127
-
128
- try {
129
- // Skip credit checks in creator mode
130
- if (!AuthManager.isCreatorMode()) {
131
- // Check credits before executing
132
- const creditCost = AuthManager.getToolCost(toolName);
133
- const hasCredits = await AuthManager.checkCredits(creditCost);
134
-
135
- if (!hasCredits) {
136
- return {
137
- content: [{
138
- type: "text",
139
- text: JSON.stringify({
140
- error: "Insufficient credits",
141
- message: `This operation requires ${creditCost} credits. Please upgrade your plan at https://www.crawlforge.dev/pricing`,
142
- creditsRequired: creditCost
143
- }, null, 2)
144
- }]
145
- };
146
- }
147
- }
148
-
149
- // Execute the tool
150
- const result = await handler(params);
151
-
152
- // Report usage for successful execution (skip in creator mode)
153
- const processingTime = Date.now() - startTime;
154
- if (!AuthManager.isCreatorMode()) {
155
- const creditCost = AuthManager.getToolCost(toolName);
156
- await AuthManager.reportUsage(
157
- toolName,
158
- creditCost,
159
- params,
160
- 200,
161
- processingTime
162
- );
163
- }
164
-
165
- return result;
166
- } catch (error) {
167
- // Report usage even for errors (reduced credit cost) - skip in creator mode
168
- const processingTime = Date.now() - startTime;
169
- if (!AuthManager.isCreatorMode()) {
170
- await AuthManager.reportUsage(
171
- toolName,
172
- Math.max(1, Math.floor(AuthManager.getToolCost(toolName) * 0.5)), // Half credits for errors
173
- params,
174
- 500,
175
- processingTime
176
- );
177
- }
178
-
179
- throw error;
180
- }
181
- };
182
- }
129
+ // Observability registry only emit metrics in HTTP mode when explicitly enabled.
130
+ // Stdio mode stays silent to match MCP host expectations.
131
+ const metricsEnabled =
132
+ (process.argv.includes('--http') || process.env.MCP_HTTP === 'true') &&
133
+ process.env.CRAWLFORGE_METRICS === 'true';
134
+ const metrics = metricsEnabled ? createMetricsRegistry() : null;
183
135
 
184
- // Initialize Search Web Tool - always available with CrawlForge API key
136
+ // Tool-handler wrapper: auth + credit tracking + structured invocation logging + observability.
137
+ const withAuth = makeWithAuth({ authManager: AuthManager, logger, metrics });
138
+
139
+ // Initialize tools
185
140
  const searchWebTool = new SearchWebTool(getToolConfig("search_web"));
186
141
  const crawlDeepTool = new CrawlDeepTool(getToolConfig('crawl_deep'));
187
142
  const mapSiteTool = new MapSiteTool(getToolConfig('map_site'));
188
-
189
- // Initialize Phase 3 tools
190
143
  const extractContentTool = new ExtractContentTool();
191
144
  const processDocumentTool = new ProcessDocumentTool();
192
145
  const summarizeContentTool = new SummarizeContentTool();
193
146
  const analyzeContentTool = new AnalyzeContentTool();
194
-
195
- // Phase 1: LLM-Powered Structured Extraction Tool
196
147
  const extractStructuredTool = new ExtractStructuredTool();
197
-
198
- // Initialize Wave 2 Advanced Tools
148
+ const extractWithLlmTool = new ExtractWithLlm();
199
149
  const batchScrapeTool = new BatchScrapeTool();
200
150
  const scrapeWithActionsTool = new ScrapeWithActionsTool();
201
-
202
- // Initialize Deep Research Tool
203
151
  const deepResearchTool = new DeepResearchTool();
204
-
205
- // Initialize Change Tracking Tool
206
152
  const trackChangesTool = new TrackChangesTool();
207
-
208
- // Initialize LLMs.txt Generator Tool (Phase 2.5)
209
153
  const generateLLMsTxtTool = new GenerateLLMsTxtTool();
210
-
211
- // Initialize Wave 3-4 Core Managers
212
154
  const stealthBrowserManager = new StealthBrowserManager();
213
155
  const localizationManager = new LocalizationManager();
214
156
 
215
- // Zod schemas for tool parameters and responses
216
- const FetchUrlSchema = z.object({
217
- url: z.string().url(),
218
- headers: z.record(z.string()).optional(),
219
- timeout: z.number().min(1000).max(30000).optional().default(10000)
220
- });
221
-
222
- const ExtractTextSchema = z.object({
223
- url: z.string().url(),
224
- remove_scripts: z.boolean().optional().default(true),
225
- remove_styles: z.boolean().optional().default(true)
226
- });
227
-
228
- const ExtractLinksSchema = z.object({
229
- url: z.string().url(),
230
- filter_external: z.boolean().optional().default(false),
231
- base_url: z.string().url().optional()
232
- });
233
-
234
- const ExtractMetadataSchema = z.object({
235
- url: z.string().url()
236
- });
237
-
238
- const ScrapeStructuredSchema = z.object({
239
- url: z.string().url(),
240
- selectors: z.record(z.string())
241
- });
242
-
243
- const SearchWebSchema = z.object({
244
- query: z.string(),
245
- limit: z.number().min(1).max(100).optional(),
246
- offset: z.number().min(0).optional(),
247
- lang: z.string().optional(),
248
- safe_search: z.boolean().optional(),
249
- time_range: z.enum(['day', 'week', 'month', 'year', 'all']).optional(),
250
- site: z.string().optional(),
251
- file_type: z.string().optional()
252
- });
253
-
254
- const CrawlDeepSchema = z.object({
255
- url: z.string().url(),
256
- max_depth: z.number().min(1).max(5).optional(),
257
- max_pages: z.number().min(1).max(1000).optional(),
258
- include_patterns: z.array(z.string()).optional(),
259
- exclude_patterns: z.array(z.string()).optional(),
260
- follow_external: z.boolean().optional(),
261
- respect_robots: z.boolean().optional(),
262
- extract_content: z.boolean().optional(),
263
- concurrency: z.number().min(1).max(20).optional()
264
- });
265
-
266
- const MapSiteSchema = z.object({
267
- url: z.string().url(),
268
- include_sitemap: z.boolean().optional(),
269
- max_urls: z.number().min(1).max(10000).optional(),
270
- group_by_path: z.boolean().optional(),
271
- include_metadata: z.boolean().optional()
272
- });
273
-
274
- const ExtractContentSchema = z.object({
275
- url: z.string().url(),
276
- options: z.object({}).optional()
277
- });
278
-
279
- const ProcessDocumentSchema = z.object({
280
- source: z.string(),
281
- sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional(),
282
- options: z.object({}).optional()
283
- });
284
-
285
- const SummarizeContentSchema = z.object({
286
- text: z.string(),
287
- options: z.object({}).optional()
288
- });
289
-
290
- const AnalyzeContentSchema = z.object({
291
- text: z.string(),
292
- options: z.object({}).optional()
293
- });
294
-
295
- // Wave 2 Advanced Tools Schemas
296
- const BatchScrapeSchema = z.object({
297
- urls: z.array(z.union([
298
- z.string().url(),
299
- z.object({
300
- url: z.string().url(),
301
- selectors: z.record(z.string()).optional(),
302
- headers: z.record(z.string()).optional(),
303
- timeout: z.number().min(1000).max(30000).optional(),
304
- metadata: z.record(z.any()).optional()
305
- })
306
- ])).min(1).max(50),
307
-
308
- formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
309
- mode: z.enum(['sync', 'async']).default('sync'),
310
-
311
- webhook: z.object({
312
- url: z.string().url(),
313
- events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
314
- headers: z.record(z.string()).optional(),
315
- signingSecret: z.string().optional()
316
- }).optional(),
317
-
318
- extractionSchema: z.record(z.string()).optional(),
319
- maxConcurrency: z.number().min(1).max(20).default(10),
320
- delayBetweenRequests: z.number().min(0).max(10000).default(100),
321
- includeMetadata: z.boolean().default(true),
322
- includeFailed: z.boolean().default(true),
323
- pageSize: z.number().min(1).max(100).default(25),
324
-
325
- jobOptions: z.object({
326
- priority: z.number().default(0),
327
- ttl: z.number().min(60000).default(24 * 60 * 60 * 1000),
328
- maxRetries: z.number().min(0).max(5).default(1),
329
- tags: z.array(z.string()).default([])
330
- }).optional()
331
- });
332
-
333
- const ScrapeWithActionsSchema = z.object({
334
- url: z.string().url(),
335
- actions: z.array(z.object({
336
- type: z.enum(['wait', 'click', 'type', 'press', 'scroll', 'screenshot', 'executeJavaScript']),
337
- selector: z.string().optional(),
338
- text: z.string().optional(),
339
- key: z.string().optional(),
340
- script: z.string().optional(),
341
- timeout: z.number().optional(),
342
- description: z.string().optional(),
343
- continueOnError: z.boolean().default(false),
344
- retries: z.number().min(0).max(5).default(0)
345
- })).min(1).max(20),
346
-
347
- formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
348
- captureIntermediateStates: z.boolean().default(false),
349
- captureScreenshots: z.boolean().default(true),
350
-
351
- formAutoFill: z.object({
352
- fields: z.array(z.object({
353
- selector: z.string(),
354
- value: z.string(),
355
- type: z.enum(['text', 'select', 'checkbox', 'radio', 'file']).default('text'),
356
- waitAfter: z.number().min(0).max(5000).default(100)
357
- })),
358
- submitSelector: z.string().optional(),
359
- waitAfterSubmit: z.number().min(0).max(30000).default(2000)
360
- }).optional(),
361
-
362
- browserOptions: z.object({
363
- headless: z.boolean().default(true),
364
- userAgent: z.string().optional(),
365
- viewportWidth: z.number().min(800).max(1920).default(1280),
366
- viewportHeight: z.number().min(600).max(1080).default(720),
367
- timeout: z.number().min(10000).max(120000).default(30000)
368
- }).optional(),
369
-
370
- extractionOptions: z.object({
371
- selectors: z.record(z.string()).optional(),
372
- includeMetadata: z.boolean().default(true),
373
- includeLinks: z.boolean().default(true),
374
- includeImages: z.boolean().default(true)
375
- }).optional(),
376
-
377
- continueOnActionError: z.boolean().default(false),
378
- maxRetries: z.number().min(0).max(3).default(1),
379
- screenshotOnError: z.boolean().default(true)
380
- });
381
-
382
- // Deep Research Tool Schema
383
- const DeepResearchSchema = z.object({
384
- topic: z.string().min(3).max(500),
385
- maxDepth: z.number().min(1).max(10).optional().default(5),
386
- maxUrls: z.number().min(1).max(1000).optional().default(50),
387
- timeLimit: z.number().min(30000).max(300000).optional().default(120000),
388
- researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad'),
389
- sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']),
390
- credibilityThreshold: z.number().min(0).max(1).optional().default(0.3),
391
- includeRecentOnly: z.boolean().optional().default(false),
392
- enableConflictDetection: z.boolean().optional().default(true),
393
- enableSourceVerification: z.boolean().optional().default(true),
394
- enableSynthesis: z.boolean().optional().default(true),
395
- outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive'),
396
- includeRawData: z.boolean().optional().default(false),
397
- includeActivityLog: z.boolean().optional().default(false),
398
- queryExpansion: z.object({
399
- enableSynonyms: z.boolean().optional().default(true),
400
- enableSpellCheck: z.boolean().optional().default(true),
401
- enableContextual: z.boolean().optional().default(true),
402
- maxVariations: z.number().min(1).max(20).optional().default(8)
403
- }).optional(),
404
- llmConfig: z.object({
405
- provider: z.enum(['auto', 'openai', 'anthropic']).optional().default('auto'),
406
- openai: z.object({
407
- apiKey: z.string().optional(),
408
- model: z.string().optional().default('gpt-3.5-turbo'),
409
- embeddingModel: z.string().optional().default('text-embedding-ada-002')
410
- }).optional(),
411
- anthropic: z.object({
412
- apiKey: z.string().optional(),
413
- model: z.string().optional().default('claude-3-haiku-20240307')
414
- }).optional(),
415
- enableSemanticAnalysis: z.boolean().optional().default(true),
416
- enableIntelligentSynthesis: z.boolean().optional().default(true)
417
- }).optional(),
418
- concurrency: z.number().min(1).max(20).optional().default(5),
419
- cacheResults: z.boolean().optional().default(true),
420
- webhook: z.object({
421
- url: z.string().url(),
422
- events: z.array(z.enum(['started', 'progress', 'completed', 'failed'])).optional().default(['completed']),
423
- headers: z.record(z.string()).optional()
424
- }).optional()
425
- });
426
-
427
- // Change Tracking Tool Schema
428
- const TrackChangesSchema = z.object({
429
- url: z.string().url(),
430
- operation: z.enum(['create_baseline', 'compare', 'monitor', 'get_history', 'get_stats']).default('compare'),
431
- content: z.string().optional(),
432
- html: z.string().optional(),
433
- trackingOptions: z.object({
434
- granularity: z.enum(['page', 'section', 'element', 'text']).default('section'),
435
- trackText: z.boolean().default(true),
436
- trackStructure: z.boolean().default(true),
437
- trackAttributes: z.boolean().default(false),
438
- trackImages: z.boolean().default(false),
439
- trackLinks: z.boolean().default(true),
440
- ignoreWhitespace: z.boolean().default(true),
441
- ignoreCase: z.boolean().default(false),
442
- customSelectors: z.array(z.string()).optional(),
443
- excludeSelectors: z.array(z.string()).optional(),
444
- significanceThresholds: z.object({
445
- minor: z.number().min(0).max(1).default(0.1),
446
- moderate: z.number().min(0).max(1).default(0.3),
447
- major: z.number().min(0).max(1).default(0.7)
448
- }).optional()
449
- }).optional(),
450
- monitoringOptions: z.object({
451
- enabled: z.boolean().default(false),
452
- interval: z.number().min(60000).max(24 * 60 * 60 * 1000).default(300000),
453
- maxRetries: z.number().min(0).max(5).default(3),
454
- retryDelay: z.number().min(1000).max(60000).default(5000),
455
- notificationThreshold: z.enum(['minor', 'moderate', 'major', 'critical']).default('moderate'),
456
- enableWebhook: z.boolean().default(false),
457
- webhookUrl: z.string().url().optional(),
458
- webhookSecret: z.string().optional()
459
- }).optional(),
460
- storageOptions: z.object({
461
- enableSnapshots: z.boolean().default(true),
462
- retainHistory: z.boolean().default(true),
463
- maxHistoryEntries: z.number().min(1).max(1000).default(100),
464
- compressionEnabled: z.boolean().default(true),
465
- deltaStorageEnabled: z.boolean().default(true)
466
- }).optional(),
467
- queryOptions: z.object({
468
- limit: z.number().min(1).max(500).default(50),
469
- offset: z.number().min(0).default(0),
470
- startTime: z.number().optional(),
471
- endTime: z.number().optional(),
472
- includeContent: z.boolean().default(false),
473
- significanceFilter: z.enum(['all', 'minor', 'moderate', 'major', 'critical']).optional()
474
- }).optional(),
475
- notificationOptions: z.object({
476
- webhook: z.object({
477
- enabled: z.boolean().default(false),
478
- url: z.string().url().optional(),
479
- method: z.enum(['POST', 'PUT']).default('POST'),
480
- headers: z.record(z.string()).optional(),
481
- signingSecret: z.string().optional(),
482
- includeContent: z.boolean().default(false)
483
- }).optional(),
484
- slack: z.object({
485
- enabled: z.boolean().default(false),
486
- webhookUrl: z.string().url().optional(),
487
- channel: z.string().optional(),
488
- username: z.string().optional()
489
- }).optional()
490
- }).optional()
491
- });
492
-
493
- // LLMs.txt Generator Tool Schema (Phase 2.5)
494
- const GenerateLLMsTxtSchema = z.object({
495
- url: z.string().url(),
496
- analysisOptions: z.object({
497
- maxDepth: z.number().min(1).max(5).optional().default(3),
498
- maxPages: z.number().min(10).max(500).optional().default(100),
499
- detectAPIs: z.boolean().optional().default(true),
500
- analyzeContent: z.boolean().optional().default(true),
501
- checkSecurity: z.boolean().optional().default(true),
502
- respectRobots: z.boolean().optional().default(true)
503
- }).optional(),
504
- outputOptions: z.object({
505
- includeDetailed: z.boolean().optional().default(true),
506
- includeAnalysis: z.boolean().optional().default(false),
507
- contactEmail: z.string().email().optional(),
508
- organizationName: z.string().optional(),
509
- customGuidelines: z.array(z.string()).optional(),
510
- customRestrictions: z.array(z.string()).optional()
511
- }).optional(),
512
- complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard'),
513
- format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both')
514
- });
515
-
516
- // Stealth Mode Tool Schema (Wave 3)
517
- const StealthModeSchema = z.object({
518
- operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure'),
519
- stealthConfig: z.object({
520
- level: z.enum(['basic', 'medium', 'advanced']).default('medium'),
521
- randomizeFingerprint: z.boolean().default(true),
522
- hideWebDriver: z.boolean().default(true),
523
- blockWebRTC: z.boolean().default(true),
524
- spoofTimezone: z.boolean().default(true),
525
- randomizeHeaders: z.boolean().default(true),
526
- useRandomUserAgent: z.boolean().default(true),
527
- simulateHumanBehavior: z.boolean().default(true),
528
- customUserAgent: z.string().optional(),
529
- customViewport: z.object({
530
- width: z.number().min(800).max(1920),
531
- height: z.number().min(600).max(1080)
532
- }).optional(),
533
- locale: z.string().default('en-US'),
534
- timezone: z.string().optional(),
535
- webRTCPublicIP: z.string().optional(),
536
- webRTCLocalIPs: z.array(z.string()).optional(),
537
- proxyRotation: z.object({
538
- enabled: z.boolean().default(false),
539
- proxies: z.array(z.string()).optional(),
540
- rotationInterval: z.number().default(300000)
541
- }).optional(),
542
- antiDetection: z.object({
543
- cloudflareBypass: z.boolean().default(true),
544
- recaptchaHandling: z.boolean().default(true),
545
- hideAutomation: z.boolean().default(true),
546
- spoofMediaDevices: z.boolean().default(true),
547
- spoofBatteryAPI: z.boolean().default(true)
548
- }).optional(),
549
- fingerprinting: z.object({
550
- canvasNoise: z.boolean().default(true),
551
- webglSpoofing: z.boolean().default(true),
552
- audioContextSpoofing: z.boolean().default(true),
553
- fontSpoofing: z.boolean().default(true),
554
- hardwareSpoofing: z.boolean().default(true)
555
- }).optional()
556
- }).optional(),
557
- contextId: z.string().optional(),
558
- urlToTest: z.string().url().optional()
559
- });
560
-
561
- // Localization Tool Schema (Wave 3)
562
- const LocalizationSchema = z.object({
563
- operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country'),
564
- countryCode: z.string().length(2).optional(),
565
- language: z.string().optional(),
566
- timezone: z.string().optional(),
567
- currency: z.string().length(3).optional(),
568
- customHeaders: z.record(z.string()).optional(),
569
- userAgent: z.string().optional(),
570
- acceptLanguage: z.string().optional(),
571
- geoLocation: z.object({
572
- latitude: z.number().min(-90).max(90),
573
- longitude: z.number().min(-180).max(180),
574
- accuracy: z.number().min(1).max(100).optional()
575
- }).optional(),
576
- proxySettings: z.object({
577
- enabled: z.boolean().default(false),
578
- region: z.string().optional(),
579
- type: z.enum(['http', 'https', 'socks4', 'socks5']).default('https'),
580
- server: z.string().optional(),
581
- port: z.number().optional(),
582
- username: z.string().optional(),
583
- password: z.string().optional(),
584
- rotation: z.object({
585
- enabled: z.boolean().default(false),
586
- interval: z.number().default(300000),
587
- strategy: z.enum(['round-robin', 'random', 'failover']).default('round-robin')
588
- }).optional(),
589
- fallback: z.object({
590
- enabled: z.boolean().default(true),
591
- maxRetries: z.number().default(3),
592
- timeout: z.number().default(10000)
593
- }).optional()
594
- }).optional(),
595
- searchParams: z.object({
596
- query: z.string().optional(),
597
- limit: z.number().optional(),
598
- offset: z.number().optional(),
599
- headers: z.record(z.string()).optional()
600
- }).optional(),
601
- browserOptions: z.object({
602
- locale: z.string().optional(),
603
- timezoneId: z.string().optional(),
604
- extraHTTPHeaders: z.record(z.string()).optional(),
605
- userAgent: z.string().optional()
606
- }).optional(),
607
- content: z.string().optional(),
608
- url: z.string().url().optional(),
609
- response: z.object({
610
- status: z.number(),
611
- body: z.string().optional(),
612
- statusText: z.string().optional()
613
- }).optional()
614
- });
615
-
616
-
617
- // Utility function to fetch URL with error handling
618
- async function fetchWithTimeout(url, options = {}) {
619
- const { timeout = 10000, headers = {} } = options;
620
-
621
- const controller = new AbortController();
622
- const timeoutId = setTimeout(() => controller.abort(), timeout);
623
-
624
- try {
625
- const response = await fetch(url, {
626
- signal: controller.signal,
627
- headers: {
628
- 'User-Agent': 'CrawlForge/1.0.0',
629
- ...headers
630
- }
631
- });
632
-
633
- clearTimeout(timeoutId);
634
- return response;
635
- } catch (error) {
636
- clearTimeout(timeoutId);
637
- if (error.name === 'AbortError') {
638
- throw new Error(`Request timeout after ${timeout}ms`);
639
- }
640
- throw error;
641
- }
642
- }
157
+ // ─── Tool registrations ────────────────────────────────────────────────────────
643
158
 
644
- // Tool: fetch_url - Basic URL fetching with headers and response handling
159
+ // Tool: fetch_url
645
160
  server.registerTool("fetch_url", {
646
161
  description: "Fetch content from a URL with optional headers and timeout",
647
162
  annotations: { title: "Fetch URL", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
@@ -650,45 +165,9 @@ server.registerTool("fetch_url", {
650
165
  headers: z.record(z.string()).optional().describe("Custom HTTP headers to include in the request"),
651
166
  timeout: z.number().min(1000).max(30000).optional().default(10000).describe("Request timeout in milliseconds (1000-30000)")
652
167
  }
653
- }, withAuth("fetch_url", async ({ url, headers, timeout }) => {
654
- try {
655
- const response = await fetchWithTimeout(url, {
656
- timeout: timeout || 10000,
657
- headers: headers || {}
658
- });
659
-
660
- const body = await response.text();
661
- const responseHeaders = {};
662
- response.headers.forEach((value, key) => {
663
- responseHeaders[key] = value;
664
- });
665
-
666
- return {
667
- content: [{
668
- type: "text",
669
- text: JSON.stringify({
670
- status: response.status,
671
- statusText: response.statusText,
672
- headers: responseHeaders,
673
- body: body,
674
- contentType: response.headers.get('content-type') || 'unknown',
675
- size: body.length,
676
- url: response.url
677
- }, null, 2)
678
- }]
679
- };
680
- } catch (error) {
681
- return {
682
- content: [{
683
- type: "text",
684
- text: `Failed to fetch URL: ${error.message}`
685
- }],
686
- isError: true
687
- };
688
- }
689
- }));
168
+ }, withAuth("fetch_url", fetchUrlHandler));
690
169
 
691
- // Tool: extract_text - Extract clean text content from HTML
170
+ // Tool: extract_text
692
171
  server.registerTool("extract_text", {
693
172
  description: "Extract clean text content from a webpage",
694
173
  annotations: { title: "Extract Text", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
@@ -697,53 +176,9 @@ server.registerTool("extract_text", {
697
176
  remove_scripts: z.boolean().optional().default(true).describe("Remove script tags before extraction"),
698
177
  remove_styles: z.boolean().optional().default(true).describe("Remove style tags before extraction")
699
178
  }
700
- }, withAuth("extract_text", async ({ url, remove_scripts, remove_styles }) => {
701
- try {
702
- const response = await fetchWithTimeout(url);
703
- if (!response.ok) {
704
- throw new Error(`HTTP ${response.status}: ${response.statusText}`);
705
- }
706
-
707
- const html = await response.text();
708
- const $ = load(html);
709
-
710
- // Remove unwanted elements
711
- if (remove_scripts !== false) {
712
- $('script').remove();
713
- }
714
- if (remove_styles !== false) {
715
- $('style').remove();
716
- }
717
-
718
- // Remove common non-content elements
719
- $('nav, header, footer, aside, .advertisement, .ad, .sidebar').remove();
720
-
721
- // Extract text content
722
- const text = $('body').text().replace(/\s+/g, ' ').trim();
723
-
724
- return {
725
- content: [{
726
- type: "text",
727
- text: JSON.stringify({
728
- text: text,
729
- word_count: text.split(/\s+/).filter(word => word.length > 0).length,
730
- char_count: text.length,
731
- url: response.url
732
- }, null, 2)
733
- }]
734
- };
735
- } catch (error) {
736
- return {
737
- content: [{
738
- type: "text",
739
- text: `Failed to extract text: ${error.message}`
740
- }],
741
- isError: true
742
- };
743
- }
744
- }));
179
+ }, withAuth("extract_text", extractTextHandler));
745
180
 
746
- // Tool: extract_links - Extract all links from a webpage with optional filtering
181
+ // Tool: extract_links
747
182
  server.registerTool("extract_links", {
748
183
  description: "Extract all links from a webpage with optional filtering",
749
184
  annotations: { title: "Extract Links", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
@@ -752,163 +187,18 @@ server.registerTool("extract_links", {
752
187
  filter_external: z.boolean().optional().default(false).describe("Only return external links"),
753
188
  base_url: z.string().url().optional().describe("Base URL for resolving relative links")
754
189
  }
755
- }, withAuth("extract_links", async ({ url, filter_external, base_url }) => {
756
- try {
757
- const response = await fetchWithTimeout(url);
758
- if (!response.ok) {
759
- throw new Error(`HTTP ${response.status}: ${response.statusText}`);
760
- }
761
-
762
- const html = await response.text();
763
- const $ = load(html);
764
-
765
- const baseUrl = base_url || new URL(url).origin;
766
- const pageUrl = new URL(url);
767
- const links = [];
768
-
769
- $('a[href]').each((_, element) => {
770
- const href = $(element).attr('href');
771
- const text = $(element).text().trim();
772
-
773
- if (!href) return;
774
-
775
- let absoluteUrl;
776
- let isExternal = false;
777
-
778
- try {
779
- if (href.startsWith('http://') || href.startsWith('https://')) {
780
- absoluteUrl = href;
781
- isExternal = new URL(href).origin !== pageUrl.origin;
782
- } else {
783
- absoluteUrl = new URL(href, baseUrl).toString();
784
- isExternal = false;
785
- }
786
-
787
- // Apply filtering
788
- if (filter_external && isExternal) {
789
- return;
790
- }
791
-
792
- links.push({
793
- href: absoluteUrl,
794
- text: text,
795
- is_external: isExternal,
796
- original_href: href
797
- });
798
- } catch (urlError) {
799
- // Skip invalid URLs
800
- }
801
- });
802
-
803
- // Remove duplicates
804
- const uniqueLinks = links.filter((link, index, arr) =>
805
- arr.findIndex(l => l.href === link.href) === index
806
- );
807
-
808
- return {
809
- content: [{
810
- type: "text",
811
- text: JSON.stringify({
812
- links: uniqueLinks,
813
- total_count: uniqueLinks.length,
814
- internal_count: uniqueLinks.filter(l => !l.is_external).length,
815
- external_count: uniqueLinks.filter(l => l.is_external).length,
816
- base_url: baseUrl
817
- }, null, 2)
818
- }]
819
- };
820
- } catch (error) {
821
- return {
822
- content: [{
823
- type: "text",
824
- text: `Failed to extract links: ${error.message}`
825
- }],
826
- isError: true
827
- };
828
- }
829
- }));
190
+ }, withAuth("extract_links", extractLinksHandler));
830
191
 
831
- // Tool: extract_metadata - Extract page metadata
192
+ // Tool: extract_metadata
832
193
  server.registerTool("extract_metadata", {
833
194
  description: "Extract metadata from a webpage (title, description, keywords, etc.)",
834
195
  annotations: { title: "Extract Metadata", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
835
196
  inputSchema: {
836
197
  url: z.string().url().describe("The URL to extract metadata from")
837
198
  }
838
- }, withAuth("extract_metadata", async ({ url }) => {
839
- try {
840
- const response = await fetchWithTimeout(url);
841
- if (!response.ok) {
842
- throw new Error(`HTTP ${response.status}: ${response.statusText}`);
843
- }
844
-
845
- const html = await response.text();
846
- const $ = load(html);
847
-
848
- // Extract basic metadata
849
- const title = $('title').text().trim() || $('h1').first().text().trim();
850
- const description = $('meta[name="description"]').attr('content') ||
851
- $('meta[property="og:description"]').attr('content') || '';
852
- const keywords = $('meta[name="keywords"]').attr('content') || '';
853
- const canonical = $('link[rel="canonical"]').attr('href') || '';
854
-
855
- // Extract Open Graph tags
856
- const ogTags = {};
857
- $('meta[property^="og:"]').each((_, element) => {
858
- const property = $(element).attr('property');
859
- const content = $(element).attr('content');
860
- if (property && content) {
861
- ogTags[property.replace('og:', '')] = content;
862
- }
863
- });
864
-
865
- // Extract Twitter Card tags
866
- const twitterTags = {};
867
- $('meta[name^="twitter:"]').each((_, element) => {
868
- const name = $(element).attr('name');
869
- const content = $(element).attr('content');
870
- if (name && content) {
871
- twitterTags[name.replace('twitter:', '')] = content;
872
- }
873
- });
874
-
875
- // Extract additional metadata
876
- const author = $('meta[name="author"]').attr('content') || '';
877
- const robots = $('meta[name="robots"]').attr('content') || '';
878
- const viewport = $('meta[name="viewport"]').attr('content') || '';
879
- const charset = $('meta[charset]').attr('charset') ||
880
- $('meta[http-equiv="Content-Type"]').attr('content') || '';
881
-
882
- return {
883
- content: [{
884
- type: "text",
885
- text: JSON.stringify({
886
- title: title,
887
- description: description,
888
- keywords: keywords.split(',').map(k => k.trim()).filter(k => k),
889
- canonical_url: canonical,
890
- author: author,
891
- robots: robots,
892
- viewport: viewport,
893
- charset: charset,
894
- og_tags: ogTags,
895
- twitter_tags: twitterTags,
896
- url: response.url
897
- }, null, 2)
898
- }]
899
- };
900
- } catch (error) {
901
- return {
902
- content: [{
903
- type: "text",
904
- text: `Failed to extract metadata: ${error.message}`
905
- }],
906
- isError: true
907
- };
908
- }
909
- }));
199
+ }, withAuth("extract_metadata", extractMetadataHandler));
910
200
 
911
- // Tool: scrape_structured - Extract structured data using CSS selectors
201
+ // Tool: scrape_structured
912
202
  server.registerTool("scrape_structured", {
913
203
  description: "Extract structured data from a webpage using CSS selectors",
914
204
  annotations: { title: "Scrape Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
@@ -916,63 +206,9 @@ server.registerTool("scrape_structured", {
916
206
  url: z.string().url().describe("The URL to scrape"),
917
207
  selectors: z.record(z.string()).describe("CSS selectors mapping field names to selectors")
918
208
  }
919
- }, withAuth("scrape_structured", async ({ url, selectors }) => {
920
- try {
921
- const response = await fetchWithTimeout(url);
922
- if (!response.ok) {
923
- throw new Error(`HTTP ${response.status}: ${response.statusText}`);
924
- }
925
-
926
- const html = await response.text();
927
- const $ = load(html);
928
-
929
- const results = {};
930
-
931
- for (const [fieldName, selector] of Object.entries(selectors)) {
932
- try {
933
- const elements = $(selector);
934
-
935
- if (elements.length === 0) {
936
- results[fieldName] = null;
937
- } else if (elements.length === 1) {
938
- // Single element - return text content
939
- results[fieldName] = elements.text().trim();
940
- } else {
941
- // Multiple elements - return array of text content
942
- results[fieldName] = elements.map((_, el) => $(el).text().trim()).get();
943
- }
944
- } catch (selectorError) {
945
- results[fieldName] = {
946
- error: `Invalid selector: ${selector}`,
947
- message: selectorError.message
948
- };
949
- }
950
- }
951
-
952
- return {
953
- content: [{
954
- type: "text",
955
- text: JSON.stringify({
956
- data: results,
957
- selectors_used: selectors,
958
- elements_found: Object.keys(results).length,
959
- url: response.url
960
- }, null, 2)
961
- }]
962
- };
963
- } catch (error) {
964
- return {
965
- content: [{
966
- type: "text",
967
- text: `Failed to scrape structured data: ${error.message}`
968
- }],
969
- isError: true
970
- };
971
- }
972
- }));
209
+ }, withAuth("scrape_structured", scrapeStructuredHandler));
973
210
 
974
- // Tool: search_web - Web search with configurable providers
975
- // Tool: search_web - Search the web using Google Search via CrawlForge proxy
211
+ // Tool: search_web
976
212
  server.registerTool("search_web", {
977
213
  description: "Search the web using Google Search API (proxied through CrawlForge)",
978
214
  annotations: { title: "Search the Web", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
@@ -989,34 +225,16 @@ server.registerTool("search_web", {
989
225
  }, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
990
226
  try {
991
227
  if (!query) {
992
- return {
993
- content: [{
994
- type: "text",
995
- text: "Query parameter is required"
996
- }],
997
- isError: true
998
- };
228
+ return { content: [{ type: "text", text: "Query parameter is required" }], isError: true };
999
229
  }
1000
-
1001
230
  const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type });
1002
- return {
1003
- content: [{
1004
- type: "text",
1005
- text: JSON.stringify(result, null, 2)
1006
- }]
1007
- };
231
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1008
232
  } catch (error) {
1009
- return {
1010
- content: [{
1011
- type: "text",
1012
- text: `Search failed: ${error.message}`
1013
- }],
1014
- isError: true
1015
- };
233
+ return { content: [{ type: "text", text: `Search failed: ${error.message}` }], isError: true };
1016
234
  }
1017
235
  }));
1018
236
 
1019
- // Tool: crawl_deep - Deep crawl websites with BFS algorithm
237
+ // Tool: crawl_deep
1020
238
  server.registerTool("crawl_deep", {
1021
239
  description: "Crawl websites deeply using breadth-first search",
1022
240
  annotations: { title: "Deep Crawl", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
@@ -1034,34 +252,16 @@ server.registerTool("crawl_deep", {
1034
252
  }, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }) => {
1035
253
  try {
1036
254
  if (!url) {
1037
- return {
1038
- content: [{
1039
- type: "text",
1040
- text: "URL parameter is required"
1041
- }],
1042
- isError: true
1043
- };
255
+ return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
1044
256
  }
1045
-
1046
257
  const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency });
1047
- return {
1048
- content: [{
1049
- type: "text",
1050
- text: JSON.stringify(result, null, 2)
1051
- }]
1052
- };
258
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1053
259
  } catch (error) {
1054
- return {
1055
- content: [{
1056
- type: "text",
1057
- text: `Crawl failed: ${error.message}`
1058
- }],
1059
- isError: true
1060
- };
260
+ return { content: [{ type: "text", text: `Crawl failed: ${error.message}` }], isError: true };
1061
261
  }
1062
262
  }));
1063
263
 
1064
- // Tool: map_site - Discover and map website structure
264
+ // Tool: map_site
1065
265
  server.registerTool("map_site", {
1066
266
  description: "Discover and map website structure",
1067
267
  annotations: { title: "Map Website", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
@@ -1075,36 +275,16 @@ server.registerTool("map_site", {
1075
275
  }, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata }) => {
1076
276
  try {
1077
277
  if (!url) {
1078
- return {
1079
- content: [{
1080
- type: "text",
1081
- text: "URL parameter is required"
1082
- }],
1083
- isError: true
1084
- };
278
+ return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
1085
279
  }
1086
-
1087
280
  const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata });
1088
- return {
1089
- content: [{
1090
- type: "text",
1091
- text: JSON.stringify(result, null, 2)
1092
- }]
1093
- };
281
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1094
282
  } catch (error) {
1095
- return {
1096
- content: [{
1097
- type: "text",
1098
- text: `Site mapping failed: ${error.message}`
1099
- }],
1100
- isError: true
1101
- };
283
+ return { content: [{ type: "text", text: `Site mapping failed: ${error.message}` }], isError: true };
1102
284
  }
1103
285
  }));
1104
286
 
1105
- // Phase 3 Tools: Enhanced Content Processing
1106
-
1107
- // Tool: extract_content - Enhanced content extraction with readability detection
287
+ // Tool: extract_content
1108
288
  server.registerTool("extract_content", {
1109
289
  description: "Extract and analyze main content from web pages with enhanced readability detection",
1110
290
  annotations: { title: "Extract Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
@@ -1115,34 +295,16 @@ server.registerTool("extract_content", {
1115
295
  }, withAuth("extract_content", async ({ url, options }) => {
1116
296
  try {
1117
297
  if (!url) {
1118
- return {
1119
- content: [{
1120
- type: "text",
1121
- text: "URL parameter is required"
1122
- }],
1123
- isError: true
1124
- };
298
+ return { content: [{ type: "text", text: "URL parameter is required" }], isError: true };
1125
299
  }
1126
-
1127
300
  const result = await extractContentTool.execute({ url, options });
1128
- return {
1129
- content: [{
1130
- type: "text",
1131
- text: JSON.stringify(result, null, 2)
1132
- }]
1133
- };
301
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1134
302
  } catch (error) {
1135
- return {
1136
- content: [{
1137
- type: "text",
1138
- text: `Content extraction failed: ${error.message}`
1139
- }],
1140
- isError: true
1141
- };
303
+ return { content: [{ type: "text", text: `Content extraction failed: ${error.message}` }], isError: true };
1142
304
  }
1143
305
  }));
1144
306
 
1145
- // Tool: process_document - Multi-format document processing
307
+ // Tool: process_document
1146
308
  server.registerTool("process_document", {
1147
309
  description: "Process documents from multiple sources and formats including PDFs and web pages",
1148
310
  annotations: { title: "Process Document", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
@@ -1154,34 +316,16 @@ server.registerTool("process_document", {
1154
316
  }, withAuth("process_document", async ({ source, sourceType, options }) => {
1155
317
  try {
1156
318
  if (!source) {
1157
- return {
1158
- content: [{
1159
- type: "text",
1160
- text: "Source parameter is required"
1161
- }],
1162
- isError: true
1163
- };
319
+ return { content: [{ type: "text", text: "Source parameter is required" }], isError: true };
1164
320
  }
1165
-
1166
321
  const result = await processDocumentTool.execute({ source, sourceType, options });
1167
- return {
1168
- content: [{
1169
- type: "text",
1170
- text: JSON.stringify(result, null, 2)
1171
- }]
1172
- };
322
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1173
323
  } catch (error) {
1174
- return {
1175
- content: [{
1176
- type: "text",
1177
- text: `Document processing failed: ${error.message}`
1178
- }],
1179
- isError: true
1180
- };
324
+ return { content: [{ type: "text", text: `Document processing failed: ${error.message}` }], isError: true };
1181
325
  }
1182
326
  }));
1183
327
 
1184
- // Tool: summarize_content - Intelligent content summarization
328
+ // Tool: summarize_content
1185
329
  server.registerTool("summarize_content", {
1186
330
  description: "Generate intelligent summaries of text content with configurable options",
1187
331
  annotations: { title: "Summarize Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
@@ -1192,34 +336,16 @@ server.registerTool("summarize_content", {
1192
336
  }, withAuth("summarize_content", async ({ text, options }) => {
1193
337
  try {
1194
338
  if (!text) {
1195
- return {
1196
- content: [{
1197
- type: "text",
1198
- text: "Text parameter is required"
1199
- }],
1200
- isError: true
1201
- };
339
+ return { content: [{ type: "text", text: "Text parameter is required" }], isError: true };
1202
340
  }
1203
-
1204
341
  const result = await summarizeContentTool.execute({ text, options });
1205
- return {
1206
- content: [{
1207
- type: "text",
1208
- text: JSON.stringify(result, null, 2)
1209
- }]
1210
- };
342
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1211
343
  } catch (error) {
1212
- return {
1213
- content: [{
1214
- type: "text",
1215
- text: `Content summarization failed: ${error.message}`
1216
- }],
1217
- isError: true
1218
- };
344
+ return { content: [{ type: "text", text: `Content summarization failed: ${error.message}` }], isError: true };
1219
345
  }
1220
346
  }));
1221
347
 
1222
- // Tool: analyze_content - Comprehensive content analysis
348
+ // Tool: analyze_content
1223
349
  server.registerTool("analyze_content", {
1224
350
  description: "Perform comprehensive content analysis including language detection and topic extraction",
1225
351
  annotations: { title: "Analyze Content", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: false },
@@ -1230,38 +356,16 @@ server.registerTool("analyze_content", {
1230
356
  }, withAuth("analyze_content", async ({ text, options }) => {
1231
357
  try {
1232
358
  if (!text) {
1233
- return {
1234
- content: [{
1235
- type: "text",
1236
- text: "Text parameter is required"
1237
- }],
1238
- isError: true
1239
- };
359
+ return { content: [{ type: "text", text: "Text parameter is required" }], isError: true };
1240
360
  }
1241
-
1242
361
  const result = await analyzeContentTool.execute({ text, options });
1243
- return {
1244
- content: [{
1245
- type: "text",
1246
- text: JSON.stringify(result, null, 2)
1247
- }]
1248
- };
362
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1249
363
  } catch (error) {
1250
- return {
1251
- content: [{
1252
- type: "text",
1253
- text: `Content analysis failed: ${error.message}`
1254
- }],
1255
- isError: true
1256
- };
364
+ return { content: [{ type: "text", text: `Content analysis failed: ${error.message}` }], isError: true };
1257
365
  }
1258
366
  }));
1259
367
 
1260
-
1261
-
1262
- // Phase 1: LLM-Powered Structured Extraction
1263
-
1264
- // Tool: extract_structured - Extract structured data from a URL using LLM and JSON Schema
368
+ // Tool: extract_structured
1265
369
  server.registerTool("extract_structured", {
1266
370
  description: "Extract structured data from a webpage using LLM-powered analysis and a JSON Schema. Falls back to CSS selector extraction when no LLM provider is configured.",
1267
371
  annotations: { title: "Extract Structured Data", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
@@ -1282,35 +386,36 @@ server.registerTool("extract_structured", {
1282
386
  }
1283
387
  }, withAuth("extract_structured", async ({ url, schema, prompt, llmConfig, fallbackToSelectors, selectorHints }) => {
1284
388
  try {
1285
- const result = await extractStructuredTool.execute({
1286
- url,
1287
- schema,
1288
- prompt,
1289
- llmConfig,
1290
- fallbackToSelectors,
1291
- selectorHints
1292
- });
1293
- return {
1294
- content: [{
1295
- type: "text",
1296
- text: JSON.stringify(result, null, 2)
1297
- }]
1298
- };
389
+ const result = await extractStructuredTool.execute({ url, schema, prompt, llmConfig, fallbackToSelectors, selectorHints });
390
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1299
391
  } catch (error) {
1300
- return {
1301
- content: [{
1302
- type: "text",
1303
- text: `Structured extraction failed: ${error.message}`
1304
- }],
1305
- isError: true
1306
- };
392
+ return { content: [{ type: "text", text: `Structured extraction failed: ${error.message}` }], isError: true };
1307
393
  }
1308
394
  }));
1309
395
 
396
+ // Tool: extract_with_llm
397
+ server.registerTool("extract_with_llm", {
398
+ description: "Extract structured data from a URL or text using a natural-language prompt, powered by OpenAI or Anthropic. Requires OPENAI_API_KEY or ANTHROPIC_API_KEY in the environment.",
399
+ annotations: { title: "Extract With LLM", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
400
+ inputSchema: {
401
+ url: z.string().url().optional().describe("URL to fetch and extract from (one of url/content required)"),
402
+ content: z.string().optional().describe("Pre-fetched text to extract from (one of url/content required)"),
403
+ prompt: z.string().describe("Natural-language extraction instruction"),
404
+ schema: z.record(z.unknown()).optional().describe("Optional JSON-schema-like hint for output shape"),
405
+ provider: z.enum(["openai", "anthropic", "auto"]).optional().default("auto").describe("LLM provider"),
406
+ model: z.string().optional().describe("Override default model"),
407
+ maxTokens: z.number().optional().default(4096).describe("Maximum output tokens")
408
+ }
409
+ }, withAuth("extract_with_llm", async (params) => {
410
+ try {
411
+ const result = await extractWithLlmTool.execute(params);
412
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
413
+ } catch (error) {
414
+ return { content: [{ type: "text", text: `LLM extraction failed: ${error.message}` }], isError: true };
415
+ }
416
+ }));
1310
417
 
1311
- // Wave 2 Advanced Tools
1312
-
1313
- // Tool: batch_scrape - Process multiple URLs simultaneously with job management
418
+ // Tool: batch_scrape
1314
419
  server.registerTool("batch_scrape", {
1315
420
  description: "Process multiple URLs simultaneously with support for async job management and webhook notifications",
1316
421
  annotations: { title: "Batch Scrape", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
@@ -1349,24 +454,13 @@ server.registerTool("batch_scrape", {
1349
454
  }, withAuth("batch_scrape", async (params) => {
1350
455
  try {
1351
456
  const result = await batchScrapeTool.execute(params);
1352
- return {
1353
- content: [{
1354
- type: "text",
1355
- text: JSON.stringify(result, null, 2)
1356
- }]
1357
- };
457
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1358
458
  } catch (error) {
1359
- return {
1360
- content: [{
1361
- type: "text",
1362
- text: `Batch scrape failed: ${error.message}`
1363
- }],
1364
- isError: true
1365
- };
459
+ return { content: [{ type: "text", text: `Batch scrape failed: ${error.message}` }], isError: true };
1366
460
  }
1367
461
  }));
1368
462
 
1369
- // Tool: scrape_with_actions - Execute action chains before scraping
463
+ // Tool: scrape_with_actions
1370
464
  server.registerTool("scrape_with_actions", {
1371
465
  description: "Execute browser action chains before scraping content, with form auto-fill and intermediate state capture",
1372
466
  annotations: { title: "Scrape with Browser Actions", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
@@ -1416,24 +510,13 @@ server.registerTool("scrape_with_actions", {
1416
510
  }, withAuth("scrape_with_actions", async (params) => {
1417
511
  try {
1418
512
  const result = await scrapeWithActionsTool.execute(params);
1419
- return {
1420
- content: [{
1421
- type: "text",
1422
- text: JSON.stringify(result, null, 2)
1423
- }]
1424
- };
513
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1425
514
  } catch (error) {
1426
- return {
1427
- content: [{
1428
- type: "text",
1429
- text: `Scrape with actions failed: ${error.message}`
1430
- }],
1431
- isError: true
1432
- };
515
+ return { content: [{ type: "text", text: `Scrape with actions failed: ${error.message}` }], isError: true };
1433
516
  }
1434
517
  }));
1435
518
 
1436
- // Tool: deep_research - Comprehensive multi-stage research with source verification
519
+ // Tool: deep_research
1437
520
  server.registerTool("deep_research", {
1438
521
  description: "Conduct comprehensive multi-stage research with intelligent query expansion, source verification, and conflict detection",
1439
522
  annotations: { title: "Deep Research", readOnlyHint: true, destructiveHint: false, idempotentHint: false, openWorldHint: true },
@@ -1483,42 +566,22 @@ server.registerTool("deep_research", {
1483
566
  }, withAuth("deep_research", async (params) => {
1484
567
  try {
1485
568
  const result = await deepResearchTool.execute(params);
1486
- return {
1487
- content: [{
1488
- type: "text",
1489
- text: JSON.stringify(result, null, 2)
1490
- }]
1491
- };
569
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1492
570
  } catch (error) {
1493
- return {
1494
- content: [{
1495
- type: "text",
1496
- text: `Deep research failed: ${error.message}`
1497
- }],
1498
- isError: true
1499
- };
571
+ return { content: [{ type: "text", text: `Deep research failed: ${error.message}` }], isError: true };
1500
572
  }
1501
573
  }));
1502
574
 
1503
- // Tool: track_changes - Enhanced Content change tracking with baseline capture and monitoring (Phase 2.4)
575
+ // Tool: track_changes
1504
576
  server.registerTool("track_changes", {
1505
577
  description: "Enhanced content change tracking with baseline capture, comparison, scheduled monitoring, advanced comparison engine, alert system, and historical analysis",
1506
578
  annotations: { title: "Track Changes", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
1507
579
  inputSchema: {
1508
580
  url: z.string().url().describe("The URL to track changes for"),
1509
581
  operation: z.enum([
1510
- 'create_baseline',
1511
- 'compare',
1512
- 'monitor',
1513
- 'get_history',
1514
- 'get_stats',
1515
- 'create_scheduled_monitor',
1516
- 'stop_scheduled_monitor',
1517
- 'get_dashboard',
1518
- 'export_history',
1519
- 'create_alert_rule',
1520
- 'generate_trend_report',
1521
- 'get_monitoring_templates'
582
+ 'create_baseline', 'compare', 'monitor', 'get_history', 'get_stats',
583
+ 'create_scheduled_monitor', 'stop_scheduled_monitor', 'get_dashboard',
584
+ 'export_history', 'create_alert_rule', 'generate_trend_report', 'get_monitoring_templates'
1522
585
  ]).default('compare').describe("Tracking operation to perform"),
1523
586
  content: z.string().optional().describe("Content to compare against baseline"),
1524
587
  html: z.string().optional().describe("HTML content to compare against baseline"),
@@ -1580,15 +643,14 @@ server.registerTool("track_changes", {
1580
643
  username: z.string().optional()
1581
644
  }).optional()
1582
645
  }).optional().describe("Notification configuration for webhooks and Slack"),
1583
- // Enhanced Phase 2.4 options
1584
646
  scheduledMonitorOptions: z.object({
1585
- schedule: z.string().optional(), // Cron expression
1586
- templateId: z.string().optional(), // Monitoring template ID
647
+ schedule: z.string().optional(),
648
+ templateId: z.string().optional(),
1587
649
  enabled: z.boolean().default(true)
1588
650
  }).optional().describe("Scheduled monitoring options with cron expressions"),
1589
651
  alertRuleOptions: z.object({
1590
652
  ruleId: z.string().optional(),
1591
- condition: z.string().optional(), // Condition description
653
+ condition: z.string().optional(),
1592
654
  actions: z.array(z.enum(['webhook', 'email', 'slack'])).optional(),
1593
655
  throttle: z.number().min(0).optional(),
1594
656
  priority: z.enum(['low', 'medium', 'high']).optional()
@@ -1609,24 +671,13 @@ server.registerTool("track_changes", {
1609
671
  }, withAuth("track_changes", async (params) => {
1610
672
  try {
1611
673
  const result = await trackChangesTool.execute(params);
1612
- return {
1613
- content: [{
1614
- type: "text",
1615
- text: JSON.stringify(result, null, 2)
1616
- }]
1617
- };
674
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1618
675
  } catch (error) {
1619
- return {
1620
- content: [{
1621
- type: "text",
1622
- text: `Change tracking failed: ${error.message}`
1623
- }],
1624
- isError: true
1625
- };
676
+ return { content: [{ type: "text", text: `Change tracking failed: ${error.message}` }], isError: true };
1626
677
  }
1627
678
  }));
1628
679
 
1629
- // Tool: generate_llms_txt - Generate LLMs.txt and LLMs-full.txt files (Phase 2.5)
680
+ // Tool: generate_llms_txt
1630
681
  server.registerTool("generate_llms_txt", {
1631
682
  description: "Analyze websites and generate standard-compliant LLMs.txt and LLMs-full.txt files defining AI model interaction guidelines",
1632
683
  annotations: { title: "Generate llms.txt", readOnlyHint: true, destructiveHint: false, idempotentHint: true, openWorldHint: true },
@@ -1654,24 +705,13 @@ server.registerTool("generate_llms_txt", {
1654
705
  }, withAuth("generate_llms_txt", async (params) => {
1655
706
  try {
1656
707
  const result = await generateLLMsTxtTool.execute(params);
1657
- return {
1658
- content: [{
1659
- type: "text",
1660
- text: JSON.stringify(result, null, 2)
1661
- }]
1662
- };
708
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1663
709
  } catch (error) {
1664
- return {
1665
- content: [{
1666
- type: "text",
1667
- text: `LLMs.txt generation failed: ${error.message}`
1668
- }],
1669
- isError: true
1670
- };
710
+ return { content: [{ type: "text", text: `LLMs.txt generation failed: ${error.message}` }], isError: true };
1671
711
  }
1672
712
  }));
1673
713
 
1674
- // Tool: stealth_mode - Advanced anti-detection browser management (Wave 3)
714
+ // Tool: stealth_mode
1675
715
  server.registerTool("stealth_mode", {
1676
716
  description: "Advanced anti-detection browser management with stealth features, fingerprint randomization, and human behavior simulation",
1677
717
  annotations: { title: "Stealth Mode", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
@@ -1721,7 +761,6 @@ server.registerTool("stealth_mode", {
1721
761
  }, withAuth("stealth_mode", async ({ operation, stealthConfig, contextId, urlToTest }) => {
1722
762
  try {
1723
763
  let result;
1724
-
1725
764
  switch (operation) {
1726
765
  case 'configure':
1727
766
  if (stealthConfig) {
@@ -1731,69 +770,42 @@ server.registerTool("stealth_mode", {
1731
770
  result = { error: 'stealthConfig is required for configure operation' };
1732
771
  }
1733
772
  break;
1734
-
1735
773
  case 'enable':
1736
774
  stealthBrowserManager.enableStealthMode(stealthConfig?.level || 'medium');
1737
775
  result = { enabled: true, level: stealthConfig?.level || 'medium' };
1738
776
  break;
1739
-
1740
777
  case 'disable':
1741
778
  stealthBrowserManager.disableStealthMode();
1742
779
  result = { disabled: true };
1743
780
  break;
1744
-
1745
- case 'create_context':
781
+ case 'create_context': {
1746
782
  const contextData = await stealthBrowserManager.createStealthContext(stealthConfig);
1747
- result = {
1748
- contextId: contextData.contextId,
1749
- fingerprint: contextData.fingerprint,
1750
- created: true
1751
- };
783
+ result = { contextId: contextData.contextId, fingerprint: contextData.fingerprint, created: true };
1752
784
  break;
1753
-
1754
- case 'create_page':
1755
- if (!contextId) {
1756
- throw new Error('contextId is required for create_page operation');
1757
- }
785
+ }
786
+ case 'create_page': {
787
+ if (!contextId) throw new Error('contextId is required for create_page operation');
1758
788
  const page = await stealthBrowserManager.createStealthPage(contextId);
1759
- result = {
1760
- pageCreated: true,
1761
- contextId: contextId,
1762
- url: urlToTest ? await page.goto(urlToTest) : null
1763
- };
789
+ result = { pageCreated: true, contextId, url: urlToTest ? await page.goto(urlToTest) : null };
1764
790
  break;
1765
-
791
+ }
1766
792
  case 'get_stats':
1767
793
  result = stealthBrowserManager.getStats();
1768
794
  break;
1769
-
1770
795
  case 'cleanup':
1771
796
  await stealthBrowserManager.cleanup();
1772
797
  result = { cleaned: true };
1773
798
  break;
1774
-
1775
799
  default:
1776
800
  result = { error: `Unknown operation: ${operation}` };
1777
801
  }
1778
-
1779
- return {
1780
- content: [{
1781
- type: "text",
1782
- text: JSON.stringify(result, null, 2)
1783
- }]
1784
- };
802
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1785
803
  } catch (error) {
1786
- return {
1787
- content: [{
1788
- type: "text",
1789
- text: `Stealth mode operation failed: ${error.message}`
1790
- }],
1791
- isError: true
1792
- };
804
+ return { content: [{ type: "text", text: `Stealth mode operation failed: ${error.message}` }], isError: true };
1793
805
  }
1794
806
  }));
1795
807
 
1796
- // Tool: localization - Multi-language and geo-location management (Wave 3)
808
+ // Tool: localization
1797
809
  server.registerTool("localization", {
1798
810
  description: "Multi-language and geo-location management with country-specific settings, browser locale emulation, timezone spoofing, and geo-blocked content handling",
1799
811
  annotations: { title: "Localization", readOnlyHint: false, destructiveHint: false, idempotentHint: false, openWorldHint: true },
@@ -1854,186 +866,110 @@ server.registerTool("localization", {
1854
866
  try {
1855
867
  const { operation } = params;
1856
868
  let result;
1857
-
1858
869
  switch (operation) {
1859
870
  case 'configure_country':
1860
- if (!params.countryCode) {
1861
- throw new Error('countryCode is required for configure_country operation');
1862
- }
871
+ if (!params.countryCode) throw new Error('countryCode is required for configure_country operation');
1863
872
  result = await localizationManager.configureCountry(params.countryCode, params);
1864
873
  break;
1865
-
1866
874
  case 'localize_search':
1867
- if (!params.searchParams) {
1868
- throw new Error('searchParams is required for localize_search operation');
1869
- }
875
+ if (!params.searchParams) throw new Error('searchParams is required for localize_search operation');
1870
876
  result = await localizationManager.localizeSearchQuery(params.searchParams, params.countryCode);
1871
877
  break;
1872
-
1873
878
  case 'localize_browser':
1874
- if (!params.browserOptions) {
1875
- throw new Error('browserOptions is required for localize_browser operation');
1876
- }
879
+ if (!params.browserOptions) throw new Error('browserOptions is required for localize_browser operation');
1877
880
  result = await localizationManager.localizeBrowserContext(params.browserOptions, params.countryCode);
1878
881
  break;
1879
-
1880
882
  case 'generate_timezone_spoof':
1881
883
  result = {
1882
884
  timezoneScript: await localizationManager.generateTimezoneSpoof(params.countryCode),
1883
885
  countryCode: params.countryCode || localizationManager.getCurrentSettings().countryCode
1884
886
  };
1885
887
  break;
1886
-
1887
888
  case 'handle_geo_blocking':
1888
- if (!params.url || !params.response) {
1889
- throw new Error('url and response are required for handle_geo_blocking operation');
1890
- }
889
+ if (!params.url || !params.response) throw new Error('url and response are required for handle_geo_blocking operation');
1891
890
  result = await localizationManager.handleGeoBlocking(params.url, params.response);
1892
891
  break;
1893
-
1894
892
  case 'auto_detect':
1895
- if (!params.content || !params.url) {
1896
- throw new Error('content and url are required for auto_detect operation');
1897
- }
893
+ if (!params.content || !params.url) throw new Error('content and url are required for auto_detect operation');
1898
894
  result = await localizationManager.autoDetectLocalization(params.content, params.url);
1899
895
  break;
1900
-
1901
896
  case 'get_stats':
1902
897
  result = localizationManager.getStats();
1903
898
  break;
1904
-
1905
899
  case 'get_supported_countries':
1906
900
  result = {
1907
901
  supportedCountries: localizationManager.getSupportedCountries(),
1908
902
  totalCount: localizationManager.getSupportedCountries().length
1909
903
  };
1910
904
  break;
1911
-
1912
905
  default:
1913
906
  result = { error: `Unknown operation: ${operation}` };
1914
907
  }
1915
-
1916
- return {
1917
- content: [{
1918
- type: "text",
1919
- text: JSON.stringify(result, null, 2)
1920
- }]
1921
- };
908
+ return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] };
1922
909
  } catch (error) {
1923
- return {
1924
- content: [{
1925
- type: "text",
1926
- text: `Localization operation failed: ${error.message}`
1927
- }],
1928
- isError: true
1929
- };
910
+ return { content: [{ type: "text", text: `Localization operation failed: ${error.message}` }], isError: true };
1930
911
  }
1931
912
  }));
1932
913
 
1933
- // Determine transport mode: HTTP if --http flag or MCP_HTTP env var is set
914
+ // ─── Transport + startup ───────────────────────────────────────────────────────
915
+
1934
916
  const useHttp = process.argv.includes('--http') || process.env.MCP_HTTP === 'true';
917
+ const useLegacyHttp = process.argv.includes('--legacy-http') || process.env.CRAWLFORGE_LEGACY_HTTP === 'true';
1935
918
 
1936
- // Set up transport and start the server
1937
919
  async function runServer() {
1938
920
  if (useHttp) {
1939
921
  const port = parseInt(process.env.PORT || '3000', 10);
1940
922
 
1941
- // Stateless transport — no session tracking, each request is independent
1942
- // This avoids the bug where server.connect(newTransport) kills previous sessions
1943
- const transport = new StreamableHTTPServerTransport({
1944
- sessionIdGenerator: undefined,
1945
- });
1946
- await server.connect(transport);
1947
-
1948
- const httpServer = createServer(async (req, res) => {
1949
- // CORS headers for Smithery gateway
1950
- res.setHeader('Access-Control-Allow-Origin', '*');
1951
- res.setHeader('Access-Control-Allow-Methods', 'GET, POST, DELETE, OPTIONS');
1952
- res.setHeader('Access-Control-Allow-Headers', 'Content-Type, mcp-session-id');
1953
- res.setHeader('Access-Control-Expose-Headers', 'mcp-session-id');
1954
-
1955
- if (req.method === 'OPTIONS') {
1956
- res.writeHead(204);
1957
- res.end();
1958
- return;
1959
- }
1960
-
1961
- // Health check endpoint
1962
- if (req.url === '/health') {
1963
- res.writeHead(200, { 'Content-Type': 'application/json' });
1964
- res.end(JSON.stringify({ status: 'ok', version: '3.0' }));
1965
- return;
1966
- }
1967
-
1968
- // MCP server card for Smithery discovery
1969
- if (req.url === '/.well-known/mcp/server-card.json') {
1970
- res.writeHead(200, { 'Content-Type': 'application/json' });
1971
- res.end(JSON.stringify({
1972
- serverInfo: {
1973
- name: "crawlforge",
1974
- version: "3.0.12",
1975
- description: "Production-ready MCP server with 20 web scraping, crawling, and content processing tools. Features stealth browsing, deep research, structured extraction, and change tracking.",
1976
- homepage: "https://www.crawlforge.dev",
1977
- icon: "https://www.crawlforge.dev/icon.png"
1978
- },
1979
- transport: {
1980
- type: "streamable-http",
1981
- url: "/mcp"
1982
- },
1983
- configSchema: {
1984
- type: "object",
1985
- properties: {
1986
- apiKey: {
1987
- type: "string",
1988
- title: "CrawlForge API Key",
1989
- description: "Your CrawlForge API key. Get one free at https://www.crawlforge.dev/signup (includes 1,000 credits)",
1990
- "x-from": { header: "x-api-key" }
1991
- }
1992
- },
1993
- required: ["apiKey"]
1994
- }
1995
- }));
1996
- return;
1997
- }
1998
-
1999
- // Route /mcp to the transport handler
2000
- if (req.url === '/mcp' || req.url === '/') {
2001
- await transport.handleRequest(req, res);
2002
- return;
923
+ if (useLegacyHttp) {
924
+ // One-release deprecation window for stateless legacy transport.
925
+ console.error('WARNING: --legacy-http is deprecated and will be removed in v3.3.0. Use the default Streamable HTTP transport.');
926
+ await connectHttp(server, AuthManager, logger, port);
927
+ } else {
928
+ // OAuth (opt-in)
929
+ let oauthProvider = null;
930
+ if (process.env.CRAWLFORGE_OAUTH_ENABLED === 'true') {
931
+ const issuer = process.env.CRAWLFORGE_OAUTH_ISSUER || `http://localhost:${port}`;
932
+ const apiKey = AuthManager.getConfig()?.apiKey;
933
+ if (!apiKey) {
934
+ console.error('OAuth enabled but no CrawlForge API key is configured — falling back to static-key auth.');
935
+ } else {
936
+ oauthProvider = createOAuthProvider({ issuer, apiKey, logger });
937
+ console.error(`OAuth 2.1 enabled discovery at ${issuer}/.well-known/oauth-authorization-server`);
938
+ }
2003
939
  }
2004
940
 
2005
- res.writeHead(404);
2006
- res.end('Not Found');
2007
- });
2008
-
2009
- httpServer.listen(port, () => {
2010
- console.error(`CrawlForge MCP Server v3.0 running on HTTP port ${port}`);
2011
- console.error(`MCP endpoint: http://localhost:${port}/mcp`);
2012
- console.error(`Health check: http://localhost:${port}/health`);
2013
- });
941
+ await connectStreamableHttp(server, AuthManager, logger, {
942
+ port,
943
+ legacy: false,
944
+ oauth: oauthProvider,
945
+ metrics
946
+ });
947
+ }
2014
948
  } else {
2015
- const transport = new StdioServerTransport();
2016
- await server.connect(transport);
2017
- console.error("CrawlForge MCP Server v3.0 running on stdio");
949
+ await connectStdio(server);
2018
950
  }
951
+
2019
952
  console.error(`Environment: ${config.server.nodeEnv}`);
2020
-
2021
953
  console.error("Search enabled: true (via CrawlForge proxy)");
2022
-
2023
- const baseTools = "fetch_url, extract_text, extract_links, extract_metadata, scrape_structured, crawl_deep, map_site";
2024
- const searchTool = ", search_web";
2025
- const phase3Tools = ", extract_content, process_document, summarize_content, analyze_content";
2026
- const wave2Tools = ", batch_scrape, scrape_with_actions";
2027
- const researchTools = ", deep_research";
2028
- const trackingTools = ", track_changes";
2029
- const llmsTxtTools = ", generate_llms_txt";
2030
- const wave3Tools = ", stealth_mode, localization";
2031
- const phase1Tools = ", extract_structured";
2032
- console.error(`Tools available: ${baseTools}${searchTool}${phase3Tools}${wave2Tools}${researchTools}${trackingTools}${llmsTxtTools}${wave3Tools}${phase1Tools}`);
2033
954
 
955
+ const allTools = [
956
+ "fetch_url", "extract_text", "extract_links", "extract_metadata", "scrape_structured",
957
+ "search_web", "crawl_deep", "map_site",
958
+ "extract_content", "process_document", "summarize_content", "analyze_content",
959
+ "batch_scrape", "scrape_with_actions",
960
+ "deep_research", "track_changes", "generate_llms_txt",
961
+ "stealth_mode", "localization", "extract_structured", "extract_with_llm"
962
+ ];
963
+ console.error(`Tools available: ${allTools.join(', ')}`);
2034
964
 
2035
- // === MEMORY LEAK PREVENTION ===
2036
- // Add graceful shutdown handling to prevent memory leaks
965
+ // Start memory monitoring in development
966
+ if (config.server.nodeEnv === "development") {
967
+ memoryMonitor.start();
968
+ console.error("Memory monitoring started");
969
+ }
970
+ }
971
+
972
+ // ─── Graceful shutdown ─────────────────────────────────────────────────────────
2037
973
 
2038
974
  let isShuttingDown = false;
2039
975
 
@@ -2042,26 +978,19 @@ async function gracefulShutdown(signal) {
2042
978
  console.error("Force shutdown...");
2043
979
  process.exit(1);
2044
980
  }
2045
-
981
+
2046
982
  isShuttingDown = true;
2047
983
  console.error(`Received ${signal}. Starting graceful shutdown...`);
2048
-
984
+
2049
985
  try {
2050
- // Cleanup tools that have destroy methods
2051
986
  const toolsToCleanup = [
2052
- batchScrapeTool,
2053
- scrapeWithActionsTool,
2054
- deepResearchTool,
2055
- trackChangesTool,
2056
- generateLLMsTxtTool,
2057
- stealthBrowserManager,
2058
- localizationManager,
2059
- extractStructuredTool
987
+ batchScrapeTool, scrapeWithActionsTool, deepResearchTool,
988
+ trackChangesTool, generateLLMsTxtTool, stealthBrowserManager,
989
+ localizationManager, extractStructuredTool
2060
990
  ].filter(tool => tool && (typeof tool.destroy === 'function' || typeof tool.cleanup === 'function'));
2061
-
991
+
2062
992
  console.error(`Cleaning up ${toolsToCleanup.length} tools...`);
2063
-
2064
- // Cleanup tools with timeout
993
+
2065
994
  await Promise.race([
2066
995
  Promise.all(toolsToCleanup.map(async (tool) => {
2067
996
  try {
@@ -2075,40 +1004,33 @@ async function gracefulShutdown(signal) {
2075
1004
  console.error(`Error cleaning up ${tool.constructor.name}:`, error.message);
2076
1005
  }
2077
1006
  })),
2078
- new Promise(resolve => setTimeout(resolve, 5000)) // 5 second timeout
1007
+ new Promise(resolve => setTimeout(resolve, 5000))
2079
1008
  ]);
2080
-
2081
- // Stop memory monitoring
1009
+
2082
1010
  if (memoryMonitor.isMonitoring) {
2083
1011
  memoryMonitor.stop();
2084
1012
  console.error("Memory monitoring stopped");
2085
1013
  }
2086
1014
 
2087
- // Force garbage collection if available
2088
1015
  if (global.gc) {
2089
1016
  console.error("Running final garbage collection...");
2090
1017
  global.gc();
2091
1018
  }
2092
-
1019
+
2093
1020
  console.error("Graceful shutdown completed");
2094
1021
  process.exit(0);
2095
-
2096
1022
  } catch (error) {
2097
1023
  console.error("Error during graceful shutdown:", error);
2098
1024
  process.exit(1);
2099
1025
  }
2100
1026
  }
2101
1027
 
2102
- // Register signal handlers
2103
1028
  process.on('SIGINT', () => gracefulShutdown('SIGINT'));
2104
1029
  process.on('SIGTERM', () => gracefulShutdown('SIGTERM'));
2105
-
2106
- // Handle uncaught exceptions and unhandled rejections
2107
1030
  process.on('uncaughtException', (error) => {
2108
1031
  console.error('Uncaught Exception:', error);
2109
1032
  gracefulShutdown('uncaughtException');
2110
1033
  });
2111
-
2112
1034
  process.on('unhandledRejection', (reason, promise) => {
2113
1035
  console.error('Unhandled Rejection at:', promise, 'reason:', reason);
2114
1036
  gracefulShutdown('unhandledRejection');
@@ -2119,17 +1041,10 @@ if (config.server.nodeEnv === 'development') {
2119
1041
  setInterval(() => {
2120
1042
  const usage = process.memoryUsage();
2121
1043
  const memoryMB = (usage.heapUsed / 1024 / 1024).toFixed(2);
2122
- if (memoryMB > 200) { // Alert if over 200MB
1044
+ if (memoryMB > 200) {
2123
1045
  console.error(`Memory usage: ${memoryMB}MB (high usage detected)`);
2124
1046
  }
2125
- }, 60000); // Check every minute
2126
- }
2127
-
2128
- // Start memory monitoring in development
2129
- if (config.server.nodeEnv === "development") {
2130
- memoryMonitor.start();
2131
- console.error("Memory monitoring started");
2132
- }
1047
+ }, 60000);
2133
1048
  }
2134
1049
 
2135
1050
  runServer().catch((error) => {