crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
package/server.js ADDED
@@ -0,0 +1,1963 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
4
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
5
+ import { z } from "zod";
6
+ import { load } from "cheerio";
7
+ import { SearchWebTool } from "./src/tools/search/searchWeb.js";
8
+ import { CrawlDeepTool } from "./src/tools/crawl/crawlDeep.js";
9
+ import { MapSiteTool } from "./src/tools/crawl/mapSite.js";
10
+ import { ExtractContentTool } from "./src/tools/extract/extractContent.js";
11
+ import { ProcessDocumentTool } from "./src/tools/extract/processDocument.js";
12
+ import { SummarizeContentTool } from "./src/tools/extract/summarizeContent.js";
13
+ import { AnalyzeContentTool } from "./src/tools/extract/analyzeContent.js";
14
+ // Wave 2 Advanced Tools
15
+ import { BatchScrapeTool } from "./src/tools/advanced/BatchScrapeTool.js";
16
+ import { ScrapeWithActionsTool } from "./src/tools/advanced/ScrapeWithActionsTool.js";
17
+ // Deep Research Tool
18
+ import { DeepResearchTool } from "./src/tools/research/deepResearch.js";
19
+ // Change Tracking Tool - commented out due to import issue
20
+ // import { TrackChangesTool } from "./src/tools/tracking/trackChanges.js";
21
+ // LLMs.txt Generator Tool (Phase 2.5)
22
+ import { GenerateLLMsTxtTool } from "./src/tools/llmstxt/generateLLMsTxt.js";
23
+ // Wave 3-4 Core Managers
24
+ import { StealthBrowserManager } from "./src/core/StealthBrowserManager.js";
25
+ import { LocalizationManager } from "./src/core/LocalizationManager.js";
26
+ import { memoryMonitor } from "./src/utils/MemoryMonitor.js";
27
+ import { config, validateConfig, isSearchConfigured, getToolConfig, getActiveSearchProvider } from "./src/constants/config.js";
28
+ // Authentication Manager
29
+ import AuthManager from "./src/core/AuthManager.js";
30
+
31
+ // Enable creator mode if BYPASS_API_KEY is set
32
+ if (process.env.BYPASS_API_KEY === 'true') {
33
+ process.env.CRAWLFORGE_CREATOR_MODE = 'true';
34
+ }
35
+
36
+ // Initialize Authentication Manager
37
+ await AuthManager.initialize();
38
+
39
+ // Check if first time setup is needed (skip in creator mode)
40
+ if (!AuthManager.isAuthenticated() && !AuthManager.isCreatorMode()) {
41
+ const apiKey = process.env.CRAWLFORGE_API_KEY;
42
+ if (apiKey) {
43
+ // Auto-setup if API key is provided via environment
44
+ console.log('🔧 Auto-configuring CrawlForge with provided API key...');
45
+ const success = await AuthManager.runSetup(apiKey);
46
+ if (!success) {
47
+ console.error('❌ Failed to authenticate with provided API key');
48
+ console.error('Please check your API key or run: npm run setup');
49
+ process.exit(1);
50
+ }
51
+ } else {
52
+ console.log('');
53
+ console.log('╔═══════════════════════════════════════════════════════╗');
54
+ console.log('║ CrawlForge MCP Server - Setup Required ║');
55
+ console.log('╚═══════════════════════════════════════════════════════╝');
56
+ console.log('');
57
+ console.log('Welcome! This appears to be your first time using CrawlForge.');
58
+ console.log('');
59
+ console.log('To get started, please run:');
60
+ console.log(' npm run setup');
61
+ console.log('');
62
+ console.log('Or set your API key via environment variable:');
63
+ console.log(' export CRAWLFORGE_API_KEY="your_api_key_here"');
64
+ console.log('');
65
+ console.log('Get your free API key at: https://crawlforge.com/signup');
66
+ console.log('(Includes 1,000 free credits!)');
67
+ console.log('');
68
+ process.exit(0);
69
+ }
70
+ }
71
+
72
+ // Validate configuration
73
+ const configErrors = validateConfig();
74
+ if (configErrors.length > 0 && config.server.nodeEnv === 'production') {
75
+ console.error('Configuration errors:', configErrors);
76
+ process.exit(1);
77
+ }
78
+
79
+ // Create the server
80
+ const server = new McpServer({ name: "crawlforge", version: "3.0.0" });
81
+
82
+ // Helper function to wrap tool handlers with authentication and credit tracking
83
+ function withAuth(toolName, handler) {
84
+ return async (params) => {
85
+ const startTime = Date.now();
86
+
87
+ try {
88
+ // Skip credit checks in creator mode
89
+ if (!AuthManager.isCreatorMode()) {
90
+ // Check credits before executing
91
+ const creditCost = AuthManager.getToolCost(toolName);
92
+ const hasCredits = await AuthManager.checkCredits(creditCost);
93
+
94
+ if (!hasCredits) {
95
+ return {
96
+ content: [{
97
+ type: "text",
98
+ text: JSON.stringify({
99
+ error: "Insufficient credits",
100
+ message: `This operation requires ${creditCost} credits. Please upgrade your plan at https://crawlforge.com/pricing`,
101
+ creditsRequired: creditCost
102
+ }, null, 2)
103
+ }]
104
+ };
105
+ }
106
+ }
107
+
108
+ // Execute the tool
109
+ const result = await handler(params);
110
+
111
+ // Report usage for successful execution (skip in creator mode)
112
+ const processingTime = Date.now() - startTime;
113
+ if (!AuthManager.isCreatorMode()) {
114
+ const creditCost = AuthManager.getToolCost(toolName);
115
+ await AuthManager.reportUsage(
116
+ toolName,
117
+ creditCost,
118
+ params,
119
+ 200,
120
+ processingTime
121
+ );
122
+ }
123
+
124
+ return result;
125
+ } catch (error) {
126
+ // Report usage even for errors (reduced credit cost) - skip in creator mode
127
+ const processingTime = Date.now() - startTime;
128
+ if (!AuthManager.isCreatorMode()) {
129
+ await AuthManager.reportUsage(
130
+ toolName,
131
+ Math.max(1, Math.floor(AuthManager.getToolCost(toolName) * 0.5)), // Half credits for errors
132
+ params,
133
+ 500,
134
+ processingTime
135
+ );
136
+ }
137
+
138
+ throw error;
139
+ }
140
+ };
141
+ }
142
+
143
+ // Initialize tools
144
+ let searchWebTool = null;
145
+ if (isSearchConfigured()) {
146
+ searchWebTool = new SearchWebTool(getToolConfig('search_web'));
147
+ }
148
+ const crawlDeepTool = new CrawlDeepTool(getToolConfig('crawl_deep'));
149
+ const mapSiteTool = new MapSiteTool(getToolConfig('map_site'));
150
+
151
+ // Initialize Phase 3 tools
152
+ const extractContentTool = new ExtractContentTool();
153
+ const processDocumentTool = new ProcessDocumentTool();
154
+ const summarizeContentTool = new SummarizeContentTool();
155
+ const analyzeContentTool = new AnalyzeContentTool();
156
+
157
+ // Initialize Wave 2 Advanced Tools
158
+ const batchScrapeTool = new BatchScrapeTool();
159
+ const scrapeWithActionsTool = new ScrapeWithActionsTool();
160
+
161
+ // Initialize Deep Research Tool
162
+ const deepResearchTool = new DeepResearchTool();
163
+
164
+ // Initialize Change Tracking Tool - temporarily disabled due to import issue
165
+ // const trackChangesTool = new TrackChangesTool();
166
+
167
+ // Initialize LLMs.txt Generator Tool (Phase 2.5)
168
+ const generateLLMsTxtTool = new GenerateLLMsTxtTool();
169
+
170
+ // Initialize Wave 3-4 Core Managers
171
+ const stealthBrowserManager = new StealthBrowserManager();
172
+ const localizationManager = new LocalizationManager();
173
+
174
+ // Zod schemas for tool parameters and responses
175
+ const FetchUrlSchema = z.object({
176
+ url: z.string().url(),
177
+ headers: z.record(z.string()).optional(),
178
+ timeout: z.number().min(1000).max(30000).optional().default(10000)
179
+ });
180
+
181
+ const ExtractTextSchema = z.object({
182
+ url: z.string().url(),
183
+ remove_scripts: z.boolean().optional().default(true),
184
+ remove_styles: z.boolean().optional().default(true)
185
+ });
186
+
187
+ const ExtractLinksSchema = z.object({
188
+ url: z.string().url(),
189
+ filter_external: z.boolean().optional().default(false),
190
+ base_url: z.string().url().optional()
191
+ });
192
+
193
+ const ExtractMetadataSchema = z.object({
194
+ url: z.string().url()
195
+ });
196
+
197
+ const ScrapeStructuredSchema = z.object({
198
+ url: z.string().url(),
199
+ selectors: z.record(z.string())
200
+ });
201
+
202
+ const SearchWebSchema = z.object({
203
+ query: z.string(),
204
+ limit: z.number().min(1).max(100).optional(),
205
+ offset: z.number().min(0).optional(),
206
+ lang: z.string().optional(),
207
+ safe_search: z.boolean().optional(),
208
+ time_range: z.enum(['day', 'week', 'month', 'year', 'all']).optional(),
209
+ site: z.string().optional(),
210
+ file_type: z.string().optional()
211
+ });
212
+
213
+ const CrawlDeepSchema = z.object({
214
+ url: z.string().url(),
215
+ max_depth: z.number().min(1).max(5).optional(),
216
+ max_pages: z.number().min(1).max(1000).optional(),
217
+ include_patterns: z.array(z.string()).optional(),
218
+ exclude_patterns: z.array(z.string()).optional(),
219
+ follow_external: z.boolean().optional(),
220
+ respect_robots: z.boolean().optional(),
221
+ extract_content: z.boolean().optional(),
222
+ concurrency: z.number().min(1).max(20).optional()
223
+ });
224
+
225
+ const MapSiteSchema = z.object({
226
+ url: z.string().url(),
227
+ include_sitemap: z.boolean().optional(),
228
+ max_urls: z.number().min(1).max(10000).optional(),
229
+ group_by_path: z.boolean().optional(),
230
+ include_metadata: z.boolean().optional()
231
+ });
232
+
233
+ const ExtractContentSchema = z.object({
234
+ url: z.string().url(),
235
+ options: z.object({}).optional()
236
+ });
237
+
238
+ const ProcessDocumentSchema = z.object({
239
+ source: z.string(),
240
+ sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional(),
241
+ options: z.object({}).optional()
242
+ });
243
+
244
+ const SummarizeContentSchema = z.object({
245
+ text: z.string(),
246
+ options: z.object({}).optional()
247
+ });
248
+
249
+ const AnalyzeContentSchema = z.object({
250
+ text: z.string(),
251
+ options: z.object({}).optional()
252
+ });
253
+
254
+ // Wave 2 Advanced Tools Schemas
255
+ const BatchScrapeSchema = z.object({
256
+ urls: z.array(z.union([
257
+ z.string().url(),
258
+ z.object({
259
+ url: z.string().url(),
260
+ selectors: z.record(z.string()).optional(),
261
+ headers: z.record(z.string()).optional(),
262
+ timeout: z.number().min(1000).max(30000).optional(),
263
+ metadata: z.record(z.any()).optional()
264
+ })
265
+ ])).min(1).max(50),
266
+
267
+ formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
268
+ mode: z.enum(['sync', 'async']).default('sync'),
269
+
270
+ webhook: z.object({
271
+ url: z.string().url(),
272
+ events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
273
+ headers: z.record(z.string()).optional(),
274
+ signingSecret: z.string().optional()
275
+ }).optional(),
276
+
277
+ extractionSchema: z.record(z.string()).optional(),
278
+ maxConcurrency: z.number().min(1).max(20).default(10),
279
+ delayBetweenRequests: z.number().min(0).max(10000).default(100),
280
+ includeMetadata: z.boolean().default(true),
281
+ includeFailed: z.boolean().default(true),
282
+ pageSize: z.number().min(1).max(100).default(25),
283
+
284
+ jobOptions: z.object({
285
+ priority: z.number().default(0),
286
+ ttl: z.number().min(60000).default(24 * 60 * 60 * 1000),
287
+ maxRetries: z.number().min(0).max(5).default(1),
288
+ tags: z.array(z.string()).default([])
289
+ }).optional()
290
+ });
291
+
292
+ const ScrapeWithActionsSchema = z.object({
293
+ url: z.string().url(),
294
+ actions: z.array(z.object({
295
+ type: z.enum(['wait', 'click', 'type', 'press', 'scroll', 'screenshot', 'executeJavaScript']),
296
+ selector: z.string().optional(),
297
+ text: z.string().optional(),
298
+ key: z.string().optional(),
299
+ script: z.string().optional(),
300
+ timeout: z.number().optional(),
301
+ description: z.string().optional(),
302
+ continueOnError: z.boolean().default(false),
303
+ retries: z.number().min(0).max(5).default(0)
304
+ })).min(1).max(20),
305
+
306
+ formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
307
+ captureIntermediateStates: z.boolean().default(false),
308
+ captureScreenshots: z.boolean().default(true),
309
+
310
+ formAutoFill: z.object({
311
+ fields: z.array(z.object({
312
+ selector: z.string(),
313
+ value: z.string(),
314
+ type: z.enum(['text', 'select', 'checkbox', 'radio', 'file']).default('text'),
315
+ waitAfter: z.number().min(0).max(5000).default(100)
316
+ })),
317
+ submitSelector: z.string().optional(),
318
+ waitAfterSubmit: z.number().min(0).max(30000).default(2000)
319
+ }).optional(),
320
+
321
+ browserOptions: z.object({
322
+ headless: z.boolean().default(true),
323
+ userAgent: z.string().optional(),
324
+ viewportWidth: z.number().min(800).max(1920).default(1280),
325
+ viewportHeight: z.number().min(600).max(1080).default(720),
326
+ timeout: z.number().min(10000).max(120000).default(30000)
327
+ }).optional(),
328
+
329
+ extractionOptions: z.object({
330
+ selectors: z.record(z.string()).optional(),
331
+ includeMetadata: z.boolean().default(true),
332
+ includeLinks: z.boolean().default(true),
333
+ includeImages: z.boolean().default(true)
334
+ }).optional(),
335
+
336
+ continueOnActionError: z.boolean().default(false),
337
+ maxRetries: z.number().min(0).max(3).default(1),
338
+ screenshotOnError: z.boolean().default(true)
339
+ });
340
+
341
+ // Deep Research Tool Schema
342
+ const DeepResearchSchema = z.object({
343
+ topic: z.string().min(3).max(500),
344
+ maxDepth: z.number().min(1).max(10).optional().default(5),
345
+ maxUrls: z.number().min(1).max(1000).optional().default(50),
346
+ timeLimit: z.number().min(30000).max(300000).optional().default(120000),
347
+ researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad'),
348
+ sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']),
349
+ credibilityThreshold: z.number().min(0).max(1).optional().default(0.3),
350
+ includeRecentOnly: z.boolean().optional().default(false),
351
+ enableConflictDetection: z.boolean().optional().default(true),
352
+ enableSourceVerification: z.boolean().optional().default(true),
353
+ enableSynthesis: z.boolean().optional().default(true),
354
+ outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive'),
355
+ includeRawData: z.boolean().optional().default(false),
356
+ includeActivityLog: z.boolean().optional().default(false),
357
+ queryExpansion: z.object({
358
+ enableSynonyms: z.boolean().optional().default(true),
359
+ enableSpellCheck: z.boolean().optional().default(true),
360
+ enableContextual: z.boolean().optional().default(true),
361
+ maxVariations: z.number().min(1).max(20).optional().default(8)
362
+ }).optional(),
363
+ llmConfig: z.object({
364
+ provider: z.enum(['auto', 'openai', 'anthropic']).optional().default('auto'),
365
+ openai: z.object({
366
+ apiKey: z.string().optional(),
367
+ model: z.string().optional().default('gpt-3.5-turbo'),
368
+ embeddingModel: z.string().optional().default('text-embedding-ada-002')
369
+ }).optional(),
370
+ anthropic: z.object({
371
+ apiKey: z.string().optional(),
372
+ model: z.string().optional().default('claude-3-haiku-20240307')
373
+ }).optional(),
374
+ enableSemanticAnalysis: z.boolean().optional().default(true),
375
+ enableIntelligentSynthesis: z.boolean().optional().default(true)
376
+ }).optional(),
377
+ concurrency: z.number().min(1).max(20).optional().default(5),
378
+ cacheResults: z.boolean().optional().default(true),
379
+ webhook: z.object({
380
+ url: z.string().url(),
381
+ events: z.array(z.enum(['started', 'progress', 'completed', 'failed'])).optional().default(['completed']),
382
+ headers: z.record(z.string()).optional()
383
+ }).optional()
384
+ });
385
+
386
+ // Change Tracking Tool Schema
387
+ const TrackChangesSchema = z.object({
388
+ url: z.string().url(),
389
+ operation: z.enum(['create_baseline', 'compare', 'monitor', 'get_history', 'get_stats']).default('compare'),
390
+ content: z.string().optional(),
391
+ html: z.string().optional(),
392
+ trackingOptions: z.object({
393
+ granularity: z.enum(['page', 'section', 'element', 'text']).default('section'),
394
+ trackText: z.boolean().default(true),
395
+ trackStructure: z.boolean().default(true),
396
+ trackAttributes: z.boolean().default(false),
397
+ trackImages: z.boolean().default(false),
398
+ trackLinks: z.boolean().default(true),
399
+ ignoreWhitespace: z.boolean().default(true),
400
+ ignoreCase: z.boolean().default(false),
401
+ customSelectors: z.array(z.string()).optional(),
402
+ excludeSelectors: z.array(z.string()).optional(),
403
+ significanceThresholds: z.object({
404
+ minor: z.number().min(0).max(1).default(0.1),
405
+ moderate: z.number().min(0).max(1).default(0.3),
406
+ major: z.number().min(0).max(1).default(0.7)
407
+ }).optional()
408
+ }).optional(),
409
+ monitoringOptions: z.object({
410
+ enabled: z.boolean().default(false),
411
+ interval: z.number().min(60000).max(24 * 60 * 60 * 1000).default(300000),
412
+ maxRetries: z.number().min(0).max(5).default(3),
413
+ retryDelay: z.number().min(1000).max(60000).default(5000),
414
+ notificationThreshold: z.enum(['minor', 'moderate', 'major', 'critical']).default('moderate'),
415
+ enableWebhook: z.boolean().default(false),
416
+ webhookUrl: z.string().url().optional(),
417
+ webhookSecret: z.string().optional()
418
+ }).optional(),
419
+ storageOptions: z.object({
420
+ enableSnapshots: z.boolean().default(true),
421
+ retainHistory: z.boolean().default(true),
422
+ maxHistoryEntries: z.number().min(1).max(1000).default(100),
423
+ compressionEnabled: z.boolean().default(true),
424
+ deltaStorageEnabled: z.boolean().default(true)
425
+ }).optional(),
426
+ queryOptions: z.object({
427
+ limit: z.number().min(1).max(500).default(50),
428
+ offset: z.number().min(0).default(0),
429
+ startTime: z.number().optional(),
430
+ endTime: z.number().optional(),
431
+ includeContent: z.boolean().default(false),
432
+ significanceFilter: z.enum(['all', 'minor', 'moderate', 'major', 'critical']).optional()
433
+ }).optional(),
434
+ notificationOptions: z.object({
435
+ webhook: z.object({
436
+ enabled: z.boolean().default(false),
437
+ url: z.string().url().optional(),
438
+ method: z.enum(['POST', 'PUT']).default('POST'),
439
+ headers: z.record(z.string()).optional(),
440
+ signingSecret: z.string().optional(),
441
+ includeContent: z.boolean().default(false)
442
+ }).optional(),
443
+ slack: z.object({
444
+ enabled: z.boolean().default(false),
445
+ webhookUrl: z.string().url().optional(),
446
+ channel: z.string().optional(),
447
+ username: z.string().optional()
448
+ }).optional()
449
+ }).optional()
450
+ });
451
+
452
+ // LLMs.txt Generator Tool Schema (Phase 2.5)
453
+ const GenerateLLMsTxtSchema = z.object({
454
+ url: z.string().url(),
455
+ analysisOptions: z.object({
456
+ maxDepth: z.number().min(1).max(5).optional().default(3),
457
+ maxPages: z.number().min(10).max(500).optional().default(100),
458
+ detectAPIs: z.boolean().optional().default(true),
459
+ analyzeContent: z.boolean().optional().default(true),
460
+ checkSecurity: z.boolean().optional().default(true),
461
+ respectRobots: z.boolean().optional().default(true)
462
+ }).optional(),
463
+ outputOptions: z.object({
464
+ includeDetailed: z.boolean().optional().default(true),
465
+ includeAnalysis: z.boolean().optional().default(false),
466
+ contactEmail: z.string().email().optional(),
467
+ organizationName: z.string().optional(),
468
+ customGuidelines: z.array(z.string()).optional(),
469
+ customRestrictions: z.array(z.string()).optional()
470
+ }).optional(),
471
+ complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard'),
472
+ format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both')
473
+ });
474
+
475
+ // Stealth Mode Tool Schema (Wave 3)
476
+ const StealthModeSchema = z.object({
477
+ operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure'),
478
+ stealthConfig: z.object({
479
+ level: z.enum(['basic', 'medium', 'advanced']).default('medium'),
480
+ randomizeFingerprint: z.boolean().default(true),
481
+ hideWebDriver: z.boolean().default(true),
482
+ blockWebRTC: z.boolean().default(true),
483
+ spoofTimezone: z.boolean().default(true),
484
+ randomizeHeaders: z.boolean().default(true),
485
+ useRandomUserAgent: z.boolean().default(true),
486
+ simulateHumanBehavior: z.boolean().default(true),
487
+ customUserAgent: z.string().optional(),
488
+ customViewport: z.object({
489
+ width: z.number().min(800).max(1920),
490
+ height: z.number().min(600).max(1080)
491
+ }).optional(),
492
+ locale: z.string().default('en-US'),
493
+ timezone: z.string().optional(),
494
+ webRTCPublicIP: z.string().optional(),
495
+ webRTCLocalIPs: z.array(z.string()).optional(),
496
+ proxyRotation: z.object({
497
+ enabled: z.boolean().default(false),
498
+ proxies: z.array(z.string()).optional(),
499
+ rotationInterval: z.number().default(300000)
500
+ }).optional(),
501
+ antiDetection: z.object({
502
+ cloudflareBypass: z.boolean().default(true),
503
+ recaptchaHandling: z.boolean().default(true),
504
+ hideAutomation: z.boolean().default(true),
505
+ spoofMediaDevices: z.boolean().default(true),
506
+ spoofBatteryAPI: z.boolean().default(true)
507
+ }).optional(),
508
+ fingerprinting: z.object({
509
+ canvasNoise: z.boolean().default(true),
510
+ webglSpoofing: z.boolean().default(true),
511
+ audioContextSpoofing: z.boolean().default(true),
512
+ fontSpoofing: z.boolean().default(true),
513
+ hardwareSpoofing: z.boolean().default(true)
514
+ }).optional()
515
+ }).optional(),
516
+ contextId: z.string().optional(),
517
+ urlToTest: z.string().url().optional()
518
+ });
519
+
520
+ // Localization Tool Schema (Wave 3)
521
+ const LocalizationSchema = z.object({
522
+ operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country'),
523
+ countryCode: z.string().length(2).optional(),
524
+ language: z.string().optional(),
525
+ timezone: z.string().optional(),
526
+ currency: z.string().length(3).optional(),
527
+ customHeaders: z.record(z.string()).optional(),
528
+ userAgent: z.string().optional(),
529
+ acceptLanguage: z.string().optional(),
530
+ geoLocation: z.object({
531
+ latitude: z.number().min(-90).max(90),
532
+ longitude: z.number().min(-180).max(180),
533
+ accuracy: z.number().min(1).max(100).optional()
534
+ }).optional(),
535
+ proxySettings: z.object({
536
+ enabled: z.boolean().default(false),
537
+ region: z.string().optional(),
538
+ type: z.enum(['http', 'https', 'socks4', 'socks5']).default('https'),
539
+ server: z.string().optional(),
540
+ port: z.number().optional(),
541
+ username: z.string().optional(),
542
+ password: z.string().optional(),
543
+ rotation: z.object({
544
+ enabled: z.boolean().default(false),
545
+ interval: z.number().default(300000),
546
+ strategy: z.enum(['round-robin', 'random', 'failover']).default('round-robin')
547
+ }).optional(),
548
+ fallback: z.object({
549
+ enabled: z.boolean().default(true),
550
+ maxRetries: z.number().default(3),
551
+ timeout: z.number().default(10000)
552
+ }).optional()
553
+ }).optional(),
554
+ searchParams: z.object({
555
+ query: z.string().optional(),
556
+ limit: z.number().optional(),
557
+ offset: z.number().optional(),
558
+ headers: z.record(z.string()).optional()
559
+ }).optional(),
560
+ browserOptions: z.object({
561
+ locale: z.string().optional(),
562
+ timezoneId: z.string().optional(),
563
+ extraHTTPHeaders: z.record(z.string()).optional(),
564
+ userAgent: z.string().optional()
565
+ }).optional(),
566
+ content: z.string().optional(),
567
+ url: z.string().url().optional(),
568
+ response: z.object({
569
+ status: z.number(),
570
+ body: z.string().optional(),
571
+ statusText: z.string().optional()
572
+ }).optional()
573
+ });
574
+
575
+
576
+ // Utility function to fetch URL with error handling
577
+ async function fetchWithTimeout(url, options = {}) {
578
+ const { timeout = 10000, headers = {} } = options;
579
+
580
+ const controller = new AbortController();
581
+ const timeoutId = setTimeout(() => controller.abort(), timeout);
582
+
583
+ try {
584
+ const response = await fetch(url, {
585
+ signal: controller.signal,
586
+ headers: {
587
+ 'User-Agent': 'CrawlForge/1.0.0',
588
+ ...headers
589
+ }
590
+ });
591
+
592
+ clearTimeout(timeoutId);
593
+ return response;
594
+ } catch (error) {
595
+ clearTimeout(timeoutId);
596
+ if (error.name === 'AbortError') {
597
+ throw new Error(`Request timeout after ${timeout}ms`);
598
+ }
599
+ throw error;
600
+ }
601
+ }
602
+
603
+ // Tool: fetch_url - Basic URL fetching with headers and response handling
604
+ server.registerTool("fetch_url", {
605
+ description: "Fetch content from a URL with optional headers and timeout",
606
+ inputSchema: {
607
+ url: z.string().url(),
608
+ headers: z.record(z.string()).optional(),
609
+ timeout: z.number().min(1000).max(30000).optional().default(10000)
610
+ }
611
+ }, withAuth("fetch_url", async ({ url, headers, timeout }) => {
612
+ try {
613
+ const response = await fetchWithTimeout(url, {
614
+ timeout: timeout || 10000,
615
+ headers: headers || {}
616
+ });
617
+
618
+ const body = await response.text();
619
+ const responseHeaders = {};
620
+ response.headers.forEach((value, key) => {
621
+ responseHeaders[key] = value;
622
+ });
623
+
624
+ return {
625
+ content: [{
626
+ type: "text",
627
+ text: JSON.stringify({
628
+ status: response.status,
629
+ statusText: response.statusText,
630
+ headers: responseHeaders,
631
+ body: body,
632
+ contentType: response.headers.get('content-type') || 'unknown',
633
+ size: body.length,
634
+ url: response.url
635
+ }, null, 2)
636
+ }]
637
+ };
638
+ } catch (error) {
639
+ return {
640
+ content: [{
641
+ type: "text",
642
+ text: `Failed to fetch URL: ${error.message}`
643
+ }],
644
+ isError: true
645
+ };
646
+ }
647
+ }));
648
+
649
+ // Tool: extract_text - Extract clean text content from HTML
650
+ server.registerTool("extract_text", {
651
+ description: "Extract clean text content from a webpage",
652
+ inputSchema: {
653
+ url: z.string().url(),
654
+ remove_scripts: z.boolean().optional().default(true),
655
+ remove_styles: z.boolean().optional().default(true)
656
+ }
657
+ }, withAuth("extract_text", async ({ url, remove_scripts, remove_styles }) => {
658
+ try {
659
+ const response = await fetchWithTimeout(url);
660
+ if (!response.ok) {
661
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
662
+ }
663
+
664
+ const html = await response.text();
665
+ const $ = load(html);
666
+
667
+ // Remove unwanted elements
668
+ if (remove_scripts !== false) {
669
+ $('script').remove();
670
+ }
671
+ if (remove_styles !== false) {
672
+ $('style').remove();
673
+ }
674
+
675
+ // Remove common non-content elements
676
+ $('nav, header, footer, aside, .advertisement, .ad, .sidebar').remove();
677
+
678
+ // Extract text content
679
+ const text = $('body').text().replace(/\s+/g, ' ').trim();
680
+
681
+ return {
682
+ content: [{
683
+ type: "text",
684
+ text: JSON.stringify({
685
+ text: text,
686
+ word_count: text.split(/\s+/).filter(word => word.length > 0).length,
687
+ char_count: text.length,
688
+ url: response.url
689
+ }, null, 2)
690
+ }]
691
+ };
692
+ } catch (error) {
693
+ return {
694
+ content: [{
695
+ type: "text",
696
+ text: `Failed to extract text: ${error.message}`
697
+ }],
698
+ isError: true
699
+ };
700
+ }
701
+ }));
702
+
703
+ // Tool: extract_links - Extract all links from a webpage with optional filtering
704
+ server.registerTool("extract_links", {
705
+ description: "Extract all links from a webpage with optional filtering",
706
+ inputSchema: {
707
+ url: z.string().url(),
708
+ filter_external: z.boolean().optional().default(false),
709
+ base_url: z.string().url().optional()
710
+ }
711
+ }, withAuth("extract_links", async ({ url, filter_external, base_url }) => {
712
+ try {
713
+ const response = await fetchWithTimeout(url);
714
+ if (!response.ok) {
715
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
716
+ }
717
+
718
+ const html = await response.text();
719
+ const $ = load(html);
720
+
721
+ const baseUrl = base_url || new URL(url).origin;
722
+ const pageUrl = new URL(url);
723
+ const links = [];
724
+
725
+ $('a[href]').each((_, element) => {
726
+ const href = $(element).attr('href');
727
+ const text = $(element).text().trim();
728
+
729
+ if (!href) return;
730
+
731
+ let absoluteUrl;
732
+ let isExternal = false;
733
+
734
+ try {
735
+ if (href.startsWith('http://') || href.startsWith('https://')) {
736
+ absoluteUrl = href;
737
+ isExternal = new URL(href).origin !== pageUrl.origin;
738
+ } else {
739
+ absoluteUrl = new URL(href, baseUrl).toString();
740
+ isExternal = false;
741
+ }
742
+
743
+ // Apply filtering
744
+ if (filter_external && isExternal) {
745
+ return;
746
+ }
747
+
748
+ links.push({
749
+ href: absoluteUrl,
750
+ text: text,
751
+ is_external: isExternal,
752
+ original_href: href
753
+ });
754
+ } catch (urlError) {
755
+ // Skip invalid URLs
756
+ }
757
+ });
758
+
759
+ // Remove duplicates
760
+ const uniqueLinks = links.filter((link, index, arr) =>
761
+ arr.findIndex(l => l.href === link.href) === index
762
+ );
763
+
764
+ return {
765
+ content: [{
766
+ type: "text",
767
+ text: JSON.stringify({
768
+ links: uniqueLinks,
769
+ total_count: uniqueLinks.length,
770
+ internal_count: uniqueLinks.filter(l => !l.is_external).length,
771
+ external_count: uniqueLinks.filter(l => l.is_external).length,
772
+ base_url: baseUrl
773
+ }, null, 2)
774
+ }]
775
+ };
776
+ } catch (error) {
777
+ return {
778
+ content: [{
779
+ type: "text",
780
+ text: `Failed to extract links: ${error.message}`
781
+ }],
782
+ isError: true
783
+ };
784
+ }
785
+ }));
786
+
787
+ // Tool: extract_metadata - Extract page metadata
788
+ server.registerTool("extract_metadata", {
789
+ description: "Extract metadata from a webpage (title, description, keywords, etc.)",
790
+ inputSchema: {
791
+ url: z.string().url()
792
+ }
793
+ }, withAuth("extract_metadata", async ({ url }) => {
794
+ try {
795
+ const response = await fetchWithTimeout(url);
796
+ if (!response.ok) {
797
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
798
+ }
799
+
800
+ const html = await response.text();
801
+ const $ = load(html);
802
+
803
+ // Extract basic metadata
804
+ const title = $('title').text().trim() || $('h1').first().text().trim();
805
+ const description = $('meta[name="description"]').attr('content') ||
806
+ $('meta[property="og:description"]').attr('content') || '';
807
+ const keywords = $('meta[name="keywords"]').attr('content') || '';
808
+ const canonical = $('link[rel="canonical"]').attr('href') || '';
809
+
810
+ // Extract Open Graph tags
811
+ const ogTags = {};
812
+ $('meta[property^="og:"]').each((_, element) => {
813
+ const property = $(element).attr('property');
814
+ const content = $(element).attr('content');
815
+ if (property && content) {
816
+ ogTags[property.replace('og:', '')] = content;
817
+ }
818
+ });
819
+
820
+ // Extract Twitter Card tags
821
+ const twitterTags = {};
822
+ $('meta[name^="twitter:"]').each((_, element) => {
823
+ const name = $(element).attr('name');
824
+ const content = $(element).attr('content');
825
+ if (name && content) {
826
+ twitterTags[name.replace('twitter:', '')] = content;
827
+ }
828
+ });
829
+
830
+ // Extract additional metadata
831
+ const author = $('meta[name="author"]').attr('content') || '';
832
+ const robots = $('meta[name="robots"]').attr('content') || '';
833
+ const viewport = $('meta[name="viewport"]').attr('content') || '';
834
+ const charset = $('meta[charset]').attr('charset') ||
835
+ $('meta[http-equiv="Content-Type"]').attr('content') || '';
836
+
837
+ return {
838
+ content: [{
839
+ type: "text",
840
+ text: JSON.stringify({
841
+ title: title,
842
+ description: description,
843
+ keywords: keywords.split(',').map(k => k.trim()).filter(k => k),
844
+ canonical_url: canonical,
845
+ author: author,
846
+ robots: robots,
847
+ viewport: viewport,
848
+ charset: charset,
849
+ og_tags: ogTags,
850
+ twitter_tags: twitterTags,
851
+ url: response.url
852
+ }, null, 2)
853
+ }]
854
+ };
855
+ } catch (error) {
856
+ return {
857
+ content: [{
858
+ type: "text",
859
+ text: `Failed to extract metadata: ${error.message}`
860
+ }],
861
+ isError: true
862
+ };
863
+ }
864
+ }));
865
+
866
+ // Tool: scrape_structured - Extract structured data using CSS selectors
867
+ server.registerTool("scrape_structured", {
868
+ description: "Extract structured data from a webpage using CSS selectors",
869
+ inputSchema: {
870
+ url: z.string().url(),
871
+ selectors: z.record(z.string())
872
+ }
873
+ }, withAuth("scrape_structured", async ({ url, selectors }) => {
874
+ try {
875
+ const response = await fetchWithTimeout(url);
876
+ if (!response.ok) {
877
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
878
+ }
879
+
880
+ const html = await response.text();
881
+ const $ = load(html);
882
+
883
+ const results = {};
884
+
885
+ for (const [fieldName, selector] of Object.entries(selectors)) {
886
+ try {
887
+ const elements = $(selector);
888
+
889
+ if (elements.length === 0) {
890
+ results[fieldName] = null;
891
+ } else if (elements.length === 1) {
892
+ // Single element - return text content
893
+ results[fieldName] = elements.text().trim();
894
+ } else {
895
+ // Multiple elements - return array of text content
896
+ results[fieldName] = elements.map((_, el) => $(el).text().trim()).get();
897
+ }
898
+ } catch (selectorError) {
899
+ results[fieldName] = {
900
+ error: `Invalid selector: ${selector}`,
901
+ message: selectorError.message
902
+ };
903
+ }
904
+ }
905
+
906
+ return {
907
+ content: [{
908
+ type: "text",
909
+ text: JSON.stringify({
910
+ data: results,
911
+ selectors_used: selectors,
912
+ elements_found: Object.keys(results).length,
913
+ url: response.url
914
+ }, null, 2)
915
+ }]
916
+ };
917
+ } catch (error) {
918
+ return {
919
+ content: [{
920
+ type: "text",
921
+ text: `Failed to scrape structured data: ${error.message}`
922
+ }],
923
+ isError: true
924
+ };
925
+ }
926
+ }));
927
+
928
+ // Tool: search_web - Web search with configurable providers
929
+ if (searchWebTool) {
930
+ const activeProvider = getActiveSearchProvider();
931
+ const providerName = activeProvider === 'google' ? 'Google Custom Search API' :
932
+ activeProvider === 'duckduckgo' ? 'DuckDuckGo' : 'Auto-selected provider';
933
+
934
+ server.registerTool("search_web", {
935
+ description: `Search the web using ${providerName}`,
936
+ inputSchema: {
937
+ query: z.string(),
938
+ limit: z.number().min(1).max(100).optional(),
939
+ offset: z.number().min(0).optional(),
940
+ lang: z.string().optional(),
941
+ safe_search: z.boolean().optional(),
942
+ time_range: z.enum(['day', 'week', 'month', 'year', 'all']).optional(),
943
+ site: z.string().optional(),
944
+ file_type: z.string().optional()
945
+ }
946
+ }, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => {
947
+ try {
948
+ if (!query) {
949
+ return {
950
+ content: [{
951
+ type: "text",
952
+ text: "Query parameter is required"
953
+ }],
954
+ isError: true
955
+ };
956
+ }
957
+
958
+ const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type });
959
+ return {
960
+ content: [{
961
+ type: "text",
962
+ text: JSON.stringify(result, null, 2)
963
+ }]
964
+ };
965
+ } catch (error) {
966
+ return {
967
+ content: [{
968
+ type: "text",
969
+ text: `Search failed: ${error.message}`
970
+ }],
971
+ isError: true
972
+ };
973
+ }
974
+ }));
975
+ } else {
976
+ const activeProvider = getActiveSearchProvider();
977
+ if (activeProvider === 'google') {
978
+ console.error("Warning: search_web tool not configured. Set GOOGLE_API_KEY and GOOGLE_SEARCH_ENGINE_ID to enable Google search.");
979
+ } else {
980
+ console.error("Warning: search_web tool initialization failed. Check your SEARCH_PROVIDER configuration.");
981
+ }
982
+ }
983
+
984
+ // Tool: crawl_deep - Deep crawl websites with BFS algorithm
985
+ server.registerTool("crawl_deep", {
986
+ description: "Crawl websites deeply using breadth-first search",
987
+ inputSchema: {
988
+ url: z.string().url(),
989
+ max_depth: z.number().min(1).max(5).optional(),
990
+ max_pages: z.number().min(1).max(1000).optional(),
991
+ include_patterns: z.array(z.string()).optional(),
992
+ exclude_patterns: z.array(z.string()).optional(),
993
+ follow_external: z.boolean().optional(),
994
+ respect_robots: z.boolean().optional(),
995
+ extract_content: z.boolean().optional(),
996
+ concurrency: z.number().min(1).max(20).optional()
997
+ }
998
+ }, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }) => {
999
+ try {
1000
+ if (!url) {
1001
+ return {
1002
+ content: [{
1003
+ type: "text",
1004
+ text: "URL parameter is required"
1005
+ }],
1006
+ isError: true
1007
+ };
1008
+ }
1009
+
1010
+ const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency });
1011
+ return {
1012
+ content: [{
1013
+ type: "text",
1014
+ text: JSON.stringify(result, null, 2)
1015
+ }]
1016
+ };
1017
+ } catch (error) {
1018
+ return {
1019
+ content: [{
1020
+ type: "text",
1021
+ text: `Crawl failed: ${error.message}`
1022
+ }],
1023
+ isError: true
1024
+ };
1025
+ }
1026
+ }));
1027
+
1028
+ // Tool: map_site - Discover and map website structure
1029
+ server.registerTool("map_site", {
1030
+ description: "Discover and map website structure",
1031
+ inputSchema: {
1032
+ url: z.string().url(),
1033
+ include_sitemap: z.boolean().optional(),
1034
+ max_urls: z.number().min(1).max(10000).optional(),
1035
+ group_by_path: z.boolean().optional(),
1036
+ include_metadata: z.boolean().optional()
1037
+ }
1038
+ }, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata }) => {
1039
+ try {
1040
+ if (!url) {
1041
+ return {
1042
+ content: [{
1043
+ type: "text",
1044
+ text: "URL parameter is required"
1045
+ }],
1046
+ isError: true
1047
+ };
1048
+ }
1049
+
1050
+ const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata });
1051
+ return {
1052
+ content: [{
1053
+ type: "text",
1054
+ text: JSON.stringify(result, null, 2)
1055
+ }]
1056
+ };
1057
+ } catch (error) {
1058
+ return {
1059
+ content: [{
1060
+ type: "text",
1061
+ text: `Site mapping failed: ${error.message}`
1062
+ }],
1063
+ isError: true
1064
+ };
1065
+ }
1066
+ }));
1067
+
1068
+ // Phase 3 Tools: Enhanced Content Processing
1069
+
1070
+ // Tool: extract_content - Enhanced content extraction with readability detection
1071
+ server.registerTool("extract_content", {
1072
+ description: "Extract and analyze main content from web pages with enhanced readability detection",
1073
+ inputSchema: {
1074
+ url: z.string().url(),
1075
+ options: z.object({}).optional()
1076
+ }
1077
+ }, withAuth("extract_content", async ({ url, options }) => {
1078
+ try {
1079
+ if (!url) {
1080
+ return {
1081
+ content: [{
1082
+ type: "text",
1083
+ text: "URL parameter is required"
1084
+ }],
1085
+ isError: true
1086
+ };
1087
+ }
1088
+
1089
+ const result = await extractContentTool.execute({ url, options });
1090
+ return {
1091
+ content: [{
1092
+ type: "text",
1093
+ text: JSON.stringify(result, null, 2)
1094
+ }]
1095
+ };
1096
+ } catch (error) {
1097
+ return {
1098
+ content: [{
1099
+ type: "text",
1100
+ text: `Content extraction failed: ${error.message}`
1101
+ }],
1102
+ isError: true
1103
+ };
1104
+ }
1105
+ }));
1106
+
1107
+ // Tool: process_document - Multi-format document processing
1108
+ server.registerTool("process_document", {
1109
+ description: "Process documents from multiple sources and formats including PDFs and web pages",
1110
+ inputSchema: {
1111
+ source: z.string(),
1112
+ sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional(),
1113
+ options: z.object({}).optional()
1114
+ }
1115
+ }, withAuth("process_document", async ({ source, sourceType, options }) => {
1116
+ try {
1117
+ if (!source) {
1118
+ return {
1119
+ content: [{
1120
+ type: "text",
1121
+ text: "Source parameter is required"
1122
+ }],
1123
+ isError: true
1124
+ };
1125
+ }
1126
+
1127
+ const result = await processDocumentTool.execute({ source, sourceType, options });
1128
+ return {
1129
+ content: [{
1130
+ type: "text",
1131
+ text: JSON.stringify(result, null, 2)
1132
+ }]
1133
+ };
1134
+ } catch (error) {
1135
+ return {
1136
+ content: [{
1137
+ type: "text",
1138
+ text: `Document processing failed: ${error.message}`
1139
+ }],
1140
+ isError: true
1141
+ };
1142
+ }
1143
+ }));
1144
+
1145
+ // Tool: summarize_content - Intelligent content summarization
1146
+ server.registerTool("summarize_content", {
1147
+ description: "Generate intelligent summaries of text content with configurable options",
1148
+ inputSchema: {
1149
+ text: z.string(),
1150
+ options: z.object({}).optional()
1151
+ }
1152
+ }, withAuth("summarize_content", async ({ text, options }) => {
1153
+ try {
1154
+ if (!text) {
1155
+ return {
1156
+ content: [{
1157
+ type: "text",
1158
+ text: "Text parameter is required"
1159
+ }],
1160
+ isError: true
1161
+ };
1162
+ }
1163
+
1164
+ const result = await summarizeContentTool.execute({ text, options });
1165
+ return {
1166
+ content: [{
1167
+ type: "text",
1168
+ text: JSON.stringify(result, null, 2)
1169
+ }]
1170
+ };
1171
+ } catch (error) {
1172
+ return {
1173
+ content: [{
1174
+ type: "text",
1175
+ text: `Content summarization failed: ${error.message}`
1176
+ }],
1177
+ isError: true
1178
+ };
1179
+ }
1180
+ }));
1181
+
1182
+ // Tool: analyze_content - Comprehensive content analysis
1183
+ server.registerTool("analyze_content", {
1184
+ description: "Perform comprehensive content analysis including language detection and topic extraction",
1185
+ inputSchema: {
1186
+ text: z.string(),
1187
+ options: z.object({}).optional()
1188
+ }
1189
+ }, withAuth("analyze_content", async ({ text, options }) => {
1190
+ try {
1191
+ if (!text) {
1192
+ return {
1193
+ content: [{
1194
+ type: "text",
1195
+ text: "Text parameter is required"
1196
+ }],
1197
+ isError: true
1198
+ };
1199
+ }
1200
+
1201
+ const result = await analyzeContentTool.execute({ text, options });
1202
+ return {
1203
+ content: [{
1204
+ type: "text",
1205
+ text: JSON.stringify(result, null, 2)
1206
+ }]
1207
+ };
1208
+ } catch (error) {
1209
+ return {
1210
+ content: [{
1211
+ type: "text",
1212
+ text: `Content analysis failed: ${error.message}`
1213
+ }],
1214
+ isError: true
1215
+ };
1216
+ }
1217
+ }));
1218
+
1219
+
1220
+ // Wave 2 Advanced Tools
1221
+
1222
+ // Tool: batch_scrape - Process multiple URLs simultaneously with job management
1223
+ server.registerTool("batch_scrape", {
1224
+ description: "Process multiple URLs simultaneously with support for async job management and webhook notifications",
1225
+ inputSchema: {
1226
+ urls: z.array(z.union([
1227
+ z.string().url(),
1228
+ z.object({
1229
+ url: z.string().url(),
1230
+ selectors: z.record(z.string()).optional(),
1231
+ headers: z.record(z.string()).optional(),
1232
+ timeout: z.number().min(1000).max(30000).optional(),
1233
+ metadata: z.record(z.any()).optional()
1234
+ })
1235
+ ])).min(1).max(50),
1236
+ formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']),
1237
+ mode: z.enum(['sync', 'async']).default('sync'),
1238
+ webhook: z.object({
1239
+ url: z.string().url(),
1240
+ events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']),
1241
+ headers: z.record(z.string()).optional(),
1242
+ signingSecret: z.string().optional()
1243
+ }).optional(),
1244
+ extractionSchema: z.record(z.string()).optional(),
1245
+ maxConcurrency: z.number().min(1).max(20).default(10),
1246
+ delayBetweenRequests: z.number().min(0).max(10000).default(100),
1247
+ includeMetadata: z.boolean().default(true),
1248
+ includeFailed: z.boolean().default(true),
1249
+ pageSize: z.number().min(1).max(100).default(25),
1250
+ jobOptions: z.object({
1251
+ priority: z.number().default(0),
1252
+ ttl: z.number().min(60000).default(24 * 60 * 60 * 1000),
1253
+ maxRetries: z.number().min(0).max(5).default(1),
1254
+ tags: z.array(z.string()).default([])
1255
+ }).optional()
1256
+ }
1257
+ }, withAuth("batch_scrape", async (params) => {
1258
+ try {
1259
+ const result = await batchScrapeTool.execute(params);
1260
+ return {
1261
+ content: [{
1262
+ type: "text",
1263
+ text: JSON.stringify(result, null, 2)
1264
+ }]
1265
+ };
1266
+ } catch (error) {
1267
+ return {
1268
+ content: [{
1269
+ type: "text",
1270
+ text: `Batch scrape failed: ${error.message}`
1271
+ }],
1272
+ isError: true
1273
+ };
1274
+ }
1275
+ }));
1276
+
1277
+ // Tool: scrape_with_actions - Execute action chains before scraping
1278
+ server.registerTool("scrape_with_actions", {
1279
+ description: "Execute browser action chains before scraping content, with form auto-fill and intermediate state capture",
1280
+ inputSchema: {
1281
+ url: z.string().url(),
1282
+ actions: z.array(z.object({
1283
+ type: z.enum(['wait', 'click', 'type', 'press', 'scroll', 'screenshot', 'executeJavaScript']),
1284
+ selector: z.string().optional(),
1285
+ text: z.string().optional(),
1286
+ key: z.string().optional(),
1287
+ script: z.string().optional(),
1288
+ timeout: z.number().optional(),
1289
+ description: z.string().optional(),
1290
+ continueOnError: z.boolean().default(false),
1291
+ retries: z.number().min(0).max(5).default(0)
1292
+ })).min(1).max(20),
1293
+ formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']),
1294
+ captureIntermediateStates: z.boolean().default(false),
1295
+ captureScreenshots: z.boolean().default(true),
1296
+ formAutoFill: z.object({
1297
+ fields: z.array(z.object({
1298
+ selector: z.string(),
1299
+ value: z.string(),
1300
+ type: z.enum(['text', 'select', 'checkbox', 'radio', 'file']).default('text'),
1301
+ waitAfter: z.number().min(0).max(5000).default(100)
1302
+ })),
1303
+ submitSelector: z.string().optional(),
1304
+ waitAfterSubmit: z.number().min(0).max(30000).default(2000)
1305
+ }).optional(),
1306
+ browserOptions: z.object({
1307
+ headless: z.boolean().default(true),
1308
+ userAgent: z.string().optional(),
1309
+ viewportWidth: z.number().min(800).max(1920).default(1280),
1310
+ viewportHeight: z.number().min(600).max(1080).default(720),
1311
+ timeout: z.number().min(10000).max(120000).default(30000)
1312
+ }).optional(),
1313
+ extractionOptions: z.object({
1314
+ selectors: z.record(z.string()).optional(),
1315
+ includeMetadata: z.boolean().default(true),
1316
+ includeLinks: z.boolean().default(true),
1317
+ includeImages: z.boolean().default(true)
1318
+ }).optional(),
1319
+ continueOnActionError: z.boolean().default(false),
1320
+ maxRetries: z.number().min(0).max(3).default(1),
1321
+ screenshotOnError: z.boolean().default(true)
1322
+ }
1323
+ }, withAuth("scrape_with_actions", async (params) => {
1324
+ try {
1325
+ const result = await scrapeWithActionsTool.execute(params);
1326
+ return {
1327
+ content: [{
1328
+ type: "text",
1329
+ text: JSON.stringify(result, null, 2)
1330
+ }]
1331
+ };
1332
+ } catch (error) {
1333
+ return {
1334
+ content: [{
1335
+ type: "text",
1336
+ text: `Scrape with actions failed: ${error.message}`
1337
+ }],
1338
+ isError: true
1339
+ };
1340
+ }
1341
+ }));
1342
+
1343
+ // Tool: deep_research - Comprehensive multi-stage research with source verification
1344
+ server.registerTool("deep_research", {
1345
+ description: "Conduct comprehensive multi-stage research with intelligent query expansion, source verification, and conflict detection",
1346
+ inputSchema: {
1347
+ topic: z.string().min(3).max(500),
1348
+ maxDepth: z.number().min(1).max(10).optional().default(5),
1349
+ maxUrls: z.number().min(1).max(1000).optional().default(50),
1350
+ timeLimit: z.number().min(30000).max(300000).optional().default(120000),
1351
+ researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad'),
1352
+ sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']),
1353
+ credibilityThreshold: z.number().min(0).max(1).optional().default(0.3),
1354
+ includeRecentOnly: z.boolean().optional().default(false),
1355
+ enableConflictDetection: z.boolean().optional().default(true),
1356
+ enableSourceVerification: z.boolean().optional().default(true),
1357
+ enableSynthesis: z.boolean().optional().default(true),
1358
+ outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive'),
1359
+ includeRawData: z.boolean().optional().default(false),
1360
+ includeActivityLog: z.boolean().optional().default(false),
1361
+ queryExpansion: z.object({
1362
+ enableSynonyms: z.boolean().optional().default(true),
1363
+ enableSpellCheck: z.boolean().optional().default(true),
1364
+ enableContextual: z.boolean().optional().default(true),
1365
+ maxVariations: z.number().min(1).max(20).optional().default(8)
1366
+ }).optional(),
1367
+ llmConfig: z.object({
1368
+ provider: z.enum(['auto', 'openai', 'anthropic']).optional().default('auto'),
1369
+ openai: z.object({
1370
+ apiKey: z.string().optional(),
1371
+ model: z.string().optional().default('gpt-3.5-turbo'),
1372
+ embeddingModel: z.string().optional().default('text-embedding-ada-002')
1373
+ }).optional(),
1374
+ anthropic: z.object({
1375
+ apiKey: z.string().optional(),
1376
+ model: z.string().optional().default('claude-3-haiku-20240307')
1377
+ }).optional(),
1378
+ enableSemanticAnalysis: z.boolean().optional().default(true),
1379
+ enableIntelligentSynthesis: z.boolean().optional().default(true)
1380
+ }).optional(),
1381
+ concurrency: z.number().min(1).max(20).optional().default(5),
1382
+ cacheResults: z.boolean().optional().default(true),
1383
+ webhook: z.object({
1384
+ url: z.string().url(),
1385
+ events: z.array(z.enum(['started', 'progress', 'completed', 'failed'])).optional().default(['completed']),
1386
+ headers: z.record(z.string()).optional()
1387
+ }).optional()
1388
+ }
1389
+ }, withAuth("deep_research", async (params) => {
1390
+ try {
1391
+ const result = await deepResearchTool.execute(params);
1392
+ return {
1393
+ content: [{
1394
+ type: "text",
1395
+ text: JSON.stringify(result, null, 2)
1396
+ }]
1397
+ };
1398
+ } catch (error) {
1399
+ return {
1400
+ content: [{
1401
+ type: "text",
1402
+ text: `Deep research failed: ${error.message}`
1403
+ }],
1404
+ isError: true
1405
+ };
1406
+ }
1407
+ }));
1408
+
1409
+ // Tool: track_changes - Enhanced Content change tracking with baseline capture and monitoring (Phase 2.4)
1410
+ // Temporarily disabled due to import issue
1411
+ /*
1412
+ server.registerTool("track_changes", {
1413
+ description: "Enhanced content change tracking with baseline capture, comparison, scheduled monitoring, advanced comparison engine, alert system, and historical analysis",
1414
+ inputSchema: {
1415
+ url: z.string().url(),
1416
+ operation: z.enum([
1417
+ 'create_baseline',
1418
+ 'compare',
1419
+ 'monitor',
1420
+ 'get_history',
1421
+ 'get_stats',
1422
+ 'create_scheduled_monitor',
1423
+ 'stop_scheduled_monitor',
1424
+ 'get_dashboard',
1425
+ 'export_history',
1426
+ 'create_alert_rule',
1427
+ 'generate_trend_report',
1428
+ 'get_monitoring_templates'
1429
+ ]).default('compare'),
1430
+ content: z.string().optional(),
1431
+ html: z.string().optional(),
1432
+ trackingOptions: z.object({
1433
+ granularity: z.enum(['page', 'section', 'element', 'text']).default('section'),
1434
+ trackText: z.boolean().default(true),
1435
+ trackStructure: z.boolean().default(true),
1436
+ trackAttributes: z.boolean().default(false),
1437
+ trackImages: z.boolean().default(false),
1438
+ trackLinks: z.boolean().default(true),
1439
+ ignoreWhitespace: z.boolean().default(true),
1440
+ ignoreCase: z.boolean().default(false),
1441
+ customSelectors: z.array(z.string()).optional(),
1442
+ excludeSelectors: z.array(z.string()).optional(),
1443
+ significanceThresholds: z.object({
1444
+ minor: z.number().min(0).max(1).default(0.1),
1445
+ moderate: z.number().min(0).max(1).default(0.3),
1446
+ major: z.number().min(0).max(1).default(0.7)
1447
+ }).optional()
1448
+ }).optional(),
1449
+ monitoringOptions: z.object({
1450
+ enabled: z.boolean().default(false),
1451
+ interval: z.number().min(60000).max(24 * 60 * 60 * 1000).default(300000),
1452
+ maxRetries: z.number().min(0).max(5).default(3),
1453
+ retryDelay: z.number().min(1000).max(60000).default(5000),
1454
+ notificationThreshold: z.enum(['minor', 'moderate', 'major', 'critical']).default('moderate'),
1455
+ enableWebhook: z.boolean().default(false),
1456
+ webhookUrl: z.string().url().optional(),
1457
+ webhookSecret: z.string().optional()
1458
+ }).optional(),
1459
+ storageOptions: z.object({
1460
+ enableSnapshots: z.boolean().default(true),
1461
+ retainHistory: z.boolean().default(true),
1462
+ maxHistoryEntries: z.number().min(1).max(1000).default(100),
1463
+ compressionEnabled: z.boolean().default(true),
1464
+ deltaStorageEnabled: z.boolean().default(true)
1465
+ }).optional(),
1466
+ queryOptions: z.object({
1467
+ limit: z.number().min(1).max(500).default(50),
1468
+ offset: z.number().min(0).default(0),
1469
+ startTime: z.number().optional(),
1470
+ endTime: z.number().optional(),
1471
+ includeContent: z.boolean().default(false),
1472
+ significanceFilter: z.enum(['all', 'minor', 'moderate', 'major', 'critical']).optional()
1473
+ }).optional(),
1474
+ notificationOptions: z.object({
1475
+ webhook: z.object({
1476
+ enabled: z.boolean().default(false),
1477
+ url: z.string().url().optional(),
1478
+ method: z.enum(['POST', 'PUT']).default('POST'),
1479
+ headers: z.record(z.string()).optional(),
1480
+ signingSecret: z.string().optional(),
1481
+ includeContent: z.boolean().default(false)
1482
+ }).optional(),
1483
+ slack: z.object({
1484
+ enabled: z.boolean().default(false),
1485
+ webhookUrl: z.string().url().optional(),
1486
+ channel: z.string().optional(),
1487
+ username: z.string().optional()
1488
+ }).optional()
1489
+ }).optional(),
1490
+ // Enhanced Phase 2.4 options
1491
+ scheduledMonitorOptions: z.object({
1492
+ schedule: z.string().optional(), // Cron expression
1493
+ templateId: z.string().optional(), // Monitoring template ID
1494
+ enabled: z.boolean().default(true)
1495
+ }).optional(),
1496
+ alertRuleOptions: z.object({
1497
+ ruleId: z.string().optional(),
1498
+ condition: z.string().optional(), // Condition description
1499
+ actions: z.array(z.enum(['webhook', 'email', 'slack'])).optional(),
1500
+ throttle: z.number().min(0).optional(),
1501
+ priority: z.enum(['low', 'medium', 'high']).optional()
1502
+ }).optional(),
1503
+ exportOptions: z.object({
1504
+ format: z.enum(['json', 'csv']).default('json'),
1505
+ startTime: z.number().optional(),
1506
+ endTime: z.number().optional(),
1507
+ includeContent: z.boolean().default(false),
1508
+ includeSnapshots: z.boolean().default(false)
1509
+ }).optional(),
1510
+ dashboardOptions: z.object({
1511
+ includeRecentAlerts: z.boolean().default(true),
1512
+ includeTrends: z.boolean().default(true),
1513
+ includeMonitorStatus: z.boolean().default(true)
1514
+ }).optional()
1515
+ })
1516
+ }, async (params) => {
1517
+ try {
1518
+ const result = await trackChangesTool.execute(params);
1519
+ return {
1520
+ content: [{
1521
+ type: "text",
1522
+ text: JSON.stringify(result, null, 2)
1523
+ }]
1524
+ };
1525
+ } catch (error) {
1526
+ return {
1527
+ content: [{
1528
+ type: "text",
1529
+ text: `Change tracking failed: ${error.message}`
1530
+ }],
1531
+ isError: true
1532
+ };
1533
+ }
1534
+ });
1535
+
1536
+ // Tool: generate_llms_txt - Generate LLMs.txt and LLMs-full.txt files (Phase 2.5)
1537
+ server.registerTool("generate_llms_txt", {
1538
+ description: "Analyze websites and generate standard-compliant LLMs.txt and LLMs-full.txt files defining AI model interaction guidelines",
1539
+ inputSchema: {
1540
+ url: z.string().url(),
1541
+ analysisOptions: z.object({
1542
+ maxDepth: z.number().min(1).max(5).optional().default(3),
1543
+ maxPages: z.number().min(10).max(500).optional().default(100),
1544
+ detectAPIs: z.boolean().optional().default(true),
1545
+ analyzeContent: z.boolean().optional().default(true),
1546
+ checkSecurity: z.boolean().optional().default(true),
1547
+ respectRobots: z.boolean().optional().default(true)
1548
+ }).optional(),
1549
+ outputOptions: z.object({
1550
+ includeDetailed: z.boolean().optional().default(true),
1551
+ includeAnalysis: z.boolean().optional().default(false),
1552
+ contactEmail: z.string().email().optional(),
1553
+ organizationName: z.string().optional(),
1554
+ customGuidelines: z.array(z.string()).optional(),
1555
+ customRestrictions: z.array(z.string()).optional()
1556
+ }).optional(),
1557
+ complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard'),
1558
+ format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both')
1559
+ }
1560
+ }, withAuth("generate_llms_txt", async (params) => {
1561
+ try {
1562
+ const result = await generateLLMsTxtTool.execute(params);
1563
+ return {
1564
+ content: [{
1565
+ type: "text",
1566
+ text: JSON.stringify(result, null, 2)
1567
+ }]
1568
+ };
1569
+ } catch (error) {
1570
+ return {
1571
+ content: [{
1572
+ type: "text",
1573
+ text: `LLMs.txt generation failed: ${error.message}`
1574
+ }],
1575
+ isError: true
1576
+ };
1577
+ }
1578
+ });
1579
+ */
1580
+
1581
+ // Tool: stealth_mode - Advanced anti-detection browser management (Wave 3)
1582
+ server.registerTool("stealth_mode", {
1583
+ description: "Advanced anti-detection browser management with stealth features, fingerprint randomization, and human behavior simulation",
1584
+ inputSchema: {
1585
+ operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure'),
1586
+ stealthConfig: z.object({
1587
+ level: z.enum(['basic', 'medium', 'advanced']).default('medium'),
1588
+ randomizeFingerprint: z.boolean().default(true),
1589
+ hideWebDriver: z.boolean().default(true),
1590
+ blockWebRTC: z.boolean().default(true),
1591
+ spoofTimezone: z.boolean().default(true),
1592
+ randomizeHeaders: z.boolean().default(true),
1593
+ useRandomUserAgent: z.boolean().default(true),
1594
+ simulateHumanBehavior: z.boolean().default(true),
1595
+ customUserAgent: z.string().optional(),
1596
+ customViewport: z.object({
1597
+ width: z.number().min(800).max(1920),
1598
+ height: z.number().min(600).max(1080)
1599
+ }).optional(),
1600
+ locale: z.string().default('en-US'),
1601
+ timezone: z.string().optional(),
1602
+ webRTCPublicIP: z.string().optional(),
1603
+ webRTCLocalIPs: z.array(z.string()).optional(),
1604
+ proxyRotation: z.object({
1605
+ enabled: z.boolean().default(false),
1606
+ proxies: z.array(z.string()).optional(),
1607
+ rotationInterval: z.number().default(300000)
1608
+ }).optional(),
1609
+ antiDetection: z.object({
1610
+ cloudflareBypass: z.boolean().default(true),
1611
+ recaptchaHandling: z.boolean().default(true),
1612
+ hideAutomation: z.boolean().default(true),
1613
+ spoofMediaDevices: z.boolean().default(true),
1614
+ spoofBatteryAPI: z.boolean().default(true)
1615
+ }).optional(),
1616
+ fingerprinting: z.object({
1617
+ canvasNoise: z.boolean().default(true),
1618
+ webglSpoofing: z.boolean().default(true),
1619
+ audioContextSpoofing: z.boolean().default(true),
1620
+ fontSpoofing: z.boolean().default(true),
1621
+ hardwareSpoofing: z.boolean().default(true)
1622
+ }).optional()
1623
+ }).optional(),
1624
+ contextId: z.string().optional(),
1625
+ urlToTest: z.string().url().optional()
1626
+ }
1627
+ }, withAuth("stealth_mode", async ({ operation, stealthConfig, contextId, urlToTest }) => {
1628
+ try {
1629
+ let result;
1630
+
1631
+ switch (operation) {
1632
+ case 'configure':
1633
+ if (stealthConfig) {
1634
+ const validated = stealthBrowserManager.validateConfig(stealthConfig);
1635
+ result = { configured: true, config: validated };
1636
+ } else {
1637
+ result = { error: 'stealthConfig is required for configure operation' };
1638
+ }
1639
+ break;
1640
+
1641
+ case 'enable':
1642
+ stealthBrowserManager.enableStealthMode(stealthConfig?.level || 'medium');
1643
+ result = { enabled: true, level: stealthConfig?.level || 'medium' };
1644
+ break;
1645
+
1646
+ case 'disable':
1647
+ stealthBrowserManager.disableStealthMode();
1648
+ result = { disabled: true };
1649
+ break;
1650
+
1651
+ case 'create_context':
1652
+ const contextData = await stealthBrowserManager.createStealthContext(stealthConfig);
1653
+ result = {
1654
+ contextId: contextData.contextId,
1655
+ fingerprint: contextData.fingerprint,
1656
+ created: true
1657
+ };
1658
+ break;
1659
+
1660
+ case 'create_page':
1661
+ if (!contextId) {
1662
+ throw new Error('contextId is required for create_page operation');
1663
+ }
1664
+ const page = await stealthBrowserManager.createStealthPage(contextId);
1665
+ result = {
1666
+ pageCreated: true,
1667
+ contextId: contextId,
1668
+ url: urlToTest ? await page.goto(urlToTest) : null
1669
+ };
1670
+ break;
1671
+
1672
+ case 'get_stats':
1673
+ result = stealthBrowserManager.getStats();
1674
+ break;
1675
+
1676
+ case 'cleanup':
1677
+ await stealthBrowserManager.cleanup();
1678
+ result = { cleaned: true };
1679
+ break;
1680
+
1681
+ default:
1682
+ result = { error: `Unknown operation: ${operation}` };
1683
+ }
1684
+
1685
+ return {
1686
+ content: [{
1687
+ type: "text",
1688
+ text: JSON.stringify(result, null, 2)
1689
+ }]
1690
+ };
1691
+ } catch (error) {
1692
+ return {
1693
+ content: [{
1694
+ type: "text",
1695
+ text: `Stealth mode operation failed: ${error.message}`
1696
+ }],
1697
+ isError: true
1698
+ };
1699
+ }
1700
+ }));
1701
+
1702
+ // Tool: localization - Multi-language and geo-location management (Wave 3)
1703
+ server.registerTool("localization", {
1704
+ description: "Multi-language and geo-location management with country-specific settings, browser locale emulation, timezone spoofing, and geo-blocked content handling",
1705
+ inputSchema: {
1706
+ operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country'),
1707
+ countryCode: z.string().length(2).optional(),
1708
+ language: z.string().optional(),
1709
+ timezone: z.string().optional(),
1710
+ currency: z.string().length(3).optional(),
1711
+ customHeaders: z.record(z.string()).optional(),
1712
+ userAgent: z.string().optional(),
1713
+ acceptLanguage: z.string().optional(),
1714
+ geoLocation: z.object({
1715
+ latitude: z.number().min(-90).max(90),
1716
+ longitude: z.number().min(-180).max(180),
1717
+ accuracy: z.number().min(1).max(100).optional()
1718
+ }).optional(),
1719
+ proxySettings: z.object({
1720
+ enabled: z.boolean().default(false),
1721
+ region: z.string().optional(),
1722
+ type: z.enum(['http', 'https', 'socks4', 'socks5']).default('https'),
1723
+ server: z.string().optional(),
1724
+ port: z.number().optional(),
1725
+ username: z.string().optional(),
1726
+ password: z.string().optional(),
1727
+ rotation: z.object({
1728
+ enabled: z.boolean().default(false),
1729
+ interval: z.number().default(300000),
1730
+ strategy: z.enum(['round-robin', 'random', 'failover']).default('round-robin')
1731
+ }).optional(),
1732
+ fallback: z.object({
1733
+ enabled: z.boolean().default(true),
1734
+ maxRetries: z.number().default(3),
1735
+ timeout: z.number().default(10000)
1736
+ }).optional()
1737
+ }).optional(),
1738
+ searchParams: z.object({
1739
+ query: z.string().optional(),
1740
+ limit: z.number().optional(),
1741
+ offset: z.number().optional(),
1742
+ headers: z.record(z.string()).optional()
1743
+ }).optional(),
1744
+ browserOptions: z.object({
1745
+ locale: z.string().optional(),
1746
+ timezoneId: z.string().optional(),
1747
+ extraHTTPHeaders: z.record(z.string()).optional(),
1748
+ userAgent: z.string().optional()
1749
+ }).optional(),
1750
+ content: z.string().optional(),
1751
+ url: z.string().url().optional(),
1752
+ response: z.object({
1753
+ status: z.number(),
1754
+ body: z.string().optional(),
1755
+ statusText: z.string().optional()
1756
+ }).optional()
1757
+ }
1758
+ }, withAuth("localization", async (params) => {
1759
+ try {
1760
+ const { operation } = params;
1761
+ let result;
1762
+
1763
+ switch (operation) {
1764
+ case 'configure_country':
1765
+ if (!params.countryCode) {
1766
+ throw new Error('countryCode is required for configure_country operation');
1767
+ }
1768
+ result = await localizationManager.configureCountry(params.countryCode, params);
1769
+ break;
1770
+
1771
+ case 'localize_search':
1772
+ if (!params.searchParams) {
1773
+ throw new Error('searchParams is required for localize_search operation');
1774
+ }
1775
+ result = await localizationManager.localizeSearchQuery(params.searchParams, params.countryCode);
1776
+ break;
1777
+
1778
+ case 'localize_browser':
1779
+ if (!params.browserOptions) {
1780
+ throw new Error('browserOptions is required for localize_browser operation');
1781
+ }
1782
+ result = await localizationManager.localizeBrowserContext(params.browserOptions, params.countryCode);
1783
+ break;
1784
+
1785
+ case 'generate_timezone_spoof':
1786
+ result = {
1787
+ timezoneScript: await localizationManager.generateTimezoneSpoof(params.countryCode),
1788
+ countryCode: params.countryCode || localizationManager.getCurrentSettings().countryCode
1789
+ };
1790
+ break;
1791
+
1792
+ case 'handle_geo_blocking':
1793
+ if (!params.url || !params.response) {
1794
+ throw new Error('url and response are required for handle_geo_blocking operation');
1795
+ }
1796
+ result = await localizationManager.handleGeoBlocking(params.url, params.response);
1797
+ break;
1798
+
1799
+ case 'auto_detect':
1800
+ if (!params.content || !params.url) {
1801
+ throw new Error('content and url are required for auto_detect operation');
1802
+ }
1803
+ result = await localizationManager.autoDetectLocalization(params.content, params.url);
1804
+ break;
1805
+
1806
+ case 'get_stats':
1807
+ result = localizationManager.getStats();
1808
+ break;
1809
+
1810
+ case 'get_supported_countries':
1811
+ result = {
1812
+ supportedCountries: localizationManager.getSupportedCountries(),
1813
+ totalCount: localizationManager.getSupportedCountries().length
1814
+ };
1815
+ break;
1816
+
1817
+ default:
1818
+ result = { error: `Unknown operation: ${operation}` };
1819
+ }
1820
+
1821
+ return {
1822
+ content: [{
1823
+ type: "text",
1824
+ text: JSON.stringify(result, null, 2)
1825
+ }]
1826
+ };
1827
+ } catch (error) {
1828
+ return {
1829
+ content: [{
1830
+ type: "text",
1831
+ text: `Localization operation failed: ${error.message}`
1832
+ }],
1833
+ isError: true
1834
+ };
1835
+ }
1836
+ }));
1837
+
1838
+ // Set up the stdio transport and start the server
1839
+ async function runServer() {
1840
+ const transport = new StdioServerTransport();
1841
+ await server.connect(transport);
1842
+ console.error("CrawlForge MCP Server v3.0 running on stdio");
1843
+ console.error(`Environment: ${config.server.nodeEnv}`);
1844
+
1845
+ if (isSearchConfigured()) {
1846
+ const activeProvider = getActiveSearchProvider();
1847
+ console.error(`Search enabled: ${isSearchConfigured()} (provider: ${activeProvider})`);
1848
+ } else {
1849
+ console.error(`Search enabled: ${isSearchConfigured()}`);
1850
+ }
1851
+
1852
+ const baseTools = 'fetch_url, extract_text, extract_links, extract_metadata, scrape_structured, crawl_deep, map_site';
1853
+ const searchTool = isSearchConfigured() ? ', search_web' : '';
1854
+ const phase3Tools = ', extract_content, process_document, summarize_content, analyze_content';
1855
+ const wave2Tools = ', batch_scrape, scrape_with_actions';
1856
+ const researchTools = ', deep_research';
1857
+ const trackingTools = ''; // track_changes temporarily disabled
1858
+ const llmsTxtTools = ', generate_llms_txt';
1859
+ const wave3Tools = ', stealth_mode, localization';
1860
+ console.error(`Tools available: ${baseTools}${searchTool}${phase3Tools}${wave2Tools}${researchTools}${trackingTools}${llmsTxtTools}${wave3Tools}`);
1861
+
1862
+ // Start memory monitoring in development
1863
+ if (config.server.nodeEnv === "development") {
1864
+ memoryMonitor.start();
1865
+ console.error("Memory monitoring started");
1866
+ }
1867
+ }
1868
+
1869
+ runServer().catch((error) => {
1870
+ console.error("Server error:", error);
1871
+ process.exit(1);
1872
+ });
1873
+ // === MEMORY LEAK PREVENTION ===
1874
+ // Add graceful shutdown handling to prevent memory leaks
1875
+
1876
+ let isShuttingDown = false;
1877
+
1878
+ async function gracefulShutdown(signal) {
1879
+ if (isShuttingDown) {
1880
+ console.error("Force shutdown...");
1881
+ process.exit(1);
1882
+ }
1883
+
1884
+ isShuttingDown = true;
1885
+ console.error(`Received ${signal}. Starting graceful shutdown...`);
1886
+
1887
+ try {
1888
+ // Cleanup tools that have destroy methods
1889
+ const toolsToCleanup = [
1890
+ batchScrapeTool,
1891
+ scrapeWithActionsTool,
1892
+ deepResearchTool,
1893
+ // trackChangesTool, // temporarily disabled
1894
+ generateLLMsTxtTool,
1895
+ stealthBrowserManager,
1896
+ localizationManager
1897
+ ].filter(tool => tool && (typeof tool.destroy === 'function' || typeof tool.cleanup === 'function'));
1898
+
1899
+ console.error(`Cleaning up ${toolsToCleanup.length} tools...`);
1900
+
1901
+ // Cleanup tools with timeout
1902
+ await Promise.race([
1903
+ Promise.all(toolsToCleanup.map(async (tool) => {
1904
+ try {
1905
+ if (typeof tool.destroy === 'function') {
1906
+ await tool.destroy();
1907
+ } else if (typeof tool.cleanup === 'function') {
1908
+ await tool.cleanup();
1909
+ }
1910
+ console.error(`Cleaned up ${tool.constructor.name}`);
1911
+ } catch (error) {
1912
+ console.error(`Error cleaning up ${tool.constructor.name}:`, error.message);
1913
+ }
1914
+ })),
1915
+ new Promise(resolve => setTimeout(resolve, 5000)) // 5 second timeout
1916
+ ]);
1917
+
1918
+ // Stop memory monitoring
1919
+ if (memoryMonitor.isMonitoring) {
1920
+ memoryMonitor.stop();
1921
+ console.error("Memory monitoring stopped");
1922
+ }
1923
+
1924
+ // Force garbage collection if available
1925
+ if (global.gc) {
1926
+ console.error("Running final garbage collection...");
1927
+ global.gc();
1928
+ }
1929
+
1930
+ console.error("Graceful shutdown completed");
1931
+ process.exit(0);
1932
+
1933
+ } catch (error) {
1934
+ console.error("Error during graceful shutdown:", error);
1935
+ process.exit(1);
1936
+ }
1937
+ }
1938
+
1939
+ // Register signal handlers
1940
+ process.on('SIGINT', () => gracefulShutdown('SIGINT'));
1941
+ process.on('SIGTERM', () => gracefulShutdown('SIGTERM'));
1942
+
1943
+ // Handle uncaught exceptions and unhandled rejections
1944
+ process.on('uncaughtException', (error) => {
1945
+ console.error('Uncaught Exception:', error);
1946
+ gracefulShutdown('uncaughtException');
1947
+ });
1948
+
1949
+ process.on('unhandledRejection', (reason, promise) => {
1950
+ console.error('Unhandled Rejection at:', promise, 'reason:', reason);
1951
+ gracefulShutdown('unhandledRejection');
1952
+ });
1953
+
1954
+ // Memory monitoring (development only)
1955
+ if (config.server.nodeEnv === 'development') {
1956
+ setInterval(() => {
1957
+ const usage = process.memoryUsage();
1958
+ const memoryMB = (usage.heapUsed / 1024 / 1024).toFixed(2);
1959
+ if (memoryMB > 200) { // Alert if over 200MB
1960
+ console.error(`Memory usage: ${memoryMB}MB (high usage detected)`);
1961
+ }
1962
+ }, 60000); // Check every minute
1963
+ }