crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,706 @@
1
+ import { z } from 'zod';
2
+ import { ResearchOrchestrator } from '../../core/ResearchOrchestrator.js';
3
+ import { Logger } from '../../utils/Logger.js';
4
+
5
+ /**
6
+ * DeepResearchTool - MCP tool for conducting comprehensive multi-stage research
7
+ * Provides intelligent research orchestration with source verification,
8
+ * conflict detection, and information synthesis
9
+ */
10
+
11
+ const DeepResearchSchema = z.object({
12
+ topic: z.string().min(3).max(500),
13
+
14
+ // Research scope configuration
15
+ maxDepth: z.number().min(1).max(10).optional().default(5),
16
+ maxUrls: z.number().min(1).max(1000).optional().default(50),
17
+ timeLimit: z.number().min(30000).max(300000).optional().default(120000), // 30s to 5min
18
+
19
+ // Research approach options
20
+ researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad'),
21
+
22
+ // Source filtering preferences
23
+ sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']),
24
+ credibilityThreshold: z.number().min(0).max(1).optional().default(0.3),
25
+ includeRecentOnly: z.boolean().optional().default(false),
26
+
27
+ // Analysis configuration
28
+ enableConflictDetection: z.boolean().optional().default(true),
29
+ enableSourceVerification: z.boolean().optional().default(true),
30
+ enableSynthesis: z.boolean().optional().default(true),
31
+
32
+ // Output preferences
33
+ outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive'),
34
+ includeRawData: z.boolean().optional().default(false),
35
+ includeActivityLog: z.boolean().optional().default(false),
36
+
37
+ // Advanced options
38
+ queryExpansion: z.object({
39
+ enableSynonyms: z.boolean().optional().default(true),
40
+ enableSpellCheck: z.boolean().optional().default(true),
41
+ enableContextual: z.boolean().optional().default(true),
42
+ maxVariations: z.number().min(1).max(20).optional().default(8)
43
+ }).optional(),
44
+
45
+ // LLM Configuration
46
+ llmConfig: z.object({
47
+ provider: z.enum(['auto', 'openai', 'anthropic']).optional().default('auto'),
48
+ openai: z.object({
49
+ apiKey: z.string().optional(),
50
+ model: z.string().optional().default('gpt-3.5-turbo'),
51
+ embeddingModel: z.string().optional().default('text-embedding-ada-002')
52
+ }).optional(),
53
+ anthropic: z.object({
54
+ apiKey: z.string().optional(),
55
+ model: z.string().optional().default('claude-3-haiku-20240307')
56
+ }).optional(),
57
+ enableSemanticAnalysis: z.boolean().optional().default(true),
58
+ enableIntelligentSynthesis: z.boolean().optional().default(true)
59
+ }).optional(),
60
+
61
+ concurrency: z.number().min(1).max(20).optional().default(5),
62
+ cacheResults: z.boolean().optional().default(true),
63
+
64
+ // Webhook notifications for long-running research
65
+ webhook: z.object({
66
+ url: z.string().url(),
67
+ events: z.array(z.enum(['started', 'progress', 'completed', 'failed'])).optional().default(['completed']),
68
+ headers: z.record(z.string()).optional()
69
+ }).optional()
70
+ });
71
+
72
+ export class DeepResearchTool {
73
+ constructor(options = {}) {
74
+ const {
75
+ defaultTimeLimit = 120000,
76
+ maxConcurrentResearch = 3,
77
+ cacheEnabled = true,
78
+ cacheTTL = 1800000, // 30 minutes
79
+ ...orchestratorOptions
80
+ } = options;
81
+
82
+ this.defaultTimeLimit = defaultTimeLimit;
83
+ this.maxConcurrentResearch = maxConcurrentResearch;
84
+ this.logger = new Logger({ component: 'DeepResearchTool' });
85
+
86
+ // Track active research sessions
87
+ this.activeSessions = new Map();
88
+ this.sessionQueue = [];
89
+
90
+ // Default orchestrator configuration
91
+ this.defaultOrchestratorConfig = {
92
+ cacheEnabled,
93
+ cacheTTL,
94
+ ...orchestratorOptions
95
+ };
96
+ }
97
+
98
+ async execute(params) {
99
+ try {
100
+ const validated = DeepResearchSchema.parse(params);
101
+ const sessionId = this.generateSessionId();
102
+
103
+ this.logger.info('Starting deep research', {
104
+ sessionId,
105
+ topic: validated.topic,
106
+ config: this.sanitizeConfigForLogging(validated)
107
+ });
108
+
109
+ // Check concurrent session limits
110
+ if (this.activeSessions.size >= this.maxConcurrentResearch) {
111
+ return {
112
+ success: false,
113
+ error: 'Maximum concurrent research sessions reached. Please try again later.',
114
+ queuePosition: this.sessionQueue.length + 1,
115
+ estimatedWaitTime: this.estimateWaitTime()
116
+ };
117
+ }
118
+
119
+ // Configure research orchestrator based on research approach
120
+ const orchestratorConfig = this.buildOrchestratorConfig(validated);
121
+ const orchestrator = new ResearchOrchestrator(orchestratorConfig);
122
+
123
+ // Register session
124
+ this.activeSessions.set(sessionId, {
125
+ orchestrator,
126
+ startTime: Date.now(),
127
+ topic: validated.topic,
128
+ status: 'running'
129
+ });
130
+
131
+ // Set up event listeners for progress tracking
132
+ this.setupEventListeners(orchestrator, sessionId, validated);
133
+
134
+ try {
135
+ // Conduct the research
136
+ const researchResults = await orchestrator.conductResearch(
137
+ validated.topic,
138
+ this.buildResearchOptions(validated)
139
+ );
140
+
141
+ // Format results according to output preference
142
+ const formattedResults = this.formatResults(researchResults, validated);
143
+
144
+ // Clean up session
145
+ this.activeSessions.delete(sessionId);
146
+
147
+ this.logger.info('Research completed successfully', {
148
+ sessionId,
149
+ duration: Date.now() - this.activeSessions.get(sessionId)?.startTime || 0,
150
+ findingsCount: researchResults.findings?.length || 0
151
+ });
152
+
153
+ return {
154
+ success: true,
155
+ sessionId,
156
+ ...formattedResults
157
+ };
158
+
159
+ } catch (researchError) {
160
+ // Handle research-specific errors
161
+ this.logger.error('Research execution failed', {
162
+ sessionId,
163
+ error: researchError.message
164
+ });
165
+
166
+ const partialResults = orchestrator.getResearchState();
167
+ this.activeSessions.delete(sessionId);
168
+
169
+ return {
170
+ success: false,
171
+ sessionId,
172
+ error: researchError.message,
173
+ partialResults: validated.includeRawData ? partialResults : undefined,
174
+ recommendations: this.generateErrorRecoveryRecommendations(researchError, validated)
175
+ };
176
+ }
177
+
178
+ } catch (validationError) {
179
+ if (validationError instanceof z.ZodError) {
180
+ return {
181
+ success: false,
182
+ error: 'Invalid parameters',
183
+ details: validationError.errors.map(err => ({
184
+ field: err.path.join('.'),
185
+ message: err.message,
186
+ received: err.received
187
+ }))
188
+ };
189
+ }
190
+
191
+ this.logger.error('Unexpected error in deep research', { error: validationError.message });
192
+ return {
193
+ success: false,
194
+ error: 'An unexpected error occurred during research initialization',
195
+ details: validationError.message
196
+ };
197
+ }
198
+ }
199
+
200
+ /**
201
+ * Build orchestrator configuration based on research approach
202
+ */
203
+ buildOrchestratorConfig(params) {
204
+ const baseConfig = { ...this.defaultOrchestratorConfig };
205
+
206
+ // Add LLM configuration if provided
207
+ if (params.llmConfig) {
208
+ baseConfig.llmConfig = params.llmConfig;
209
+ }
210
+
211
+ // Adjust configuration based on research approach
212
+ switch (params.researchApproach) {
213
+ case 'academic':
214
+ return {
215
+ ...baseConfig,
216
+ maxDepth: Math.min(params.maxDepth, 8),
217
+ enableSourceVerification: true,
218
+ searchConfig: {
219
+ enableRanking: true,
220
+ rankingWeights: {
221
+ authority: 0.4, // Higher weight for academic sources
222
+ semantic: 0.3,
223
+ bm25: 0.2,
224
+ freshness: 0.1
225
+ }
226
+ }
227
+ };
228
+
229
+ case 'current_events':
230
+ return {
231
+ ...baseConfig,
232
+ maxDepth: Math.min(params.maxDepth, 6),
233
+ searchConfig: {
234
+ enableRanking: true,
235
+ rankingWeights: {
236
+ freshness: 0.4, // Prioritize recent content
237
+ semantic: 0.3,
238
+ bm25: 0.2,
239
+ authority: 0.1
240
+ }
241
+ }
242
+ };
243
+
244
+ case 'focused':
245
+ return {
246
+ ...baseConfig,
247
+ maxDepth: Math.min(params.maxDepth, 4),
248
+ maxUrls: Math.min(params.maxUrls, 30),
249
+ concurrency: Math.min(params.concurrency, 3)
250
+ };
251
+
252
+ case 'comparative':
253
+ return {
254
+ ...baseConfig,
255
+ enableConflictDetection: true,
256
+ maxDepth: params.maxDepth,
257
+ searchConfig: {
258
+ enableDeduplication: true,
259
+ deduplicationThresholds: {
260
+ url: 0.9,
261
+ title: 0.8,
262
+ content: 0.7
263
+ }
264
+ }
265
+ };
266
+
267
+ case 'broad':
268
+ default:
269
+ return {
270
+ ...baseConfig,
271
+ maxDepth: params.maxDepth,
272
+ maxUrls: params.maxUrls,
273
+ timeLimit: params.timeLimit
274
+ };
275
+ }
276
+ }
277
+
278
+ /**
279
+ * Build research options for the orchestrator
280
+ */
281
+ buildResearchOptions(params) {
282
+ return {
283
+ sourceTypes: params.sourceTypes,
284
+ credibilityThreshold: params.credibilityThreshold,
285
+ includeRecentOnly: params.includeRecentOnly,
286
+ queryExpansion: params.queryExpansion,
287
+ enableConflictDetection: params.enableConflictDetection,
288
+ enableSourceVerification: params.enableSourceVerification,
289
+ enableSynthesis: params.enableSynthesis,
290
+ concurrency: params.concurrency
291
+ };
292
+ }
293
+
294
+ /**
295
+ * Set up event listeners for research progress tracking
296
+ */
297
+ setupEventListeners(orchestrator, sessionId, params) {
298
+ if (params.webhook) {
299
+ orchestrator.on('researchCompleted', (data) => {
300
+ this.sendWebhookNotification(params.webhook, 'completed', {
301
+ sessionId,
302
+ ...data
303
+ });
304
+ });
305
+
306
+ orchestrator.on('researchFailed', (data) => {
307
+ this.sendWebhookNotification(params.webhook, 'failed', {
308
+ sessionId,
309
+ ...data
310
+ });
311
+ });
312
+
313
+ orchestrator.on('activityLogged', (activity) => {
314
+ if (params.webhook.events.includes('progress')) {
315
+ this.sendWebhookNotification(params.webhook, 'progress', {
316
+ sessionId,
317
+ activity
318
+ });
319
+ }
320
+ });
321
+ }
322
+
323
+ // Internal progress tracking
324
+ orchestrator.on('activityLogged', (activity) => {
325
+ const session = this.activeSessions.get(sessionId);
326
+ if (session) {
327
+ session.lastActivity = activity;
328
+ session.status = activity.type;
329
+ }
330
+ });
331
+ }
332
+
333
+ /**
334
+ * Format research results according to output preferences
335
+ */
336
+ formatResults(results, params) {
337
+ const formatted = {
338
+ researchSummary: results.researchSummary,
339
+ metadata: results.metadata
340
+ };
341
+
342
+ switch (params.outputFormat) {
343
+ case 'comprehensive':
344
+ return {
345
+ ...formatted,
346
+ findings: results.findings,
347
+ supportingEvidence: results.supportingEvidence,
348
+ consensus: results.consensus,
349
+ conflicts: results.conflicts,
350
+ researchGaps: results.researchGaps,
351
+ recommendations: results.recommendations,
352
+ credibilityAssessment: results.credibilityAssessment,
353
+ performance: results.performance,
354
+ activityLog: params.includeActivityLog ? results.activityLog : undefined,
355
+ rawData: params.includeRawData ? this.extractRawData(results) : undefined
356
+ };
357
+
358
+ case 'summary':
359
+ return {
360
+ ...formatted,
361
+ keyFindings: results.findings.slice(0, 5),
362
+ topSources: results.supportingEvidence.slice(0, 5),
363
+ mainConflicts: results.conflicts.slice(0, 3),
364
+ primaryRecommendations: results.recommendations.slice(0, 3),
365
+ credibilityOverview: {
366
+ averageCredibility: results.credibilityAssessment.averageCredibility,
367
+ highCredibilitySources: results.credibilityAssessment.highCredibilitySources
368
+ }
369
+ };
370
+
371
+ case 'citations_only':
372
+ return {
373
+ ...formatted,
374
+ sources: results.supportingEvidence.map(source => ({
375
+ title: source.title,
376
+ url: source.url,
377
+ credibility: source.credibility,
378
+ relevance: source.relevance
379
+ })),
380
+ citationCount: results.supportingEvidence.length,
381
+ citationSummary: this.generateCitationSummary(results.supportingEvidence)
382
+ };
383
+
384
+ case 'conflicts_focus':
385
+ return {
386
+ ...formatted,
387
+ conflicts: results.conflicts,
388
+ conflictAnalysis: this.analyzeConflicts(results.conflicts),
389
+ consensusAreas: results.consensus,
390
+ recommendedActions: results.recommendations.filter(r =>
391
+ r.type === 'conflict_resolution' || r.type === 'validation'
392
+ )
393
+ };
394
+
395
+ default:
396
+ return formatted;
397
+ }
398
+ }
399
+
400
+ /**
401
+ * Generate citation summary for citation-focused output
402
+ */
403
+ generateCitationSummary(sources) {
404
+ const domainCounts = {};
405
+ const typeDistribution = { academic: 0, commercial: 0, government: 0, other: 0 };
406
+
407
+ sources.forEach(source => {
408
+ try {
409
+ const domain = new URL(source.url).hostname;
410
+ domainCounts[domain] = (domainCounts[domain] || 0) + 1;
411
+
412
+ // Classify source type
413
+ if (domain.includes('edu') || domain.includes('scholar')) {
414
+ typeDistribution.academic++;
415
+ } else if (domain.includes('gov')) {
416
+ typeDistribution.government++;
417
+ } else if (domain.includes('com')) {
418
+ typeDistribution.commercial++;
419
+ } else {
420
+ typeDistribution.other++;
421
+ }
422
+ } catch {
423
+ typeDistribution.other++;
424
+ }
425
+ });
426
+
427
+ return {
428
+ totalSources: sources.length,
429
+ uniqueDomains: Object.keys(domainCounts).length,
430
+ topDomains: Object.entries(domainCounts)
431
+ .sort(([,a], [,b]) => b - a)
432
+ .slice(0, 5)
433
+ .map(([domain, count]) => ({ domain, count })),
434
+ sourceTypeDistribution: typeDistribution,
435
+ averageCredibility: sources.reduce((sum, s) => sum + (s.credibility || 0), 0) / sources.length
436
+ };
437
+ }
438
+
439
+ /**
440
+ * Analyze conflicts for conflict-focused output
441
+ */
442
+ analyzeConflicts(conflicts) {
443
+ if (!conflicts.length) {
444
+ return {
445
+ conflictCount: 0,
446
+ severity: 'none',
447
+ resolutionPriority: 'low'
448
+ };
449
+ }
450
+
451
+ const severityLevels = conflicts.map(c => c.severity || 0.5);
452
+ const avgSeverity = severityLevels.reduce((sum, s) => sum + s, 0) / severityLevels.length;
453
+
454
+ return {
455
+ conflictCount: conflicts.length,
456
+ averageSeverity: avgSeverity,
457
+ severityDistribution: {
458
+ high: severityLevels.filter(s => s >= 0.7).length,
459
+ medium: severityLevels.filter(s => s >= 0.4 && s < 0.7).length,
460
+ low: severityLevels.filter(s => s < 0.4).length
461
+ },
462
+ conflictTypes: [...new Set(conflicts.map(c => c.type))],
463
+ resolutionPriority: avgSeverity >= 0.7 ? 'high' : avgSeverity >= 0.4 ? 'medium' : 'low',
464
+ recommendations: this.generateConflictResolutionRecommendations(conflicts)
465
+ };
466
+ }
467
+
468
+ /**
469
+ * Generate conflict resolution recommendations
470
+ */
471
+ generateConflictResolutionRecommendations(conflicts) {
472
+ const recommendations = [];
473
+
474
+ const highSeverityConflicts = conflicts.filter(c => (c.severity || 0) >= 0.7);
475
+ if (highSeverityConflicts.length > 0) {
476
+ recommendations.push({
477
+ type: 'priority_investigation',
478
+ description: `Investigate ${highSeverityConflicts.length} high-severity conflicts immediately`,
479
+ urgency: 'high'
480
+ });
481
+ }
482
+
483
+ const contradictionConflicts = conflicts.filter(c => c.type === 'contradiction');
484
+ if (contradictionConflicts.length > 0) {
485
+ recommendations.push({
486
+ type: 'cross_reference',
487
+ description: 'Cross-reference contradictory claims with additional authoritative sources',
488
+ urgency: 'medium'
489
+ });
490
+ }
491
+
492
+ recommendations.push({
493
+ type: 'expert_consultation',
494
+ description: 'Consider consulting domain experts for conflict resolution',
495
+ urgency: 'low'
496
+ });
497
+
498
+ return recommendations;
499
+ }
500
+
501
+ /**
502
+ * Extract raw data for debugging/analysis
503
+ */
504
+ extractRawData(results) {
505
+ return {
506
+ searchQueries: results.activityLog?.filter(log => log.type === 'topic_expansion'),
507
+ sourcesDiscovered: results.activityLog?.filter(log => log.type === 'initial_gathering'),
508
+ extractionResults: results.activityLog?.filter(log => log.type === 'deep_exploration'),
509
+ verificationResults: results.activityLog?.filter(log => log.type === 'source_verification'),
510
+ synthesisProcess: results.activityLog?.filter(log => log.type === 'information_synthesis')
511
+ };
512
+ }
513
+
514
+ /**
515
+ * Generate error recovery recommendations
516
+ */
517
+ generateErrorRecoveryRecommendations(error, params) {
518
+ const recommendations = [];
519
+
520
+ if (error.message.includes('timeout') || error.message.includes('time limit')) {
521
+ recommendations.push({
522
+ type: 'increase_time_limit',
523
+ description: `Consider increasing time limit beyond ${params.timeLimit}ms`,
524
+ suggestedValue: Math.min(params.timeLimit * 2, 300000)
525
+ });
526
+
527
+ recommendations.push({
528
+ type: 'reduce_scope',
529
+ description: 'Reduce research scope to fit within time constraints',
530
+ suggestions: {
531
+ maxUrls: Math.ceil(params.maxUrls * 0.7),
532
+ maxDepth: Math.max(1, params.maxDepth - 1)
533
+ }
534
+ });
535
+ }
536
+
537
+ if (error.message.includes('network') || error.message.includes('fetch')) {
538
+ recommendations.push({
539
+ type: 'retry_with_delay',
540
+ description: 'Network issues detected. Retry with increased delays between requests',
541
+ suggestedDelay: 2000
542
+ });
543
+ }
544
+
545
+ if (error.message.includes('rate limit')) {
546
+ recommendations.push({
547
+ type: 'reduce_concurrency',
548
+ description: 'Rate limiting detected. Reduce concurrent operations',
549
+ suggestedConcurrency: Math.max(1, Math.ceil(params.concurrency / 2))
550
+ });
551
+ }
552
+
553
+ recommendations.push({
554
+ type: 'fallback_approach',
555
+ description: 'Try a more focused research approach',
556
+ suggestedApproach: 'focused'
557
+ });
558
+
559
+ return recommendations;
560
+ }
561
+
562
+ /**
563
+ * Send webhook notification
564
+ */
565
+ async sendWebhookNotification(webhook, event, data) {
566
+ if (!webhook.events.includes(event)) return;
567
+
568
+ try {
569
+ const payload = {
570
+ event,
571
+ timestamp: new Date().toISOString(),
572
+ data
573
+ };
574
+
575
+ const response = await fetch(webhook.url, {
576
+ method: 'POST',
577
+ headers: {
578
+ 'Content-Type': 'application/json',
579
+ 'User-Agent': 'MCP-WebScraper-DeepResearch/1.0',
580
+ ...webhook.headers
581
+ },
582
+ body: JSON.stringify(payload)
583
+ });
584
+
585
+ if (!response.ok) {
586
+ this.logger.warn('Webhook notification failed', {
587
+ url: webhook.url,
588
+ status: response.status,
589
+ event
590
+ });
591
+ }
592
+ } catch (error) {
593
+ this.logger.error('Webhook notification error', {
594
+ url: webhook.url,
595
+ error: error.message,
596
+ event
597
+ });
598
+ }
599
+ }
600
+
601
+ /**
602
+ * Utility methods
603
+ */
604
+ generateSessionId() {
605
+ return `deep_research_${Date.now()}_${Math.random().toString(36).substring(2, 8)}`;
606
+ }
607
+
608
+ sanitizeConfigForLogging(config) {
609
+ const { webhook, ...safeConfig } = config;
610
+ return {
611
+ ...safeConfig,
612
+ webhook: webhook ? { url: webhook.url, events: webhook.events } : undefined
613
+ };
614
+ }
615
+
616
+ estimateWaitTime() {
617
+ const avgSessionTime = 120000; // 2 minutes average
618
+ const queueSize = this.sessionQueue.length;
619
+ return queueSize * avgSessionTime;
620
+ }
621
+
622
+ /**
623
+ * Public API for session management
624
+ */
625
+ getActiveSessions() {
626
+ return Array.from(this.activeSessions.entries()).map(([id, session]) => ({
627
+ sessionId: id,
628
+ topic: session.topic,
629
+ status: session.status,
630
+ duration: Date.now() - session.startTime,
631
+ lastActivity: session.lastActivity?.type
632
+ }));
633
+ }
634
+
635
+ getSessionStatus(sessionId) {
636
+ const session = this.activeSessions.get(sessionId);
637
+ if (!session) return null;
638
+
639
+ return {
640
+ sessionId,
641
+ topic: session.topic,
642
+ status: session.status,
643
+ duration: Date.now() - session.startTime,
644
+ lastActivity: session.lastActivity,
645
+ metrics: session.orchestrator.getMetrics()
646
+ };
647
+ }
648
+
649
+ cancelSession(sessionId) {
650
+ const session = this.activeSessions.get(sessionId);
651
+ if (session) {
652
+ session.orchestrator.stopResearch();
653
+ this.activeSessions.delete(sessionId);
654
+ return true;
655
+ }
656
+ return false;
657
+ }
658
+
659
+ /**
660
+ * Get tool statistics and health information
661
+ */
662
+ getStats() {
663
+ return {
664
+ activeSessions: this.activeSessions.size,
665
+ queuedSessions: this.sessionQueue.length,
666
+ maxConcurrent: this.maxConcurrentResearch,
667
+ uptime: process.uptime(),
668
+ memory: process.memoryUsage()
669
+ };
670
+ }
671
+ }
672
+
673
+ export default DeepResearchTool;
674
+ // Create and export tool instance for MCP compatibility
675
+ export const deepResearchTool = new DeepResearchTool();
676
+
677
+ // Add name property for MCP protocol compliance
678
+ deepResearchTool.name = 'deep_research';
679
+
680
+ // Add validateParameters method for MCP protocol compliance
681
+ deepResearchTool.validateParameters = function(params) {
682
+ return DeepResearchSchema.parse(params);
683
+ };
684
+
685
+ // Add description property for MCP protocol compliance
686
+ deepResearchTool.description = 'Conduct comprehensive multi-stage research with intelligent query expansion, source verification, and information synthesis';
687
+
688
+ // Add inputSchema property for MCP protocol compliance
689
+ deepResearchTool.inputSchema = {
690
+ type: 'object',
691
+ properties: {
692
+ topic: {
693
+ type: 'string',
694
+ description: 'The research topic or question to investigate'
695
+ },
696
+ maxDepth: {
697
+ type: 'number',
698
+ description: 'Maximum depth for research exploration'
699
+ },
700
+ maxUrls: {
701
+ type: 'number',
702
+ description: 'Maximum number of URLs to process'
703
+ }
704
+ },
705
+ required: ['topic']
706
+ };