crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,706 @@
|
|
|
1
|
+
import { z } from 'zod';
|
|
2
|
+
import { ResearchOrchestrator } from '../../core/ResearchOrchestrator.js';
|
|
3
|
+
import { Logger } from '../../utils/Logger.js';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* DeepResearchTool - MCP tool for conducting comprehensive multi-stage research
|
|
7
|
+
* Provides intelligent research orchestration with source verification,
|
|
8
|
+
* conflict detection, and information synthesis
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
const DeepResearchSchema = z.object({
|
|
12
|
+
topic: z.string().min(3).max(500),
|
|
13
|
+
|
|
14
|
+
// Research scope configuration
|
|
15
|
+
maxDepth: z.number().min(1).max(10).optional().default(5),
|
|
16
|
+
maxUrls: z.number().min(1).max(1000).optional().default(50),
|
|
17
|
+
timeLimit: z.number().min(30000).max(300000).optional().default(120000), // 30s to 5min
|
|
18
|
+
|
|
19
|
+
// Research approach options
|
|
20
|
+
researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad'),
|
|
21
|
+
|
|
22
|
+
// Source filtering preferences
|
|
23
|
+
sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']),
|
|
24
|
+
credibilityThreshold: z.number().min(0).max(1).optional().default(0.3),
|
|
25
|
+
includeRecentOnly: z.boolean().optional().default(false),
|
|
26
|
+
|
|
27
|
+
// Analysis configuration
|
|
28
|
+
enableConflictDetection: z.boolean().optional().default(true),
|
|
29
|
+
enableSourceVerification: z.boolean().optional().default(true),
|
|
30
|
+
enableSynthesis: z.boolean().optional().default(true),
|
|
31
|
+
|
|
32
|
+
// Output preferences
|
|
33
|
+
outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive'),
|
|
34
|
+
includeRawData: z.boolean().optional().default(false),
|
|
35
|
+
includeActivityLog: z.boolean().optional().default(false),
|
|
36
|
+
|
|
37
|
+
// Advanced options
|
|
38
|
+
queryExpansion: z.object({
|
|
39
|
+
enableSynonyms: z.boolean().optional().default(true),
|
|
40
|
+
enableSpellCheck: z.boolean().optional().default(true),
|
|
41
|
+
enableContextual: z.boolean().optional().default(true),
|
|
42
|
+
maxVariations: z.number().min(1).max(20).optional().default(8)
|
|
43
|
+
}).optional(),
|
|
44
|
+
|
|
45
|
+
// LLM Configuration
|
|
46
|
+
llmConfig: z.object({
|
|
47
|
+
provider: z.enum(['auto', 'openai', 'anthropic']).optional().default('auto'),
|
|
48
|
+
openai: z.object({
|
|
49
|
+
apiKey: z.string().optional(),
|
|
50
|
+
model: z.string().optional().default('gpt-3.5-turbo'),
|
|
51
|
+
embeddingModel: z.string().optional().default('text-embedding-ada-002')
|
|
52
|
+
}).optional(),
|
|
53
|
+
anthropic: z.object({
|
|
54
|
+
apiKey: z.string().optional(),
|
|
55
|
+
model: z.string().optional().default('claude-3-haiku-20240307')
|
|
56
|
+
}).optional(),
|
|
57
|
+
enableSemanticAnalysis: z.boolean().optional().default(true),
|
|
58
|
+
enableIntelligentSynthesis: z.boolean().optional().default(true)
|
|
59
|
+
}).optional(),
|
|
60
|
+
|
|
61
|
+
concurrency: z.number().min(1).max(20).optional().default(5),
|
|
62
|
+
cacheResults: z.boolean().optional().default(true),
|
|
63
|
+
|
|
64
|
+
// Webhook notifications for long-running research
|
|
65
|
+
webhook: z.object({
|
|
66
|
+
url: z.string().url(),
|
|
67
|
+
events: z.array(z.enum(['started', 'progress', 'completed', 'failed'])).optional().default(['completed']),
|
|
68
|
+
headers: z.record(z.string()).optional()
|
|
69
|
+
}).optional()
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
export class DeepResearchTool {
|
|
73
|
+
constructor(options = {}) {
|
|
74
|
+
const {
|
|
75
|
+
defaultTimeLimit = 120000,
|
|
76
|
+
maxConcurrentResearch = 3,
|
|
77
|
+
cacheEnabled = true,
|
|
78
|
+
cacheTTL = 1800000, // 30 minutes
|
|
79
|
+
...orchestratorOptions
|
|
80
|
+
} = options;
|
|
81
|
+
|
|
82
|
+
this.defaultTimeLimit = defaultTimeLimit;
|
|
83
|
+
this.maxConcurrentResearch = maxConcurrentResearch;
|
|
84
|
+
this.logger = new Logger({ component: 'DeepResearchTool' });
|
|
85
|
+
|
|
86
|
+
// Track active research sessions
|
|
87
|
+
this.activeSessions = new Map();
|
|
88
|
+
this.sessionQueue = [];
|
|
89
|
+
|
|
90
|
+
// Default orchestrator configuration
|
|
91
|
+
this.defaultOrchestratorConfig = {
|
|
92
|
+
cacheEnabled,
|
|
93
|
+
cacheTTL,
|
|
94
|
+
...orchestratorOptions
|
|
95
|
+
};
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
async execute(params) {
|
|
99
|
+
try {
|
|
100
|
+
const validated = DeepResearchSchema.parse(params);
|
|
101
|
+
const sessionId = this.generateSessionId();
|
|
102
|
+
|
|
103
|
+
this.logger.info('Starting deep research', {
|
|
104
|
+
sessionId,
|
|
105
|
+
topic: validated.topic,
|
|
106
|
+
config: this.sanitizeConfigForLogging(validated)
|
|
107
|
+
});
|
|
108
|
+
|
|
109
|
+
// Check concurrent session limits
|
|
110
|
+
if (this.activeSessions.size >= this.maxConcurrentResearch) {
|
|
111
|
+
return {
|
|
112
|
+
success: false,
|
|
113
|
+
error: 'Maximum concurrent research sessions reached. Please try again later.',
|
|
114
|
+
queuePosition: this.sessionQueue.length + 1,
|
|
115
|
+
estimatedWaitTime: this.estimateWaitTime()
|
|
116
|
+
};
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// Configure research orchestrator based on research approach
|
|
120
|
+
const orchestratorConfig = this.buildOrchestratorConfig(validated);
|
|
121
|
+
const orchestrator = new ResearchOrchestrator(orchestratorConfig);
|
|
122
|
+
|
|
123
|
+
// Register session
|
|
124
|
+
this.activeSessions.set(sessionId, {
|
|
125
|
+
orchestrator,
|
|
126
|
+
startTime: Date.now(),
|
|
127
|
+
topic: validated.topic,
|
|
128
|
+
status: 'running'
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
// Set up event listeners for progress tracking
|
|
132
|
+
this.setupEventListeners(orchestrator, sessionId, validated);
|
|
133
|
+
|
|
134
|
+
try {
|
|
135
|
+
// Conduct the research
|
|
136
|
+
const researchResults = await orchestrator.conductResearch(
|
|
137
|
+
validated.topic,
|
|
138
|
+
this.buildResearchOptions(validated)
|
|
139
|
+
);
|
|
140
|
+
|
|
141
|
+
// Format results according to output preference
|
|
142
|
+
const formattedResults = this.formatResults(researchResults, validated);
|
|
143
|
+
|
|
144
|
+
// Clean up session
|
|
145
|
+
this.activeSessions.delete(sessionId);
|
|
146
|
+
|
|
147
|
+
this.logger.info('Research completed successfully', {
|
|
148
|
+
sessionId,
|
|
149
|
+
duration: Date.now() - this.activeSessions.get(sessionId)?.startTime || 0,
|
|
150
|
+
findingsCount: researchResults.findings?.length || 0
|
|
151
|
+
});
|
|
152
|
+
|
|
153
|
+
return {
|
|
154
|
+
success: true,
|
|
155
|
+
sessionId,
|
|
156
|
+
...formattedResults
|
|
157
|
+
};
|
|
158
|
+
|
|
159
|
+
} catch (researchError) {
|
|
160
|
+
// Handle research-specific errors
|
|
161
|
+
this.logger.error('Research execution failed', {
|
|
162
|
+
sessionId,
|
|
163
|
+
error: researchError.message
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
const partialResults = orchestrator.getResearchState();
|
|
167
|
+
this.activeSessions.delete(sessionId);
|
|
168
|
+
|
|
169
|
+
return {
|
|
170
|
+
success: false,
|
|
171
|
+
sessionId,
|
|
172
|
+
error: researchError.message,
|
|
173
|
+
partialResults: validated.includeRawData ? partialResults : undefined,
|
|
174
|
+
recommendations: this.generateErrorRecoveryRecommendations(researchError, validated)
|
|
175
|
+
};
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
} catch (validationError) {
|
|
179
|
+
if (validationError instanceof z.ZodError) {
|
|
180
|
+
return {
|
|
181
|
+
success: false,
|
|
182
|
+
error: 'Invalid parameters',
|
|
183
|
+
details: validationError.errors.map(err => ({
|
|
184
|
+
field: err.path.join('.'),
|
|
185
|
+
message: err.message,
|
|
186
|
+
received: err.received
|
|
187
|
+
}))
|
|
188
|
+
};
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
this.logger.error('Unexpected error in deep research', { error: validationError.message });
|
|
192
|
+
return {
|
|
193
|
+
success: false,
|
|
194
|
+
error: 'An unexpected error occurred during research initialization',
|
|
195
|
+
details: validationError.message
|
|
196
|
+
};
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
/**
|
|
201
|
+
* Build orchestrator configuration based on research approach
|
|
202
|
+
*/
|
|
203
|
+
buildOrchestratorConfig(params) {
|
|
204
|
+
const baseConfig = { ...this.defaultOrchestratorConfig };
|
|
205
|
+
|
|
206
|
+
// Add LLM configuration if provided
|
|
207
|
+
if (params.llmConfig) {
|
|
208
|
+
baseConfig.llmConfig = params.llmConfig;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// Adjust configuration based on research approach
|
|
212
|
+
switch (params.researchApproach) {
|
|
213
|
+
case 'academic':
|
|
214
|
+
return {
|
|
215
|
+
...baseConfig,
|
|
216
|
+
maxDepth: Math.min(params.maxDepth, 8),
|
|
217
|
+
enableSourceVerification: true,
|
|
218
|
+
searchConfig: {
|
|
219
|
+
enableRanking: true,
|
|
220
|
+
rankingWeights: {
|
|
221
|
+
authority: 0.4, // Higher weight for academic sources
|
|
222
|
+
semantic: 0.3,
|
|
223
|
+
bm25: 0.2,
|
|
224
|
+
freshness: 0.1
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
};
|
|
228
|
+
|
|
229
|
+
case 'current_events':
|
|
230
|
+
return {
|
|
231
|
+
...baseConfig,
|
|
232
|
+
maxDepth: Math.min(params.maxDepth, 6),
|
|
233
|
+
searchConfig: {
|
|
234
|
+
enableRanking: true,
|
|
235
|
+
rankingWeights: {
|
|
236
|
+
freshness: 0.4, // Prioritize recent content
|
|
237
|
+
semantic: 0.3,
|
|
238
|
+
bm25: 0.2,
|
|
239
|
+
authority: 0.1
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
};
|
|
243
|
+
|
|
244
|
+
case 'focused':
|
|
245
|
+
return {
|
|
246
|
+
...baseConfig,
|
|
247
|
+
maxDepth: Math.min(params.maxDepth, 4),
|
|
248
|
+
maxUrls: Math.min(params.maxUrls, 30),
|
|
249
|
+
concurrency: Math.min(params.concurrency, 3)
|
|
250
|
+
};
|
|
251
|
+
|
|
252
|
+
case 'comparative':
|
|
253
|
+
return {
|
|
254
|
+
...baseConfig,
|
|
255
|
+
enableConflictDetection: true,
|
|
256
|
+
maxDepth: params.maxDepth,
|
|
257
|
+
searchConfig: {
|
|
258
|
+
enableDeduplication: true,
|
|
259
|
+
deduplicationThresholds: {
|
|
260
|
+
url: 0.9,
|
|
261
|
+
title: 0.8,
|
|
262
|
+
content: 0.7
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
};
|
|
266
|
+
|
|
267
|
+
case 'broad':
|
|
268
|
+
default:
|
|
269
|
+
return {
|
|
270
|
+
...baseConfig,
|
|
271
|
+
maxDepth: params.maxDepth,
|
|
272
|
+
maxUrls: params.maxUrls,
|
|
273
|
+
timeLimit: params.timeLimit
|
|
274
|
+
};
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Build research options for the orchestrator
|
|
280
|
+
*/
|
|
281
|
+
buildResearchOptions(params) {
|
|
282
|
+
return {
|
|
283
|
+
sourceTypes: params.sourceTypes,
|
|
284
|
+
credibilityThreshold: params.credibilityThreshold,
|
|
285
|
+
includeRecentOnly: params.includeRecentOnly,
|
|
286
|
+
queryExpansion: params.queryExpansion,
|
|
287
|
+
enableConflictDetection: params.enableConflictDetection,
|
|
288
|
+
enableSourceVerification: params.enableSourceVerification,
|
|
289
|
+
enableSynthesis: params.enableSynthesis,
|
|
290
|
+
concurrency: params.concurrency
|
|
291
|
+
};
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/**
|
|
295
|
+
* Set up event listeners for research progress tracking
|
|
296
|
+
*/
|
|
297
|
+
setupEventListeners(orchestrator, sessionId, params) {
|
|
298
|
+
if (params.webhook) {
|
|
299
|
+
orchestrator.on('researchCompleted', (data) => {
|
|
300
|
+
this.sendWebhookNotification(params.webhook, 'completed', {
|
|
301
|
+
sessionId,
|
|
302
|
+
...data
|
|
303
|
+
});
|
|
304
|
+
});
|
|
305
|
+
|
|
306
|
+
orchestrator.on('researchFailed', (data) => {
|
|
307
|
+
this.sendWebhookNotification(params.webhook, 'failed', {
|
|
308
|
+
sessionId,
|
|
309
|
+
...data
|
|
310
|
+
});
|
|
311
|
+
});
|
|
312
|
+
|
|
313
|
+
orchestrator.on('activityLogged', (activity) => {
|
|
314
|
+
if (params.webhook.events.includes('progress')) {
|
|
315
|
+
this.sendWebhookNotification(params.webhook, 'progress', {
|
|
316
|
+
sessionId,
|
|
317
|
+
activity
|
|
318
|
+
});
|
|
319
|
+
}
|
|
320
|
+
});
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Internal progress tracking
|
|
324
|
+
orchestrator.on('activityLogged', (activity) => {
|
|
325
|
+
const session = this.activeSessions.get(sessionId);
|
|
326
|
+
if (session) {
|
|
327
|
+
session.lastActivity = activity;
|
|
328
|
+
session.status = activity.type;
|
|
329
|
+
}
|
|
330
|
+
});
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
/**
|
|
334
|
+
* Format research results according to output preferences
|
|
335
|
+
*/
|
|
336
|
+
formatResults(results, params) {
|
|
337
|
+
const formatted = {
|
|
338
|
+
researchSummary: results.researchSummary,
|
|
339
|
+
metadata: results.metadata
|
|
340
|
+
};
|
|
341
|
+
|
|
342
|
+
switch (params.outputFormat) {
|
|
343
|
+
case 'comprehensive':
|
|
344
|
+
return {
|
|
345
|
+
...formatted,
|
|
346
|
+
findings: results.findings,
|
|
347
|
+
supportingEvidence: results.supportingEvidence,
|
|
348
|
+
consensus: results.consensus,
|
|
349
|
+
conflicts: results.conflicts,
|
|
350
|
+
researchGaps: results.researchGaps,
|
|
351
|
+
recommendations: results.recommendations,
|
|
352
|
+
credibilityAssessment: results.credibilityAssessment,
|
|
353
|
+
performance: results.performance,
|
|
354
|
+
activityLog: params.includeActivityLog ? results.activityLog : undefined,
|
|
355
|
+
rawData: params.includeRawData ? this.extractRawData(results) : undefined
|
|
356
|
+
};
|
|
357
|
+
|
|
358
|
+
case 'summary':
|
|
359
|
+
return {
|
|
360
|
+
...formatted,
|
|
361
|
+
keyFindings: results.findings.slice(0, 5),
|
|
362
|
+
topSources: results.supportingEvidence.slice(0, 5),
|
|
363
|
+
mainConflicts: results.conflicts.slice(0, 3),
|
|
364
|
+
primaryRecommendations: results.recommendations.slice(0, 3),
|
|
365
|
+
credibilityOverview: {
|
|
366
|
+
averageCredibility: results.credibilityAssessment.averageCredibility,
|
|
367
|
+
highCredibilitySources: results.credibilityAssessment.highCredibilitySources
|
|
368
|
+
}
|
|
369
|
+
};
|
|
370
|
+
|
|
371
|
+
case 'citations_only':
|
|
372
|
+
return {
|
|
373
|
+
...formatted,
|
|
374
|
+
sources: results.supportingEvidence.map(source => ({
|
|
375
|
+
title: source.title,
|
|
376
|
+
url: source.url,
|
|
377
|
+
credibility: source.credibility,
|
|
378
|
+
relevance: source.relevance
|
|
379
|
+
})),
|
|
380
|
+
citationCount: results.supportingEvidence.length,
|
|
381
|
+
citationSummary: this.generateCitationSummary(results.supportingEvidence)
|
|
382
|
+
};
|
|
383
|
+
|
|
384
|
+
case 'conflicts_focus':
|
|
385
|
+
return {
|
|
386
|
+
...formatted,
|
|
387
|
+
conflicts: results.conflicts,
|
|
388
|
+
conflictAnalysis: this.analyzeConflicts(results.conflicts),
|
|
389
|
+
consensusAreas: results.consensus,
|
|
390
|
+
recommendedActions: results.recommendations.filter(r =>
|
|
391
|
+
r.type === 'conflict_resolution' || r.type === 'validation'
|
|
392
|
+
)
|
|
393
|
+
};
|
|
394
|
+
|
|
395
|
+
default:
|
|
396
|
+
return formatted;
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
/**
|
|
401
|
+
* Generate citation summary for citation-focused output
|
|
402
|
+
*/
|
|
403
|
+
generateCitationSummary(sources) {
|
|
404
|
+
const domainCounts = {};
|
|
405
|
+
const typeDistribution = { academic: 0, commercial: 0, government: 0, other: 0 };
|
|
406
|
+
|
|
407
|
+
sources.forEach(source => {
|
|
408
|
+
try {
|
|
409
|
+
const domain = new URL(source.url).hostname;
|
|
410
|
+
domainCounts[domain] = (domainCounts[domain] || 0) + 1;
|
|
411
|
+
|
|
412
|
+
// Classify source type
|
|
413
|
+
if (domain.includes('edu') || domain.includes('scholar')) {
|
|
414
|
+
typeDistribution.academic++;
|
|
415
|
+
} else if (domain.includes('gov')) {
|
|
416
|
+
typeDistribution.government++;
|
|
417
|
+
} else if (domain.includes('com')) {
|
|
418
|
+
typeDistribution.commercial++;
|
|
419
|
+
} else {
|
|
420
|
+
typeDistribution.other++;
|
|
421
|
+
}
|
|
422
|
+
} catch {
|
|
423
|
+
typeDistribution.other++;
|
|
424
|
+
}
|
|
425
|
+
});
|
|
426
|
+
|
|
427
|
+
return {
|
|
428
|
+
totalSources: sources.length,
|
|
429
|
+
uniqueDomains: Object.keys(domainCounts).length,
|
|
430
|
+
topDomains: Object.entries(domainCounts)
|
|
431
|
+
.sort(([,a], [,b]) => b - a)
|
|
432
|
+
.slice(0, 5)
|
|
433
|
+
.map(([domain, count]) => ({ domain, count })),
|
|
434
|
+
sourceTypeDistribution: typeDistribution,
|
|
435
|
+
averageCredibility: sources.reduce((sum, s) => sum + (s.credibility || 0), 0) / sources.length
|
|
436
|
+
};
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
/**
|
|
440
|
+
* Analyze conflicts for conflict-focused output
|
|
441
|
+
*/
|
|
442
|
+
analyzeConflicts(conflicts) {
|
|
443
|
+
if (!conflicts.length) {
|
|
444
|
+
return {
|
|
445
|
+
conflictCount: 0,
|
|
446
|
+
severity: 'none',
|
|
447
|
+
resolutionPriority: 'low'
|
|
448
|
+
};
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
const severityLevels = conflicts.map(c => c.severity || 0.5);
|
|
452
|
+
const avgSeverity = severityLevels.reduce((sum, s) => sum + s, 0) / severityLevels.length;
|
|
453
|
+
|
|
454
|
+
return {
|
|
455
|
+
conflictCount: conflicts.length,
|
|
456
|
+
averageSeverity: avgSeverity,
|
|
457
|
+
severityDistribution: {
|
|
458
|
+
high: severityLevels.filter(s => s >= 0.7).length,
|
|
459
|
+
medium: severityLevels.filter(s => s >= 0.4 && s < 0.7).length,
|
|
460
|
+
low: severityLevels.filter(s => s < 0.4).length
|
|
461
|
+
},
|
|
462
|
+
conflictTypes: [...new Set(conflicts.map(c => c.type))],
|
|
463
|
+
resolutionPriority: avgSeverity >= 0.7 ? 'high' : avgSeverity >= 0.4 ? 'medium' : 'low',
|
|
464
|
+
recommendations: this.generateConflictResolutionRecommendations(conflicts)
|
|
465
|
+
};
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
/**
|
|
469
|
+
* Generate conflict resolution recommendations
|
|
470
|
+
*/
|
|
471
|
+
generateConflictResolutionRecommendations(conflicts) {
|
|
472
|
+
const recommendations = [];
|
|
473
|
+
|
|
474
|
+
const highSeverityConflicts = conflicts.filter(c => (c.severity || 0) >= 0.7);
|
|
475
|
+
if (highSeverityConflicts.length > 0) {
|
|
476
|
+
recommendations.push({
|
|
477
|
+
type: 'priority_investigation',
|
|
478
|
+
description: `Investigate ${highSeverityConflicts.length} high-severity conflicts immediately`,
|
|
479
|
+
urgency: 'high'
|
|
480
|
+
});
|
|
481
|
+
}
|
|
482
|
+
|
|
483
|
+
const contradictionConflicts = conflicts.filter(c => c.type === 'contradiction');
|
|
484
|
+
if (contradictionConflicts.length > 0) {
|
|
485
|
+
recommendations.push({
|
|
486
|
+
type: 'cross_reference',
|
|
487
|
+
description: 'Cross-reference contradictory claims with additional authoritative sources',
|
|
488
|
+
urgency: 'medium'
|
|
489
|
+
});
|
|
490
|
+
}
|
|
491
|
+
|
|
492
|
+
recommendations.push({
|
|
493
|
+
type: 'expert_consultation',
|
|
494
|
+
description: 'Consider consulting domain experts for conflict resolution',
|
|
495
|
+
urgency: 'low'
|
|
496
|
+
});
|
|
497
|
+
|
|
498
|
+
return recommendations;
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
/**
|
|
502
|
+
* Extract raw data for debugging/analysis
|
|
503
|
+
*/
|
|
504
|
+
extractRawData(results) {
|
|
505
|
+
return {
|
|
506
|
+
searchQueries: results.activityLog?.filter(log => log.type === 'topic_expansion'),
|
|
507
|
+
sourcesDiscovered: results.activityLog?.filter(log => log.type === 'initial_gathering'),
|
|
508
|
+
extractionResults: results.activityLog?.filter(log => log.type === 'deep_exploration'),
|
|
509
|
+
verificationResults: results.activityLog?.filter(log => log.type === 'source_verification'),
|
|
510
|
+
synthesisProcess: results.activityLog?.filter(log => log.type === 'information_synthesis')
|
|
511
|
+
};
|
|
512
|
+
}
|
|
513
|
+
|
|
514
|
+
/**
|
|
515
|
+
* Generate error recovery recommendations
|
|
516
|
+
*/
|
|
517
|
+
generateErrorRecoveryRecommendations(error, params) {
|
|
518
|
+
const recommendations = [];
|
|
519
|
+
|
|
520
|
+
if (error.message.includes('timeout') || error.message.includes('time limit')) {
|
|
521
|
+
recommendations.push({
|
|
522
|
+
type: 'increase_time_limit',
|
|
523
|
+
description: `Consider increasing time limit beyond ${params.timeLimit}ms`,
|
|
524
|
+
suggestedValue: Math.min(params.timeLimit * 2, 300000)
|
|
525
|
+
});
|
|
526
|
+
|
|
527
|
+
recommendations.push({
|
|
528
|
+
type: 'reduce_scope',
|
|
529
|
+
description: 'Reduce research scope to fit within time constraints',
|
|
530
|
+
suggestions: {
|
|
531
|
+
maxUrls: Math.ceil(params.maxUrls * 0.7),
|
|
532
|
+
maxDepth: Math.max(1, params.maxDepth - 1)
|
|
533
|
+
}
|
|
534
|
+
});
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
if (error.message.includes('network') || error.message.includes('fetch')) {
|
|
538
|
+
recommendations.push({
|
|
539
|
+
type: 'retry_with_delay',
|
|
540
|
+
description: 'Network issues detected. Retry with increased delays between requests',
|
|
541
|
+
suggestedDelay: 2000
|
|
542
|
+
});
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
if (error.message.includes('rate limit')) {
|
|
546
|
+
recommendations.push({
|
|
547
|
+
type: 'reduce_concurrency',
|
|
548
|
+
description: 'Rate limiting detected. Reduce concurrent operations',
|
|
549
|
+
suggestedConcurrency: Math.max(1, Math.ceil(params.concurrency / 2))
|
|
550
|
+
});
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
recommendations.push({
|
|
554
|
+
type: 'fallback_approach',
|
|
555
|
+
description: 'Try a more focused research approach',
|
|
556
|
+
suggestedApproach: 'focused'
|
|
557
|
+
});
|
|
558
|
+
|
|
559
|
+
return recommendations;
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
/**
|
|
563
|
+
* Send webhook notification
|
|
564
|
+
*/
|
|
565
|
+
async sendWebhookNotification(webhook, event, data) {
|
|
566
|
+
if (!webhook.events.includes(event)) return;
|
|
567
|
+
|
|
568
|
+
try {
|
|
569
|
+
const payload = {
|
|
570
|
+
event,
|
|
571
|
+
timestamp: new Date().toISOString(),
|
|
572
|
+
data
|
|
573
|
+
};
|
|
574
|
+
|
|
575
|
+
const response = await fetch(webhook.url, {
|
|
576
|
+
method: 'POST',
|
|
577
|
+
headers: {
|
|
578
|
+
'Content-Type': 'application/json',
|
|
579
|
+
'User-Agent': 'MCP-WebScraper-DeepResearch/1.0',
|
|
580
|
+
...webhook.headers
|
|
581
|
+
},
|
|
582
|
+
body: JSON.stringify(payload)
|
|
583
|
+
});
|
|
584
|
+
|
|
585
|
+
if (!response.ok) {
|
|
586
|
+
this.logger.warn('Webhook notification failed', {
|
|
587
|
+
url: webhook.url,
|
|
588
|
+
status: response.status,
|
|
589
|
+
event
|
|
590
|
+
});
|
|
591
|
+
}
|
|
592
|
+
} catch (error) {
|
|
593
|
+
this.logger.error('Webhook notification error', {
|
|
594
|
+
url: webhook.url,
|
|
595
|
+
error: error.message,
|
|
596
|
+
event
|
|
597
|
+
});
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
/**
|
|
602
|
+
* Utility methods
|
|
603
|
+
*/
|
|
604
|
+
generateSessionId() {
|
|
605
|
+
return `deep_research_${Date.now()}_${Math.random().toString(36).substring(2, 8)}`;
|
|
606
|
+
}
|
|
607
|
+
|
|
608
|
+
sanitizeConfigForLogging(config) {
|
|
609
|
+
const { webhook, ...safeConfig } = config;
|
|
610
|
+
return {
|
|
611
|
+
...safeConfig,
|
|
612
|
+
webhook: webhook ? { url: webhook.url, events: webhook.events } : undefined
|
|
613
|
+
};
|
|
614
|
+
}
|
|
615
|
+
|
|
616
|
+
estimateWaitTime() {
|
|
617
|
+
const avgSessionTime = 120000; // 2 minutes average
|
|
618
|
+
const queueSize = this.sessionQueue.length;
|
|
619
|
+
return queueSize * avgSessionTime;
|
|
620
|
+
}
|
|
621
|
+
|
|
622
|
+
/**
|
|
623
|
+
* Public API for session management
|
|
624
|
+
*/
|
|
625
|
+
getActiveSessions() {
|
|
626
|
+
return Array.from(this.activeSessions.entries()).map(([id, session]) => ({
|
|
627
|
+
sessionId: id,
|
|
628
|
+
topic: session.topic,
|
|
629
|
+
status: session.status,
|
|
630
|
+
duration: Date.now() - session.startTime,
|
|
631
|
+
lastActivity: session.lastActivity?.type
|
|
632
|
+
}));
|
|
633
|
+
}
|
|
634
|
+
|
|
635
|
+
getSessionStatus(sessionId) {
|
|
636
|
+
const session = this.activeSessions.get(sessionId);
|
|
637
|
+
if (!session) return null;
|
|
638
|
+
|
|
639
|
+
return {
|
|
640
|
+
sessionId,
|
|
641
|
+
topic: session.topic,
|
|
642
|
+
status: session.status,
|
|
643
|
+
duration: Date.now() - session.startTime,
|
|
644
|
+
lastActivity: session.lastActivity,
|
|
645
|
+
metrics: session.orchestrator.getMetrics()
|
|
646
|
+
};
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
cancelSession(sessionId) {
|
|
650
|
+
const session = this.activeSessions.get(sessionId);
|
|
651
|
+
if (session) {
|
|
652
|
+
session.orchestrator.stopResearch();
|
|
653
|
+
this.activeSessions.delete(sessionId);
|
|
654
|
+
return true;
|
|
655
|
+
}
|
|
656
|
+
return false;
|
|
657
|
+
}
|
|
658
|
+
|
|
659
|
+
/**
|
|
660
|
+
* Get tool statistics and health information
|
|
661
|
+
*/
|
|
662
|
+
getStats() {
|
|
663
|
+
return {
|
|
664
|
+
activeSessions: this.activeSessions.size,
|
|
665
|
+
queuedSessions: this.sessionQueue.length,
|
|
666
|
+
maxConcurrent: this.maxConcurrentResearch,
|
|
667
|
+
uptime: process.uptime(),
|
|
668
|
+
memory: process.memoryUsage()
|
|
669
|
+
};
|
|
670
|
+
}
|
|
671
|
+
}
|
|
672
|
+
|
|
673
|
+
export default DeepResearchTool;
|
|
674
|
+
// Create and export tool instance for MCP compatibility
|
|
675
|
+
export const deepResearchTool = new DeepResearchTool();
|
|
676
|
+
|
|
677
|
+
// Add name property for MCP protocol compliance
|
|
678
|
+
deepResearchTool.name = 'deep_research';
|
|
679
|
+
|
|
680
|
+
// Add validateParameters method for MCP protocol compliance
|
|
681
|
+
deepResearchTool.validateParameters = function(params) {
|
|
682
|
+
return DeepResearchSchema.parse(params);
|
|
683
|
+
};
|
|
684
|
+
|
|
685
|
+
// Add description property for MCP protocol compliance
|
|
686
|
+
deepResearchTool.description = 'Conduct comprehensive multi-stage research with intelligent query expansion, source verification, and information synthesis';
|
|
687
|
+
|
|
688
|
+
// Add inputSchema property for MCP protocol compliance
|
|
689
|
+
deepResearchTool.inputSchema = {
|
|
690
|
+
type: 'object',
|
|
691
|
+
properties: {
|
|
692
|
+
topic: {
|
|
693
|
+
type: 'string',
|
|
694
|
+
description: 'The research topic or question to investigate'
|
|
695
|
+
},
|
|
696
|
+
maxDepth: {
|
|
697
|
+
type: 'number',
|
|
698
|
+
description: 'Maximum depth for research exploration'
|
|
699
|
+
},
|
|
700
|
+
maxUrls: {
|
|
701
|
+
type: 'number',
|
|
702
|
+
description: 'Maximum number of URLs to process'
|
|
703
|
+
}
|
|
704
|
+
},
|
|
705
|
+
required: ['topic']
|
|
706
|
+
};
|