crawlforge-mcp-server 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CLAUDE.md +315 -0
  2. package/LICENSE +21 -0
  3. package/README.md +181 -0
  4. package/package.json +115 -0
  5. package/server.js +1963 -0
  6. package/setup.js +112 -0
  7. package/src/constants/config.js +615 -0
  8. package/src/core/ActionExecutor.js +1104 -0
  9. package/src/core/AlertNotificationSystem.js +601 -0
  10. package/src/core/AuthManager.js +315 -0
  11. package/src/core/ChangeTracker.js +2306 -0
  12. package/src/core/JobManager.js +687 -0
  13. package/src/core/LLMsTxtAnalyzer.js +753 -0
  14. package/src/core/LocalizationManager.js +1615 -0
  15. package/src/core/PerformanceManager.js +828 -0
  16. package/src/core/ResearchOrchestrator.js +1327 -0
  17. package/src/core/SnapshotManager.js +1037 -0
  18. package/src/core/StealthBrowserManager.js +1795 -0
  19. package/src/core/WebhookDispatcher.js +745 -0
  20. package/src/core/analysis/ContentAnalyzer.js +749 -0
  21. package/src/core/analysis/LinkAnalyzer.js +972 -0
  22. package/src/core/cache/CacheManager.js +821 -0
  23. package/src/core/connections/ConnectionPool.js +553 -0
  24. package/src/core/crawlers/BFSCrawler.js +845 -0
  25. package/src/core/integrations/PerformanceIntegration.js +377 -0
  26. package/src/core/llm/AnthropicProvider.js +135 -0
  27. package/src/core/llm/LLMManager.js +415 -0
  28. package/src/core/llm/LLMProvider.js +97 -0
  29. package/src/core/llm/OpenAIProvider.js +127 -0
  30. package/src/core/processing/BrowserProcessor.js +986 -0
  31. package/src/core/processing/ContentProcessor.js +505 -0
  32. package/src/core/processing/PDFProcessor.js +448 -0
  33. package/src/core/processing/StreamProcessor.js +673 -0
  34. package/src/core/queue/QueueManager.js +98 -0
  35. package/src/core/workers/WorkerPool.js +585 -0
  36. package/src/core/workers/worker.js +743 -0
  37. package/src/monitoring/healthCheck.js +600 -0
  38. package/src/monitoring/metrics.js +761 -0
  39. package/src/optimization/wave3-optimizations.js +932 -0
  40. package/src/security/security-patches.js +120 -0
  41. package/src/security/security-tests.js +355 -0
  42. package/src/security/wave3-security.js +652 -0
  43. package/src/tools/advanced/BatchScrapeTool.js +1089 -0
  44. package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
  45. package/src/tools/crawl/crawlDeep.js +449 -0
  46. package/src/tools/crawl/mapSite.js +400 -0
  47. package/src/tools/extract/analyzeContent.js +624 -0
  48. package/src/tools/extract/extractContent.js +329 -0
  49. package/src/tools/extract/processDocument.js +503 -0
  50. package/src/tools/extract/summarizeContent.js +376 -0
  51. package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
  52. package/src/tools/research/deepResearch.js +706 -0
  53. package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
  54. package/src/tools/search/adapters/googleSearch.js +236 -0
  55. package/src/tools/search/adapters/searchProviderFactory.js +96 -0
  56. package/src/tools/search/queryExpander.js +543 -0
  57. package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
  58. package/src/tools/search/ranking/ResultRanker.js +497 -0
  59. package/src/tools/search/searchWeb.js +482 -0
  60. package/src/tools/tracking/trackChanges.js +1355 -0
  61. package/src/utils/CircuitBreaker.js +515 -0
  62. package/src/utils/ErrorHandlingConfig.js +342 -0
  63. package/src/utils/HumanBehaviorSimulator.js +569 -0
  64. package/src/utils/Logger.js +568 -0
  65. package/src/utils/MemoryMonitor.js +173 -0
  66. package/src/utils/RetryManager.js +386 -0
  67. package/src/utils/contentUtils.js +588 -0
  68. package/src/utils/domainFilter.js +612 -0
  69. package/src/utils/inputValidation.js +766 -0
  70. package/src/utils/rateLimiter.js +196 -0
  71. package/src/utils/robotsChecker.js +91 -0
  72. package/src/utils/securityMiddleware.js +416 -0
  73. package/src/utils/sitemapParser.js +678 -0
  74. package/src/utils/ssrfProtection.js +640 -0
  75. package/src/utils/urlNormalizer.js +168 -0
@@ -0,0 +1,1327 @@
1
+ import { EventEmitter } from 'events';
2
+ import { SearchWebTool } from '../tools/search/searchWeb.js';
3
+ import { CrawlDeepTool } from '../tools/crawl/crawlDeep.js';
4
+ import { ExtractContentTool } from '../tools/extract/extractContent.js';
5
+ import { SummarizeContentTool } from '../tools/extract/summarizeContent.js';
6
+ import { QueryExpander } from '../tools/search/queryExpander.js';
7
+ import { ResultRanker } from '../tools/search/ranking/ResultRanker.js';
8
+ import { CacheManager } from './cache/CacheManager.js';
9
+ import { Logger } from '../utils/Logger.js';
10
+ import { LLMManager } from './llm/LLMManager.js';
11
+
12
+ /**
13
+ * ResearchOrchestrator - Multi-stage research orchestration engine with LLM integration
14
+ * Coordinates complex research workflows with intelligent query expansion,
15
+ * source verification, information synthesis, and AI-powered analysis
16
+ *
17
+ * Phase 2.1 Features:
18
+ * - LLM-powered query expansion with semantic understanding
19
+ * - AI-driven relevance scoring and content analysis
20
+ * - Intelligent research synthesis with conflict detection
21
+ * - Advanced provenance tracking and activity logging
22
+ * - Smart URL prioritization based on content quality
23
+ */
24
+ export class ResearchOrchestrator extends EventEmitter {
25
+ constructor(options = {}) {
26
+ super();
27
+
28
+ const {
29
+ maxDepth = 5,
30
+ maxUrls = 100,
31
+ timeLimit = 120000, // 2 minutes default
32
+ concurrency = 5,
33
+ enableSourceVerification = true,
34
+ enableConflictDetection = true,
35
+ cacheEnabled = true,
36
+ cacheTTL = 1800000, // 30 minutes
37
+ searchConfig = {},
38
+ crawlConfig = {},
39
+ extractConfig = {},
40
+ summarizeConfig = {}
41
+ } = options;
42
+
43
+ this.maxDepth = Math.min(Math.max(1, maxDepth), 10);
44
+ this.maxUrls = Math.min(Math.max(1, maxUrls), 1000);
45
+ this.timeLimit = Math.min(Math.max(30000, timeLimit), 300000);
46
+ this.concurrency = Math.min(Math.max(1, concurrency), 20);
47
+ this.enableSourceVerification = enableSourceVerification;
48
+ this.enableConflictDetection = enableConflictDetection;
49
+
50
+ // Initialize tools
51
+ this.searchTool = new SearchWebTool(searchConfig);
52
+ this.crawlTool = new CrawlDeepTool(crawlConfig);
53
+ this.extractTool = new ExtractContentTool(extractConfig);
54
+ this.summarizeTool = new SummarizeContentTool(summarizeConfig);
55
+
56
+ // Initialize utilities
57
+ this.queryExpander = new QueryExpander();
58
+ this.resultRanker = new ResultRanker();
59
+ this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
60
+ this.logger = new Logger({ component: 'ResearchOrchestrator' });
61
+
62
+ // Initialize LLM Manager for AI-powered research
63
+ this.llmManager = new LLMManager(options.llmConfig || {});
64
+ this.enableLLMFeatures = this.llmManager.isAvailable();
65
+
66
+ if (this.enableLLMFeatures) {
67
+ this.logger.info('LLM-powered research features enabled');
68
+ } else {
69
+ this.logger.warn('LLM providers not available, using fallback methods');
70
+ }
71
+
72
+ // Research state tracking
73
+ this.researchState = {
74
+ sessionId: null,
75
+ startTime: null,
76
+ currentDepth: 0,
77
+ visitedUrls: new Set(),
78
+ searchResults: new Map(),
79
+ extractedContent: new Map(),
80
+ researchFindings: [],
81
+ credibilityScores: new Map(),
82
+ conflictMap: new Map(),
83
+ activityLog: [],
84
+ llmAnalysis: new Map(),
85
+ semanticSimilarities: new Map(),
86
+ relevanceScores: new Map(),
87
+ synthesisHistory: []
88
+ };
89
+
90
+ // Performance metrics
91
+ this.metrics = {
92
+ searchQueries: 0,
93
+ urlsProcessed: 0,
94
+ contentExtracted: 0,
95
+ conflictsDetected: 0,
96
+ sourcesVerified: 0,
97
+ cacheHits: 0,
98
+ totalProcessingTime: 0,
99
+ llmAnalysisCalls: 0,
100
+ semanticAnalysisTime: 0,
101
+ queryExpansionTime: 0,
102
+ synthesisTime: 0
103
+ };
104
+ }
105
+
106
+ /**
107
+ * Conduct comprehensive deep research on a topic
108
+ * @param {string} topic - The research topic/question
109
+ * @param {Object} options - Research configuration options
110
+ * @returns {Promise<Object>} Research results
111
+ */
112
+ async conductResearch(topic, options = {}) {
113
+ const sessionId = this.generateSessionId();
114
+ const startTime = Date.now();
115
+
116
+ this.initializeResearchSession(sessionId, topic, startTime);
117
+
118
+ try {
119
+ this.logger.info('Starting deep research', { sessionId, topic, options });
120
+
121
+ // Stage 1: Initial topic exploration and query expansion
122
+ const expandedQueries = await this.expandResearchTopic(topic);
123
+ this.logActivity('topic_expansion', { originalTopic: topic, expandedQueries });
124
+
125
+ // Stage 2: Broad information gathering
126
+ const initialSources = await this.gatherInitialSources(expandedQueries, options);
127
+ this.logActivity('initial_gathering', { sourcesFound: initialSources.length });
128
+
129
+ // Stage 3: Deep exploration of promising sources
130
+ const detailedFindings = await this.exploreSourcesInDepth(initialSources, options);
131
+ this.logActivity('deep_exploration', { findingsCount: detailedFindings.length });
132
+
133
+ // Stage 4: Source credibility assessment
134
+ const verifiedSources = this.enableSourceVerification ?
135
+ await this.verifySourceCredibility(detailedFindings) : detailedFindings;
136
+ this.logActivity('source_verification', { verifiedCount: verifiedSources.length });
137
+
138
+ // Stage 5: Information synthesis and conflict detection
139
+ const synthesizedResults = await this.synthesizeInformation(verifiedSources, topic);
140
+ this.logActivity('information_synthesis', { conflictsFound: synthesizedResults.conflicts.length });
141
+
142
+ // Stage 6: Final result compilation
143
+ const finalResults = this.compileResearchResults(topic, synthesizedResults, options);
144
+
145
+ const totalTime = Date.now() - startTime;
146
+ this.metrics.totalProcessingTime = totalTime;
147
+
148
+ this.logger.info('Research completed', {
149
+ sessionId,
150
+ duration: totalTime,
151
+ findings: finalResults.findings.length
152
+ });
153
+
154
+ this.emit('researchCompleted', {
155
+ sessionId,
156
+ topic,
157
+ duration: totalTime,
158
+ findings: finalResults.findings.length
159
+ });
160
+
161
+ return finalResults;
162
+
163
+ } catch (error) {
164
+ this.logger.error('Research failed', { sessionId, error: error.message });
165
+ this.emit('researchFailed', { sessionId, topic, error: error.message });
166
+
167
+ return this.handleResearchError(error, topic, sessionId);
168
+ }
169
+ }
170
+
171
+ /**
172
+ * Initialize research session state
173
+ */
174
+ initializeResearchSession(sessionId, topic, startTime) {
175
+ this.researchState = {
176
+ sessionId,
177
+ topic,
178
+ startTime,
179
+ currentDepth: 0,
180
+ visitedUrls: new Set(),
181
+ searchResults: new Map(),
182
+ extractedContent: new Map(),
183
+ researchFindings: [],
184
+ credibilityScores: new Map(),
185
+ conflictMap: new Map(),
186
+ activityLog: []
187
+ };
188
+
189
+ // Reset metrics
190
+ Object.keys(this.metrics).forEach(key => {
191
+ this.metrics[key] = 0;
192
+ });
193
+ }
194
+
195
+ /**
196
+ * Expand research topic into multiple targeted queries with LLM enhancement
197
+ */
198
+ async expandResearchTopic(topic) {
199
+ const startTime = Date.now();
200
+
201
+ try {
202
+ const cacheKey = this.cache ? this.cache.generateKey('topic_expansion_v2', { topic, llm: this.enableLLMFeatures }) : null;
203
+
204
+ if (this.cache && cacheKey) {
205
+ const cached = await this.cache.get(cacheKey);
206
+ if (cached) {
207
+ this.metrics.cacheHits++;
208
+ return cached;
209
+ }
210
+ }
211
+
212
+ let expandedQueries = [];
213
+
214
+ // LLM-powered query expansion (preferred)
215
+ if (this.enableLLMFeatures) {
216
+ try {
217
+ this.logger.info('Using LLM for intelligent query expansion');
218
+ expandedQueries = await this.llmManager.expandQuery(topic, {
219
+ maxExpansions: 8,
220
+ includeContextual: true,
221
+ includeSynonyms: true,
222
+ includeRelated: true
223
+ });
224
+ this.metrics.llmAnalysisCalls++;
225
+ } catch (llmError) {
226
+ this.logger.warn('LLM query expansion failed, falling back to traditional methods', { error: llmError.message });
227
+ }
228
+ }
229
+
230
+ // Fallback to traditional expansion if LLM failed or unavailable
231
+ if (expandedQueries.length === 0) {
232
+ expandedQueries = await this.queryExpander.expandQuery(topic, {
233
+ enableSynonyms: true,
234
+ enableSpellCheck: true,
235
+ enablePhraseDetection: true,
236
+ maxExpansions: 8
237
+ });
238
+ }
239
+
240
+ // Add research-specific query variations
241
+ const researchVariations = this.generateResearchVariations(topic);
242
+ const allQueries = [...new Set([topic, ...expandedQueries, ...researchVariations])];
243
+
244
+ // Rank queries by research relevance with semantic understanding
245
+ const rankedQueries = await this.rankResearchQueriesWithSemantics(allQueries, topic);
246
+
247
+ if (this.cache && cacheKey) {
248
+ await this.cache.set(cacheKey, rankedQueries);
249
+ }
250
+
251
+ this.metrics.queryExpansionTime += Date.now() - startTime;
252
+ return rankedQueries;
253
+ } catch (error) {
254
+ this.logger.warn('Topic expansion failed, using original topic', { error: error.message });
255
+ this.metrics.queryExpansionTime += Date.now() - startTime;
256
+ return [topic];
257
+ }
258
+ }
259
+
260
+ /**
261
+ * Generate research-specific query variations
262
+ */
263
+ generateResearchVariations(topic) {
264
+ const variations = [];
265
+
266
+ // Question-based variations
267
+ variations.push(`what is ${topic}`);
268
+ variations.push(`how does ${topic} work`);
269
+ variations.push(`${topic} explained`);
270
+ variations.push(`${topic} research`);
271
+ variations.push(`${topic} studies`);
272
+ variations.push(`${topic} analysis`);
273
+
274
+ // Academic and authoritative variations
275
+ variations.push(`${topic} academic`);
276
+ variations.push(`${topic} scientific`);
277
+ variations.push(`${topic} research paper`);
278
+ variations.push(`${topic} peer reviewed`);
279
+
280
+ // Current and historical context
281
+ variations.push(`latest ${topic}`);
282
+ variations.push(`current ${topic}`);
283
+ variations.push(`${topic} 2024`);
284
+ variations.push(`${topic} trends`);
285
+
286
+ return variations.slice(0, 10); // Limit variations
287
+ }
288
+
289
+ /**
290
+ * Rank research queries by relevance and specificity with semantic analysis
291
+ */
292
+ async rankResearchQueriesWithSemantics(queries, originalTopic) {
293
+ const startTime = Date.now();
294
+
295
+ try {
296
+ const scored = await Promise.all(queries.map(async (query) => {
297
+ let score = 0.5; // Base score
298
+
299
+ // Give original topic highest priority
300
+ if (query === originalTopic) {
301
+ return { query, score: 1.0 };
302
+ }
303
+
304
+ // Traditional scoring
305
+ const topicWords = originalTopic.toLowerCase().split(" ");
306
+ const queryWords = query.toLowerCase().split(" ");
307
+ const overlap = topicWords.filter(word => queryWords.includes(word));
308
+ score += (overlap.length / topicWords.length) * 0.3;
309
+
310
+ // Research-oriented bonus
311
+ const researchKeywords = ["research", "study", "analysis", "academic", "scientific"];
312
+ if (researchKeywords.some(keyword => query.toLowerCase().includes(keyword))) {
313
+ score += 0.2;
314
+ }
315
+
316
+ // Length preference
317
+ if (query.length > 10 && query.length < 100) {
318
+ score += 0.1;
319
+ }
320
+
321
+ // Semantic similarity boost (if LLM available)
322
+ if (this.enableLLMFeatures) {
323
+ try {
324
+ const similarity = await this.llmManager.calculateSimilarity(originalTopic, query);
325
+ score += similarity * 0.3; // Semantic similarity weight
326
+ this.researchState.semanticSimilarities.set(query, similarity);
327
+ } catch (semanticError) {
328
+ this.logger.debug('Semantic similarity calculation failed', { query, error: semanticError.message });
329
+ }
330
+ }
331
+
332
+ return { query, score };
333
+ }));
334
+
335
+ const sortedQueries = scored
336
+ .sort((a, b) => b.score - a.score)
337
+ .map(item => item.query);
338
+
339
+ // Ensure original topic is always first
340
+ const result = [originalTopic];
341
+ sortedQueries.forEach(query => {
342
+ if (query !== originalTopic && result.length < this.maxDepth) {
343
+ result.push(query);
344
+ }
345
+ });
346
+
347
+ this.metrics.semanticAnalysisTime += Date.now() - startTime;
348
+ return result.slice(0, this.maxDepth);
349
+ } catch (error) {
350
+ this.logger.warn('Semantic ranking failed, using fallback', { error: error.message });
351
+ return this.rankResearchQueries(queries, originalTopic);
352
+ }
353
+ }
354
+
355
+ /**
356
+ * Fallback ranking method (original implementation)
357
+ */
358
+ rankResearchQueries(queries, originalTopic) {
359
+ const scored = queries.map(query => {
360
+ let score = 0.5;
361
+
362
+ if (query === originalTopic) {
363
+ score = 1.0;
364
+ } else {
365
+ const topicWords = originalTopic.toLowerCase().split(" ");
366
+ const queryWords = query.toLowerCase().split(" ");
367
+ const overlap = topicWords.filter(word => queryWords.includes(word));
368
+ score += (overlap.length / topicWords.length) * 0.3;
369
+
370
+ const researchKeywords = ["research", "study", "analysis", "academic", "scientific"];
371
+ if (researchKeywords.some(keyword => query.toLowerCase().includes(keyword))) {
372
+ score += 0.2;
373
+ }
374
+
375
+ if (query.length > 10 && query.length < 100) {
376
+ score += 0.1;
377
+ }
378
+ }
379
+
380
+ return { query, score };
381
+ });
382
+
383
+ const sortedQueries = scored
384
+ .sort((a, b) => b.score - a.score)
385
+ .map(item => item.query);
386
+
387
+ const result = [originalTopic];
388
+ sortedQueries.forEach(query => {
389
+ if (query !== originalTopic && result.length < this.maxDepth) {
390
+ result.push(query);
391
+ }
392
+ });
393
+
394
+ return result.slice(0, this.maxDepth);
395
+ }
396
+ /**
397
+ * Gather initial sources using expanded queries
398
+ */
399
+ async gatherInitialSources(queries, options) {
400
+ const allSources = [];
401
+ const maxSourcesPerQuery = Math.ceil(this.maxUrls / queries.length);
402
+
403
+ await this.processWithTimeLimit(async () => {
404
+ const searchPromises = queries.slice(0, 5).map(async (query) => {
405
+ try {
406
+ this.metrics.searchQueries++;
407
+ const searchResults = await this.searchTool.execute({
408
+ query,
409
+ limit: maxSourcesPerQuery,
410
+ enable_ranking: true,
411
+ enable_deduplication: true
412
+ });
413
+
414
+ if (searchResults.results && searchResults.results.length > 0) {
415
+ const processedResults = searchResults.results.map(result => ({
416
+ ...result,
417
+ sourceQuery: query,
418
+ discoveredAt: new Date().toISOString(),
419
+ credibilityScore: this.calculateInitialCredibility(result),
420
+ researchRelevance: this.calculateResearchRelevance(result, query)
421
+ }));
422
+
423
+ this.researchState.searchResults.set(query, processedResults);
424
+ return processedResults;
425
+ }
426
+ return [];
427
+ } catch (error) {
428
+ this.logger.warn('Search failed for query', { query, error: error.message });
429
+ return [];
430
+ }
431
+ });
432
+
433
+ const results = await Promise.all(searchPromises);
434
+ results.forEach(sources => allSources.push(...sources));
435
+ });
436
+
437
+ // Deduplicate and rank sources
438
+ const uniqueSources = this.deduplicateSources(allSources);
439
+ const rankedSources = await this.rankSourcesByResearchValue(uniqueSources);
440
+
441
+ return rankedSources.slice(0, this.maxUrls);
442
+ }
443
+
444
+ /**
445
+ * Explore promising sources in depth with LLM-powered relevance analysis
446
+ */
447
+ async exploreSourcesInDepth(sources, options) {
448
+ const detailedFindings = [];
449
+ const batchSize = Math.min(this.concurrency, 10);
450
+ const { topic } = this.researchState;
451
+
452
+ await this.processWithTimeLimit(async () => {
453
+ for (let i = 0; i < sources.length; i += batchSize) {
454
+ const batch = sources.slice(i, i + batchSize);
455
+
456
+ const batchPromises = batch.map(async (source) => {
457
+ try {
458
+ if (this.researchState.visitedUrls.has(source.link)) {
459
+ return null;
460
+ }
461
+
462
+ this.researchState.visitedUrls.add(source.link);
463
+ this.metrics.urlsProcessed++;
464
+
465
+ // Extract detailed content
466
+ const contentData = await this.extractTool.execute({
467
+ url: source.link,
468
+ options: { includeMetadata: true, includeStructuredData: true }
469
+ });
470
+
471
+ if (contentData && contentData.content) {
472
+ this.metrics.contentExtracted++;
473
+
474
+ // Enhance source with extracted content
475
+ let enhancedSource = {
476
+ ...source,
477
+ extractedContent: contentData.content,
478
+ metadata: contentData.metadata,
479
+ structuredData: contentData.structuredData,
480
+ extractedAt: new Date().toISOString(),
481
+ wordCount: contentData.content.split(' ').length,
482
+ readabilityScore: this.calculateReadabilityScore(contentData.content)
483
+ };
484
+
485
+ // LLM-powered relevance analysis
486
+ if (this.enableLLMFeatures && topic) {
487
+ try {
488
+ const relevanceAnalysis = await this.llmManager.analyzeRelevance(
489
+ contentData.content,
490
+ topic,
491
+ { maxContentLength: 2000 }
492
+ );
493
+
494
+ enhancedSource.llmAnalysis = relevanceAnalysis;
495
+ enhancedSource.relevanceScore = relevanceAnalysis.relevanceScore;
496
+ this.researchState.llmAnalysis.set(source.link, relevanceAnalysis);
497
+ this.researchState.relevanceScores.set(source.link, relevanceAnalysis.relevanceScore);
498
+ this.metrics.llmAnalysisCalls++;
499
+
500
+ this.logger.debug('LLM relevance analysis completed', {
501
+ url: source.link,
502
+ relevanceScore: relevanceAnalysis.relevanceScore,
503
+ keyPoints: relevanceAnalysis.keyPoints.length
504
+ });
505
+ } catch (llmError) {
506
+ this.logger.warn('LLM relevance analysis failed', {
507
+ url: source.link,
508
+ error: llmError.message
509
+ });
510
+ // Set default relevance score
511
+ enhancedSource.relevanceScore = this.calculateTraditionalRelevance(contentData.content, topic);
512
+ }
513
+ } else {
514
+ // Fallback relevance calculation
515
+ enhancedSource.relevanceScore = this.calculateTraditionalRelevance(contentData.content, topic);
516
+ }
517
+
518
+ this.researchState.extractedContent.set(source.link, enhancedSource);
519
+ return enhancedSource;
520
+ }
521
+ return null;
522
+ } catch (error) {
523
+ this.logger.warn('Content extraction failed', {
524
+ url: source.link,
525
+ error: error.message
526
+ });
527
+ return null;
528
+ }
529
+ });
530
+
531
+ const batchResults = await Promise.all(batchPromises);
532
+ const validResults = batchResults.filter(result => result !== null);
533
+ detailedFindings.push(...validResults);
534
+ }
535
+ });
536
+
537
+ // Sort by relevance score (LLM or traditional)
538
+ return detailedFindings.sort((a, b) => (b.relevanceScore || 0) - (a.relevanceScore || 0));
539
+ }
540
+
541
+ /**
542
+ * Verify source credibility using multiple factors
543
+ */
544
+ async verifySourceCredibility(sources) {
545
+ const verifiedSources = [];
546
+
547
+ for (const source of sources) {
548
+ try {
549
+ this.metrics.sourcesVerified++;
550
+
551
+ const credibilityFactors = {
552
+ domainAuthority: this.assessDomainAuthority(source.link),
553
+ contentQuality: this.assessContentQuality(source),
554
+ sourceType: this.identifySourceType(source),
555
+ recency: this.assessContentRecency(source),
556
+ authorityIndicators: this.findAuthorityIndicators(source),
557
+ citationPotential: this.assessCitationPotential(source)
558
+ };
559
+
560
+ const overallCredibility = this.calculateOverallCredibility(credibilityFactors);
561
+
562
+ // Only include sources that meet minimum credibility threshold
563
+ if (overallCredibility >= 0.3) {
564
+ verifiedSources.push({
565
+ ...source,
566
+ credibilityFactors,
567
+ overallCredibility,
568
+ verifiedAt: new Date().toISOString()
569
+ });
570
+
571
+ this.researchState.credibilityScores.set(source.link, overallCredibility);
572
+ }
573
+ } catch (error) {
574
+ this.logger.warn('Credibility verification failed', {
575
+ url: source.link,
576
+ error: error.message
577
+ });
578
+ }
579
+ }
580
+
581
+ return verifiedSources.sort((a, b) => b.overallCredibility - a.overallCredibility);
582
+ }
583
+
584
+ /**
585
+ * Synthesize information and detect conflicts with LLM enhancement
586
+ */
587
+ async synthesizeInformation(sources, topic) {
588
+ const startTime = Date.now();
589
+ const synthesis = {
590
+ keyFindings: [],
591
+ supportingEvidence: [],
592
+ conflicts: [],
593
+ consensus: [],
594
+ gaps: [],
595
+ recommendations: [],
596
+ llmSynthesis: null
597
+ };
598
+
599
+ try {
600
+ // Extract key claims and facts from each source
601
+ const extractedClaims = await this.extractKeyClaims(sources);
602
+
603
+ // Group related claims
604
+ const claimGroups = this.groupRelatedClaims(extractedClaims);
605
+
606
+ // Detect conflicts between claims
607
+ if (this.enableConflictDetection) {
608
+ synthesis.conflicts = this.detectInformationConflicts(claimGroups);
609
+ this.metrics.conflictsDetected = synthesis.conflicts.length;
610
+ }
611
+
612
+ // Identify consensus areas
613
+ synthesis.consensus = this.identifyConsensus(claimGroups);
614
+
615
+ // Generate key findings
616
+ synthesis.keyFindings = this.generateKeyFindings(claimGroups, sources);
617
+
618
+ // Compile supporting evidence
619
+ synthesis.supportingEvidence = this.compileSupportingEvidence(sources);
620
+
621
+ // Identify research gaps
622
+ synthesis.gaps = this.identifyResearchGaps(claimGroups, topic);
623
+
624
+ // Generate recommendations
625
+ synthesis.recommendations = this.generateResearchRecommendations(synthesis, topic);
626
+
627
+ // LLM-powered comprehensive synthesis
628
+ if (this.enableLLMFeatures && sources.length > 0) {
629
+ try {
630
+ this.logger.info('Generating LLM-powered research synthesis');
631
+
632
+ // Prepare findings for LLM analysis
633
+ const findingsForLLM = synthesis.keyFindings.map(finding => ({
634
+ finding: finding.finding,
635
+ credibility: finding.credibility,
636
+ sources: finding.sources.length
637
+ }));
638
+
639
+ const llmSynthesis = await this.llmManager.synthesizeFindings(
640
+ findingsForLLM,
641
+ topic,
642
+ {
643
+ maxFindings: 10,
644
+ includeConflicts: synthesis.conflicts.length > 0
645
+ }
646
+ );
647
+
648
+ synthesis.llmSynthesis = llmSynthesis;
649
+ this.researchState.synthesisHistory.push({
650
+ timestamp: new Date().toISOString(),
651
+ topic,
652
+ synthesis: llmSynthesis,
653
+ sourceCount: sources.length
654
+ });
655
+
656
+ this.metrics.llmAnalysisCalls++;
657
+ this.logger.info('LLM synthesis completed', {
658
+ confidence: llmSynthesis.confidence,
659
+ insights: llmSynthesis.keyInsights?.length || 0,
660
+ themes: llmSynthesis.themes?.length || 0
661
+ });
662
+
663
+ } catch (llmError) {
664
+ this.logger.warn('LLM synthesis failed', { error: llmError.message });
665
+ synthesis.llmSynthesis = {
666
+ error: 'LLM synthesis unavailable',
667
+ fallback: true
668
+ };
669
+ }
670
+ }
671
+
672
+ } catch (error) {
673
+ this.logger.error('Information synthesis failed', { error: error.message });
674
+ synthesis.error = error.message;
675
+ }
676
+
677
+ this.metrics.synthesisTime += Date.now() - startTime;
678
+ return synthesis;
679
+ }
680
+
681
+ /**
682
+ * Extract key claims from source content
683
+ */
684
+ async extractKeyClaims(sources) {
685
+ const claims = [];
686
+
687
+ for (const source of sources) {
688
+ try {
689
+ if (!source.extractedContent) continue;
690
+
691
+ const content = source.extractedContent.substring(0, 5000); // Limit content length
692
+
693
+ // Use summarization to extract key points
694
+ const summary = await this.summarizeTool.execute({
695
+ text: content,
696
+ options: {
697
+ maxLength: 500,
698
+ extractKeyPoints: true,
699
+ includeSupporting: true
700
+ }
701
+ });
702
+
703
+ if (summary.keyPoints) {
704
+ summary.keyPoints.forEach((point, index) => {
705
+ claims.push({
706
+ id: `${source.link}_claim_${index}`,
707
+ claim: point,
708
+ source: source.link,
709
+ sourceTitle: source.title,
710
+ credibility: source.overallCredibility || 0.5,
711
+ context: summary.supporting?.[index] || '',
712
+ extractedAt: new Date().toISOString()
713
+ });
714
+ });
715
+ }
716
+ } catch (error) {
717
+ this.logger.warn('Claim extraction failed', {
718
+ source: source.link,
719
+ error: error.message
720
+ });
721
+ }
722
+ }
723
+
724
+ return claims;
725
+ }
726
+
727
+ /**
728
+ * Group related claims for analysis
729
+ */
730
+ groupRelatedClaims(claims) {
731
+ const groups = new Map();
732
+
733
+ for (const claim of claims) {
734
+ const keywords = this.extractKeywords(claim.claim);
735
+ const groupKey = keywords.slice(0, 3).sort().join('_');
736
+
737
+ if (!groups.has(groupKey)) {
738
+ groups.set(groupKey, {
739
+ id: groupKey,
740
+ keywords,
741
+ claims: [],
742
+ avgCredibility: 0,
743
+ sourceCount: 0
744
+ });
745
+ }
746
+
747
+ groups.get(groupKey).claims.push(claim);
748
+ }
749
+
750
+ // Calculate group statistics
751
+ groups.forEach(group => {
752
+ group.sourceCount = new Set(group.claims.map(c => c.source)).size;
753
+ group.avgCredibility = group.claims.reduce((sum, c) => sum + c.credibility, 0) / group.claims.length;
754
+ });
755
+
756
+ return Array.from(groups.values());
757
+ }
758
+
759
+ /**
760
+ * Detect conflicts between information claims
761
+ */
762
+ detectInformationConflicts(claimGroups) {
763
+ const conflicts = [];
764
+
765
+ for (const group of claimGroups) {
766
+ if (group.claims.length < 2) continue;
767
+
768
+ // Simple conflict detection based on contradictory terms
769
+ const conflictIndicators = [
770
+ ['not', 'is'], ['false', 'true'], ['incorrect', 'correct'],
771
+ ['impossible', 'possible'], ['never', 'always'], ['no', 'yes']
772
+ ];
773
+
774
+ for (let i = 0; i < group.claims.length; i++) {
775
+ for (let j = i + 1; j < group.claims.length; j++) {
776
+ const claim1 = group.claims[i];
777
+ const claim2 = group.claims[j];
778
+
779
+ const text1 = claim1.claim.toLowerCase();
780
+ const text2 = claim2.claim.toLowerCase();
781
+
782
+ for (const [neg, pos] of conflictIndicators) {
783
+ if ((text1.includes(neg) && text2.includes(pos)) ||
784
+ (text1.includes(pos) && text2.includes(neg))) {
785
+
786
+ conflicts.push({
787
+ id: `conflict_${conflicts.length}`,
788
+ type: 'contradiction',
789
+ claim1: claim1,
790
+ claim2: claim2,
791
+ severity: this.calculateConflictSeverity(claim1, claim2),
792
+ detectedAt: new Date().toISOString()
793
+ });
794
+
795
+ break;
796
+ }
797
+ }
798
+ }
799
+ }
800
+ }
801
+
802
+ return conflicts;
803
+ }
804
+
805
+ /**
806
+ * Identify areas of consensus
807
+ */
808
+ identifyConsensus(claimGroups) {
809
+ return claimGroups
810
+ .filter(group => group.sourceCount >= 2 && group.avgCredibility >= 0.6)
811
+ .map(group => ({
812
+ topic: group.keywords.join(' '),
813
+ supportingClaims: group.claims.length,
814
+ supportingSources: group.sourceCount,
815
+ averageCredibility: group.avgCredibility,
816
+ consensusStrength: this.calculateConsensusStrength(group)
817
+ }))
818
+ .sort((a, b) => b.consensusStrength - a.consensusStrength);
819
+ }
820
+
821
+ /**
822
+ * Calculate various scoring functions
823
+ */
824
+ calculateInitialCredibility(source) {
825
+ let score = 0.5;
826
+
827
+ // Domain-based scoring
828
+ try {
829
+ const domain = new URL(source.link).hostname;
830
+ if (domain.includes('edu')) score += 0.3;
831
+ else if (domain.includes('gov')) score += 0.4;
832
+ else if (domain.includes('org')) score += 0.2;
833
+ } catch {}
834
+
835
+ // Content indicators
836
+ if (source.snippet) {
837
+ const snippet = source.snippet.toLowerCase();
838
+ if (snippet.includes('research') || snippet.includes('study')) score += 0.1;
839
+ if (snippet.includes('peer reviewed')) score += 0.2;
840
+ }
841
+
842
+ return Math.min(1, score);
843
+ }
844
+
845
+ calculateResearchRelevance(result, query) {
846
+ let relevance = 0.5;
847
+
848
+ const title = (result.title || '').toLowerCase();
849
+ const snippet = (result.snippet || '').toLowerCase();
850
+ const queryLower = query.toLowerCase();
851
+
852
+ // Title relevance
853
+ if (title.includes(queryLower)) relevance += 0.3;
854
+
855
+ // Snippet relevance
856
+ if (snippet.includes(queryLower)) relevance += 0.2;
857
+
858
+ return Math.min(1, relevance);
859
+ }
860
+
861
+ calculateReadabilityScore(content) {
862
+ if (!content) return 0.5;
863
+
864
+ const words = content.split(' ').length;
865
+ const sentences = content.split(/[.!?]/).length;
866
+ const avgWordsPerSentence = words / Math.max(sentences, 1);
867
+
868
+ // Simple readability approximation
869
+ if (avgWordsPerSentence < 15) return 0.8; // Easy to read
870
+ if (avgWordsPerSentence < 20) return 0.6; // Moderate
871
+ return 0.4; // Difficult
872
+ }
873
+
874
+ calculateOverallCredibility(factors) {
875
+ const weights = {
876
+ domainAuthority: 0.3,
877
+ contentQuality: 0.25,
878
+ sourceType: 0.2,
879
+ recency: 0.1,
880
+ authorityIndicators: 0.1,
881
+ citationPotential: 0.05
882
+ };
883
+
884
+ let score = 0;
885
+ Object.entries(weights).forEach(([factor, weight]) => {
886
+ score += (factors[factor] || 0.5) * weight;
887
+ });
888
+
889
+ return Math.min(1, Math.max(0, score));
890
+ }
891
+
892
+ calculateConflictSeverity(claim1, claim2) {
893
+ const credibilityDiff = Math.abs(claim1.credibility - claim2.credibility);
894
+ return 0.5 + (credibilityDiff * 0.5);
895
+ }
896
+
897
+ calculateConsensusStrength(group) {
898
+ return (group.sourceCount * 0.4) + (group.avgCredibility * 0.6);
899
+ }
900
+
901
+ /**
902
+ * Utility functions
903
+ */
904
+ assessDomainAuthority(url) {
905
+ try {
906
+ const domain = new URL(url).hostname.toLowerCase();
907
+
908
+ // High authority domains
909
+ if (domain.includes('edu') || domain.includes('gov')) return 0.9;
910
+ if (domain.includes('org')) return 0.7;
911
+ if (['wikipedia.org', 'pubmed.ncbi.nlm.nih.gov'].includes(domain)) return 0.8;
912
+
913
+ return 0.5;
914
+ } catch {
915
+ return 0.3;
916
+ }
917
+ }
918
+
919
+ assessContentQuality(source) {
920
+ let score = 0.5;
921
+
922
+ if (source.wordCount > 500) score += 0.2;
923
+ if (source.readabilityScore > 0.6) score += 0.1;
924
+ if (source.metadata?.author) score += 0.1;
925
+ if (source.structuredData) score += 0.1;
926
+
927
+ return Math.min(1, score);
928
+ }
929
+
930
+ identifySourceType(source) {
931
+ const content = (source.extractedContent || '').toLowerCase();
932
+ const title = (source.title || '').toLowerCase();
933
+
934
+ if (content.includes('abstract') || content.includes('methodology')) return 0.9;
935
+ if (title.includes('research') || title.includes('study')) return 0.8;
936
+ if (content.includes('peer reviewed')) return 0.9;
937
+ if (title.includes('news') || title.includes('blog')) return 0.4;
938
+
939
+ return 0.6;
940
+ }
941
+
942
+ assessContentRecency(source) {
943
+ // Simple recency assessment - would need better date extraction in real implementation
944
+ return 0.6; // Neutral score
945
+ }
946
+
947
+ findAuthorityIndicators(source) {
948
+ let score = 0.5;
949
+ const content = (source.extractedContent || '').toLowerCase();
950
+
951
+ if (content.includes('citation') || content.includes('reference')) score += 0.2;
952
+ if (content.includes('doi:')) score += 0.2;
953
+ if (source.metadata?.author) score += 0.1;
954
+
955
+ return Math.min(1, score);
956
+ }
957
+
958
+ assessCitationPotential(source) {
959
+ let score = 0.5;
960
+
961
+ if (source.metadata?.doi) score += 0.3;
962
+ if (source.structuredData?.citations) score += 0.2;
963
+
964
+ return Math.min(1, score);
965
+ }
966
+
967
+ extractKeywords(text) {
968
+ return text
969
+ .toLowerCase()
970
+ .replace(/[^\w\s]/g, ' ')
971
+ .split(/\s+/)
972
+ .filter(word => word.length > 3)
973
+ .slice(0, 10);
974
+ }
975
+
976
+ /**
977
+ * Calculate traditional relevance score without LLM
978
+ */
979
+ calculateTraditionalRelevance(content, topic) {
980
+ if (!content || !topic) return 0.5;
981
+
982
+ const topicWords = topic.toLowerCase().split(/\s+/).filter(word => word.length > 2);
983
+ const contentLower = content.toLowerCase();
984
+
985
+ let matches = 0;
986
+ let totalWeight = 0;
987
+
988
+ topicWords.forEach(word => {
989
+ const regex = new RegExp(`\\b${word}\\b`, 'g');
990
+ const wordMatches = (contentLower.match(regex) || []).length;
991
+ matches += wordMatches;
992
+ totalWeight += word.length * wordMatches; // Weight by word importance
993
+ });
994
+
995
+ // Calculate relevance based on keyword density and content length
996
+ const contentWords = content.split(/\s+/).length;
997
+ const density = matches / Math.max(contentWords, 1);
998
+ const coverage = matches / Math.max(topicWords.length, 1);
999
+
1000
+ // Combine density and coverage with weights
1001
+ const relevanceScore = (density * 0.4) + (coverage * 0.6);
1002
+
1003
+ return Math.min(1, Math.max(0, relevanceScore));
1004
+ }
1005
+
1006
+ /**
1007
+ * Utility methods for research workflow
1008
+ */
1009
+ async processWithTimeLimit(asyncFunction) {
1010
+ const timeoutPromise = new Promise((_, reject) => {
1011
+ setTimeout(() => reject(new Error('Research time limit exceeded')), this.timeLimit);
1012
+ });
1013
+
1014
+ try {
1015
+ await Promise.race([asyncFunction(), timeoutPromise]);
1016
+ } catch (error) {
1017
+ if (error.message === 'Research time limit exceeded') {
1018
+ this.logger.warn('Research time limit reached, returning partial results');
1019
+ } else {
1020
+ throw error;
1021
+ }
1022
+ }
1023
+ }
1024
+
1025
+ deduplicateSources(sources) {
1026
+ const seen = new Set();
1027
+ return sources.filter(source => {
1028
+ const key = source.link;
1029
+ if (seen.has(key)) return false;
1030
+ seen.add(key);
1031
+ return true;
1032
+ });
1033
+ }
1034
+
1035
+ async rankSourcesByResearchValue(sources) {
1036
+ return sources.sort((a, b) => {
1037
+ const scoreA = (a.credibilityScore || 0) + (a.researchRelevance || 0);
1038
+ const scoreB = (b.credibilityScore || 0) + (b.researchRelevance || 0);
1039
+ return scoreB - scoreA;
1040
+ });
1041
+ }
1042
+
1043
+ generateKeyFindings(claimGroups, sources) {
1044
+ return claimGroups
1045
+ .filter(group => group.avgCredibility >= 0.6)
1046
+ .sort((a, b) => b.consensusStrength - a.consensusStrength)
1047
+ .slice(0, 10)
1048
+ .map(group => ({
1049
+ finding: group.keywords.join(' '),
1050
+ supportingClaims: group.claims.length,
1051
+ credibility: group.avgCredibility,
1052
+ sources: group.claims.map(c => c.source)
1053
+ }));
1054
+ }
1055
+
1056
+ compileSupportingEvidence(sources) {
1057
+ return sources
1058
+ .filter(source => source.overallCredibility >= 0.7)
1059
+ .map(source => ({
1060
+ title: source.title,
1061
+ url: source.link,
1062
+ credibility: source.overallCredibility,
1063
+ evidence: source.extractedContent?.substring(0, 300) + '...'
1064
+ }))
1065
+ .slice(0, 15);
1066
+ }
1067
+
1068
+ identifyResearchGaps(claimGroups, topic) {
1069
+ const gaps = [];
1070
+
1071
+ // Identify areas with low claim count or credibility
1072
+ const weakAreas = claimGroups.filter(group =>
1073
+ group.claims.length < 2 || group.avgCredibility < 0.5
1074
+ );
1075
+
1076
+ weakAreas.forEach(area => {
1077
+ gaps.push({
1078
+ area: area.keywords.join(' '),
1079
+ issue: 'Limited reliable sources',
1080
+ suggestion: `More research needed on ${area.keywords.join(' ')} related to ${topic}`
1081
+ });
1082
+ });
1083
+
1084
+ return gaps.slice(0, 5);
1085
+ }
1086
+
1087
+ generateResearchRecommendations(synthesis, topic) {
1088
+ const recommendations = [];
1089
+
1090
+ if (synthesis.conflicts.length > 0) {
1091
+ recommendations.push({
1092
+ type: 'conflict_resolution',
1093
+ priority: 'high',
1094
+ description: `Investigate ${synthesis.conflicts.length} conflicting claims about ${topic}`
1095
+ });
1096
+ }
1097
+
1098
+ if (synthesis.gaps.length > 0) {
1099
+ recommendations.push({
1100
+ type: 'gap_filling',
1101
+ priority: 'medium',
1102
+ description: `Address research gaps in ${synthesis.gaps.map(g => g.area).join(', ')}`
1103
+ });
1104
+ }
1105
+
1106
+ recommendations.push({
1107
+ type: 'validation',
1108
+ priority: 'medium',
1109
+ description: `Validate findings with additional peer-reviewed sources`
1110
+ });
1111
+
1112
+ return recommendations;
1113
+ }
1114
+
1115
+ compileResearchResults(topic, synthesis, options) {
1116
+ const baseResults = {
1117
+ sessionId: this.researchState.sessionId,
1118
+ topic,
1119
+ researchSummary: {
1120
+ totalSources: this.metrics.urlsProcessed,
1121
+ verifiedSources: this.metrics.sourcesVerified,
1122
+ keyFindings: synthesis.keyFindings.length,
1123
+ conflictsFound: synthesis.conflicts.length,
1124
+ consensusAreas: synthesis.consensus.length,
1125
+ llmEnhanced: this.enableLLMFeatures
1126
+ },
1127
+ findings: synthesis.keyFindings,
1128
+ supportingEvidence: synthesis.supportingEvidence,
1129
+ consensus: synthesis.consensus,
1130
+ conflicts: synthesis.conflicts,
1131
+ researchGaps: synthesis.gaps,
1132
+ recommendations: synthesis.recommendations,
1133
+ credibilityAssessment: {
1134
+ highCredibilitySources: Array.from(this.researchState.credibilityScores.entries())
1135
+ .filter(([_, score]) => score >= 0.7)
1136
+ .length,
1137
+ averageCredibility: this.calculateAverageCredibility(),
1138
+ credibilityDistribution: this.getCredibilityDistribution()
1139
+ },
1140
+ activityLog: this.researchState.activityLog,
1141
+ performance: {
1142
+ ...this.metrics,
1143
+ timeLimit: this.timeLimit,
1144
+ completedWithinLimit: this.metrics.totalProcessingTime < this.timeLimit
1145
+ },
1146
+ metadata: {
1147
+ generatedAt: new Date().toISOString(),
1148
+ researchDepth: this.researchState.currentDepth,
1149
+ configuration: {
1150
+ maxDepth: this.maxDepth,
1151
+ maxUrls: this.maxUrls,
1152
+ timeLimit: this.timeLimit,
1153
+ llmEnabled: this.enableLLMFeatures
1154
+ }
1155
+ }
1156
+ };
1157
+
1158
+ // Add LLM-specific analysis if available
1159
+ if (this.enableLLMFeatures) {
1160
+ baseResults.llmAnalysis = {
1161
+ synthesis: synthesis.llmSynthesis,
1162
+ relevanceScores: Object.fromEntries(this.researchState.relevanceScores),
1163
+ semanticSimilarities: Object.fromEntries(this.researchState.semanticSimilarities),
1164
+ analysisHistory: this.researchState.synthesisHistory,
1165
+ llmMetrics: {
1166
+ totalLLMCalls: this.metrics.llmAnalysisCalls,
1167
+ semanticAnalysisTime: this.metrics.semanticAnalysisTime,
1168
+ queryExpansionTime: this.metrics.queryExpansionTime,
1169
+ synthesisTime: this.metrics.synthesisTime
1170
+ }
1171
+ };
1172
+
1173
+ // Enhanced insights from LLM synthesis
1174
+ if (synthesis.llmSynthesis && !synthesis.llmSynthesis.error) {
1175
+ baseResults.insights = {
1176
+ aiSummary: synthesis.llmSynthesis.summary,
1177
+ keyThemes: synthesis.llmSynthesis.themes,
1178
+ confidenceLevel: synthesis.llmSynthesis.confidence,
1179
+ intelligentInsights: synthesis.llmSynthesis.keyInsights,
1180
+ aiRecommendations: synthesis.llmSynthesis.recommendations,
1181
+ identifiedGaps: synthesis.llmSynthesis.gaps
1182
+ };
1183
+ }
1184
+
1185
+ // Provenance tracking for LLM-enhanced sources
1186
+ baseResults.provenance = {
1187
+ sourceAnalysis: Array.from(this.researchState.llmAnalysis.entries()).map(([url, analysis]) => ({
1188
+ url,
1189
+ relevanceScore: analysis.relevanceScore,
1190
+ keyPoints: analysis.keyPoints,
1191
+ topicAlignment: analysis.topicAlignment,
1192
+ credibilityIndicators: analysis.credibilityIndicators
1193
+ })),
1194
+ queryExpansion: this.researchState.semanticSimilarities.size > 0 ?
1195
+ Object.fromEntries(this.researchState.semanticSimilarities) : null,
1196
+ totalAnalyzedSources: this.researchState.llmAnalysis.size
1197
+ };
1198
+ }
1199
+
1200
+ return baseResults;
1201
+ }
1202
+
1203
+ handleResearchError(error, topic, sessionId) {
1204
+ return {
1205
+ sessionId,
1206
+ topic,
1207
+ error: error.message,
1208
+ partialResults: {
1209
+ visitedUrls: Array.from(this.researchState.visitedUrls),
1210
+ activityLog: this.researchState.activityLog,
1211
+ metrics: this.metrics
1212
+ },
1213
+ recommendations: [{
1214
+ type: 'error_recovery',
1215
+ priority: 'high',
1216
+ description: 'Retry research with reduced scope or increased time limit'
1217
+ }],
1218
+ generatedAt: new Date().toISOString()
1219
+ };
1220
+ }
1221
+
1222
+ calculateAverageCredibility() {
1223
+ const scores = Array.from(this.researchState.credibilityScores.values());
1224
+ return scores.length > 0 ?
1225
+ scores.reduce((sum, score) => sum + score, 0) / scores.length : 0;
1226
+ }
1227
+
1228
+ getCredibilityDistribution() {
1229
+ const scores = Array.from(this.researchState.credibilityScores.values());
1230
+ const high = scores.filter(s => s >= 0.7).length;
1231
+ const medium = scores.filter(s => s >= 0.4 && s < 0.7).length;
1232
+ const low = scores.filter(s => s < 0.4).length;
1233
+
1234
+ return { high, medium, low };
1235
+ }
1236
+
1237
+ logActivity(type, data) {
1238
+ const activity = {
1239
+ type,
1240
+ timestamp: new Date().toISOString(),
1241
+ data
1242
+ };
1243
+
1244
+ this.researchState.activityLog.push(activity);
1245
+ this.emit('activityLogged', activity);
1246
+ }
1247
+
1248
+ generateSessionId() {
1249
+ return `research_${Date.now()}_${Math.random().toString(36).substring(2, 8)}`;
1250
+ }
1251
+
1252
+ // Public API methods for monitoring and control
1253
+ getResearchState() {
1254
+ return { ...this.researchState };
1255
+ }
1256
+
1257
+ getMetrics() {
1258
+ return { ...this.metrics };
1259
+ }
1260
+
1261
+ pauseResearch() {
1262
+ this.emit('researchPaused', { sessionId: this.researchState.sessionId });
1263
+ }
1264
+
1265
+ resumeResearch() {
1266
+ this.emit('researchResumed', { sessionId: this.researchState.sessionId });
1267
+ }
1268
+
1269
+ stopResearch() {
1270
+ this.emit('researchStopped', { sessionId: this.researchState.sessionId });
1271
+ }
1272
+
1273
+ /**
1274
+ * Cleanup method for proper resource disposal
1275
+ */
1276
+ async cleanup() {
1277
+ try {
1278
+ // Stop any active research
1279
+ this.stopResearch();
1280
+
1281
+ // Clear cache if available
1282
+ if (this.cache && typeof this.cache.clear === "function") {
1283
+ await this.cache.clear();
1284
+ }
1285
+
1286
+ // Clear all event listeners
1287
+ this.removeAllListeners();
1288
+
1289
+ // Reset research state
1290
+ this.researchState = {
1291
+ sessionId: null,
1292
+ currentDepth: 0,
1293
+ visitedUrls: new Set(),
1294
+ searchResults: new Map(),
1295
+ extractedContent: new Map(),
1296
+ researchFindings: [],
1297
+ credibilityScores: new Map(),
1298
+ conflictMap: new Map(),
1299
+ activityLog: [],
1300
+ llmAnalysis: new Map(),
1301
+ semanticSimilarities: new Map(),
1302
+ relevanceScores: new Map(),
1303
+ synthesisHistory: []
1304
+ };
1305
+
1306
+ // Reset metrics
1307
+ this.metrics = {
1308
+ searchQueries: 0,
1309
+ urlsProcessed: 0,
1310
+ contentExtracted: 0,
1311
+ conflictsDetected: 0,
1312
+ sourcesVerified: 0,
1313
+ cacheHits: 0,
1314
+ totalProcessingTime: 0,
1315
+ llmAnalysisCalls: 0,
1316
+ semanticAnalysisTime: 0,
1317
+ queryExpansionTime: 0,
1318
+ synthesisTime: 0
1319
+ };
1320
+
1321
+ } catch (error) {
1322
+ // Silent cleanup - do not throw errors during cleanup
1323
+ console.warn("Warning during ResearchOrchestrator cleanup:", error.message);
1324
+ }
1325
+ }
1326
+ }
1327
+