crawlforge-mcp-server 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +315 -0
- package/LICENSE +21 -0
- package/README.md +181 -0
- package/package.json +115 -0
- package/server.js +1963 -0
- package/setup.js +112 -0
- package/src/constants/config.js +615 -0
- package/src/core/ActionExecutor.js +1104 -0
- package/src/core/AlertNotificationSystem.js +601 -0
- package/src/core/AuthManager.js +315 -0
- package/src/core/ChangeTracker.js +2306 -0
- package/src/core/JobManager.js +687 -0
- package/src/core/LLMsTxtAnalyzer.js +753 -0
- package/src/core/LocalizationManager.js +1615 -0
- package/src/core/PerformanceManager.js +828 -0
- package/src/core/ResearchOrchestrator.js +1327 -0
- package/src/core/SnapshotManager.js +1037 -0
- package/src/core/StealthBrowserManager.js +1795 -0
- package/src/core/WebhookDispatcher.js +745 -0
- package/src/core/analysis/ContentAnalyzer.js +749 -0
- package/src/core/analysis/LinkAnalyzer.js +972 -0
- package/src/core/cache/CacheManager.js +821 -0
- package/src/core/connections/ConnectionPool.js +553 -0
- package/src/core/crawlers/BFSCrawler.js +845 -0
- package/src/core/integrations/PerformanceIntegration.js +377 -0
- package/src/core/llm/AnthropicProvider.js +135 -0
- package/src/core/llm/LLMManager.js +415 -0
- package/src/core/llm/LLMProvider.js +97 -0
- package/src/core/llm/OpenAIProvider.js +127 -0
- package/src/core/processing/BrowserProcessor.js +986 -0
- package/src/core/processing/ContentProcessor.js +505 -0
- package/src/core/processing/PDFProcessor.js +448 -0
- package/src/core/processing/StreamProcessor.js +673 -0
- package/src/core/queue/QueueManager.js +98 -0
- package/src/core/workers/WorkerPool.js +585 -0
- package/src/core/workers/worker.js +743 -0
- package/src/monitoring/healthCheck.js +600 -0
- package/src/monitoring/metrics.js +761 -0
- package/src/optimization/wave3-optimizations.js +932 -0
- package/src/security/security-patches.js +120 -0
- package/src/security/security-tests.js +355 -0
- package/src/security/wave3-security.js +652 -0
- package/src/tools/advanced/BatchScrapeTool.js +1089 -0
- package/src/tools/advanced/ScrapeWithActionsTool.js +669 -0
- package/src/tools/crawl/crawlDeep.js +449 -0
- package/src/tools/crawl/mapSite.js +400 -0
- package/src/tools/extract/analyzeContent.js +624 -0
- package/src/tools/extract/extractContent.js +329 -0
- package/src/tools/extract/processDocument.js +503 -0
- package/src/tools/extract/summarizeContent.js +376 -0
- package/src/tools/llmstxt/generateLLMsTxt.js +570 -0
- package/src/tools/research/deepResearch.js +706 -0
- package/src/tools/search/adapters/duckduckgoSearch.js +398 -0
- package/src/tools/search/adapters/googleSearch.js +236 -0
- package/src/tools/search/adapters/searchProviderFactory.js +96 -0
- package/src/tools/search/queryExpander.js +543 -0
- package/src/tools/search/ranking/ResultDeduplicator.js +676 -0
- package/src/tools/search/ranking/ResultRanker.js +497 -0
- package/src/tools/search/searchWeb.js +482 -0
- package/src/tools/tracking/trackChanges.js +1355 -0
- package/src/utils/CircuitBreaker.js +515 -0
- package/src/utils/ErrorHandlingConfig.js +342 -0
- package/src/utils/HumanBehaviorSimulator.js +569 -0
- package/src/utils/Logger.js +568 -0
- package/src/utils/MemoryMonitor.js +173 -0
- package/src/utils/RetryManager.js +386 -0
- package/src/utils/contentUtils.js +588 -0
- package/src/utils/domainFilter.js +612 -0
- package/src/utils/inputValidation.js +766 -0
- package/src/utils/rateLimiter.js +196 -0
- package/src/utils/robotsChecker.js +91 -0
- package/src/utils/securityMiddleware.js +416 -0
- package/src/utils/sitemapParser.js +678 -0
- package/src/utils/ssrfProtection.js +640 -0
- package/src/utils/urlNormalizer.js +168 -0
|
@@ -0,0 +1,1327 @@
|
|
|
1
|
+
import { EventEmitter } from 'events';
|
|
2
|
+
import { SearchWebTool } from '../tools/search/searchWeb.js';
|
|
3
|
+
import { CrawlDeepTool } from '../tools/crawl/crawlDeep.js';
|
|
4
|
+
import { ExtractContentTool } from '../tools/extract/extractContent.js';
|
|
5
|
+
import { SummarizeContentTool } from '../tools/extract/summarizeContent.js';
|
|
6
|
+
import { QueryExpander } from '../tools/search/queryExpander.js';
|
|
7
|
+
import { ResultRanker } from '../tools/search/ranking/ResultRanker.js';
|
|
8
|
+
import { CacheManager } from './cache/CacheManager.js';
|
|
9
|
+
import { Logger } from '../utils/Logger.js';
|
|
10
|
+
import { LLMManager } from './llm/LLMManager.js';
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* ResearchOrchestrator - Multi-stage research orchestration engine with LLM integration
|
|
14
|
+
* Coordinates complex research workflows with intelligent query expansion,
|
|
15
|
+
* source verification, information synthesis, and AI-powered analysis
|
|
16
|
+
*
|
|
17
|
+
* Phase 2.1 Features:
|
|
18
|
+
* - LLM-powered query expansion with semantic understanding
|
|
19
|
+
* - AI-driven relevance scoring and content analysis
|
|
20
|
+
* - Intelligent research synthesis with conflict detection
|
|
21
|
+
* - Advanced provenance tracking and activity logging
|
|
22
|
+
* - Smart URL prioritization based on content quality
|
|
23
|
+
*/
|
|
24
|
+
export class ResearchOrchestrator extends EventEmitter {
|
|
25
|
+
constructor(options = {}) {
|
|
26
|
+
super();
|
|
27
|
+
|
|
28
|
+
const {
|
|
29
|
+
maxDepth = 5,
|
|
30
|
+
maxUrls = 100,
|
|
31
|
+
timeLimit = 120000, // 2 minutes default
|
|
32
|
+
concurrency = 5,
|
|
33
|
+
enableSourceVerification = true,
|
|
34
|
+
enableConflictDetection = true,
|
|
35
|
+
cacheEnabled = true,
|
|
36
|
+
cacheTTL = 1800000, // 30 minutes
|
|
37
|
+
searchConfig = {},
|
|
38
|
+
crawlConfig = {},
|
|
39
|
+
extractConfig = {},
|
|
40
|
+
summarizeConfig = {}
|
|
41
|
+
} = options;
|
|
42
|
+
|
|
43
|
+
this.maxDepth = Math.min(Math.max(1, maxDepth), 10);
|
|
44
|
+
this.maxUrls = Math.min(Math.max(1, maxUrls), 1000);
|
|
45
|
+
this.timeLimit = Math.min(Math.max(30000, timeLimit), 300000);
|
|
46
|
+
this.concurrency = Math.min(Math.max(1, concurrency), 20);
|
|
47
|
+
this.enableSourceVerification = enableSourceVerification;
|
|
48
|
+
this.enableConflictDetection = enableConflictDetection;
|
|
49
|
+
|
|
50
|
+
// Initialize tools
|
|
51
|
+
this.searchTool = new SearchWebTool(searchConfig);
|
|
52
|
+
this.crawlTool = new CrawlDeepTool(crawlConfig);
|
|
53
|
+
this.extractTool = new ExtractContentTool(extractConfig);
|
|
54
|
+
this.summarizeTool = new SummarizeContentTool(summarizeConfig);
|
|
55
|
+
|
|
56
|
+
// Initialize utilities
|
|
57
|
+
this.queryExpander = new QueryExpander();
|
|
58
|
+
this.resultRanker = new ResultRanker();
|
|
59
|
+
this.cache = cacheEnabled ? new CacheManager({ ttl: cacheTTL }) : null;
|
|
60
|
+
this.logger = new Logger({ component: 'ResearchOrchestrator' });
|
|
61
|
+
|
|
62
|
+
// Initialize LLM Manager for AI-powered research
|
|
63
|
+
this.llmManager = new LLMManager(options.llmConfig || {});
|
|
64
|
+
this.enableLLMFeatures = this.llmManager.isAvailable();
|
|
65
|
+
|
|
66
|
+
if (this.enableLLMFeatures) {
|
|
67
|
+
this.logger.info('LLM-powered research features enabled');
|
|
68
|
+
} else {
|
|
69
|
+
this.logger.warn('LLM providers not available, using fallback methods');
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Research state tracking
|
|
73
|
+
this.researchState = {
|
|
74
|
+
sessionId: null,
|
|
75
|
+
startTime: null,
|
|
76
|
+
currentDepth: 0,
|
|
77
|
+
visitedUrls: new Set(),
|
|
78
|
+
searchResults: new Map(),
|
|
79
|
+
extractedContent: new Map(),
|
|
80
|
+
researchFindings: [],
|
|
81
|
+
credibilityScores: new Map(),
|
|
82
|
+
conflictMap: new Map(),
|
|
83
|
+
activityLog: [],
|
|
84
|
+
llmAnalysis: new Map(),
|
|
85
|
+
semanticSimilarities: new Map(),
|
|
86
|
+
relevanceScores: new Map(),
|
|
87
|
+
synthesisHistory: []
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
// Performance metrics
|
|
91
|
+
this.metrics = {
|
|
92
|
+
searchQueries: 0,
|
|
93
|
+
urlsProcessed: 0,
|
|
94
|
+
contentExtracted: 0,
|
|
95
|
+
conflictsDetected: 0,
|
|
96
|
+
sourcesVerified: 0,
|
|
97
|
+
cacheHits: 0,
|
|
98
|
+
totalProcessingTime: 0,
|
|
99
|
+
llmAnalysisCalls: 0,
|
|
100
|
+
semanticAnalysisTime: 0,
|
|
101
|
+
queryExpansionTime: 0,
|
|
102
|
+
synthesisTime: 0
|
|
103
|
+
};
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Conduct comprehensive deep research on a topic
|
|
108
|
+
* @param {string} topic - The research topic/question
|
|
109
|
+
* @param {Object} options - Research configuration options
|
|
110
|
+
* @returns {Promise<Object>} Research results
|
|
111
|
+
*/
|
|
112
|
+
async conductResearch(topic, options = {}) {
|
|
113
|
+
const sessionId = this.generateSessionId();
|
|
114
|
+
const startTime = Date.now();
|
|
115
|
+
|
|
116
|
+
this.initializeResearchSession(sessionId, topic, startTime);
|
|
117
|
+
|
|
118
|
+
try {
|
|
119
|
+
this.logger.info('Starting deep research', { sessionId, topic, options });
|
|
120
|
+
|
|
121
|
+
// Stage 1: Initial topic exploration and query expansion
|
|
122
|
+
const expandedQueries = await this.expandResearchTopic(topic);
|
|
123
|
+
this.logActivity('topic_expansion', { originalTopic: topic, expandedQueries });
|
|
124
|
+
|
|
125
|
+
// Stage 2: Broad information gathering
|
|
126
|
+
const initialSources = await this.gatherInitialSources(expandedQueries, options);
|
|
127
|
+
this.logActivity('initial_gathering', { sourcesFound: initialSources.length });
|
|
128
|
+
|
|
129
|
+
// Stage 3: Deep exploration of promising sources
|
|
130
|
+
const detailedFindings = await this.exploreSourcesInDepth(initialSources, options);
|
|
131
|
+
this.logActivity('deep_exploration', { findingsCount: detailedFindings.length });
|
|
132
|
+
|
|
133
|
+
// Stage 4: Source credibility assessment
|
|
134
|
+
const verifiedSources = this.enableSourceVerification ?
|
|
135
|
+
await this.verifySourceCredibility(detailedFindings) : detailedFindings;
|
|
136
|
+
this.logActivity('source_verification', { verifiedCount: verifiedSources.length });
|
|
137
|
+
|
|
138
|
+
// Stage 5: Information synthesis and conflict detection
|
|
139
|
+
const synthesizedResults = await this.synthesizeInformation(verifiedSources, topic);
|
|
140
|
+
this.logActivity('information_synthesis', { conflictsFound: synthesizedResults.conflicts.length });
|
|
141
|
+
|
|
142
|
+
// Stage 6: Final result compilation
|
|
143
|
+
const finalResults = this.compileResearchResults(topic, synthesizedResults, options);
|
|
144
|
+
|
|
145
|
+
const totalTime = Date.now() - startTime;
|
|
146
|
+
this.metrics.totalProcessingTime = totalTime;
|
|
147
|
+
|
|
148
|
+
this.logger.info('Research completed', {
|
|
149
|
+
sessionId,
|
|
150
|
+
duration: totalTime,
|
|
151
|
+
findings: finalResults.findings.length
|
|
152
|
+
});
|
|
153
|
+
|
|
154
|
+
this.emit('researchCompleted', {
|
|
155
|
+
sessionId,
|
|
156
|
+
topic,
|
|
157
|
+
duration: totalTime,
|
|
158
|
+
findings: finalResults.findings.length
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
return finalResults;
|
|
162
|
+
|
|
163
|
+
} catch (error) {
|
|
164
|
+
this.logger.error('Research failed', { sessionId, error: error.message });
|
|
165
|
+
this.emit('researchFailed', { sessionId, topic, error: error.message });
|
|
166
|
+
|
|
167
|
+
return this.handleResearchError(error, topic, sessionId);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Initialize research session state
|
|
173
|
+
*/
|
|
174
|
+
initializeResearchSession(sessionId, topic, startTime) {
|
|
175
|
+
this.researchState = {
|
|
176
|
+
sessionId,
|
|
177
|
+
topic,
|
|
178
|
+
startTime,
|
|
179
|
+
currentDepth: 0,
|
|
180
|
+
visitedUrls: new Set(),
|
|
181
|
+
searchResults: new Map(),
|
|
182
|
+
extractedContent: new Map(),
|
|
183
|
+
researchFindings: [],
|
|
184
|
+
credibilityScores: new Map(),
|
|
185
|
+
conflictMap: new Map(),
|
|
186
|
+
activityLog: []
|
|
187
|
+
};
|
|
188
|
+
|
|
189
|
+
// Reset metrics
|
|
190
|
+
Object.keys(this.metrics).forEach(key => {
|
|
191
|
+
this.metrics[key] = 0;
|
|
192
|
+
});
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
/**
|
|
196
|
+
* Expand research topic into multiple targeted queries with LLM enhancement
|
|
197
|
+
*/
|
|
198
|
+
async expandResearchTopic(topic) {
|
|
199
|
+
const startTime = Date.now();
|
|
200
|
+
|
|
201
|
+
try {
|
|
202
|
+
const cacheKey = this.cache ? this.cache.generateKey('topic_expansion_v2', { topic, llm: this.enableLLMFeatures }) : null;
|
|
203
|
+
|
|
204
|
+
if (this.cache && cacheKey) {
|
|
205
|
+
const cached = await this.cache.get(cacheKey);
|
|
206
|
+
if (cached) {
|
|
207
|
+
this.metrics.cacheHits++;
|
|
208
|
+
return cached;
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
let expandedQueries = [];
|
|
213
|
+
|
|
214
|
+
// LLM-powered query expansion (preferred)
|
|
215
|
+
if (this.enableLLMFeatures) {
|
|
216
|
+
try {
|
|
217
|
+
this.logger.info('Using LLM for intelligent query expansion');
|
|
218
|
+
expandedQueries = await this.llmManager.expandQuery(topic, {
|
|
219
|
+
maxExpansions: 8,
|
|
220
|
+
includeContextual: true,
|
|
221
|
+
includeSynonyms: true,
|
|
222
|
+
includeRelated: true
|
|
223
|
+
});
|
|
224
|
+
this.metrics.llmAnalysisCalls++;
|
|
225
|
+
} catch (llmError) {
|
|
226
|
+
this.logger.warn('LLM query expansion failed, falling back to traditional methods', { error: llmError.message });
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Fallback to traditional expansion if LLM failed or unavailable
|
|
231
|
+
if (expandedQueries.length === 0) {
|
|
232
|
+
expandedQueries = await this.queryExpander.expandQuery(topic, {
|
|
233
|
+
enableSynonyms: true,
|
|
234
|
+
enableSpellCheck: true,
|
|
235
|
+
enablePhraseDetection: true,
|
|
236
|
+
maxExpansions: 8
|
|
237
|
+
});
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Add research-specific query variations
|
|
241
|
+
const researchVariations = this.generateResearchVariations(topic);
|
|
242
|
+
const allQueries = [...new Set([topic, ...expandedQueries, ...researchVariations])];
|
|
243
|
+
|
|
244
|
+
// Rank queries by research relevance with semantic understanding
|
|
245
|
+
const rankedQueries = await this.rankResearchQueriesWithSemantics(allQueries, topic);
|
|
246
|
+
|
|
247
|
+
if (this.cache && cacheKey) {
|
|
248
|
+
await this.cache.set(cacheKey, rankedQueries);
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
this.metrics.queryExpansionTime += Date.now() - startTime;
|
|
252
|
+
return rankedQueries;
|
|
253
|
+
} catch (error) {
|
|
254
|
+
this.logger.warn('Topic expansion failed, using original topic', { error: error.message });
|
|
255
|
+
this.metrics.queryExpansionTime += Date.now() - startTime;
|
|
256
|
+
return [topic];
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Generate research-specific query variations
|
|
262
|
+
*/
|
|
263
|
+
generateResearchVariations(topic) {
|
|
264
|
+
const variations = [];
|
|
265
|
+
|
|
266
|
+
// Question-based variations
|
|
267
|
+
variations.push(`what is ${topic}`);
|
|
268
|
+
variations.push(`how does ${topic} work`);
|
|
269
|
+
variations.push(`${topic} explained`);
|
|
270
|
+
variations.push(`${topic} research`);
|
|
271
|
+
variations.push(`${topic} studies`);
|
|
272
|
+
variations.push(`${topic} analysis`);
|
|
273
|
+
|
|
274
|
+
// Academic and authoritative variations
|
|
275
|
+
variations.push(`${topic} academic`);
|
|
276
|
+
variations.push(`${topic} scientific`);
|
|
277
|
+
variations.push(`${topic} research paper`);
|
|
278
|
+
variations.push(`${topic} peer reviewed`);
|
|
279
|
+
|
|
280
|
+
// Current and historical context
|
|
281
|
+
variations.push(`latest ${topic}`);
|
|
282
|
+
variations.push(`current ${topic}`);
|
|
283
|
+
variations.push(`${topic} 2024`);
|
|
284
|
+
variations.push(`${topic} trends`);
|
|
285
|
+
|
|
286
|
+
return variations.slice(0, 10); // Limit variations
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
/**
|
|
290
|
+
* Rank research queries by relevance and specificity with semantic analysis
|
|
291
|
+
*/
|
|
292
|
+
async rankResearchQueriesWithSemantics(queries, originalTopic) {
|
|
293
|
+
const startTime = Date.now();
|
|
294
|
+
|
|
295
|
+
try {
|
|
296
|
+
const scored = await Promise.all(queries.map(async (query) => {
|
|
297
|
+
let score = 0.5; // Base score
|
|
298
|
+
|
|
299
|
+
// Give original topic highest priority
|
|
300
|
+
if (query === originalTopic) {
|
|
301
|
+
return { query, score: 1.0 };
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// Traditional scoring
|
|
305
|
+
const topicWords = originalTopic.toLowerCase().split(" ");
|
|
306
|
+
const queryWords = query.toLowerCase().split(" ");
|
|
307
|
+
const overlap = topicWords.filter(word => queryWords.includes(word));
|
|
308
|
+
score += (overlap.length / topicWords.length) * 0.3;
|
|
309
|
+
|
|
310
|
+
// Research-oriented bonus
|
|
311
|
+
const researchKeywords = ["research", "study", "analysis", "academic", "scientific"];
|
|
312
|
+
if (researchKeywords.some(keyword => query.toLowerCase().includes(keyword))) {
|
|
313
|
+
score += 0.2;
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
// Length preference
|
|
317
|
+
if (query.length > 10 && query.length < 100) {
|
|
318
|
+
score += 0.1;
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
// Semantic similarity boost (if LLM available)
|
|
322
|
+
if (this.enableLLMFeatures) {
|
|
323
|
+
try {
|
|
324
|
+
const similarity = await this.llmManager.calculateSimilarity(originalTopic, query);
|
|
325
|
+
score += similarity * 0.3; // Semantic similarity weight
|
|
326
|
+
this.researchState.semanticSimilarities.set(query, similarity);
|
|
327
|
+
} catch (semanticError) {
|
|
328
|
+
this.logger.debug('Semantic similarity calculation failed', { query, error: semanticError.message });
|
|
329
|
+
}
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
return { query, score };
|
|
333
|
+
}));
|
|
334
|
+
|
|
335
|
+
const sortedQueries = scored
|
|
336
|
+
.sort((a, b) => b.score - a.score)
|
|
337
|
+
.map(item => item.query);
|
|
338
|
+
|
|
339
|
+
// Ensure original topic is always first
|
|
340
|
+
const result = [originalTopic];
|
|
341
|
+
sortedQueries.forEach(query => {
|
|
342
|
+
if (query !== originalTopic && result.length < this.maxDepth) {
|
|
343
|
+
result.push(query);
|
|
344
|
+
}
|
|
345
|
+
});
|
|
346
|
+
|
|
347
|
+
this.metrics.semanticAnalysisTime += Date.now() - startTime;
|
|
348
|
+
return result.slice(0, this.maxDepth);
|
|
349
|
+
} catch (error) {
|
|
350
|
+
this.logger.warn('Semantic ranking failed, using fallback', { error: error.message });
|
|
351
|
+
return this.rankResearchQueries(queries, originalTopic);
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
/**
|
|
356
|
+
* Fallback ranking method (original implementation)
|
|
357
|
+
*/
|
|
358
|
+
rankResearchQueries(queries, originalTopic) {
|
|
359
|
+
const scored = queries.map(query => {
|
|
360
|
+
let score = 0.5;
|
|
361
|
+
|
|
362
|
+
if (query === originalTopic) {
|
|
363
|
+
score = 1.0;
|
|
364
|
+
} else {
|
|
365
|
+
const topicWords = originalTopic.toLowerCase().split(" ");
|
|
366
|
+
const queryWords = query.toLowerCase().split(" ");
|
|
367
|
+
const overlap = topicWords.filter(word => queryWords.includes(word));
|
|
368
|
+
score += (overlap.length / topicWords.length) * 0.3;
|
|
369
|
+
|
|
370
|
+
const researchKeywords = ["research", "study", "analysis", "academic", "scientific"];
|
|
371
|
+
if (researchKeywords.some(keyword => query.toLowerCase().includes(keyword))) {
|
|
372
|
+
score += 0.2;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
if (query.length > 10 && query.length < 100) {
|
|
376
|
+
score += 0.1;
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
return { query, score };
|
|
381
|
+
});
|
|
382
|
+
|
|
383
|
+
const sortedQueries = scored
|
|
384
|
+
.sort((a, b) => b.score - a.score)
|
|
385
|
+
.map(item => item.query);
|
|
386
|
+
|
|
387
|
+
const result = [originalTopic];
|
|
388
|
+
sortedQueries.forEach(query => {
|
|
389
|
+
if (query !== originalTopic && result.length < this.maxDepth) {
|
|
390
|
+
result.push(query);
|
|
391
|
+
}
|
|
392
|
+
});
|
|
393
|
+
|
|
394
|
+
return result.slice(0, this.maxDepth);
|
|
395
|
+
}
|
|
396
|
+
/**
|
|
397
|
+
* Gather initial sources using expanded queries
|
|
398
|
+
*/
|
|
399
|
+
async gatherInitialSources(queries, options) {
|
|
400
|
+
const allSources = [];
|
|
401
|
+
const maxSourcesPerQuery = Math.ceil(this.maxUrls / queries.length);
|
|
402
|
+
|
|
403
|
+
await this.processWithTimeLimit(async () => {
|
|
404
|
+
const searchPromises = queries.slice(0, 5).map(async (query) => {
|
|
405
|
+
try {
|
|
406
|
+
this.metrics.searchQueries++;
|
|
407
|
+
const searchResults = await this.searchTool.execute({
|
|
408
|
+
query,
|
|
409
|
+
limit: maxSourcesPerQuery,
|
|
410
|
+
enable_ranking: true,
|
|
411
|
+
enable_deduplication: true
|
|
412
|
+
});
|
|
413
|
+
|
|
414
|
+
if (searchResults.results && searchResults.results.length > 0) {
|
|
415
|
+
const processedResults = searchResults.results.map(result => ({
|
|
416
|
+
...result,
|
|
417
|
+
sourceQuery: query,
|
|
418
|
+
discoveredAt: new Date().toISOString(),
|
|
419
|
+
credibilityScore: this.calculateInitialCredibility(result),
|
|
420
|
+
researchRelevance: this.calculateResearchRelevance(result, query)
|
|
421
|
+
}));
|
|
422
|
+
|
|
423
|
+
this.researchState.searchResults.set(query, processedResults);
|
|
424
|
+
return processedResults;
|
|
425
|
+
}
|
|
426
|
+
return [];
|
|
427
|
+
} catch (error) {
|
|
428
|
+
this.logger.warn('Search failed for query', { query, error: error.message });
|
|
429
|
+
return [];
|
|
430
|
+
}
|
|
431
|
+
});
|
|
432
|
+
|
|
433
|
+
const results = await Promise.all(searchPromises);
|
|
434
|
+
results.forEach(sources => allSources.push(...sources));
|
|
435
|
+
});
|
|
436
|
+
|
|
437
|
+
// Deduplicate and rank sources
|
|
438
|
+
const uniqueSources = this.deduplicateSources(allSources);
|
|
439
|
+
const rankedSources = await this.rankSourcesByResearchValue(uniqueSources);
|
|
440
|
+
|
|
441
|
+
return rankedSources.slice(0, this.maxUrls);
|
|
442
|
+
}
|
|
443
|
+
|
|
444
|
+
/**
|
|
445
|
+
* Explore promising sources in depth with LLM-powered relevance analysis
|
|
446
|
+
*/
|
|
447
|
+
async exploreSourcesInDepth(sources, options) {
|
|
448
|
+
const detailedFindings = [];
|
|
449
|
+
const batchSize = Math.min(this.concurrency, 10);
|
|
450
|
+
const { topic } = this.researchState;
|
|
451
|
+
|
|
452
|
+
await this.processWithTimeLimit(async () => {
|
|
453
|
+
for (let i = 0; i < sources.length; i += batchSize) {
|
|
454
|
+
const batch = sources.slice(i, i + batchSize);
|
|
455
|
+
|
|
456
|
+
const batchPromises = batch.map(async (source) => {
|
|
457
|
+
try {
|
|
458
|
+
if (this.researchState.visitedUrls.has(source.link)) {
|
|
459
|
+
return null;
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
this.researchState.visitedUrls.add(source.link);
|
|
463
|
+
this.metrics.urlsProcessed++;
|
|
464
|
+
|
|
465
|
+
// Extract detailed content
|
|
466
|
+
const contentData = await this.extractTool.execute({
|
|
467
|
+
url: source.link,
|
|
468
|
+
options: { includeMetadata: true, includeStructuredData: true }
|
|
469
|
+
});
|
|
470
|
+
|
|
471
|
+
if (contentData && contentData.content) {
|
|
472
|
+
this.metrics.contentExtracted++;
|
|
473
|
+
|
|
474
|
+
// Enhance source with extracted content
|
|
475
|
+
let enhancedSource = {
|
|
476
|
+
...source,
|
|
477
|
+
extractedContent: contentData.content,
|
|
478
|
+
metadata: contentData.metadata,
|
|
479
|
+
structuredData: contentData.structuredData,
|
|
480
|
+
extractedAt: new Date().toISOString(),
|
|
481
|
+
wordCount: contentData.content.split(' ').length,
|
|
482
|
+
readabilityScore: this.calculateReadabilityScore(contentData.content)
|
|
483
|
+
};
|
|
484
|
+
|
|
485
|
+
// LLM-powered relevance analysis
|
|
486
|
+
if (this.enableLLMFeatures && topic) {
|
|
487
|
+
try {
|
|
488
|
+
const relevanceAnalysis = await this.llmManager.analyzeRelevance(
|
|
489
|
+
contentData.content,
|
|
490
|
+
topic,
|
|
491
|
+
{ maxContentLength: 2000 }
|
|
492
|
+
);
|
|
493
|
+
|
|
494
|
+
enhancedSource.llmAnalysis = relevanceAnalysis;
|
|
495
|
+
enhancedSource.relevanceScore = relevanceAnalysis.relevanceScore;
|
|
496
|
+
this.researchState.llmAnalysis.set(source.link, relevanceAnalysis);
|
|
497
|
+
this.researchState.relevanceScores.set(source.link, relevanceAnalysis.relevanceScore);
|
|
498
|
+
this.metrics.llmAnalysisCalls++;
|
|
499
|
+
|
|
500
|
+
this.logger.debug('LLM relevance analysis completed', {
|
|
501
|
+
url: source.link,
|
|
502
|
+
relevanceScore: relevanceAnalysis.relevanceScore,
|
|
503
|
+
keyPoints: relevanceAnalysis.keyPoints.length
|
|
504
|
+
});
|
|
505
|
+
} catch (llmError) {
|
|
506
|
+
this.logger.warn('LLM relevance analysis failed', {
|
|
507
|
+
url: source.link,
|
|
508
|
+
error: llmError.message
|
|
509
|
+
});
|
|
510
|
+
// Set default relevance score
|
|
511
|
+
enhancedSource.relevanceScore = this.calculateTraditionalRelevance(contentData.content, topic);
|
|
512
|
+
}
|
|
513
|
+
} else {
|
|
514
|
+
// Fallback relevance calculation
|
|
515
|
+
enhancedSource.relevanceScore = this.calculateTraditionalRelevance(contentData.content, topic);
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
this.researchState.extractedContent.set(source.link, enhancedSource);
|
|
519
|
+
return enhancedSource;
|
|
520
|
+
}
|
|
521
|
+
return null;
|
|
522
|
+
} catch (error) {
|
|
523
|
+
this.logger.warn('Content extraction failed', {
|
|
524
|
+
url: source.link,
|
|
525
|
+
error: error.message
|
|
526
|
+
});
|
|
527
|
+
return null;
|
|
528
|
+
}
|
|
529
|
+
});
|
|
530
|
+
|
|
531
|
+
const batchResults = await Promise.all(batchPromises);
|
|
532
|
+
const validResults = batchResults.filter(result => result !== null);
|
|
533
|
+
detailedFindings.push(...validResults);
|
|
534
|
+
}
|
|
535
|
+
});
|
|
536
|
+
|
|
537
|
+
// Sort by relevance score (LLM or traditional)
|
|
538
|
+
return detailedFindings.sort((a, b) => (b.relevanceScore || 0) - (a.relevanceScore || 0));
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
/**
|
|
542
|
+
* Verify source credibility using multiple factors
|
|
543
|
+
*/
|
|
544
|
+
async verifySourceCredibility(sources) {
|
|
545
|
+
const verifiedSources = [];
|
|
546
|
+
|
|
547
|
+
for (const source of sources) {
|
|
548
|
+
try {
|
|
549
|
+
this.metrics.sourcesVerified++;
|
|
550
|
+
|
|
551
|
+
const credibilityFactors = {
|
|
552
|
+
domainAuthority: this.assessDomainAuthority(source.link),
|
|
553
|
+
contentQuality: this.assessContentQuality(source),
|
|
554
|
+
sourceType: this.identifySourceType(source),
|
|
555
|
+
recency: this.assessContentRecency(source),
|
|
556
|
+
authorityIndicators: this.findAuthorityIndicators(source),
|
|
557
|
+
citationPotential: this.assessCitationPotential(source)
|
|
558
|
+
};
|
|
559
|
+
|
|
560
|
+
const overallCredibility = this.calculateOverallCredibility(credibilityFactors);
|
|
561
|
+
|
|
562
|
+
// Only include sources that meet minimum credibility threshold
|
|
563
|
+
if (overallCredibility >= 0.3) {
|
|
564
|
+
verifiedSources.push({
|
|
565
|
+
...source,
|
|
566
|
+
credibilityFactors,
|
|
567
|
+
overallCredibility,
|
|
568
|
+
verifiedAt: new Date().toISOString()
|
|
569
|
+
});
|
|
570
|
+
|
|
571
|
+
this.researchState.credibilityScores.set(source.link, overallCredibility);
|
|
572
|
+
}
|
|
573
|
+
} catch (error) {
|
|
574
|
+
this.logger.warn('Credibility verification failed', {
|
|
575
|
+
url: source.link,
|
|
576
|
+
error: error.message
|
|
577
|
+
});
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
return verifiedSources.sort((a, b) => b.overallCredibility - a.overallCredibility);
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
/**
|
|
585
|
+
* Synthesize information and detect conflicts with LLM enhancement
|
|
586
|
+
*/
|
|
587
|
+
async synthesizeInformation(sources, topic) {
|
|
588
|
+
const startTime = Date.now();
|
|
589
|
+
const synthesis = {
|
|
590
|
+
keyFindings: [],
|
|
591
|
+
supportingEvidence: [],
|
|
592
|
+
conflicts: [],
|
|
593
|
+
consensus: [],
|
|
594
|
+
gaps: [],
|
|
595
|
+
recommendations: [],
|
|
596
|
+
llmSynthesis: null
|
|
597
|
+
};
|
|
598
|
+
|
|
599
|
+
try {
|
|
600
|
+
// Extract key claims and facts from each source
|
|
601
|
+
const extractedClaims = await this.extractKeyClaims(sources);
|
|
602
|
+
|
|
603
|
+
// Group related claims
|
|
604
|
+
const claimGroups = this.groupRelatedClaims(extractedClaims);
|
|
605
|
+
|
|
606
|
+
// Detect conflicts between claims
|
|
607
|
+
if (this.enableConflictDetection) {
|
|
608
|
+
synthesis.conflicts = this.detectInformationConflicts(claimGroups);
|
|
609
|
+
this.metrics.conflictsDetected = synthesis.conflicts.length;
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
// Identify consensus areas
|
|
613
|
+
synthesis.consensus = this.identifyConsensus(claimGroups);
|
|
614
|
+
|
|
615
|
+
// Generate key findings
|
|
616
|
+
synthesis.keyFindings = this.generateKeyFindings(claimGroups, sources);
|
|
617
|
+
|
|
618
|
+
// Compile supporting evidence
|
|
619
|
+
synthesis.supportingEvidence = this.compileSupportingEvidence(sources);
|
|
620
|
+
|
|
621
|
+
// Identify research gaps
|
|
622
|
+
synthesis.gaps = this.identifyResearchGaps(claimGroups, topic);
|
|
623
|
+
|
|
624
|
+
// Generate recommendations
|
|
625
|
+
synthesis.recommendations = this.generateResearchRecommendations(synthesis, topic);
|
|
626
|
+
|
|
627
|
+
// LLM-powered comprehensive synthesis
|
|
628
|
+
if (this.enableLLMFeatures && sources.length > 0) {
|
|
629
|
+
try {
|
|
630
|
+
this.logger.info('Generating LLM-powered research synthesis');
|
|
631
|
+
|
|
632
|
+
// Prepare findings for LLM analysis
|
|
633
|
+
const findingsForLLM = synthesis.keyFindings.map(finding => ({
|
|
634
|
+
finding: finding.finding,
|
|
635
|
+
credibility: finding.credibility,
|
|
636
|
+
sources: finding.sources.length
|
|
637
|
+
}));
|
|
638
|
+
|
|
639
|
+
const llmSynthesis = await this.llmManager.synthesizeFindings(
|
|
640
|
+
findingsForLLM,
|
|
641
|
+
topic,
|
|
642
|
+
{
|
|
643
|
+
maxFindings: 10,
|
|
644
|
+
includeConflicts: synthesis.conflicts.length > 0
|
|
645
|
+
}
|
|
646
|
+
);
|
|
647
|
+
|
|
648
|
+
synthesis.llmSynthesis = llmSynthesis;
|
|
649
|
+
this.researchState.synthesisHistory.push({
|
|
650
|
+
timestamp: new Date().toISOString(),
|
|
651
|
+
topic,
|
|
652
|
+
synthesis: llmSynthesis,
|
|
653
|
+
sourceCount: sources.length
|
|
654
|
+
});
|
|
655
|
+
|
|
656
|
+
this.metrics.llmAnalysisCalls++;
|
|
657
|
+
this.logger.info('LLM synthesis completed', {
|
|
658
|
+
confidence: llmSynthesis.confidence,
|
|
659
|
+
insights: llmSynthesis.keyInsights?.length || 0,
|
|
660
|
+
themes: llmSynthesis.themes?.length || 0
|
|
661
|
+
});
|
|
662
|
+
|
|
663
|
+
} catch (llmError) {
|
|
664
|
+
this.logger.warn('LLM synthesis failed', { error: llmError.message });
|
|
665
|
+
synthesis.llmSynthesis = {
|
|
666
|
+
error: 'LLM synthesis unavailable',
|
|
667
|
+
fallback: true
|
|
668
|
+
};
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
} catch (error) {
|
|
673
|
+
this.logger.error('Information synthesis failed', { error: error.message });
|
|
674
|
+
synthesis.error = error.message;
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
this.metrics.synthesisTime += Date.now() - startTime;
|
|
678
|
+
return synthesis;
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
/**
|
|
682
|
+
* Extract key claims from source content
|
|
683
|
+
*/
|
|
684
|
+
async extractKeyClaims(sources) {
|
|
685
|
+
const claims = [];
|
|
686
|
+
|
|
687
|
+
for (const source of sources) {
|
|
688
|
+
try {
|
|
689
|
+
if (!source.extractedContent) continue;
|
|
690
|
+
|
|
691
|
+
const content = source.extractedContent.substring(0, 5000); // Limit content length
|
|
692
|
+
|
|
693
|
+
// Use summarization to extract key points
|
|
694
|
+
const summary = await this.summarizeTool.execute({
|
|
695
|
+
text: content,
|
|
696
|
+
options: {
|
|
697
|
+
maxLength: 500,
|
|
698
|
+
extractKeyPoints: true,
|
|
699
|
+
includeSupporting: true
|
|
700
|
+
}
|
|
701
|
+
});
|
|
702
|
+
|
|
703
|
+
if (summary.keyPoints) {
|
|
704
|
+
summary.keyPoints.forEach((point, index) => {
|
|
705
|
+
claims.push({
|
|
706
|
+
id: `${source.link}_claim_${index}`,
|
|
707
|
+
claim: point,
|
|
708
|
+
source: source.link,
|
|
709
|
+
sourceTitle: source.title,
|
|
710
|
+
credibility: source.overallCredibility || 0.5,
|
|
711
|
+
context: summary.supporting?.[index] || '',
|
|
712
|
+
extractedAt: new Date().toISOString()
|
|
713
|
+
});
|
|
714
|
+
});
|
|
715
|
+
}
|
|
716
|
+
} catch (error) {
|
|
717
|
+
this.logger.warn('Claim extraction failed', {
|
|
718
|
+
source: source.link,
|
|
719
|
+
error: error.message
|
|
720
|
+
});
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
return claims;
|
|
725
|
+
}
|
|
726
|
+
|
|
727
|
+
/**
|
|
728
|
+
* Group related claims for analysis
|
|
729
|
+
*/
|
|
730
|
+
groupRelatedClaims(claims) {
|
|
731
|
+
const groups = new Map();
|
|
732
|
+
|
|
733
|
+
for (const claim of claims) {
|
|
734
|
+
const keywords = this.extractKeywords(claim.claim);
|
|
735
|
+
const groupKey = keywords.slice(0, 3).sort().join('_');
|
|
736
|
+
|
|
737
|
+
if (!groups.has(groupKey)) {
|
|
738
|
+
groups.set(groupKey, {
|
|
739
|
+
id: groupKey,
|
|
740
|
+
keywords,
|
|
741
|
+
claims: [],
|
|
742
|
+
avgCredibility: 0,
|
|
743
|
+
sourceCount: 0
|
|
744
|
+
});
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
groups.get(groupKey).claims.push(claim);
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
// Calculate group statistics
|
|
751
|
+
groups.forEach(group => {
|
|
752
|
+
group.sourceCount = new Set(group.claims.map(c => c.source)).size;
|
|
753
|
+
group.avgCredibility = group.claims.reduce((sum, c) => sum + c.credibility, 0) / group.claims.length;
|
|
754
|
+
});
|
|
755
|
+
|
|
756
|
+
return Array.from(groups.values());
|
|
757
|
+
}
|
|
758
|
+
|
|
759
|
+
/**
|
|
760
|
+
* Detect conflicts between information claims
|
|
761
|
+
*/
|
|
762
|
+
detectInformationConflicts(claimGroups) {
|
|
763
|
+
const conflicts = [];
|
|
764
|
+
|
|
765
|
+
for (const group of claimGroups) {
|
|
766
|
+
if (group.claims.length < 2) continue;
|
|
767
|
+
|
|
768
|
+
// Simple conflict detection based on contradictory terms
|
|
769
|
+
const conflictIndicators = [
|
|
770
|
+
['not', 'is'], ['false', 'true'], ['incorrect', 'correct'],
|
|
771
|
+
['impossible', 'possible'], ['never', 'always'], ['no', 'yes']
|
|
772
|
+
];
|
|
773
|
+
|
|
774
|
+
for (let i = 0; i < group.claims.length; i++) {
|
|
775
|
+
for (let j = i + 1; j < group.claims.length; j++) {
|
|
776
|
+
const claim1 = group.claims[i];
|
|
777
|
+
const claim2 = group.claims[j];
|
|
778
|
+
|
|
779
|
+
const text1 = claim1.claim.toLowerCase();
|
|
780
|
+
const text2 = claim2.claim.toLowerCase();
|
|
781
|
+
|
|
782
|
+
for (const [neg, pos] of conflictIndicators) {
|
|
783
|
+
if ((text1.includes(neg) && text2.includes(pos)) ||
|
|
784
|
+
(text1.includes(pos) && text2.includes(neg))) {
|
|
785
|
+
|
|
786
|
+
conflicts.push({
|
|
787
|
+
id: `conflict_${conflicts.length}`,
|
|
788
|
+
type: 'contradiction',
|
|
789
|
+
claim1: claim1,
|
|
790
|
+
claim2: claim2,
|
|
791
|
+
severity: this.calculateConflictSeverity(claim1, claim2),
|
|
792
|
+
detectedAt: new Date().toISOString()
|
|
793
|
+
});
|
|
794
|
+
|
|
795
|
+
break;
|
|
796
|
+
}
|
|
797
|
+
}
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
return conflicts;
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
/**
|
|
806
|
+
* Identify areas of consensus
|
|
807
|
+
*/
|
|
808
|
+
identifyConsensus(claimGroups) {
|
|
809
|
+
return claimGroups
|
|
810
|
+
.filter(group => group.sourceCount >= 2 && group.avgCredibility >= 0.6)
|
|
811
|
+
.map(group => ({
|
|
812
|
+
topic: group.keywords.join(' '),
|
|
813
|
+
supportingClaims: group.claims.length,
|
|
814
|
+
supportingSources: group.sourceCount,
|
|
815
|
+
averageCredibility: group.avgCredibility,
|
|
816
|
+
consensusStrength: this.calculateConsensusStrength(group)
|
|
817
|
+
}))
|
|
818
|
+
.sort((a, b) => b.consensusStrength - a.consensusStrength);
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
/**
|
|
822
|
+
* Calculate various scoring functions
|
|
823
|
+
*/
|
|
824
|
+
calculateInitialCredibility(source) {
|
|
825
|
+
let score = 0.5;
|
|
826
|
+
|
|
827
|
+
// Domain-based scoring
|
|
828
|
+
try {
|
|
829
|
+
const domain = new URL(source.link).hostname;
|
|
830
|
+
if (domain.includes('edu')) score += 0.3;
|
|
831
|
+
else if (domain.includes('gov')) score += 0.4;
|
|
832
|
+
else if (domain.includes('org')) score += 0.2;
|
|
833
|
+
} catch {}
|
|
834
|
+
|
|
835
|
+
// Content indicators
|
|
836
|
+
if (source.snippet) {
|
|
837
|
+
const snippet = source.snippet.toLowerCase();
|
|
838
|
+
if (snippet.includes('research') || snippet.includes('study')) score += 0.1;
|
|
839
|
+
if (snippet.includes('peer reviewed')) score += 0.2;
|
|
840
|
+
}
|
|
841
|
+
|
|
842
|
+
return Math.min(1, score);
|
|
843
|
+
}
|
|
844
|
+
|
|
845
|
+
calculateResearchRelevance(result, query) {
|
|
846
|
+
let relevance = 0.5;
|
|
847
|
+
|
|
848
|
+
const title = (result.title || '').toLowerCase();
|
|
849
|
+
const snippet = (result.snippet || '').toLowerCase();
|
|
850
|
+
const queryLower = query.toLowerCase();
|
|
851
|
+
|
|
852
|
+
// Title relevance
|
|
853
|
+
if (title.includes(queryLower)) relevance += 0.3;
|
|
854
|
+
|
|
855
|
+
// Snippet relevance
|
|
856
|
+
if (snippet.includes(queryLower)) relevance += 0.2;
|
|
857
|
+
|
|
858
|
+
return Math.min(1, relevance);
|
|
859
|
+
}
|
|
860
|
+
|
|
861
|
+
calculateReadabilityScore(content) {
|
|
862
|
+
if (!content) return 0.5;
|
|
863
|
+
|
|
864
|
+
const words = content.split(' ').length;
|
|
865
|
+
const sentences = content.split(/[.!?]/).length;
|
|
866
|
+
const avgWordsPerSentence = words / Math.max(sentences, 1);
|
|
867
|
+
|
|
868
|
+
// Simple readability approximation
|
|
869
|
+
if (avgWordsPerSentence < 15) return 0.8; // Easy to read
|
|
870
|
+
if (avgWordsPerSentence < 20) return 0.6; // Moderate
|
|
871
|
+
return 0.4; // Difficult
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
calculateOverallCredibility(factors) {
|
|
875
|
+
const weights = {
|
|
876
|
+
domainAuthority: 0.3,
|
|
877
|
+
contentQuality: 0.25,
|
|
878
|
+
sourceType: 0.2,
|
|
879
|
+
recency: 0.1,
|
|
880
|
+
authorityIndicators: 0.1,
|
|
881
|
+
citationPotential: 0.05
|
|
882
|
+
};
|
|
883
|
+
|
|
884
|
+
let score = 0;
|
|
885
|
+
Object.entries(weights).forEach(([factor, weight]) => {
|
|
886
|
+
score += (factors[factor] || 0.5) * weight;
|
|
887
|
+
});
|
|
888
|
+
|
|
889
|
+
return Math.min(1, Math.max(0, score));
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
calculateConflictSeverity(claim1, claim2) {
|
|
893
|
+
const credibilityDiff = Math.abs(claim1.credibility - claim2.credibility);
|
|
894
|
+
return 0.5 + (credibilityDiff * 0.5);
|
|
895
|
+
}
|
|
896
|
+
|
|
897
|
+
calculateConsensusStrength(group) {
|
|
898
|
+
return (group.sourceCount * 0.4) + (group.avgCredibility * 0.6);
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
/**
|
|
902
|
+
* Utility functions
|
|
903
|
+
*/
|
|
904
|
+
assessDomainAuthority(url) {
|
|
905
|
+
try {
|
|
906
|
+
const domain = new URL(url).hostname.toLowerCase();
|
|
907
|
+
|
|
908
|
+
// High authority domains
|
|
909
|
+
if (domain.includes('edu') || domain.includes('gov')) return 0.9;
|
|
910
|
+
if (domain.includes('org')) return 0.7;
|
|
911
|
+
if (['wikipedia.org', 'pubmed.ncbi.nlm.nih.gov'].includes(domain)) return 0.8;
|
|
912
|
+
|
|
913
|
+
return 0.5;
|
|
914
|
+
} catch {
|
|
915
|
+
return 0.3;
|
|
916
|
+
}
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
assessContentQuality(source) {
|
|
920
|
+
let score = 0.5;
|
|
921
|
+
|
|
922
|
+
if (source.wordCount > 500) score += 0.2;
|
|
923
|
+
if (source.readabilityScore > 0.6) score += 0.1;
|
|
924
|
+
if (source.metadata?.author) score += 0.1;
|
|
925
|
+
if (source.structuredData) score += 0.1;
|
|
926
|
+
|
|
927
|
+
return Math.min(1, score);
|
|
928
|
+
}
|
|
929
|
+
|
|
930
|
+
identifySourceType(source) {
|
|
931
|
+
const content = (source.extractedContent || '').toLowerCase();
|
|
932
|
+
const title = (source.title || '').toLowerCase();
|
|
933
|
+
|
|
934
|
+
if (content.includes('abstract') || content.includes('methodology')) return 0.9;
|
|
935
|
+
if (title.includes('research') || title.includes('study')) return 0.8;
|
|
936
|
+
if (content.includes('peer reviewed')) return 0.9;
|
|
937
|
+
if (title.includes('news') || title.includes('blog')) return 0.4;
|
|
938
|
+
|
|
939
|
+
return 0.6;
|
|
940
|
+
}
|
|
941
|
+
|
|
942
|
+
assessContentRecency(source) {
|
|
943
|
+
// Simple recency assessment - would need better date extraction in real implementation
|
|
944
|
+
return 0.6; // Neutral score
|
|
945
|
+
}
|
|
946
|
+
|
|
947
|
+
findAuthorityIndicators(source) {
|
|
948
|
+
let score = 0.5;
|
|
949
|
+
const content = (source.extractedContent || '').toLowerCase();
|
|
950
|
+
|
|
951
|
+
if (content.includes('citation') || content.includes('reference')) score += 0.2;
|
|
952
|
+
if (content.includes('doi:')) score += 0.2;
|
|
953
|
+
if (source.metadata?.author) score += 0.1;
|
|
954
|
+
|
|
955
|
+
return Math.min(1, score);
|
|
956
|
+
}
|
|
957
|
+
|
|
958
|
+
assessCitationPotential(source) {
|
|
959
|
+
let score = 0.5;
|
|
960
|
+
|
|
961
|
+
if (source.metadata?.doi) score += 0.3;
|
|
962
|
+
if (source.structuredData?.citations) score += 0.2;
|
|
963
|
+
|
|
964
|
+
return Math.min(1, score);
|
|
965
|
+
}
|
|
966
|
+
|
|
967
|
+
extractKeywords(text) {
|
|
968
|
+
return text
|
|
969
|
+
.toLowerCase()
|
|
970
|
+
.replace(/[^\w\s]/g, ' ')
|
|
971
|
+
.split(/\s+/)
|
|
972
|
+
.filter(word => word.length > 3)
|
|
973
|
+
.slice(0, 10);
|
|
974
|
+
}
|
|
975
|
+
|
|
976
|
+
/**
|
|
977
|
+
* Calculate traditional relevance score without LLM
|
|
978
|
+
*/
|
|
979
|
+
calculateTraditionalRelevance(content, topic) {
|
|
980
|
+
if (!content || !topic) return 0.5;
|
|
981
|
+
|
|
982
|
+
const topicWords = topic.toLowerCase().split(/\s+/).filter(word => word.length > 2);
|
|
983
|
+
const contentLower = content.toLowerCase();
|
|
984
|
+
|
|
985
|
+
let matches = 0;
|
|
986
|
+
let totalWeight = 0;
|
|
987
|
+
|
|
988
|
+
topicWords.forEach(word => {
|
|
989
|
+
const regex = new RegExp(`\\b${word}\\b`, 'g');
|
|
990
|
+
const wordMatches = (contentLower.match(regex) || []).length;
|
|
991
|
+
matches += wordMatches;
|
|
992
|
+
totalWeight += word.length * wordMatches; // Weight by word importance
|
|
993
|
+
});
|
|
994
|
+
|
|
995
|
+
// Calculate relevance based on keyword density and content length
|
|
996
|
+
const contentWords = content.split(/\s+/).length;
|
|
997
|
+
const density = matches / Math.max(contentWords, 1);
|
|
998
|
+
const coverage = matches / Math.max(topicWords.length, 1);
|
|
999
|
+
|
|
1000
|
+
// Combine density and coverage with weights
|
|
1001
|
+
const relevanceScore = (density * 0.4) + (coverage * 0.6);
|
|
1002
|
+
|
|
1003
|
+
return Math.min(1, Math.max(0, relevanceScore));
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
/**
|
|
1007
|
+
* Utility methods for research workflow
|
|
1008
|
+
*/
|
|
1009
|
+
async processWithTimeLimit(asyncFunction) {
|
|
1010
|
+
const timeoutPromise = new Promise((_, reject) => {
|
|
1011
|
+
setTimeout(() => reject(new Error('Research time limit exceeded')), this.timeLimit);
|
|
1012
|
+
});
|
|
1013
|
+
|
|
1014
|
+
try {
|
|
1015
|
+
await Promise.race([asyncFunction(), timeoutPromise]);
|
|
1016
|
+
} catch (error) {
|
|
1017
|
+
if (error.message === 'Research time limit exceeded') {
|
|
1018
|
+
this.logger.warn('Research time limit reached, returning partial results');
|
|
1019
|
+
} else {
|
|
1020
|
+
throw error;
|
|
1021
|
+
}
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
|
|
1025
|
+
deduplicateSources(sources) {
|
|
1026
|
+
const seen = new Set();
|
|
1027
|
+
return sources.filter(source => {
|
|
1028
|
+
const key = source.link;
|
|
1029
|
+
if (seen.has(key)) return false;
|
|
1030
|
+
seen.add(key);
|
|
1031
|
+
return true;
|
|
1032
|
+
});
|
|
1033
|
+
}
|
|
1034
|
+
|
|
1035
|
+
async rankSourcesByResearchValue(sources) {
|
|
1036
|
+
return sources.sort((a, b) => {
|
|
1037
|
+
const scoreA = (a.credibilityScore || 0) + (a.researchRelevance || 0);
|
|
1038
|
+
const scoreB = (b.credibilityScore || 0) + (b.researchRelevance || 0);
|
|
1039
|
+
return scoreB - scoreA;
|
|
1040
|
+
});
|
|
1041
|
+
}
|
|
1042
|
+
|
|
1043
|
+
generateKeyFindings(claimGroups, sources) {
|
|
1044
|
+
return claimGroups
|
|
1045
|
+
.filter(group => group.avgCredibility >= 0.6)
|
|
1046
|
+
.sort((a, b) => b.consensusStrength - a.consensusStrength)
|
|
1047
|
+
.slice(0, 10)
|
|
1048
|
+
.map(group => ({
|
|
1049
|
+
finding: group.keywords.join(' '),
|
|
1050
|
+
supportingClaims: group.claims.length,
|
|
1051
|
+
credibility: group.avgCredibility,
|
|
1052
|
+
sources: group.claims.map(c => c.source)
|
|
1053
|
+
}));
|
|
1054
|
+
}
|
|
1055
|
+
|
|
1056
|
+
compileSupportingEvidence(sources) {
|
|
1057
|
+
return sources
|
|
1058
|
+
.filter(source => source.overallCredibility >= 0.7)
|
|
1059
|
+
.map(source => ({
|
|
1060
|
+
title: source.title,
|
|
1061
|
+
url: source.link,
|
|
1062
|
+
credibility: source.overallCredibility,
|
|
1063
|
+
evidence: source.extractedContent?.substring(0, 300) + '...'
|
|
1064
|
+
}))
|
|
1065
|
+
.slice(0, 15);
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
identifyResearchGaps(claimGroups, topic) {
|
|
1069
|
+
const gaps = [];
|
|
1070
|
+
|
|
1071
|
+
// Identify areas with low claim count or credibility
|
|
1072
|
+
const weakAreas = claimGroups.filter(group =>
|
|
1073
|
+
group.claims.length < 2 || group.avgCredibility < 0.5
|
|
1074
|
+
);
|
|
1075
|
+
|
|
1076
|
+
weakAreas.forEach(area => {
|
|
1077
|
+
gaps.push({
|
|
1078
|
+
area: area.keywords.join(' '),
|
|
1079
|
+
issue: 'Limited reliable sources',
|
|
1080
|
+
suggestion: `More research needed on ${area.keywords.join(' ')} related to ${topic}`
|
|
1081
|
+
});
|
|
1082
|
+
});
|
|
1083
|
+
|
|
1084
|
+
return gaps.slice(0, 5);
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
generateResearchRecommendations(synthesis, topic) {
|
|
1088
|
+
const recommendations = [];
|
|
1089
|
+
|
|
1090
|
+
if (synthesis.conflicts.length > 0) {
|
|
1091
|
+
recommendations.push({
|
|
1092
|
+
type: 'conflict_resolution',
|
|
1093
|
+
priority: 'high',
|
|
1094
|
+
description: `Investigate ${synthesis.conflicts.length} conflicting claims about ${topic}`
|
|
1095
|
+
});
|
|
1096
|
+
}
|
|
1097
|
+
|
|
1098
|
+
if (synthesis.gaps.length > 0) {
|
|
1099
|
+
recommendations.push({
|
|
1100
|
+
type: 'gap_filling',
|
|
1101
|
+
priority: 'medium',
|
|
1102
|
+
description: `Address research gaps in ${synthesis.gaps.map(g => g.area).join(', ')}`
|
|
1103
|
+
});
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1106
|
+
recommendations.push({
|
|
1107
|
+
type: 'validation',
|
|
1108
|
+
priority: 'medium',
|
|
1109
|
+
description: `Validate findings with additional peer-reviewed sources`
|
|
1110
|
+
});
|
|
1111
|
+
|
|
1112
|
+
return recommendations;
|
|
1113
|
+
}
|
|
1114
|
+
|
|
1115
|
+
compileResearchResults(topic, synthesis, options) {
|
|
1116
|
+
const baseResults = {
|
|
1117
|
+
sessionId: this.researchState.sessionId,
|
|
1118
|
+
topic,
|
|
1119
|
+
researchSummary: {
|
|
1120
|
+
totalSources: this.metrics.urlsProcessed,
|
|
1121
|
+
verifiedSources: this.metrics.sourcesVerified,
|
|
1122
|
+
keyFindings: synthesis.keyFindings.length,
|
|
1123
|
+
conflictsFound: synthesis.conflicts.length,
|
|
1124
|
+
consensusAreas: synthesis.consensus.length,
|
|
1125
|
+
llmEnhanced: this.enableLLMFeatures
|
|
1126
|
+
},
|
|
1127
|
+
findings: synthesis.keyFindings,
|
|
1128
|
+
supportingEvidence: synthesis.supportingEvidence,
|
|
1129
|
+
consensus: synthesis.consensus,
|
|
1130
|
+
conflicts: synthesis.conflicts,
|
|
1131
|
+
researchGaps: synthesis.gaps,
|
|
1132
|
+
recommendations: synthesis.recommendations,
|
|
1133
|
+
credibilityAssessment: {
|
|
1134
|
+
highCredibilitySources: Array.from(this.researchState.credibilityScores.entries())
|
|
1135
|
+
.filter(([_, score]) => score >= 0.7)
|
|
1136
|
+
.length,
|
|
1137
|
+
averageCredibility: this.calculateAverageCredibility(),
|
|
1138
|
+
credibilityDistribution: this.getCredibilityDistribution()
|
|
1139
|
+
},
|
|
1140
|
+
activityLog: this.researchState.activityLog,
|
|
1141
|
+
performance: {
|
|
1142
|
+
...this.metrics,
|
|
1143
|
+
timeLimit: this.timeLimit,
|
|
1144
|
+
completedWithinLimit: this.metrics.totalProcessingTime < this.timeLimit
|
|
1145
|
+
},
|
|
1146
|
+
metadata: {
|
|
1147
|
+
generatedAt: new Date().toISOString(),
|
|
1148
|
+
researchDepth: this.researchState.currentDepth,
|
|
1149
|
+
configuration: {
|
|
1150
|
+
maxDepth: this.maxDepth,
|
|
1151
|
+
maxUrls: this.maxUrls,
|
|
1152
|
+
timeLimit: this.timeLimit,
|
|
1153
|
+
llmEnabled: this.enableLLMFeatures
|
|
1154
|
+
}
|
|
1155
|
+
}
|
|
1156
|
+
};
|
|
1157
|
+
|
|
1158
|
+
// Add LLM-specific analysis if available
|
|
1159
|
+
if (this.enableLLMFeatures) {
|
|
1160
|
+
baseResults.llmAnalysis = {
|
|
1161
|
+
synthesis: synthesis.llmSynthesis,
|
|
1162
|
+
relevanceScores: Object.fromEntries(this.researchState.relevanceScores),
|
|
1163
|
+
semanticSimilarities: Object.fromEntries(this.researchState.semanticSimilarities),
|
|
1164
|
+
analysisHistory: this.researchState.synthesisHistory,
|
|
1165
|
+
llmMetrics: {
|
|
1166
|
+
totalLLMCalls: this.metrics.llmAnalysisCalls,
|
|
1167
|
+
semanticAnalysisTime: this.metrics.semanticAnalysisTime,
|
|
1168
|
+
queryExpansionTime: this.metrics.queryExpansionTime,
|
|
1169
|
+
synthesisTime: this.metrics.synthesisTime
|
|
1170
|
+
}
|
|
1171
|
+
};
|
|
1172
|
+
|
|
1173
|
+
// Enhanced insights from LLM synthesis
|
|
1174
|
+
if (synthesis.llmSynthesis && !synthesis.llmSynthesis.error) {
|
|
1175
|
+
baseResults.insights = {
|
|
1176
|
+
aiSummary: synthesis.llmSynthesis.summary,
|
|
1177
|
+
keyThemes: synthesis.llmSynthesis.themes,
|
|
1178
|
+
confidenceLevel: synthesis.llmSynthesis.confidence,
|
|
1179
|
+
intelligentInsights: synthesis.llmSynthesis.keyInsights,
|
|
1180
|
+
aiRecommendations: synthesis.llmSynthesis.recommendations,
|
|
1181
|
+
identifiedGaps: synthesis.llmSynthesis.gaps
|
|
1182
|
+
};
|
|
1183
|
+
}
|
|
1184
|
+
|
|
1185
|
+
// Provenance tracking for LLM-enhanced sources
|
|
1186
|
+
baseResults.provenance = {
|
|
1187
|
+
sourceAnalysis: Array.from(this.researchState.llmAnalysis.entries()).map(([url, analysis]) => ({
|
|
1188
|
+
url,
|
|
1189
|
+
relevanceScore: analysis.relevanceScore,
|
|
1190
|
+
keyPoints: analysis.keyPoints,
|
|
1191
|
+
topicAlignment: analysis.topicAlignment,
|
|
1192
|
+
credibilityIndicators: analysis.credibilityIndicators
|
|
1193
|
+
})),
|
|
1194
|
+
queryExpansion: this.researchState.semanticSimilarities.size > 0 ?
|
|
1195
|
+
Object.fromEntries(this.researchState.semanticSimilarities) : null,
|
|
1196
|
+
totalAnalyzedSources: this.researchState.llmAnalysis.size
|
|
1197
|
+
};
|
|
1198
|
+
}
|
|
1199
|
+
|
|
1200
|
+
return baseResults;
|
|
1201
|
+
}
|
|
1202
|
+
|
|
1203
|
+
handleResearchError(error, topic, sessionId) {
|
|
1204
|
+
return {
|
|
1205
|
+
sessionId,
|
|
1206
|
+
topic,
|
|
1207
|
+
error: error.message,
|
|
1208
|
+
partialResults: {
|
|
1209
|
+
visitedUrls: Array.from(this.researchState.visitedUrls),
|
|
1210
|
+
activityLog: this.researchState.activityLog,
|
|
1211
|
+
metrics: this.metrics
|
|
1212
|
+
},
|
|
1213
|
+
recommendations: [{
|
|
1214
|
+
type: 'error_recovery',
|
|
1215
|
+
priority: 'high',
|
|
1216
|
+
description: 'Retry research with reduced scope or increased time limit'
|
|
1217
|
+
}],
|
|
1218
|
+
generatedAt: new Date().toISOString()
|
|
1219
|
+
};
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
calculateAverageCredibility() {
|
|
1223
|
+
const scores = Array.from(this.researchState.credibilityScores.values());
|
|
1224
|
+
return scores.length > 0 ?
|
|
1225
|
+
scores.reduce((sum, score) => sum + score, 0) / scores.length : 0;
|
|
1226
|
+
}
|
|
1227
|
+
|
|
1228
|
+
getCredibilityDistribution() {
|
|
1229
|
+
const scores = Array.from(this.researchState.credibilityScores.values());
|
|
1230
|
+
const high = scores.filter(s => s >= 0.7).length;
|
|
1231
|
+
const medium = scores.filter(s => s >= 0.4 && s < 0.7).length;
|
|
1232
|
+
const low = scores.filter(s => s < 0.4).length;
|
|
1233
|
+
|
|
1234
|
+
return { high, medium, low };
|
|
1235
|
+
}
|
|
1236
|
+
|
|
1237
|
+
logActivity(type, data) {
|
|
1238
|
+
const activity = {
|
|
1239
|
+
type,
|
|
1240
|
+
timestamp: new Date().toISOString(),
|
|
1241
|
+
data
|
|
1242
|
+
};
|
|
1243
|
+
|
|
1244
|
+
this.researchState.activityLog.push(activity);
|
|
1245
|
+
this.emit('activityLogged', activity);
|
|
1246
|
+
}
|
|
1247
|
+
|
|
1248
|
+
generateSessionId() {
|
|
1249
|
+
return `research_${Date.now()}_${Math.random().toString(36).substring(2, 8)}`;
|
|
1250
|
+
}
|
|
1251
|
+
|
|
1252
|
+
// Public API methods for monitoring and control
|
|
1253
|
+
getResearchState() {
|
|
1254
|
+
return { ...this.researchState };
|
|
1255
|
+
}
|
|
1256
|
+
|
|
1257
|
+
getMetrics() {
|
|
1258
|
+
return { ...this.metrics };
|
|
1259
|
+
}
|
|
1260
|
+
|
|
1261
|
+
pauseResearch() {
|
|
1262
|
+
this.emit('researchPaused', { sessionId: this.researchState.sessionId });
|
|
1263
|
+
}
|
|
1264
|
+
|
|
1265
|
+
resumeResearch() {
|
|
1266
|
+
this.emit('researchResumed', { sessionId: this.researchState.sessionId });
|
|
1267
|
+
}
|
|
1268
|
+
|
|
1269
|
+
stopResearch() {
|
|
1270
|
+
this.emit('researchStopped', { sessionId: this.researchState.sessionId });
|
|
1271
|
+
}
|
|
1272
|
+
|
|
1273
|
+
/**
|
|
1274
|
+
* Cleanup method for proper resource disposal
|
|
1275
|
+
*/
|
|
1276
|
+
async cleanup() {
|
|
1277
|
+
try {
|
|
1278
|
+
// Stop any active research
|
|
1279
|
+
this.stopResearch();
|
|
1280
|
+
|
|
1281
|
+
// Clear cache if available
|
|
1282
|
+
if (this.cache && typeof this.cache.clear === "function") {
|
|
1283
|
+
await this.cache.clear();
|
|
1284
|
+
}
|
|
1285
|
+
|
|
1286
|
+
// Clear all event listeners
|
|
1287
|
+
this.removeAllListeners();
|
|
1288
|
+
|
|
1289
|
+
// Reset research state
|
|
1290
|
+
this.researchState = {
|
|
1291
|
+
sessionId: null,
|
|
1292
|
+
currentDepth: 0,
|
|
1293
|
+
visitedUrls: new Set(),
|
|
1294
|
+
searchResults: new Map(),
|
|
1295
|
+
extractedContent: new Map(),
|
|
1296
|
+
researchFindings: [],
|
|
1297
|
+
credibilityScores: new Map(),
|
|
1298
|
+
conflictMap: new Map(),
|
|
1299
|
+
activityLog: [],
|
|
1300
|
+
llmAnalysis: new Map(),
|
|
1301
|
+
semanticSimilarities: new Map(),
|
|
1302
|
+
relevanceScores: new Map(),
|
|
1303
|
+
synthesisHistory: []
|
|
1304
|
+
};
|
|
1305
|
+
|
|
1306
|
+
// Reset metrics
|
|
1307
|
+
this.metrics = {
|
|
1308
|
+
searchQueries: 0,
|
|
1309
|
+
urlsProcessed: 0,
|
|
1310
|
+
contentExtracted: 0,
|
|
1311
|
+
conflictsDetected: 0,
|
|
1312
|
+
sourcesVerified: 0,
|
|
1313
|
+
cacheHits: 0,
|
|
1314
|
+
totalProcessingTime: 0,
|
|
1315
|
+
llmAnalysisCalls: 0,
|
|
1316
|
+
semanticAnalysisTime: 0,
|
|
1317
|
+
queryExpansionTime: 0,
|
|
1318
|
+
synthesisTime: 0
|
|
1319
|
+
};
|
|
1320
|
+
|
|
1321
|
+
} catch (error) {
|
|
1322
|
+
// Silent cleanup - do not throw errors during cleanup
|
|
1323
|
+
console.warn("Warning during ResearchOrchestrator cleanup:", error.message);
|
|
1324
|
+
}
|
|
1325
|
+
}
|
|
1326
|
+
}
|
|
1327
|
+
|