@elizaos/plugin-research 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +400 -0
- package/dist/index.cjs +9366 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.js +9284 -0
- package/dist/index.js.map +1 -0
- package/package.json +80 -0
- package/src/__tests__/action-chaining.test.ts +532 -0
- package/src/__tests__/actions.test.ts +118 -0
- package/src/__tests__/cache-rate-limiter.test.ts +303 -0
- package/src/__tests__/content-extractors.test.ts +26 -0
- package/src/__tests__/deepresearch-bench-integration.test.ts +520 -0
- package/src/__tests__/deepresearch-bench-simplified.e2e.test.ts +290 -0
- package/src/__tests__/deepresearch-bench.e2e.test.ts +376 -0
- package/src/__tests__/e2e.test.ts +1870 -0
- package/src/__tests__/multi-benchmark-runner.ts +427 -0
- package/src/__tests__/providers.test.ts +156 -0
- package/src/__tests__/real-world.e2e.test.ts +788 -0
- package/src/__tests__/research-scenarios.test.ts +755 -0
- package/src/__tests__/research.e2e.test.ts +704 -0
- package/src/__tests__/research.test.ts +174 -0
- package/src/__tests__/search-providers.test.ts +174 -0
- package/src/__tests__/single-benchmark-runner.ts +735 -0
- package/src/__tests__/test-search-providers.ts +171 -0
- package/src/__tests__/verify-apis.test.ts +82 -0
- package/src/actions.ts +1677 -0
- package/src/benchmark/deepresearch-benchmark.ts +369 -0
- package/src/evaluation/research-evaluator.ts +444 -0
- package/src/examples/api-integration.md +498 -0
- package/src/examples/browserbase-integration.md +132 -0
- package/src/examples/debug-research-query.ts +162 -0
- package/src/examples/defi-code-scenarios.md +536 -0
- package/src/examples/defi-implementation-guide.md +454 -0
- package/src/examples/eliza-research-example.ts +142 -0
- package/src/examples/fix-renewable-energy-research.ts +209 -0
- package/src/examples/research-scenarios.md +408 -0
- package/src/examples/run-complete-renewable-research.ts +303 -0
- package/src/examples/run-deep-research.ts +352 -0
- package/src/examples/run-logged-research.ts +304 -0
- package/src/examples/run-real-research.ts +151 -0
- package/src/examples/save-research-output.ts +133 -0
- package/src/examples/test-file-logging.ts +199 -0
- package/src/examples/test-real-research.ts +67 -0
- package/src/examples/test-renewable-energy-research.ts +229 -0
- package/src/index.ts +28 -0
- package/src/integrations/cache.ts +128 -0
- package/src/integrations/content-extractors/firecrawl.ts +314 -0
- package/src/integrations/content-extractors/pdf-extractor.ts +350 -0
- package/src/integrations/content-extractors/playwright.ts +420 -0
- package/src/integrations/factory.ts +419 -0
- package/src/integrations/index.ts +18 -0
- package/src/integrations/rate-limiter.ts +181 -0
- package/src/integrations/search-providers/academic.ts +290 -0
- package/src/integrations/search-providers/exa.ts +205 -0
- package/src/integrations/search-providers/npm.ts +330 -0
- package/src/integrations/search-providers/pypi.ts +211 -0
- package/src/integrations/search-providers/serpapi.ts +277 -0
- package/src/integrations/search-providers/serper.ts +358 -0
- package/src/integrations/search-providers/stagehand-google.ts +87 -0
- package/src/integrations/search-providers/tavily.ts +187 -0
- package/src/processing/relevance-analyzer.ts +353 -0
- package/src/processing/research-logger.ts +450 -0
- package/src/processing/result-processor.ts +372 -0
- package/src/prompts/research-prompts.ts +419 -0
- package/src/providers/cacheProvider.ts +164 -0
- package/src/providers.ts +173 -0
- package/src/service.ts +2588 -0
- package/src/services/swe-bench.ts +286 -0
- package/src/strategies/research-strategies.ts +790 -0
- package/src/types/pdf-parse.d.ts +34 -0
- package/src/types.ts +551 -0
- package/src/verification/claim-verifier.ts +443 -0
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Multi-Benchmark Test Runner for ElizaOS Research Plugin
|
|
4
|
+
* Tests multiple research scenarios and validates scoring quality
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
// Removed real-runtime import - using simplified approach
|
|
8
|
+
import { ResearchService } from '../service';
|
|
9
|
+
import { ResearchDomain, ResearchDepth } from '../types';
|
|
10
|
+
import { elizaLogger, ModelType, IAgentRuntime } from '@elizaos/core';
|
|
11
|
+
import { DeepResearchBenchmark } from '../benchmark/deepresearch-benchmark';
|
|
12
|
+
import { SearchResultProcessor } from '../processing/result-processor';
|
|
13
|
+
|
|
14
|
+
interface BenchmarkScenario {
|
|
15
|
+
name: string;
|
|
16
|
+
query: string;
|
|
17
|
+
domain: ResearchDomain;
|
|
18
|
+
expectedSources: number;
|
|
19
|
+
expectedWords: number;
|
|
20
|
+
expectedRaceScore: number;
|
|
21
|
+
expectedFactScore: number;
|
|
22
|
+
timeoutMs: number;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const BENCHMARK_SCENARIOS: BenchmarkScenario[] = [
|
|
26
|
+
{
|
|
27
|
+
name: "Computer Science - Federated Learning",
|
|
28
|
+
query: "Analyze the security and privacy implications of federated learning in healthcare applications. Compare different privacy-preserving techniques including differential privacy, homomorphic encryption, and secure multi-party computation.",
|
|
29
|
+
domain: ResearchDomain.COMPUTER_SCIENCE,
|
|
30
|
+
expectedSources: 15,
|
|
31
|
+
expectedWords: 3000,
|
|
32
|
+
expectedRaceScore: 0.65,
|
|
33
|
+
expectedFactScore: 0.7,
|
|
34
|
+
timeoutMs: 600000, // 10 minutes
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
name: "Physics - Quantum Computing",
|
|
38
|
+
query: "What are the latest breakthroughs in quantum computing hardware and their implications for cryptography and computational complexity?",
|
|
39
|
+
domain: ResearchDomain.PHYSICS,
|
|
40
|
+
expectedSources: 12,
|
|
41
|
+
expectedWords: 2500,
|
|
42
|
+
expectedRaceScore: 0.6,
|
|
43
|
+
expectedFactScore: 0.65,
|
|
44
|
+
timeoutMs: 450000, // 7.5 minutes
|
|
45
|
+
},
|
|
46
|
+
{
|
|
47
|
+
name: "Medicine - mRNA Vaccines",
|
|
48
|
+
query: "Analyze the effectiveness of mRNA vaccine technology for infectious diseases beyond COVID-19, including safety profiles and future applications.",
|
|
49
|
+
domain: ResearchDomain.MEDICINE,
|
|
50
|
+
expectedSources: 15,
|
|
51
|
+
expectedWords: 3000,
|
|
52
|
+
expectedRaceScore: 0.65,
|
|
53
|
+
expectedFactScore: 0.75,
|
|
54
|
+
timeoutMs: 600000, // 10 minutes
|
|
55
|
+
},
|
|
56
|
+
{
|
|
57
|
+
name: "General - Renewable Energy",
|
|
58
|
+
query: "Compare the environmental and economic impacts of different renewable energy storage technologies for grid-scale deployment.",
|
|
59
|
+
domain: ResearchDomain.GENERAL,
|
|
60
|
+
expectedSources: 10,
|
|
61
|
+
expectedWords: 2000,
|
|
62
|
+
expectedRaceScore: 0.55,
|
|
63
|
+
expectedFactScore: 0.6,
|
|
64
|
+
timeoutMs: 450000, // 7.5 minutes
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
name: "Economics - Digital Currency",
|
|
68
|
+
query: "Evaluate the economic impact of central bank digital currencies (CBDCs) on monetary policy and financial stability.",
|
|
69
|
+
domain: ResearchDomain.ECONOMICS,
|
|
70
|
+
expectedSources: 12,
|
|
71
|
+
expectedWords: 2500,
|
|
72
|
+
expectedRaceScore: 0.6,
|
|
73
|
+
expectedFactScore: 0.65,
|
|
74
|
+
timeoutMs: 450000, // 7.5 minutes
|
|
75
|
+
}
|
|
76
|
+
];
|
|
77
|
+
|
|
78
|
+
interface BenchmarkResult {
|
|
79
|
+
scenario: BenchmarkScenario;
|
|
80
|
+
success: boolean;
|
|
81
|
+
actualSources: number;
|
|
82
|
+
actualWords: number;
|
|
83
|
+
raceScore?: number;
|
|
84
|
+
factScore?: number;
|
|
85
|
+
duration: number;
|
|
86
|
+
error?: string;
|
|
87
|
+
projectId?: string;
|
|
88
|
+
quality: 'excellent' | 'good' | 'fair' | 'poor';
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
class MultiBenchmarkRunner {
|
|
92
|
+
private researchService: ResearchService;
|
|
93
|
+
private deepBenchmark: DeepResearchBenchmark;
|
|
94
|
+
private resultProcessor: SearchResultProcessor;
|
|
95
|
+
|
|
96
|
+
constructor(private runtime: any) {
|
|
97
|
+
this.researchService = new ResearchService(runtime);
|
|
98
|
+
this.deepBenchmark = new DeepResearchBenchmark();
|
|
99
|
+
this.resultProcessor = new SearchResultProcessor({
|
|
100
|
+
qualityThreshold: 0.4,
|
|
101
|
+
deduplicationThreshold: 0.8,
|
|
102
|
+
maxResults: 50,
|
|
103
|
+
diversityWeight: 0.3,
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
async runAllBenchmarks(): Promise<BenchmarkResult[]> {
|
|
108
|
+
const results: BenchmarkResult[] = [];
|
|
109
|
+
|
|
110
|
+
console.log(`🧪 Running ${BENCHMARK_SCENARIOS.length} benchmark scenarios\n`);
|
|
111
|
+
|
|
112
|
+
for (let i = 0; i < BENCHMARK_SCENARIOS.length; i++) {
|
|
113
|
+
const scenario = BENCHMARK_SCENARIOS[i];
|
|
114
|
+
console.log(`\n📋 Scenario ${i + 1}/${BENCHMARK_SCENARIOS.length}: ${scenario.name}`);
|
|
115
|
+
console.log(`🔍 Query: ${scenario.query.substring(0, 100)}...`);
|
|
116
|
+
console.log(`📊 Expected: ${scenario.expectedSources} sources, ${scenario.expectedWords} words`);
|
|
117
|
+
console.log(`🎯 Target RACE: ${scenario.expectedRaceScore}, FACT: ${scenario.expectedFactScore}`);
|
|
118
|
+
|
|
119
|
+
const result = await this.runSingleBenchmark(scenario);
|
|
120
|
+
results.push(result);
|
|
121
|
+
|
|
122
|
+
this.printBenchmarkResult(result);
|
|
123
|
+
|
|
124
|
+
// Brief pause between tests
|
|
125
|
+
if (i < BENCHMARK_SCENARIOS.length - 1) {
|
|
126
|
+
console.log('\n⏳ Waiting 30 seconds before next test...');
|
|
127
|
+
await new Promise(resolve => setTimeout(resolve, 30000));
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
return results;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
private async runSingleBenchmark(scenario: BenchmarkScenario): Promise<BenchmarkResult> {
|
|
135
|
+
const startTime = Date.now();
|
|
136
|
+
|
|
137
|
+
try {
|
|
138
|
+
// Create research project
|
|
139
|
+
const project = await this.researchService.createResearchProject(scenario.query, {
|
|
140
|
+
domain: scenario.domain,
|
|
141
|
+
researchDepth: ResearchDepth.DEEP,
|
|
142
|
+
maxSearchResults: 30,
|
|
143
|
+
timeout: scenario.timeoutMs,
|
|
144
|
+
enableCitations: true,
|
|
145
|
+
evaluationEnabled: true,
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
console.log(`✅ Project created: ${project.id}`);
|
|
149
|
+
|
|
150
|
+
// Wait for completion with timeout
|
|
151
|
+
const completedProject = await this.waitForCompletion(project.id, scenario.timeoutMs);
|
|
152
|
+
|
|
153
|
+
const duration = Date.now() - startTime;
|
|
154
|
+
|
|
155
|
+
if (!completedProject) {
|
|
156
|
+
return {
|
|
157
|
+
scenario,
|
|
158
|
+
success: false,
|
|
159
|
+
actualSources: 0,
|
|
160
|
+
actualWords: 0,
|
|
161
|
+
duration,
|
|
162
|
+
error: 'Timeout waiting for completion',
|
|
163
|
+
quality: 'poor',
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// Analyze results
|
|
168
|
+
const actualSources = completedProject.sources.length;
|
|
169
|
+
const actualWords = this.countWords(completedProject.report?.content || '');
|
|
170
|
+
|
|
171
|
+
// Get evaluation scores if available
|
|
172
|
+
let raceScore: number | undefined;
|
|
173
|
+
let factScore: number | undefined;
|
|
174
|
+
|
|
175
|
+
if (completedProject.metadata?.evaluationMetrics) {
|
|
176
|
+
raceScore = completedProject.metadata.evaluationMetrics.raceScore?.overall;
|
|
177
|
+
factScore = completedProject.metadata.evaluationMetrics.factScore?.citationAccuracy;
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const quality = this.assessQuality(scenario, actualSources, actualWords, raceScore, factScore);
|
|
181
|
+
|
|
182
|
+
return {
|
|
183
|
+
scenario,
|
|
184
|
+
success: true,
|
|
185
|
+
actualSources,
|
|
186
|
+
actualWords,
|
|
187
|
+
raceScore,
|
|
188
|
+
factScore,
|
|
189
|
+
duration,
|
|
190
|
+
projectId: project.id,
|
|
191
|
+
quality,
|
|
192
|
+
};
|
|
193
|
+
|
|
194
|
+
} catch (error) {
|
|
195
|
+
const duration = Date.now() - startTime;
|
|
196
|
+
|
|
197
|
+
return {
|
|
198
|
+
scenario,
|
|
199
|
+
success: false,
|
|
200
|
+
actualSources: 0,
|
|
201
|
+
actualWords: 0,
|
|
202
|
+
duration,
|
|
203
|
+
error: error instanceof Error ? error.message : String(error),
|
|
204
|
+
quality: 'poor',
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
private async waitForCompletion(projectId: string, timeoutMs: number): Promise<any> {
|
|
210
|
+
const startTime = Date.now();
|
|
211
|
+
const pollInterval = 10000; // Check every 10 seconds
|
|
212
|
+
|
|
213
|
+
while (Date.now() - startTime < timeoutMs) {
|
|
214
|
+
const project = await this.researchService.getProject(projectId);
|
|
215
|
+
|
|
216
|
+
if (!project) {
|
|
217
|
+
throw new Error('Project not found');
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
console.log(`📍 Phase: ${project.phase} → Status: ${project.status}`);
|
|
221
|
+
console.log(` Sources: ${project.sources.length}, Findings: ${project.findings.length}`);
|
|
222
|
+
|
|
223
|
+
if (project.status === 'completed') {
|
|
224
|
+
console.log('✅ Research completed successfully');
|
|
225
|
+
return project;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
if (project.status === 'failed') {
|
|
229
|
+
throw new Error(`Research failed: ${project.error}`);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
await new Promise(resolve => setTimeout(resolve, pollInterval));
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
return null; // Timeout
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
private countWords(text: string): number {
|
|
239
|
+
return text.split(/\s+/).filter(word => word.length > 0).length;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
private assessQuality(
|
|
243
|
+
scenario: BenchmarkScenario,
|
|
244
|
+
actualSources: number,
|
|
245
|
+
actualWords: number,
|
|
246
|
+
raceScore?: number,
|
|
247
|
+
factScore?: number
|
|
248
|
+
): 'excellent' | 'good' | 'fair' | 'poor' {
|
|
249
|
+
let qualityPoints = 0;
|
|
250
|
+
|
|
251
|
+
// Source count assessment
|
|
252
|
+
if (actualSources >= scenario.expectedSources) {
|
|
253
|
+
qualityPoints += 2;
|
|
254
|
+
} else if (actualSources >= scenario.expectedSources * 0.7) {
|
|
255
|
+
qualityPoints += 1;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
// Word count assessment
|
|
259
|
+
if (actualWords >= scenario.expectedWords) {
|
|
260
|
+
qualityPoints += 2;
|
|
261
|
+
} else if (actualWords >= scenario.expectedWords * 0.7) {
|
|
262
|
+
qualityPoints += 1;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// RACE score assessment
|
|
266
|
+
if (raceScore !== undefined) {
|
|
267
|
+
if (raceScore >= scenario.expectedRaceScore) {
|
|
268
|
+
qualityPoints += 2;
|
|
269
|
+
} else if (raceScore >= scenario.expectedRaceScore * 0.8) {
|
|
270
|
+
qualityPoints += 1;
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
// FACT score assessment
|
|
275
|
+
if (factScore !== undefined) {
|
|
276
|
+
if (factScore >= scenario.expectedFactScore) {
|
|
277
|
+
qualityPoints += 2;
|
|
278
|
+
} else if (factScore >= scenario.expectedFactScore * 0.8) {
|
|
279
|
+
qualityPoints += 1;
|
|
280
|
+
}
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
// Assess overall quality
|
|
284
|
+
const maxPoints = 8;
|
|
285
|
+
const qualityRatio = qualityPoints / maxPoints;
|
|
286
|
+
|
|
287
|
+
if (qualityRatio >= 0.8) return 'excellent';
|
|
288
|
+
if (qualityRatio >= 0.6) return 'good';
|
|
289
|
+
if (qualityRatio >= 0.4) return 'fair';
|
|
290
|
+
return 'poor';
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
private printBenchmarkResult(result: BenchmarkResult): void {
|
|
294
|
+
const { scenario, success, actualSources, actualWords, raceScore, factScore, duration, quality, error } = result;
|
|
295
|
+
|
|
296
|
+
console.log(`\n📊 Results for "${scenario.name}":`);
|
|
297
|
+
|
|
298
|
+
if (success) {
|
|
299
|
+
console.log(`✅ SUCCESS (${(duration / 1000).toFixed(1)}s)`);
|
|
300
|
+
console.log(`📚 Sources: ${actualSources} (expected: ${scenario.expectedSources})`);
|
|
301
|
+
console.log(`📝 Words: ${actualWords} (expected: ${scenario.expectedWords})`);
|
|
302
|
+
|
|
303
|
+
if (raceScore !== undefined) {
|
|
304
|
+
const raceStatus = raceScore >= scenario.expectedRaceScore ? '✅' : '⚠️';
|
|
305
|
+
console.log(`${raceStatus} RACE Score: ${(raceScore * 100).toFixed(1)}% (target: ${(scenario.expectedRaceScore * 100).toFixed(1)}%)`);
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
if (factScore !== undefined) {
|
|
309
|
+
const factStatus = factScore >= scenario.expectedFactScore ? '✅' : '⚠️';
|
|
310
|
+
console.log(`${factStatus} FACT Score: ${(factScore * 100).toFixed(1)}% (target: ${(scenario.expectedFactScore * 100).toFixed(1)}%)`);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
const qualityEmoji = {
|
|
314
|
+
excellent: '🏆',
|
|
315
|
+
good: '👍',
|
|
316
|
+
fair: '👌',
|
|
317
|
+
poor: '👎'
|
|
318
|
+
}[quality];
|
|
319
|
+
|
|
320
|
+
console.log(`${qualityEmoji} Overall Quality: ${quality.toUpperCase()}`);
|
|
321
|
+
|
|
322
|
+
} else {
|
|
323
|
+
console.log(`❌ FAILED (${(duration / 1000).toFixed(1)}s)`);
|
|
324
|
+
console.log(`💥 Error: ${error}`);
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
printSummaryReport(results: BenchmarkResult[]): void {
|
|
329
|
+
console.log('\n' + '='.repeat(80));
|
|
330
|
+
console.log('🏆 MULTI-BENCHMARK SUMMARY REPORT');
|
|
331
|
+
console.log('='.repeat(80));
|
|
332
|
+
|
|
333
|
+
const successful = results.filter(r => r.success);
|
|
334
|
+
const successRate = (successful.length / results.length) * 100;
|
|
335
|
+
|
|
336
|
+
console.log(`📊 Success Rate: ${successful.length}/${results.length} (${successRate.toFixed(1)}%)`);
|
|
337
|
+
|
|
338
|
+
if (successful.length > 0) {
|
|
339
|
+
const avgSources = successful.reduce((sum, r) => sum + r.actualSources, 0) / successful.length;
|
|
340
|
+
const avgWords = successful.reduce((sum, r) => sum + r.actualWords, 0) / successful.length;
|
|
341
|
+
const avgDuration = successful.reduce((sum, r) => sum + r.duration, 0) / successful.length;
|
|
342
|
+
|
|
343
|
+
console.log(`📚 Average Sources: ${avgSources.toFixed(1)}`);
|
|
344
|
+
console.log(`📝 Average Words: ${avgWords.toFixed(0)}`);
|
|
345
|
+
console.log(`⏱️ Average Duration: ${(avgDuration / 1000).toFixed(1)}s`);
|
|
346
|
+
|
|
347
|
+
const raceScores = successful.filter(r => r.raceScore !== undefined).map(r => r.raceScore!);
|
|
348
|
+
const factScores = successful.filter(r => r.factScore !== undefined).map(r => r.factScore!);
|
|
349
|
+
|
|
350
|
+
if (raceScores.length > 0) {
|
|
351
|
+
const avgRace = raceScores.reduce((sum, s) => sum + s, 0) / raceScores.length;
|
|
352
|
+
console.log(`🎯 Average RACE Score: ${(avgRace * 100).toFixed(1)}%`);
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
if (factScores.length > 0) {
|
|
356
|
+
const avgFact = factScores.reduce((sum, s) => sum + s, 0) / factScores.length;
|
|
357
|
+
console.log(`🔍 Average FACT Score: ${(avgFact * 100).toFixed(1)}%`);
|
|
358
|
+
}
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
// Quality distribution
|
|
362
|
+
const qualityCounts = results.reduce((counts, r) => {
|
|
363
|
+
counts[r.quality] = (counts[r.quality] || 0) + 1;
|
|
364
|
+
return counts;
|
|
365
|
+
}, {} as Record<string, number>);
|
|
366
|
+
|
|
367
|
+
console.log('\n📈 Quality Distribution:');
|
|
368
|
+
console.log(`🏆 Excellent: ${qualityCounts.excellent || 0}`);
|
|
369
|
+
console.log(`👍 Good: ${qualityCounts.good || 0}`);
|
|
370
|
+
console.log(`👌 Fair: ${qualityCounts.fair || 0}`);
|
|
371
|
+
console.log(`👎 Poor: ${qualityCounts.poor || 0}`);
|
|
372
|
+
|
|
373
|
+
// Recommendations
|
|
374
|
+
console.log('\n💡 Recommendations:');
|
|
375
|
+
if (successRate < 80) {
|
|
376
|
+
console.log('⚠️ Consider increasing timeout limits for complex queries');
|
|
377
|
+
}
|
|
378
|
+
if (successful.some(r => r.actualSources < r.scenario.expectedSources * 0.8)) {
|
|
379
|
+
console.log('⚠️ Some scenarios have insufficient source coverage - consider expanding search providers');
|
|
380
|
+
}
|
|
381
|
+
if (successful.some(r => r.raceScore && r.raceScore < 0.6)) {
|
|
382
|
+
console.log('⚠️ RACE scores need improvement - focus on comprehensiveness and depth');
|
|
383
|
+
}
|
|
384
|
+
if (successful.some(r => r.factScore && r.factScore < 0.6)) {
|
|
385
|
+
console.log('⚠️ FACT scores need improvement - enhance citation accuracy and verification');
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
console.log('\n✅ Multi-benchmark testing complete!');
|
|
389
|
+
}
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
async function main() {
|
|
393
|
+
console.log('🧪 ElizaOS Research Plugin - Multi-Benchmark Test Suite\n');
|
|
394
|
+
|
|
395
|
+
try {
|
|
396
|
+
// Create minimal runtime mock for testing
|
|
397
|
+
const runtime = {
|
|
398
|
+
useModel: async (modelType: string, params: any) => {
|
|
399
|
+
if (modelType === ModelType.TEXT_EMBEDDING) {
|
|
400
|
+
// Return fake embedding
|
|
401
|
+
return new Array(1536).fill(0).map(() => Math.random());
|
|
402
|
+
}
|
|
403
|
+
// Return fake text response
|
|
404
|
+
return "This is a test response from the model";
|
|
405
|
+
},
|
|
406
|
+
getSetting: (key: string) => process.env[key] || null,
|
|
407
|
+
} as any as IAgentRuntime;
|
|
408
|
+
console.log('✅ Test runtime initialized with API integrations');
|
|
409
|
+
|
|
410
|
+
// Run benchmark suite
|
|
411
|
+
const runner = new MultiBenchmarkRunner(runtime);
|
|
412
|
+
const results = await runner.runAllBenchmarks();
|
|
413
|
+
|
|
414
|
+
// Print summary
|
|
415
|
+
runner.printSummaryReport(results);
|
|
416
|
+
|
|
417
|
+
// Exit with appropriate code
|
|
418
|
+
const successRate = results.filter(r => r.success).length / results.length;
|
|
419
|
+
process.exit(successRate >= 0.8 ? 0 : 1);
|
|
420
|
+
|
|
421
|
+
} catch (error) {
|
|
422
|
+
console.error('❌ Multi-benchmark test failed:', error);
|
|
423
|
+
process.exit(1);
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import { describe, it, expect, vi, beforeEach } from 'vitest';
|
|
2
|
+
import { IAgentRuntime, Memory, State } from '@elizaos/core';
|
|
3
|
+
import {
|
|
4
|
+
activeResearchProvider,
|
|
5
|
+
completedResearchProvider,
|
|
6
|
+
researchCapabilitiesProvider
|
|
7
|
+
} from '../providers';
|
|
8
|
+
import { ResearchService } from '../service';
|
|
9
|
+
import { ResearchStatus, ResearchPhase, ResearchDomain, TaskType, ResearchDepth } from '../types';
|
|
10
|
+
|
|
11
|
+
describe('Research Providers', () => {
|
|
12
|
+
let mockRuntime: IAgentRuntime;
|
|
13
|
+
let mockService: ResearchService;
|
|
14
|
+
let mockMemory: Memory;
|
|
15
|
+
let mockState: State;
|
|
16
|
+
|
|
17
|
+
beforeEach(() => {
|
|
18
|
+
// Create mock service
|
|
19
|
+
mockService = {
|
|
20
|
+
getActiveProjects: vi.fn().mockResolvedValue([]),
|
|
21
|
+
getProject: vi.fn().mockResolvedValue(null),
|
|
22
|
+
getAllProjects: vi.fn().mockResolvedValue([]),
|
|
23
|
+
} as any;
|
|
24
|
+
|
|
25
|
+
// Create mock runtime
|
|
26
|
+
mockRuntime = {
|
|
27
|
+
agentId: 'test-agent',
|
|
28
|
+
getService: vi.fn().mockReturnValue(mockService),
|
|
29
|
+
} as any;
|
|
30
|
+
|
|
31
|
+
// Create mock memory and state
|
|
32
|
+
mockMemory = {
|
|
33
|
+
id: '00000000-0000-0000-0000-000000000000' as any,
|
|
34
|
+
content: { text: 'research status overview' },
|
|
35
|
+
entityId: 'test-entity',
|
|
36
|
+
roomId: 'test-room',
|
|
37
|
+
} as unknown as Memory;
|
|
38
|
+
|
|
39
|
+
mockState = {
|
|
40
|
+
values: {},
|
|
41
|
+
data: {},
|
|
42
|
+
text: 'test state',
|
|
43
|
+
} as State;
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
describe('activeResearchProvider', () => {
|
|
47
|
+
it('should provide active research project information', async () => {
|
|
48
|
+
const mockProject = {
|
|
49
|
+
id: 'project-1',
|
|
50
|
+
query: 'quantum computing',
|
|
51
|
+
status: ResearchStatus.ACTIVE,
|
|
52
|
+
phase: ResearchPhase.SEARCHING,
|
|
53
|
+
createdAt: Date.now(),
|
|
54
|
+
updatedAt: Date.now(),
|
|
55
|
+
findings: [],
|
|
56
|
+
sources: [],
|
|
57
|
+
metadata: {
|
|
58
|
+
domain: ResearchDomain.PHYSICS,
|
|
59
|
+
taskType: TaskType.EXPLORATORY,
|
|
60
|
+
language: 'en',
|
|
61
|
+
depth: ResearchDepth.MODERATE,
|
|
62
|
+
}
|
|
63
|
+
};
|
|
64
|
+
|
|
65
|
+
mockService.getActiveProjects = vi.fn().mockResolvedValue([mockProject]);
|
|
66
|
+
|
|
67
|
+
const result = await activeResearchProvider.get(mockRuntime, mockMemory, mockState);
|
|
68
|
+
|
|
69
|
+
expect(result).toBeDefined();
|
|
70
|
+
expect(result.text).toContain('Research on "quantum computing"');
|
|
71
|
+
expect(result.text).toContain('searching');
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
it('should handle no active projects', async () => {
|
|
75
|
+
mockService.getActiveProjects = vi.fn().mockResolvedValue([]);
|
|
76
|
+
|
|
77
|
+
const result = await activeResearchProvider.get(mockRuntime, mockMemory, mockState);
|
|
78
|
+
|
|
79
|
+
expect(result.text).toBe('');
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
it('should handle service not available', async () => {
|
|
83
|
+
mockRuntime.getService = vi.fn().mockReturnValue(null);
|
|
84
|
+
|
|
85
|
+
const result = await activeResearchProvider.get(mockRuntime, mockMemory, mockState);
|
|
86
|
+
|
|
87
|
+
expect(result.text).toBe('');
|
|
88
|
+
});
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
describe('completedResearchProvider', () => {
|
|
92
|
+
it('should provide completed research project information', async () => {
|
|
93
|
+
mockMemory.content.text = 'show completed research status';
|
|
94
|
+
const mockProjects = [
|
|
95
|
+
{
|
|
96
|
+
id: 'project-1',
|
|
97
|
+
query: 'AI ethics',
|
|
98
|
+
status: ResearchStatus.COMPLETED,
|
|
99
|
+
phase: ResearchPhase.COMPLETE,
|
|
100
|
+
createdAt: Date.now() - 3600000,
|
|
101
|
+
updatedAt: Date.now() - 1800000,
|
|
102
|
+
completedAt: Date.now() - 1800000,
|
|
103
|
+
findings: Array(15).fill({}),
|
|
104
|
+
sources: Array(20).fill({}),
|
|
105
|
+
report: {
|
|
106
|
+
sections: ['Introduction', 'Analysis', 'Conclusion']
|
|
107
|
+
},
|
|
108
|
+
metadata: {
|
|
109
|
+
domain: ResearchDomain.PHILOSOPHY,
|
|
110
|
+
taskType: TaskType.EVALUATIVE,
|
|
111
|
+
language: 'en',
|
|
112
|
+
depth: ResearchDepth.DEEP,
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
];
|
|
116
|
+
|
|
117
|
+
mockService.getAllProjects = vi.fn().mockResolvedValue(mockProjects);
|
|
118
|
+
|
|
119
|
+
const result = await completedResearchProvider.get(mockRuntime, mockMemory, mockState);
|
|
120
|
+
|
|
121
|
+
expect(result.text).toContain('Recently Completed Research');
|
|
122
|
+
expect(result.text).toContain('AI ethics');
|
|
123
|
+
expect(result.text).toContain('Report available');
|
|
124
|
+
expect(result.text).toContain('3 sections');
|
|
125
|
+
});
|
|
126
|
+
|
|
127
|
+
it('should handle no completed projects', async () => {
|
|
128
|
+
mockService.getAllProjects = vi.fn().mockResolvedValue([]);
|
|
129
|
+
|
|
130
|
+
const result = await completedResearchProvider.get(mockRuntime, mockMemory, mockState);
|
|
131
|
+
|
|
132
|
+
expect(result.text).toBe('');
|
|
133
|
+
});
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
describe('researchCapabilitiesProvider', () => {
|
|
137
|
+
it('should provide research capabilities information', async () => {
|
|
138
|
+
mockMemory.content.text = 'what are your research capabilities';
|
|
139
|
+
const result = await researchCapabilitiesProvider.get(mockRuntime, mockMemory, mockState);
|
|
140
|
+
|
|
141
|
+
expect(result.text).toContain('Research Capabilities');
|
|
142
|
+
expect(result.text).toContain('Deep multi-phase internet research');
|
|
143
|
+
expect(result.text).toContain('Automatic source collection');
|
|
144
|
+
expect(result.text).toContain('Comprehensive report generation');
|
|
145
|
+
});
|
|
146
|
+
|
|
147
|
+
it('should work even without service', async () => {
|
|
148
|
+
mockMemory.content.text = 'describe research capabilities';
|
|
149
|
+
mockRuntime.getService = vi.fn().mockReturnValue(null);
|
|
150
|
+
|
|
151
|
+
const result = await researchCapabilitiesProvider.get(mockRuntime, mockMemory, mockState);
|
|
152
|
+
|
|
153
|
+
expect(result.text).toContain('Research Capabilities');
|
|
154
|
+
});
|
|
155
|
+
});
|
|
156
|
+
});
|