@elizaos/plugin-research 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +400 -0
- package/dist/index.cjs +9366 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.js +9284 -0
- package/dist/index.js.map +1 -0
- package/package.json +80 -0
- package/src/__tests__/action-chaining.test.ts +532 -0
- package/src/__tests__/actions.test.ts +118 -0
- package/src/__tests__/cache-rate-limiter.test.ts +303 -0
- package/src/__tests__/content-extractors.test.ts +26 -0
- package/src/__tests__/deepresearch-bench-integration.test.ts +520 -0
- package/src/__tests__/deepresearch-bench-simplified.e2e.test.ts +290 -0
- package/src/__tests__/deepresearch-bench.e2e.test.ts +376 -0
- package/src/__tests__/e2e.test.ts +1870 -0
- package/src/__tests__/multi-benchmark-runner.ts +427 -0
- package/src/__tests__/providers.test.ts +156 -0
- package/src/__tests__/real-world.e2e.test.ts +788 -0
- package/src/__tests__/research-scenarios.test.ts +755 -0
- package/src/__tests__/research.e2e.test.ts +704 -0
- package/src/__tests__/research.test.ts +174 -0
- package/src/__tests__/search-providers.test.ts +174 -0
- package/src/__tests__/single-benchmark-runner.ts +735 -0
- package/src/__tests__/test-search-providers.ts +171 -0
- package/src/__tests__/verify-apis.test.ts +82 -0
- package/src/actions.ts +1677 -0
- package/src/benchmark/deepresearch-benchmark.ts +369 -0
- package/src/evaluation/research-evaluator.ts +444 -0
- package/src/examples/api-integration.md +498 -0
- package/src/examples/browserbase-integration.md +132 -0
- package/src/examples/debug-research-query.ts +162 -0
- package/src/examples/defi-code-scenarios.md +536 -0
- package/src/examples/defi-implementation-guide.md +454 -0
- package/src/examples/eliza-research-example.ts +142 -0
- package/src/examples/fix-renewable-energy-research.ts +209 -0
- package/src/examples/research-scenarios.md +408 -0
- package/src/examples/run-complete-renewable-research.ts +303 -0
- package/src/examples/run-deep-research.ts +352 -0
- package/src/examples/run-logged-research.ts +304 -0
- package/src/examples/run-real-research.ts +151 -0
- package/src/examples/save-research-output.ts +133 -0
- package/src/examples/test-file-logging.ts +199 -0
- package/src/examples/test-real-research.ts +67 -0
- package/src/examples/test-renewable-energy-research.ts +229 -0
- package/src/index.ts +28 -0
- package/src/integrations/cache.ts +128 -0
- package/src/integrations/content-extractors/firecrawl.ts +314 -0
- package/src/integrations/content-extractors/pdf-extractor.ts +350 -0
- package/src/integrations/content-extractors/playwright.ts +420 -0
- package/src/integrations/factory.ts +419 -0
- package/src/integrations/index.ts +18 -0
- package/src/integrations/rate-limiter.ts +181 -0
- package/src/integrations/search-providers/academic.ts +290 -0
- package/src/integrations/search-providers/exa.ts +205 -0
- package/src/integrations/search-providers/npm.ts +330 -0
- package/src/integrations/search-providers/pypi.ts +211 -0
- package/src/integrations/search-providers/serpapi.ts +277 -0
- package/src/integrations/search-providers/serper.ts +358 -0
- package/src/integrations/search-providers/stagehand-google.ts +87 -0
- package/src/integrations/search-providers/tavily.ts +187 -0
- package/src/processing/relevance-analyzer.ts +353 -0
- package/src/processing/research-logger.ts +450 -0
- package/src/processing/result-processor.ts +372 -0
- package/src/prompts/research-prompts.ts +419 -0
- package/src/providers/cacheProvider.ts +164 -0
- package/src/providers.ts +173 -0
- package/src/service.ts +2588 -0
- package/src/services/swe-bench.ts +286 -0
- package/src/strategies/research-strategies.ts +790 -0
- package/src/types/pdf-parse.d.ts +34 -0
- package/src/types.ts +551 -0
- package/src/verification/claim-verifier.ts +443 -0
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
import { exec } from 'child_process';
|
|
2
|
+
import { promisify } from 'util';
|
|
3
|
+
import fs from 'fs/promises';
|
|
4
|
+
import path from 'path';
|
|
5
|
+
import { elizaLogger } from '@elizaos/core';
|
|
6
|
+
import { ResearchProject, DeepResearchBenchResult } from '../types';
|
|
7
|
+
|
|
8
|
+
const execAsync = promisify(exec);
|
|
9
|
+
|
|
10
|
+
export interface BenchmarkConfig {
|
|
11
|
+
pythonPath?: string;
|
|
12
|
+
benchmarkPath?: string;
|
|
13
|
+
outputDir?: string;
|
|
14
|
+
maxRetries?: number;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export interface BenchmarkResult {
|
|
18
|
+
comprehensiveness: number;
|
|
19
|
+
insight: number;
|
|
20
|
+
instructionFollowing: number;
|
|
21
|
+
readability: number;
|
|
22
|
+
overallScore: number;
|
|
23
|
+
timestamp: string;
|
|
24
|
+
modelName: string;
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface BenchmarkSetupResult {
|
|
28
|
+
success: boolean;
|
|
29
|
+
pythonVersion?: string;
|
|
30
|
+
benchmarkPath?: string;
|
|
31
|
+
error?: string;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export class DeepResearchBenchmark {
|
|
35
|
+
private pythonPath: string;
|
|
36
|
+
private benchmarkPath: string;
|
|
37
|
+
private outputDir: string;
|
|
38
|
+
private maxRetries: number;
|
|
39
|
+
|
|
40
|
+
constructor(config: BenchmarkConfig = {}) {
|
|
41
|
+
this.pythonPath = config.pythonPath || 'python3';
|
|
42
|
+
this.benchmarkPath = config.benchmarkPath || path.join(process.cwd(), 'deep_research_bench');
|
|
43
|
+
this.outputDir = config.outputDir || path.join(this.benchmarkPath, 'results');
|
|
44
|
+
this.maxRetries = config.maxRetries || 3;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Check if the benchmark environment is properly set up
|
|
49
|
+
*/
|
|
50
|
+
async checkSetup(): Promise<BenchmarkSetupResult> {
|
|
51
|
+
try {
|
|
52
|
+
// Check Python availability
|
|
53
|
+
const { stdout: pythonVersion } = await execAsync(`${this.pythonPath} --version`);
|
|
54
|
+
elizaLogger.info(`Python version: ${pythonVersion.trim()}`);
|
|
55
|
+
|
|
56
|
+
// Check if benchmark directory exists
|
|
57
|
+
try {
|
|
58
|
+
await fs.access(this.benchmarkPath);
|
|
59
|
+
} catch {
|
|
60
|
+
return {
|
|
61
|
+
success: false,
|
|
62
|
+
error: `Benchmark directory not found: ${this.benchmarkPath}`
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Check if main script exists
|
|
67
|
+
const mainScript = path.join(this.benchmarkPath, 'deepresearch_bench_race.py');
|
|
68
|
+
try {
|
|
69
|
+
await fs.access(mainScript);
|
|
70
|
+
} catch {
|
|
71
|
+
return {
|
|
72
|
+
success: false,
|
|
73
|
+
error: `Main benchmark script not found: ${mainScript}`
|
|
74
|
+
};
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Check if requirements are installed
|
|
78
|
+
try {
|
|
79
|
+
await execAsync(`cd "${this.benchmarkPath}" && ${this.pythonPath} -c "import tqdm, openai, requests"`);
|
|
80
|
+
} catch (error) {
|
|
81
|
+
return {
|
|
82
|
+
success: false,
|
|
83
|
+
error: `Python dependencies not installed. Run: cd ${this.benchmarkPath} && pip install -r requirements.txt`
|
|
84
|
+
};
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
return {
|
|
88
|
+
success: true,
|
|
89
|
+
pythonVersion: pythonVersion.trim(),
|
|
90
|
+
benchmarkPath: this.benchmarkPath
|
|
91
|
+
};
|
|
92
|
+
} catch (error) {
|
|
93
|
+
return {
|
|
94
|
+
success: false,
|
|
95
|
+
error: `Setup check failed: ${error instanceof Error ? error.message : String(error)}`
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Install benchmark dependencies
|
|
102
|
+
*/
|
|
103
|
+
async setupBenchmark(): Promise<boolean> {
|
|
104
|
+
try {
|
|
105
|
+
elizaLogger.info('Installing DeepResearch benchmark dependencies...');
|
|
106
|
+
|
|
107
|
+
const { stdout, stderr } = await execAsync(
|
|
108
|
+
`cd "${this.benchmarkPath}" && pip install -r requirements.txt`,
|
|
109
|
+
{ timeout: 180000 } // 3 minutes
|
|
110
|
+
);
|
|
111
|
+
|
|
112
|
+
if (stderr && stderr.includes('ERROR')) {
|
|
113
|
+
elizaLogger.error('Pip install errors:', stderr);
|
|
114
|
+
return false;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
elizaLogger.info('Dependencies installed successfully');
|
|
118
|
+
return true;
|
|
119
|
+
} catch (error) {
|
|
120
|
+
elizaLogger.error('Failed to install dependencies:', error);
|
|
121
|
+
return false;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
/**
|
|
126
|
+
* Convert a ResearchProject to DeepResearch benchmark format
|
|
127
|
+
*/
|
|
128
|
+
private convertProjectToBenchmarkFormat(project: ResearchProject): DeepResearchBenchResult {
|
|
129
|
+
if (!project.report) {
|
|
130
|
+
throw new Error('Project must have a report to be benchmarked');
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// Create the article content from report sections
|
|
134
|
+
const article = project.report.sections
|
|
135
|
+
.map(section => `${section.heading}\n\n${section.content}`)
|
|
136
|
+
.join('\n\n');
|
|
137
|
+
|
|
138
|
+
return {
|
|
139
|
+
id: project.id,
|
|
140
|
+
prompt: project.query,
|
|
141
|
+
article,
|
|
142
|
+
metadata: {
|
|
143
|
+
domain: project.metadata.domain,
|
|
144
|
+
taskType: project.metadata.taskType,
|
|
145
|
+
generatedAt: new Date().toISOString(),
|
|
146
|
+
modelVersion: 'elizaos-research-1.0',
|
|
147
|
+
evaluationScores: project.report.evaluationMetrics ? {
|
|
148
|
+
race: project.report.evaluationMetrics.raceScore,
|
|
149
|
+
fact: project.report.evaluationMetrics.factScore,
|
|
150
|
+
} : {
|
|
151
|
+
race: {
|
|
152
|
+
overall: 0,
|
|
153
|
+
comprehensiveness: 0,
|
|
154
|
+
depth: 0,
|
|
155
|
+
instructionFollowing: 0,
|
|
156
|
+
readability: 0,
|
|
157
|
+
breakdown: [],
|
|
158
|
+
},
|
|
159
|
+
fact: {
|
|
160
|
+
citationAccuracy: 0,
|
|
161
|
+
effectiveCitations: 0,
|
|
162
|
+
totalCitations: 0,
|
|
163
|
+
verifiedCitations: 0,
|
|
164
|
+
disputedCitations: 0,
|
|
165
|
+
citationCoverage: 0,
|
|
166
|
+
sourceCredibility: 0,
|
|
167
|
+
breakdown: [],
|
|
168
|
+
},
|
|
169
|
+
},
|
|
170
|
+
},
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Save a research project in the format expected by the benchmark
|
|
176
|
+
*/
|
|
177
|
+
private async saveProjectForBenchmark(project: ResearchProject, modelName: string): Promise<string> {
|
|
178
|
+
const benchmarkData = this.convertProjectToBenchmarkFormat(project);
|
|
179
|
+
|
|
180
|
+
// Create model directory
|
|
181
|
+
const modelDir = path.join(this.benchmarkPath, 'data', 'test_data', 'raw_data');
|
|
182
|
+
await fs.mkdir(modelDir, { recursive: true });
|
|
183
|
+
|
|
184
|
+
// Save as JSONL file (one line per project)
|
|
185
|
+
const outputFile = path.join(modelDir, `${modelName}.jsonl`);
|
|
186
|
+
const jsonLine = JSON.stringify(benchmarkData) + '\n';
|
|
187
|
+
|
|
188
|
+
await fs.writeFile(outputFile, jsonLine, 'utf-8');
|
|
189
|
+
elizaLogger.info(`Saved project to benchmark format: ${outputFile}`);
|
|
190
|
+
|
|
191
|
+
return outputFile;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* Run the RACE evaluation for a single project
|
|
196
|
+
*/
|
|
197
|
+
async evaluateProject(
|
|
198
|
+
project: ResearchProject,
|
|
199
|
+
modelName: string = 'elizaos-research-agent'
|
|
200
|
+
): Promise<BenchmarkResult> {
|
|
201
|
+
// Check setup first
|
|
202
|
+
const setupResult = await this.checkSetup();
|
|
203
|
+
if (!setupResult.success) {
|
|
204
|
+
throw new Error(`Benchmark setup failed: ${setupResult.error}`);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
try {
|
|
208
|
+
// Save project in benchmark format
|
|
209
|
+
await this.saveProjectForBenchmark(project, modelName);
|
|
210
|
+
|
|
211
|
+
// Run the benchmark
|
|
212
|
+
elizaLogger.info(`Running DeepResearch benchmark for model: ${modelName}`);
|
|
213
|
+
|
|
214
|
+
const command = `cd "${this.benchmarkPath}" && ${this.pythonPath} deepresearch_bench_race.py ${modelName} --limit 1 --only_en`;
|
|
215
|
+
|
|
216
|
+
const { stdout, stderr } = await execAsync(command, {
|
|
217
|
+
timeout: 300000, // 5 minutes
|
|
218
|
+
maxBuffer: 1024 * 1024 * 10 // 10MB buffer
|
|
219
|
+
});
|
|
220
|
+
|
|
221
|
+
if (stderr && stderr.includes('ERROR')) {
|
|
222
|
+
elizaLogger.error('Benchmark execution errors:', stderr);
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
elizaLogger.info('Benchmark completed, parsing results...');
|
|
226
|
+
|
|
227
|
+
// Parse results
|
|
228
|
+
const resultFile = path.join(this.outputDir, 'race_result.txt');
|
|
229
|
+
const results = await this.parseResults(resultFile);
|
|
230
|
+
|
|
231
|
+
elizaLogger.info('Benchmark results:', results);
|
|
232
|
+
return results;
|
|
233
|
+
|
|
234
|
+
} catch (error) {
|
|
235
|
+
elizaLogger.error('Benchmark evaluation failed:', error);
|
|
236
|
+
throw error;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
/**
|
|
241
|
+
* Parse benchmark results from the output file
|
|
242
|
+
*/
|
|
243
|
+
private async parseResults(resultFile: string): Promise<BenchmarkResult> {
|
|
244
|
+
try {
|
|
245
|
+
const content = await fs.readFile(resultFile, 'utf-8');
|
|
246
|
+
const lines = content.split('\n');
|
|
247
|
+
|
|
248
|
+
const results: any = {
|
|
249
|
+
timestamp: new Date().toISOString(),
|
|
250
|
+
modelName: 'elizaos-research-agent'
|
|
251
|
+
};
|
|
252
|
+
|
|
253
|
+
for (const line of lines) {
|
|
254
|
+
const [key, value] = line.split(':').map(s => s.trim());
|
|
255
|
+
if (key && value) {
|
|
256
|
+
const numValue = parseFloat(value);
|
|
257
|
+
if (!isNaN(numValue)) {
|
|
258
|
+
switch (key.toLowerCase()) {
|
|
259
|
+
case 'comprehensiveness':
|
|
260
|
+
results.comprehensiveness = numValue;
|
|
261
|
+
break;
|
|
262
|
+
case 'insight':
|
|
263
|
+
results.insight = numValue;
|
|
264
|
+
break;
|
|
265
|
+
case 'instruction following':
|
|
266
|
+
results.instructionFollowing = numValue;
|
|
267
|
+
break;
|
|
268
|
+
case 'readability':
|
|
269
|
+
results.readability = numValue;
|
|
270
|
+
break;
|
|
271
|
+
case 'overall score':
|
|
272
|
+
results.overallScore = numValue;
|
|
273
|
+
break;
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// Validate that we got all required scores
|
|
280
|
+
const requiredFields = ['comprehensiveness', 'insight', 'instructionFollowing', 'readability', 'overallScore'];
|
|
281
|
+
for (const field of requiredFields) {
|
|
282
|
+
if (results[field] === undefined) {
|
|
283
|
+
throw new Error(`Missing required field in benchmark results: ${field}`);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
return results as BenchmarkResult;
|
|
288
|
+
|
|
289
|
+
} catch (error) {
|
|
290
|
+
elizaLogger.error('Failed to parse benchmark results:', error);
|
|
291
|
+
throw new Error(`Failed to parse benchmark results: ${error instanceof Error ? error.message : String(error)}`);
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
|
|
295
|
+
/**
|
|
296
|
+
* Run benchmark on multiple projects
|
|
297
|
+
*/
|
|
298
|
+
async evaluateProjects(
|
|
299
|
+
projects: ResearchProject[],
|
|
300
|
+
modelName: string = 'elizaos-research-agent'
|
|
301
|
+
): Promise<BenchmarkResult[]> {
|
|
302
|
+
const results: BenchmarkResult[] = [];
|
|
303
|
+
|
|
304
|
+
for (let i = 0; i < projects.length; i++) {
|
|
305
|
+
const project = projects[i];
|
|
306
|
+
elizaLogger.info(`Evaluating project ${i + 1}/${projects.length}: ${project.query.substring(0, 50)}...`);
|
|
307
|
+
|
|
308
|
+
try {
|
|
309
|
+
const result = await this.evaluateProject(project, `${modelName}-${i}`);
|
|
310
|
+
results.push(result);
|
|
311
|
+
} catch (error) {
|
|
312
|
+
elizaLogger.error(`Failed to evaluate project ${i + 1}:`, error);
|
|
313
|
+
// Continue with other projects
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
return results;
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
/**
|
|
321
|
+
* Get benchmark statistics for a model
|
|
322
|
+
*/
|
|
323
|
+
async getBenchmarkStats(modelName: string): Promise<{
|
|
324
|
+
averageScore: number;
|
|
325
|
+
totalEvaluations: number;
|
|
326
|
+
scoreBreakdown: {
|
|
327
|
+
comprehensiveness: number;
|
|
328
|
+
insight: number;
|
|
329
|
+
instructionFollowing: number;
|
|
330
|
+
readability: number;
|
|
331
|
+
};
|
|
332
|
+
}> {
|
|
333
|
+
try {
|
|
334
|
+
const rawResultsFile = path.join(this.outputDir, 'raw_results.jsonl');
|
|
335
|
+
const content = await fs.readFile(rawResultsFile, 'utf-8');
|
|
336
|
+
const lines = content.trim().split('\n');
|
|
337
|
+
|
|
338
|
+
const scores = lines
|
|
339
|
+
.map(line => JSON.parse(line))
|
|
340
|
+
.filter(result => !result.error); // Filter out failed evaluations
|
|
341
|
+
|
|
342
|
+
if (scores.length === 0) {
|
|
343
|
+
throw new Error('No successful evaluations found');
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
const avgScores = {
|
|
347
|
+
comprehensiveness: scores.reduce((sum, s) => sum + (s.comprehensiveness || 0), 0) / scores.length,
|
|
348
|
+
insight: scores.reduce((sum, s) => sum + (s.insight || 0), 0) / scores.length,
|
|
349
|
+
instructionFollowing: scores.reduce((sum, s) => sum + (s.instruction_following || 0), 0) / scores.length,
|
|
350
|
+
readability: scores.reduce((sum, s) => sum + (s.readability || 0), 0) / scores.length,
|
|
351
|
+
};
|
|
352
|
+
|
|
353
|
+
const averageScore = scores.reduce((sum, s) => sum + (s.overall_score || 0), 0) / scores.length;
|
|
354
|
+
|
|
355
|
+
return {
|
|
356
|
+
averageScore,
|
|
357
|
+
totalEvaluations: scores.length,
|
|
358
|
+
scoreBreakdown: avgScores
|
|
359
|
+
};
|
|
360
|
+
|
|
361
|
+
} catch (error) {
|
|
362
|
+
elizaLogger.error('Failed to get benchmark stats:', error);
|
|
363
|
+
throw error;
|
|
364
|
+
}
|
|
365
|
+
}
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
// Default instance
|
|
369
|
+
export const deepResearchBenchmark = new DeepResearchBenchmark();
|