@elizaos/plugin-research 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/README.md +400 -0
  2. package/dist/index.cjs +9366 -0
  3. package/dist/index.cjs.map +1 -0
  4. package/dist/index.js +9284 -0
  5. package/dist/index.js.map +1 -0
  6. package/package.json +80 -0
  7. package/src/__tests__/action-chaining.test.ts +532 -0
  8. package/src/__tests__/actions.test.ts +118 -0
  9. package/src/__tests__/cache-rate-limiter.test.ts +303 -0
  10. package/src/__tests__/content-extractors.test.ts +26 -0
  11. package/src/__tests__/deepresearch-bench-integration.test.ts +520 -0
  12. package/src/__tests__/deepresearch-bench-simplified.e2e.test.ts +290 -0
  13. package/src/__tests__/deepresearch-bench.e2e.test.ts +376 -0
  14. package/src/__tests__/e2e.test.ts +1870 -0
  15. package/src/__tests__/multi-benchmark-runner.ts +427 -0
  16. package/src/__tests__/providers.test.ts +156 -0
  17. package/src/__tests__/real-world.e2e.test.ts +788 -0
  18. package/src/__tests__/research-scenarios.test.ts +755 -0
  19. package/src/__tests__/research.e2e.test.ts +704 -0
  20. package/src/__tests__/research.test.ts +174 -0
  21. package/src/__tests__/search-providers.test.ts +174 -0
  22. package/src/__tests__/single-benchmark-runner.ts +735 -0
  23. package/src/__tests__/test-search-providers.ts +171 -0
  24. package/src/__tests__/verify-apis.test.ts +82 -0
  25. package/src/actions.ts +1677 -0
  26. package/src/benchmark/deepresearch-benchmark.ts +369 -0
  27. package/src/evaluation/research-evaluator.ts +444 -0
  28. package/src/examples/api-integration.md +498 -0
  29. package/src/examples/browserbase-integration.md +132 -0
  30. package/src/examples/debug-research-query.ts +162 -0
  31. package/src/examples/defi-code-scenarios.md +536 -0
  32. package/src/examples/defi-implementation-guide.md +454 -0
  33. package/src/examples/eliza-research-example.ts +142 -0
  34. package/src/examples/fix-renewable-energy-research.ts +209 -0
  35. package/src/examples/research-scenarios.md +408 -0
  36. package/src/examples/run-complete-renewable-research.ts +303 -0
  37. package/src/examples/run-deep-research.ts +352 -0
  38. package/src/examples/run-logged-research.ts +304 -0
  39. package/src/examples/run-real-research.ts +151 -0
  40. package/src/examples/save-research-output.ts +133 -0
  41. package/src/examples/test-file-logging.ts +199 -0
  42. package/src/examples/test-real-research.ts +67 -0
  43. package/src/examples/test-renewable-energy-research.ts +229 -0
  44. package/src/index.ts +28 -0
  45. package/src/integrations/cache.ts +128 -0
  46. package/src/integrations/content-extractors/firecrawl.ts +314 -0
  47. package/src/integrations/content-extractors/pdf-extractor.ts +350 -0
  48. package/src/integrations/content-extractors/playwright.ts +420 -0
  49. package/src/integrations/factory.ts +419 -0
  50. package/src/integrations/index.ts +18 -0
  51. package/src/integrations/rate-limiter.ts +181 -0
  52. package/src/integrations/search-providers/academic.ts +290 -0
  53. package/src/integrations/search-providers/exa.ts +205 -0
  54. package/src/integrations/search-providers/npm.ts +330 -0
  55. package/src/integrations/search-providers/pypi.ts +211 -0
  56. package/src/integrations/search-providers/serpapi.ts +277 -0
  57. package/src/integrations/search-providers/serper.ts +358 -0
  58. package/src/integrations/search-providers/stagehand-google.ts +87 -0
  59. package/src/integrations/search-providers/tavily.ts +187 -0
  60. package/src/processing/relevance-analyzer.ts +353 -0
  61. package/src/processing/research-logger.ts +450 -0
  62. package/src/processing/result-processor.ts +372 -0
  63. package/src/prompts/research-prompts.ts +419 -0
  64. package/src/providers/cacheProvider.ts +164 -0
  65. package/src/providers.ts +173 -0
  66. package/src/service.ts +2588 -0
  67. package/src/services/swe-bench.ts +286 -0
  68. package/src/strategies/research-strategies.ts +790 -0
  69. package/src/types/pdf-parse.d.ts +34 -0
  70. package/src/types.ts +551 -0
  71. package/src/verification/claim-verifier.ts +443 -0
@@ -0,0 +1,369 @@
1
+ import { exec } from 'child_process';
2
+ import { promisify } from 'util';
3
+ import fs from 'fs/promises';
4
+ import path from 'path';
5
+ import { elizaLogger } from '@elizaos/core';
6
+ import { ResearchProject, DeepResearchBenchResult } from '../types';
7
+
8
+ const execAsync = promisify(exec);
9
+
10
+ export interface BenchmarkConfig {
11
+ pythonPath?: string;
12
+ benchmarkPath?: string;
13
+ outputDir?: string;
14
+ maxRetries?: number;
15
+ }
16
+
17
+ export interface BenchmarkResult {
18
+ comprehensiveness: number;
19
+ insight: number;
20
+ instructionFollowing: number;
21
+ readability: number;
22
+ overallScore: number;
23
+ timestamp: string;
24
+ modelName: string;
25
+ }
26
+
27
+ export interface BenchmarkSetupResult {
28
+ success: boolean;
29
+ pythonVersion?: string;
30
+ benchmarkPath?: string;
31
+ error?: string;
32
+ }
33
+
34
+ export class DeepResearchBenchmark {
35
+ private pythonPath: string;
36
+ private benchmarkPath: string;
37
+ private outputDir: string;
38
+ private maxRetries: number;
39
+
40
+ constructor(config: BenchmarkConfig = {}) {
41
+ this.pythonPath = config.pythonPath || 'python3';
42
+ this.benchmarkPath = config.benchmarkPath || path.join(process.cwd(), 'deep_research_bench');
43
+ this.outputDir = config.outputDir || path.join(this.benchmarkPath, 'results');
44
+ this.maxRetries = config.maxRetries || 3;
45
+ }
46
+
47
+ /**
48
+ * Check if the benchmark environment is properly set up
49
+ */
50
+ async checkSetup(): Promise<BenchmarkSetupResult> {
51
+ try {
52
+ // Check Python availability
53
+ const { stdout: pythonVersion } = await execAsync(`${this.pythonPath} --version`);
54
+ elizaLogger.info(`Python version: ${pythonVersion.trim()}`);
55
+
56
+ // Check if benchmark directory exists
57
+ try {
58
+ await fs.access(this.benchmarkPath);
59
+ } catch {
60
+ return {
61
+ success: false,
62
+ error: `Benchmark directory not found: ${this.benchmarkPath}`
63
+ };
64
+ }
65
+
66
+ // Check if main script exists
67
+ const mainScript = path.join(this.benchmarkPath, 'deepresearch_bench_race.py');
68
+ try {
69
+ await fs.access(mainScript);
70
+ } catch {
71
+ return {
72
+ success: false,
73
+ error: `Main benchmark script not found: ${mainScript}`
74
+ };
75
+ }
76
+
77
+ // Check if requirements are installed
78
+ try {
79
+ await execAsync(`cd "${this.benchmarkPath}" && ${this.pythonPath} -c "import tqdm, openai, requests"`);
80
+ } catch (error) {
81
+ return {
82
+ success: false,
83
+ error: `Python dependencies not installed. Run: cd ${this.benchmarkPath} && pip install -r requirements.txt`
84
+ };
85
+ }
86
+
87
+ return {
88
+ success: true,
89
+ pythonVersion: pythonVersion.trim(),
90
+ benchmarkPath: this.benchmarkPath
91
+ };
92
+ } catch (error) {
93
+ return {
94
+ success: false,
95
+ error: `Setup check failed: ${error instanceof Error ? error.message : String(error)}`
96
+ };
97
+ }
98
+ }
99
+
100
+ /**
101
+ * Install benchmark dependencies
102
+ */
103
+ async setupBenchmark(): Promise<boolean> {
104
+ try {
105
+ elizaLogger.info('Installing DeepResearch benchmark dependencies...');
106
+
107
+ const { stdout, stderr } = await execAsync(
108
+ `cd "${this.benchmarkPath}" && pip install -r requirements.txt`,
109
+ { timeout: 180000 } // 3 minutes
110
+ );
111
+
112
+ if (stderr && stderr.includes('ERROR')) {
113
+ elizaLogger.error('Pip install errors:', stderr);
114
+ return false;
115
+ }
116
+
117
+ elizaLogger.info('Dependencies installed successfully');
118
+ return true;
119
+ } catch (error) {
120
+ elizaLogger.error('Failed to install dependencies:', error);
121
+ return false;
122
+ }
123
+ }
124
+
125
+ /**
126
+ * Convert a ResearchProject to DeepResearch benchmark format
127
+ */
128
+ private convertProjectToBenchmarkFormat(project: ResearchProject): DeepResearchBenchResult {
129
+ if (!project.report) {
130
+ throw new Error('Project must have a report to be benchmarked');
131
+ }
132
+
133
+ // Create the article content from report sections
134
+ const article = project.report.sections
135
+ .map(section => `${section.heading}\n\n${section.content}`)
136
+ .join('\n\n');
137
+
138
+ return {
139
+ id: project.id,
140
+ prompt: project.query,
141
+ article,
142
+ metadata: {
143
+ domain: project.metadata.domain,
144
+ taskType: project.metadata.taskType,
145
+ generatedAt: new Date().toISOString(),
146
+ modelVersion: 'elizaos-research-1.0',
147
+ evaluationScores: project.report.evaluationMetrics ? {
148
+ race: project.report.evaluationMetrics.raceScore,
149
+ fact: project.report.evaluationMetrics.factScore,
150
+ } : {
151
+ race: {
152
+ overall: 0,
153
+ comprehensiveness: 0,
154
+ depth: 0,
155
+ instructionFollowing: 0,
156
+ readability: 0,
157
+ breakdown: [],
158
+ },
159
+ fact: {
160
+ citationAccuracy: 0,
161
+ effectiveCitations: 0,
162
+ totalCitations: 0,
163
+ verifiedCitations: 0,
164
+ disputedCitations: 0,
165
+ citationCoverage: 0,
166
+ sourceCredibility: 0,
167
+ breakdown: [],
168
+ },
169
+ },
170
+ },
171
+ };
172
+ }
173
+
174
+ /**
175
+ * Save a research project in the format expected by the benchmark
176
+ */
177
+ private async saveProjectForBenchmark(project: ResearchProject, modelName: string): Promise<string> {
178
+ const benchmarkData = this.convertProjectToBenchmarkFormat(project);
179
+
180
+ // Create model directory
181
+ const modelDir = path.join(this.benchmarkPath, 'data', 'test_data', 'raw_data');
182
+ await fs.mkdir(modelDir, { recursive: true });
183
+
184
+ // Save as JSONL file (one line per project)
185
+ const outputFile = path.join(modelDir, `${modelName}.jsonl`);
186
+ const jsonLine = JSON.stringify(benchmarkData) + '\n';
187
+
188
+ await fs.writeFile(outputFile, jsonLine, 'utf-8');
189
+ elizaLogger.info(`Saved project to benchmark format: ${outputFile}`);
190
+
191
+ return outputFile;
192
+ }
193
+
194
+ /**
195
+ * Run the RACE evaluation for a single project
196
+ */
197
+ async evaluateProject(
198
+ project: ResearchProject,
199
+ modelName: string = 'elizaos-research-agent'
200
+ ): Promise<BenchmarkResult> {
201
+ // Check setup first
202
+ const setupResult = await this.checkSetup();
203
+ if (!setupResult.success) {
204
+ throw new Error(`Benchmark setup failed: ${setupResult.error}`);
205
+ }
206
+
207
+ try {
208
+ // Save project in benchmark format
209
+ await this.saveProjectForBenchmark(project, modelName);
210
+
211
+ // Run the benchmark
212
+ elizaLogger.info(`Running DeepResearch benchmark for model: ${modelName}`);
213
+
214
+ const command = `cd "${this.benchmarkPath}" && ${this.pythonPath} deepresearch_bench_race.py ${modelName} --limit 1 --only_en`;
215
+
216
+ const { stdout, stderr } = await execAsync(command, {
217
+ timeout: 300000, // 5 minutes
218
+ maxBuffer: 1024 * 1024 * 10 // 10MB buffer
219
+ });
220
+
221
+ if (stderr && stderr.includes('ERROR')) {
222
+ elizaLogger.error('Benchmark execution errors:', stderr);
223
+ }
224
+
225
+ elizaLogger.info('Benchmark completed, parsing results...');
226
+
227
+ // Parse results
228
+ const resultFile = path.join(this.outputDir, 'race_result.txt');
229
+ const results = await this.parseResults(resultFile);
230
+
231
+ elizaLogger.info('Benchmark results:', results);
232
+ return results;
233
+
234
+ } catch (error) {
235
+ elizaLogger.error('Benchmark evaluation failed:', error);
236
+ throw error;
237
+ }
238
+ }
239
+
240
+ /**
241
+ * Parse benchmark results from the output file
242
+ */
243
+ private async parseResults(resultFile: string): Promise<BenchmarkResult> {
244
+ try {
245
+ const content = await fs.readFile(resultFile, 'utf-8');
246
+ const lines = content.split('\n');
247
+
248
+ const results: any = {
249
+ timestamp: new Date().toISOString(),
250
+ modelName: 'elizaos-research-agent'
251
+ };
252
+
253
+ for (const line of lines) {
254
+ const [key, value] = line.split(':').map(s => s.trim());
255
+ if (key && value) {
256
+ const numValue = parseFloat(value);
257
+ if (!isNaN(numValue)) {
258
+ switch (key.toLowerCase()) {
259
+ case 'comprehensiveness':
260
+ results.comprehensiveness = numValue;
261
+ break;
262
+ case 'insight':
263
+ results.insight = numValue;
264
+ break;
265
+ case 'instruction following':
266
+ results.instructionFollowing = numValue;
267
+ break;
268
+ case 'readability':
269
+ results.readability = numValue;
270
+ break;
271
+ case 'overall score':
272
+ results.overallScore = numValue;
273
+ break;
274
+ }
275
+ }
276
+ }
277
+ }
278
+
279
+ // Validate that we got all required scores
280
+ const requiredFields = ['comprehensiveness', 'insight', 'instructionFollowing', 'readability', 'overallScore'];
281
+ for (const field of requiredFields) {
282
+ if (results[field] === undefined) {
283
+ throw new Error(`Missing required field in benchmark results: ${field}`);
284
+ }
285
+ }
286
+
287
+ return results as BenchmarkResult;
288
+
289
+ } catch (error) {
290
+ elizaLogger.error('Failed to parse benchmark results:', error);
291
+ throw new Error(`Failed to parse benchmark results: ${error instanceof Error ? error.message : String(error)}`);
292
+ }
293
+ }
294
+
295
+ /**
296
+ * Run benchmark on multiple projects
297
+ */
298
+ async evaluateProjects(
299
+ projects: ResearchProject[],
300
+ modelName: string = 'elizaos-research-agent'
301
+ ): Promise<BenchmarkResult[]> {
302
+ const results: BenchmarkResult[] = [];
303
+
304
+ for (let i = 0; i < projects.length; i++) {
305
+ const project = projects[i];
306
+ elizaLogger.info(`Evaluating project ${i + 1}/${projects.length}: ${project.query.substring(0, 50)}...`);
307
+
308
+ try {
309
+ const result = await this.evaluateProject(project, `${modelName}-${i}`);
310
+ results.push(result);
311
+ } catch (error) {
312
+ elizaLogger.error(`Failed to evaluate project ${i + 1}:`, error);
313
+ // Continue with other projects
314
+ }
315
+ }
316
+
317
+ return results;
318
+ }
319
+
320
+ /**
321
+ * Get benchmark statistics for a model
322
+ */
323
+ async getBenchmarkStats(modelName: string): Promise<{
324
+ averageScore: number;
325
+ totalEvaluations: number;
326
+ scoreBreakdown: {
327
+ comprehensiveness: number;
328
+ insight: number;
329
+ instructionFollowing: number;
330
+ readability: number;
331
+ };
332
+ }> {
333
+ try {
334
+ const rawResultsFile = path.join(this.outputDir, 'raw_results.jsonl');
335
+ const content = await fs.readFile(rawResultsFile, 'utf-8');
336
+ const lines = content.trim().split('\n');
337
+
338
+ const scores = lines
339
+ .map(line => JSON.parse(line))
340
+ .filter(result => !result.error); // Filter out failed evaluations
341
+
342
+ if (scores.length === 0) {
343
+ throw new Error('No successful evaluations found');
344
+ }
345
+
346
+ const avgScores = {
347
+ comprehensiveness: scores.reduce((sum, s) => sum + (s.comprehensiveness || 0), 0) / scores.length,
348
+ insight: scores.reduce((sum, s) => sum + (s.insight || 0), 0) / scores.length,
349
+ instructionFollowing: scores.reduce((sum, s) => sum + (s.instruction_following || 0), 0) / scores.length,
350
+ readability: scores.reduce((sum, s) => sum + (s.readability || 0), 0) / scores.length,
351
+ };
352
+
353
+ const averageScore = scores.reduce((sum, s) => sum + (s.overall_score || 0), 0) / scores.length;
354
+
355
+ return {
356
+ averageScore,
357
+ totalEvaluations: scores.length,
358
+ scoreBreakdown: avgScores
359
+ };
360
+
361
+ } catch (error) {
362
+ elizaLogger.error('Failed to get benchmark stats:', error);
363
+ throw error;
364
+ }
365
+ }
366
+ }
367
+
368
+ // Default instance
369
+ export const deepResearchBenchmark = new DeepResearchBenchmark();