@elizaos/plugin-research 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/README.md +400 -0
  2. package/dist/index.cjs +9366 -0
  3. package/dist/index.cjs.map +1 -0
  4. package/dist/index.js +9284 -0
  5. package/dist/index.js.map +1 -0
  6. package/package.json +80 -0
  7. package/src/__tests__/action-chaining.test.ts +532 -0
  8. package/src/__tests__/actions.test.ts +118 -0
  9. package/src/__tests__/cache-rate-limiter.test.ts +303 -0
  10. package/src/__tests__/content-extractors.test.ts +26 -0
  11. package/src/__tests__/deepresearch-bench-integration.test.ts +520 -0
  12. package/src/__tests__/deepresearch-bench-simplified.e2e.test.ts +290 -0
  13. package/src/__tests__/deepresearch-bench.e2e.test.ts +376 -0
  14. package/src/__tests__/e2e.test.ts +1870 -0
  15. package/src/__tests__/multi-benchmark-runner.ts +427 -0
  16. package/src/__tests__/providers.test.ts +156 -0
  17. package/src/__tests__/real-world.e2e.test.ts +788 -0
  18. package/src/__tests__/research-scenarios.test.ts +755 -0
  19. package/src/__tests__/research.e2e.test.ts +704 -0
  20. package/src/__tests__/research.test.ts +174 -0
  21. package/src/__tests__/search-providers.test.ts +174 -0
  22. package/src/__tests__/single-benchmark-runner.ts +735 -0
  23. package/src/__tests__/test-search-providers.ts +171 -0
  24. package/src/__tests__/verify-apis.test.ts +82 -0
  25. package/src/actions.ts +1677 -0
  26. package/src/benchmark/deepresearch-benchmark.ts +369 -0
  27. package/src/evaluation/research-evaluator.ts +444 -0
  28. package/src/examples/api-integration.md +498 -0
  29. package/src/examples/browserbase-integration.md +132 -0
  30. package/src/examples/debug-research-query.ts +162 -0
  31. package/src/examples/defi-code-scenarios.md +536 -0
  32. package/src/examples/defi-implementation-guide.md +454 -0
  33. package/src/examples/eliza-research-example.ts +142 -0
  34. package/src/examples/fix-renewable-energy-research.ts +209 -0
  35. package/src/examples/research-scenarios.md +408 -0
  36. package/src/examples/run-complete-renewable-research.ts +303 -0
  37. package/src/examples/run-deep-research.ts +352 -0
  38. package/src/examples/run-logged-research.ts +304 -0
  39. package/src/examples/run-real-research.ts +151 -0
  40. package/src/examples/save-research-output.ts +133 -0
  41. package/src/examples/test-file-logging.ts +199 -0
  42. package/src/examples/test-real-research.ts +67 -0
  43. package/src/examples/test-renewable-energy-research.ts +229 -0
  44. package/src/index.ts +28 -0
  45. package/src/integrations/cache.ts +128 -0
  46. package/src/integrations/content-extractors/firecrawl.ts +314 -0
  47. package/src/integrations/content-extractors/pdf-extractor.ts +350 -0
  48. package/src/integrations/content-extractors/playwright.ts +420 -0
  49. package/src/integrations/factory.ts +419 -0
  50. package/src/integrations/index.ts +18 -0
  51. package/src/integrations/rate-limiter.ts +181 -0
  52. package/src/integrations/search-providers/academic.ts +290 -0
  53. package/src/integrations/search-providers/exa.ts +205 -0
  54. package/src/integrations/search-providers/npm.ts +330 -0
  55. package/src/integrations/search-providers/pypi.ts +211 -0
  56. package/src/integrations/search-providers/serpapi.ts +277 -0
  57. package/src/integrations/search-providers/serper.ts +358 -0
  58. package/src/integrations/search-providers/stagehand-google.ts +87 -0
  59. package/src/integrations/search-providers/tavily.ts +187 -0
  60. package/src/processing/relevance-analyzer.ts +353 -0
  61. package/src/processing/research-logger.ts +450 -0
  62. package/src/processing/result-processor.ts +372 -0
  63. package/src/prompts/research-prompts.ts +419 -0
  64. package/src/providers/cacheProvider.ts +164 -0
  65. package/src/providers.ts +173 -0
  66. package/src/service.ts +2588 -0
  67. package/src/services/swe-bench.ts +286 -0
  68. package/src/strategies/research-strategies.ts +790 -0
  69. package/src/types/pdf-parse.d.ts +34 -0
  70. package/src/types.ts +551 -0
  71. package/src/verification/claim-verifier.ts +443 -0
@@ -0,0 +1,444 @@
1
+ import { IAgentRuntime, elizaLogger, ModelType } from '@elizaos/core';
2
+ import {
3
+ ResearchProject,
4
+ ResearchReport,
5
+ Citation,
6
+ FactualClaim,
7
+ RACEScore,
8
+ FACTScore,
9
+ EvaluationCriteria,
10
+ EvaluationMetrics,
11
+ VerificationStatus,
12
+ } from '../types';
13
+
14
+ /**
15
+ * RACE (Reference-based Adaptive Criteria-driven Evaluation) implementation
16
+ */
17
+ export class RACEEvaluator {
18
+ constructor(private runtime: IAgentRuntime) {}
19
+
20
+ async evaluate(
21
+ report: ResearchReport,
22
+ criteria: EvaluationCriteria,
23
+ referenceReport?: ResearchReport
24
+ ): Promise<RACEScore> {
25
+ const scores = {
26
+ comprehensiveness: await this.evaluateDimension(
27
+ report,
28
+ criteria.comprehensiveness,
29
+ 'comprehensiveness',
30
+ referenceReport
31
+ ),
32
+ depth: await this.evaluateDimension(report, criteria.depth, 'depth', referenceReport),
33
+ instructionFollowing: await this.evaluateDimension(
34
+ report,
35
+ criteria.instructionFollowing,
36
+ 'instructionFollowing',
37
+ referenceReport
38
+ ),
39
+ readability: await this.evaluateDimension(
40
+ report,
41
+ criteria.readability,
42
+ 'readability',
43
+ referenceReport
44
+ ),
45
+ };
46
+
47
+ // Calculate weighted overall score
48
+ const overall =
49
+ scores.comprehensiveness * criteria.comprehensiveness.weight +
50
+ scores.depth * criteria.depth.weight +
51
+ scores.instructionFollowing * criteria.instructionFollowing.weight +
52
+ scores.readability * criteria.readability.weight;
53
+
54
+ return {
55
+ overall,
56
+ comprehensiveness: scores.comprehensiveness,
57
+ depth: scores.depth,
58
+ instructionFollowing: scores.instructionFollowing,
59
+ readability: scores.readability,
60
+ breakdown: [],
61
+ };
62
+ }
63
+
64
+ private async evaluateDimension(
65
+ report: ResearchReport,
66
+ criteriaDefinition: any,
67
+ dimension: string,
68
+ referenceReport?: ResearchReport
69
+ ): Promise<number> {
70
+ try {
71
+ // Check if we have useModel available
72
+ if (!this.runtime.useModel) {
73
+ elizaLogger.warn(`[RACEEvaluator] No model available for ${dimension} evaluation, using default score`);
74
+ return 0.7; // Default score when no model available
75
+ }
76
+
77
+ const reportContent = this.extractReportContent(report);
78
+ const referenceContent = referenceReport ? this.extractReportContent(referenceReport) : '';
79
+
80
+ // Convert rubric items to string format if they're objects
81
+ let rubricText = '';
82
+ if (Array.isArray(criteriaDefinition.rubric)) {
83
+ rubricText = criteriaDefinition.rubric.map((item: any, i: number) => {
84
+ if (typeof item === 'string') {
85
+ return `${i + 1}. ${item}`;
86
+ } else if (item.description) {
87
+ return `${item.score || i}. ${item.description}`;
88
+ }
89
+ return `${i + 1}. Criterion ${i + 1}`;
90
+ }).join('\n');
91
+ }
92
+
93
+ const prompt = `Evaluate this research report on the ${dimension} dimension.
94
+
95
+ Evaluation Criteria:
96
+ ${criteriaDefinition.description}
97
+
98
+ Rubric Items to Check:
99
+ ${rubricText}
100
+
101
+ Report to Evaluate (first 5000 chars):
102
+ ${reportContent.substring(0, 5000)}
103
+
104
+ ${referenceContent ? `Reference Report for Comparison (first 2000 chars):\n${referenceContent.substring(0, 2000)}` : ''}
105
+
106
+ Provide a score from 0-100 based on how well the report meets the criteria.
107
+ Consider each rubric item and provide reasoning for your score.
108
+
109
+ Respond with JSON:
110
+ {
111
+ "score": number (0-100),
112
+ "reasoning": "explanation of score",
113
+ "rubricScores": {
114
+ "item1": score,
115
+ "item2": score,
116
+ ...
117
+ }
118
+ }`;
119
+
120
+ const response = await this.runtime.useModel(ModelType.TEXT_LARGE, {
121
+ messages: [
122
+ {
123
+ role: 'system',
124
+ content: 'You are an expert research evaluator. Provide a balanced, fair assessment.'
125
+ },
126
+ { role: 'user', content: prompt }
127
+ ],
128
+ temperature: 0.3,
129
+ max_tokens: 1000,
130
+ });
131
+
132
+ const content = typeof response === 'string' ? response : (response as any).content || '';
133
+
134
+ // Try to parse JSON response
135
+ try {
136
+ const jsonMatch = content.match(/\{[\s\S]*\}/);
137
+ if (jsonMatch) {
138
+ const result = JSON.parse(jsonMatch[0]);
139
+ return Math.max(0, Math.min(1, result.score / 100)); // Normalize to 0-1
140
+ }
141
+ } catch (parseError) {
142
+ elizaLogger.warn(`[RACEEvaluator] Failed to parse JSON response for ${dimension}:`, parseError);
143
+ }
144
+
145
+ // Fallback: try to extract score from text
146
+ const scoreMatch = content.match(/score[:\s]+(\d+)/i);
147
+ if (scoreMatch) {
148
+ const score = parseInt(scoreMatch[1]);
149
+ return Math.max(0, Math.min(1, score / 100));
150
+ }
151
+
152
+ elizaLogger.error(`[RACEEvaluator] Failed to extract score for ${dimension}`);
153
+ return 0.5; // Default middle score
154
+ } catch (e) {
155
+ elizaLogger.error(`[RACEEvaluator] Failed to evaluate ${dimension}:`, e);
156
+ return 0.5; // Default middle score
157
+ }
158
+ }
159
+
160
+ private extractReportContent(report: ResearchReport): string {
161
+ let content = `Title: ${report.title}\n\nSummary: ${report.summary}\n\n`;
162
+
163
+ for (const section of report.sections) {
164
+ content += `## ${section.heading}\n${section.content}\n\n`;
165
+ if (section.subsections) {
166
+ for (const subsection of section.subsections) {
167
+ content += `### ${subsection.heading}\n${subsection.content}\n\n`;
168
+ }
169
+ }
170
+ }
171
+
172
+ return content.substring(0, 10000); // Limit length for LLM context
173
+ }
174
+ }
175
+
176
+ /**
177
+ * FACT (Framework for Factual Abundance and Citation Trustworthiness) implementation
178
+ */
179
+ export class FACTEvaluator {
180
+ constructor(private runtime: IAgentRuntime) {}
181
+
182
+ async evaluate(project: ResearchProject): Promise<FACTScore> {
183
+ const allClaims = await this.extractFactualClaims(project);
184
+ const verificationResults = await this.verifyClaims(allClaims);
185
+
186
+ const totalCitations = project.report?.citations.length || 0;
187
+ const verifiedCitations = verificationResults.filter((r) => r.verified).length;
188
+ const citationAccuracy = totalCitations > 0 ? verifiedCitations / totalCitations : 0;
189
+
190
+ // Deduplicate claims
191
+ const uniqueClaims = this.deduplicateClaims(verificationResults);
192
+ const effectiveCitations = uniqueClaims.filter((c) => c.verified).length;
193
+
194
+ return {
195
+ citationAccuracy,
196
+ effectiveCitations,
197
+ totalCitations,
198
+ verifiedCitations,
199
+ disputedCitations: 0,
200
+ citationCoverage: totalCitations > 0 ? effectiveCitations / totalCitations : 0,
201
+ sourceCredibility: 0.8, // Default credibility score
202
+ breakdown: [],
203
+ };
204
+ }
205
+
206
+ private async extractFactualClaims(project: ResearchProject): Promise<FactualClaim[]> {
207
+ const claims: FactualClaim[] = [];
208
+
209
+ if (!project.report) return claims;
210
+
211
+ // Extract claims from report sections
212
+ for (const section of project.report.sections) {
213
+ const sectionClaims = await this.extractClaimsFromText(section.content, section.citations);
214
+ claims.push(...sectionClaims);
215
+ }
216
+
217
+ // Also extract from findings
218
+ for (const finding of project.findings) {
219
+ if (finding.factualClaims) {
220
+ claims.push(...finding.factualClaims);
221
+ }
222
+ }
223
+
224
+ return claims;
225
+ }
226
+
227
+ private async extractClaimsFromText(
228
+ text: string,
229
+ citations: Citation[]
230
+ ): Promise<FactualClaim[]> {
231
+ // Check if we have a model available
232
+ if (!this.runtime.useModel) {
233
+ elizaLogger.warn('[FACTEvaluator] No model available for claim extraction');
234
+ return [];
235
+ }
236
+
237
+ const prompt = `Extract factual claims and their citations from this text.
238
+
239
+ Text:
240
+ ${text.substring(0, 3000)}
241
+
242
+ Available Citations:
243
+ ${citations.slice(0, 10).map((c, i) => `[${i + 1}] ${c.source.url}`).join('\n')}
244
+
245
+ For each factual claim in the text:
246
+ 1. Extract the exact statement
247
+ 2. Identify which citation supports it (by number)
248
+ 3. Note the supporting evidence
249
+
250
+ Respond with JSON array:
251
+ [
252
+ {
253
+ "statement": "exact factual claim",
254
+ "citationIndex": number,
255
+ "supportingEvidence": "relevant quote or context"
256
+ }
257
+ ]
258
+
259
+ Extract 3-5 key claims maximum.`;
260
+
261
+ try {
262
+ const response = await this.runtime.useModel(ModelType.TEXT_LARGE, {
263
+ messages: [
264
+ {
265
+ role: 'system',
266
+ content: 'You are a fact extraction expert. Extract only clear, verifiable claims.'
267
+ },
268
+ { role: 'user', content: prompt }
269
+ ],
270
+ temperature: 0.2,
271
+ max_tokens: 1500,
272
+ });
273
+
274
+ const content = typeof response === 'string' ? response : (response as any).content || '';
275
+
276
+ // Try to parse JSON array
277
+ const jsonMatch = content.match(/\[[\s\S]*\]/);
278
+ if (jsonMatch) {
279
+ const extracted = JSON.parse(jsonMatch[0]);
280
+
281
+ return extracted.slice(0, 5).map((item: any) => ({
282
+ id: `claim_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`,
283
+ statement: item.statement || '',
284
+ supportingEvidence: [item.supportingEvidence || ''],
285
+ sourceUrls: citations[item.citationIndex - 1]
286
+ ? [citations[item.citationIndex - 1].source.url]
287
+ : [],
288
+ verificationStatus: VerificationStatus.UNVERIFIED,
289
+ confidenceScore: 0.8,
290
+ relatedClaims: [],
291
+ }));
292
+ }
293
+ } catch (e) {
294
+ elizaLogger.error('[FACTEvaluator] Failed to extract claims:', e);
295
+ }
296
+
297
+ return [];
298
+ }
299
+
300
+ private async verifyClaims(
301
+ claims: FactualClaim[]
302
+ ): Promise<Array<FactualClaim & { verified: boolean }>> {
303
+ const results = [];
304
+
305
+ for (const claim of claims) {
306
+ const verified = await this.verifySingleClaim(claim);
307
+ results.push({ ...claim, verified });
308
+ }
309
+
310
+ return results;
311
+ }
312
+
313
+ private async verifySingleClaim(claim: FactualClaim): Promise<boolean> {
314
+ // If no model available, return conservative estimate
315
+ if (!this.runtime.useModel) {
316
+ return claim.confidenceScore > 0.7;
317
+ }
318
+
319
+ // In a real implementation, this would:
320
+ // 1. Fetch the source URL content
321
+ // 2. Check if the content supports the claim
322
+ // For now, we'll use a simplified verification
323
+
324
+ if (!claim.sourceUrls || claim.sourceUrls.length === 0 || !claim.statement) return false;
325
+
326
+ const prompt = `Does this evidence support the claim?
327
+
328
+ Claim: ${claim.statement}
329
+ Evidence: ${claim.supportingEvidence?.join(' ')}
330
+ Source URL: ${claim.sourceUrls[0]}
331
+
332
+ Answer with just "yes" or "no".`;
333
+
334
+ try {
335
+ const response = await this.runtime.useModel(ModelType.TEXT_LARGE, {
336
+ messages: [
337
+ {
338
+ role: 'system',
339
+ content: 'You are a fact verifier. Answer only yes or no.'
340
+ },
341
+ { role: 'user', content: prompt }
342
+ ],
343
+ temperature: 0.1,
344
+ max_tokens: 10,
345
+ });
346
+
347
+ const answer = typeof response === 'string' ? response : (response as any).content || '';
348
+ return answer.toLowerCase().includes('yes');
349
+ } catch (e) {
350
+ elizaLogger.error('[FACTEvaluator] Failed to verify claim:', e);
351
+ return false;
352
+ }
353
+ }
354
+
355
+ private deduplicateClaims(
356
+ claims: Array<FactualClaim & { verified: boolean }>
357
+ ): Array<FactualClaim & { verified: boolean }> {
358
+ const seen = new Set<string>();
359
+ const unique = [];
360
+
361
+ for (const claim of claims) {
362
+ const key = `${claim.statement}|${claim.sourceUrls?.join(',')}`;
363
+ if (!seen.has(key)) {
364
+ seen.add(key);
365
+ unique.push(claim);
366
+ }
367
+ }
368
+
369
+ return unique;
370
+ }
371
+ }
372
+
373
+ /**
374
+ * Combined evaluator for complete research assessment
375
+ */
376
+ export class ResearchEvaluator {
377
+ private raceEvaluator: RACEEvaluator;
378
+ private factEvaluator: FACTEvaluator;
379
+
380
+ constructor(runtime: IAgentRuntime) {
381
+ this.raceEvaluator = new RACEEvaluator(runtime);
382
+ this.factEvaluator = new FACTEvaluator(runtime);
383
+ }
384
+
385
+ async evaluateProject(
386
+ project: ResearchProject,
387
+ criteria: EvaluationCriteria,
388
+ referenceReport?: ResearchReport
389
+ ): Promise<EvaluationMetrics> {
390
+ if (!project.report) {
391
+ throw new Error('Project must have a completed report for evaluation');
392
+ }
393
+
394
+ const [raceScore, factScore] = await Promise.all([
395
+ this.raceEvaluator.evaluate(project.report, criteria, referenceReport),
396
+ this.factEvaluator.evaluate(project),
397
+ ]);
398
+
399
+ return {
400
+ raceScore,
401
+ factScore,
402
+ timestamp: Date.now(),
403
+ evaluatorVersion: '1.0',
404
+ };
405
+ }
406
+
407
+ formatEvaluationReport(metrics: EvaluationMetrics): string {
408
+ const race = metrics.raceScore!;
409
+ const fact = metrics.factScore!;
410
+
411
+ return `# Research Evaluation Report
412
+
413
+ ## RACE Scores
414
+ - **Overall**: ${(race.overall * 100).toFixed(1)}%
415
+ - **Comprehensiveness**: ${(race.comprehensiveness * 100).toFixed(1)}%
416
+ - **Depth**: ${(race.depth * 100).toFixed(1)}%
417
+ - **Instruction Following**: ${(race.instructionFollowing * 100).toFixed(1)}%
418
+ - **Readability**: ${(race.readability * 100).toFixed(1)}%
419
+
420
+ ## FACT Scores
421
+ - **Citation Accuracy**: ${(fact.citationAccuracy * 100).toFixed(1)}%
422
+ - **Effective Citations**: ${fact.effectiveCitations}
423
+ - **Total Citations**: ${fact.totalCitations}
424
+ - **Verified Citations**: ${fact.verifiedCitations}
425
+
426
+ ## Overall Assessment
427
+ ${this.generateAssessment(race, fact)}`;
428
+ }
429
+
430
+ private generateAssessment(race: RACEScore, fact: FACTScore): string {
431
+ const overallQuality = race.overall;
432
+ const citationQuality = fact.citationAccuracy;
433
+
434
+ if (overallQuality >= 0.8 && citationQuality >= 0.8) {
435
+ return 'Excellent research quality with strong factual grounding.';
436
+ } else if (overallQuality >= 0.6 && citationQuality >= 0.6) {
437
+ return 'Good research quality with adequate citation support.';
438
+ } else if (overallQuality >= 0.4 || citationQuality >= 0.4) {
439
+ return 'Moderate research quality. Consider improving depth and citation accuracy.';
440
+ } else {
441
+ return 'Research needs significant improvement in both content quality and factual support.';
442
+ }
443
+ }
444
+ }