@elizaos/plugin-research 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/README.md +400 -0
  2. package/dist/index.cjs +9366 -0
  3. package/dist/index.cjs.map +1 -0
  4. package/dist/index.js +9284 -0
  5. package/dist/index.js.map +1 -0
  6. package/package.json +80 -0
  7. package/src/__tests__/action-chaining.test.ts +532 -0
  8. package/src/__tests__/actions.test.ts +118 -0
  9. package/src/__tests__/cache-rate-limiter.test.ts +303 -0
  10. package/src/__tests__/content-extractors.test.ts +26 -0
  11. package/src/__tests__/deepresearch-bench-integration.test.ts +520 -0
  12. package/src/__tests__/deepresearch-bench-simplified.e2e.test.ts +290 -0
  13. package/src/__tests__/deepresearch-bench.e2e.test.ts +376 -0
  14. package/src/__tests__/e2e.test.ts +1870 -0
  15. package/src/__tests__/multi-benchmark-runner.ts +427 -0
  16. package/src/__tests__/providers.test.ts +156 -0
  17. package/src/__tests__/real-world.e2e.test.ts +788 -0
  18. package/src/__tests__/research-scenarios.test.ts +755 -0
  19. package/src/__tests__/research.e2e.test.ts +704 -0
  20. package/src/__tests__/research.test.ts +174 -0
  21. package/src/__tests__/search-providers.test.ts +174 -0
  22. package/src/__tests__/single-benchmark-runner.ts +735 -0
  23. package/src/__tests__/test-search-providers.ts +171 -0
  24. package/src/__tests__/verify-apis.test.ts +82 -0
  25. package/src/actions.ts +1677 -0
  26. package/src/benchmark/deepresearch-benchmark.ts +369 -0
  27. package/src/evaluation/research-evaluator.ts +444 -0
  28. package/src/examples/api-integration.md +498 -0
  29. package/src/examples/browserbase-integration.md +132 -0
  30. package/src/examples/debug-research-query.ts +162 -0
  31. package/src/examples/defi-code-scenarios.md +536 -0
  32. package/src/examples/defi-implementation-guide.md +454 -0
  33. package/src/examples/eliza-research-example.ts +142 -0
  34. package/src/examples/fix-renewable-energy-research.ts +209 -0
  35. package/src/examples/research-scenarios.md +408 -0
  36. package/src/examples/run-complete-renewable-research.ts +303 -0
  37. package/src/examples/run-deep-research.ts +352 -0
  38. package/src/examples/run-logged-research.ts +304 -0
  39. package/src/examples/run-real-research.ts +151 -0
  40. package/src/examples/save-research-output.ts +133 -0
  41. package/src/examples/test-file-logging.ts +199 -0
  42. package/src/examples/test-real-research.ts +67 -0
  43. package/src/examples/test-renewable-energy-research.ts +229 -0
  44. package/src/index.ts +28 -0
  45. package/src/integrations/cache.ts +128 -0
  46. package/src/integrations/content-extractors/firecrawl.ts +314 -0
  47. package/src/integrations/content-extractors/pdf-extractor.ts +350 -0
  48. package/src/integrations/content-extractors/playwright.ts +420 -0
  49. package/src/integrations/factory.ts +419 -0
  50. package/src/integrations/index.ts +18 -0
  51. package/src/integrations/rate-limiter.ts +181 -0
  52. package/src/integrations/search-providers/academic.ts +290 -0
  53. package/src/integrations/search-providers/exa.ts +205 -0
  54. package/src/integrations/search-providers/npm.ts +330 -0
  55. package/src/integrations/search-providers/pypi.ts +211 -0
  56. package/src/integrations/search-providers/serpapi.ts +277 -0
  57. package/src/integrations/search-providers/serper.ts +358 -0
  58. package/src/integrations/search-providers/stagehand-google.ts +87 -0
  59. package/src/integrations/search-providers/tavily.ts +187 -0
  60. package/src/processing/relevance-analyzer.ts +353 -0
  61. package/src/processing/research-logger.ts +450 -0
  62. package/src/processing/result-processor.ts +372 -0
  63. package/src/prompts/research-prompts.ts +419 -0
  64. package/src/providers/cacheProvider.ts +164 -0
  65. package/src/providers.ts +173 -0
  66. package/src/service.ts +2588 -0
  67. package/src/services/swe-bench.ts +286 -0
  68. package/src/strategies/research-strategies.ts +790 -0
  69. package/src/types/pdf-parse.d.ts +34 -0
  70. package/src/types.ts +551 -0
  71. package/src/verification/claim-verifier.ts +443 -0
@@ -0,0 +1,790 @@
1
+ import { IAgentRuntime, elizaLogger, ModelType } from '@elizaos/core';
2
+ import {
3
+ ResearchDomain,
4
+ TaskType,
5
+ ResearchDepth,
6
+ QueryPlan,
7
+ SubQuery,
8
+ SearchStrategy,
9
+ SearchApproach,
10
+ DomainApproach,
11
+ EvaluationCriteria,
12
+ CriteriaDefinition,
13
+ ScoringMethod,
14
+ ResearchMetadata,
15
+ ResultType,
16
+ TemporalFocus,
17
+ SourceType,
18
+ RubricItem,
19
+ } from '../types';
20
+
21
+ // Domain-specific research configurations
22
+ const DOMAIN_CONFIGS: Record<ResearchDomain, DomainConfig> = {
23
+ [ResearchDomain.PHYSICS]: {
24
+ keyTerms: ['quantum', 'relativity', 'particle', 'mechanics', 'thermodynamics', 'electromagnetism'],
25
+ authoritySource: ['arxiv.org', 'physics.aps.org', 'nature.com/nphys', 'science.org'],
26
+ methodology: 'theoretical-experimental',
27
+ evaluationFocus: ['mathematical rigor', 'experimental validation', 'theoretical consistency'],
28
+ preferredSources: [SourceType.ACADEMIC, SourceType.TECHNICAL],
29
+ },
30
+ [ResearchDomain.CHEMISTRY]: {
31
+ keyTerms: ['synthesis', 'reaction', 'compound', 'molecular', 'organic', 'inorganic'],
32
+ authoritySource: ['acs.org', 'rsc.org', 'chemistry.nature.com', 'sciencedirect.com'],
33
+ methodology: 'experimental-analytical',
34
+ evaluationFocus: ['reproducibility', 'yield', 'purity', 'mechanism'],
35
+ preferredSources: [SourceType.ACADEMIC, SourceType.TECHNICAL],
36
+ },
37
+ [ResearchDomain.BIOLOGY]: {
38
+ keyTerms: ['cell', 'gene', 'protein', 'evolution', 'ecology', 'physiology'],
39
+ authoritySource: ['ncbi.nlm.nih.gov', 'nature.com', 'cell.com', 'biology.plos.org'],
40
+ methodology: 'observational-experimental',
41
+ evaluationFocus: ['statistical significance', 'reproducibility', 'biological relevance'],
42
+ preferredSources: [SourceType.ACADEMIC, SourceType.GOVERNMENT],
43
+ },
44
+ [ResearchDomain.ENVIRONMENTAL_SCIENCE]: {
45
+ keyTerms: ['climate', 'ecosystem', 'pollution', 'sustainability', 'biodiversity', 'conservation'],
46
+ authoritySource: ['ipcc.ch', 'epa.gov', 'nature.com/nclimate', 'unep.org'],
47
+ methodology: 'observational-modeling',
48
+ evaluationFocus: ['data quality', 'model accuracy', 'policy implications'],
49
+ preferredSources: [SourceType.GOVERNMENT, SourceType.ACADEMIC, SourceType.ORGANIZATION],
50
+ },
51
+ [ResearchDomain.ENGINEERING]: {
52
+ keyTerms: ['design', 'optimization', 'materials', 'systems', 'control', 'manufacturing'],
53
+ authoritySource: ['ieee.org', 'asme.org', 'engineeringvillage.com', 'asce.org'],
54
+ methodology: 'design-testing',
55
+ evaluationFocus: ['performance', 'efficiency', 'cost-effectiveness', 'safety'],
56
+ preferredSources: [SourceType.TECHNICAL, SourceType.ACADEMIC],
57
+ },
58
+ [ResearchDomain.COMPUTER_SCIENCE]: {
59
+ keyTerms: ['algorithm', 'data structure', 'machine learning', 'network', 'security', 'software'],
60
+ authoritySource: ['acm.org', 'ieee.org', 'arxiv.org/cs', 'github.com'],
61
+ methodology: 'theoretical-implementation',
62
+ evaluationFocus: ['complexity', 'correctness', 'scalability', 'performance'],
63
+ preferredSources: [SourceType.TECHNICAL, SourceType.ACADEMIC, SourceType.WEB],
64
+ },
65
+ [ResearchDomain.MATHEMATICS]: {
66
+ keyTerms: ['theorem', 'proof', 'equation', 'topology', 'algebra', 'analysis'],
67
+ authoritySource: ['ams.org', 'arxiv.org/math', 'mathscinet.ams.org', 'zbmath.org'],
68
+ methodology: 'theoretical-proof',
69
+ evaluationFocus: ['rigor', 'generality', 'elegance', 'applicability'],
70
+ preferredSources: [SourceType.ACADEMIC],
71
+ },
72
+ [ResearchDomain.MEDICINE]: {
73
+ keyTerms: ['diagnosis', 'treatment', 'clinical', 'pathology', 'pharmacology', 'epidemiology'],
74
+ authoritySource: ['pubmed.ncbi.nlm.nih.gov', 'nejm.org', 'thelancet.com', 'who.int'],
75
+ methodology: 'clinical-evidence',
76
+ evaluationFocus: ['clinical significance', 'safety', 'efficacy', 'evidence level'],
77
+ preferredSources: [SourceType.ACADEMIC, SourceType.GOVERNMENT],
78
+ },
79
+ [ResearchDomain.PSYCHOLOGY]: {
80
+ keyTerms: ['behavior', 'cognition', 'emotion', 'development', 'personality', 'disorder'],
81
+ authoritySource: ['apa.org', 'psychologicalscience.org', 'nature.com/nathumbehav', 'ncbi.nlm.nih.gov'],
82
+ methodology: 'empirical-theoretical',
83
+ evaluationFocus: ['validity', 'reliability', 'generalizability', 'ethical considerations'],
84
+ preferredSources: [SourceType.ACADEMIC, SourceType.ORGANIZATION],
85
+ },
86
+ [ResearchDomain.ECONOMICS]: {
87
+ keyTerms: ['market', 'policy', 'growth', 'inflation', 'trade', 'behavioral'],
88
+ authoritySource: ['nber.org', 'imf.org', 'worldbank.org', 'aeaweb.org'],
89
+ methodology: 'theoretical-empirical',
90
+ evaluationFocus: ['model validity', 'data quality', 'policy relevance', 'predictive power'],
91
+ preferredSources: [SourceType.ACADEMIC, SourceType.GOVERNMENT, SourceType.ORGANIZATION],
92
+ },
93
+ [ResearchDomain.FINANCE]: {
94
+ keyTerms: ['investment', 'risk', 'portfolio', 'derivatives', 'banking', 'cryptocurrency'],
95
+ authoritySource: ['bloomberg.com', 'reuters.com', 'ft.com', 'wsj.com'],
96
+ methodology: 'quantitative-analytical',
97
+ evaluationFocus: ['return', 'risk assessment', 'market efficiency', 'regulatory compliance'],
98
+ preferredSources: [SourceType.NEWS, SourceType.TECHNICAL, SourceType.GOVERNMENT],
99
+ },
100
+ [ResearchDomain.BUSINESS]: {
101
+ keyTerms: ['strategy', 'management', 'innovation', 'leadership', 'operations', 'entrepreneurship'],
102
+ authoritySource: ['hbr.org', 'mckinsey.com', 'bcg.com', 'forbes.com'],
103
+ methodology: 'case-analytical',
104
+ evaluationFocus: ['practicality', 'ROI', 'scalability', 'competitive advantage'],
105
+ preferredSources: [SourceType.NEWS, SourceType.ORGANIZATION, SourceType.ACADEMIC],
106
+ },
107
+ [ResearchDomain.MARKETING]: {
108
+ keyTerms: ['consumer', 'branding', 'digital', 'segmentation', 'campaign', 'analytics'],
109
+ authoritySource: ['marketingland.com', 'adweek.com', 'warc.com', 'ama.org'],
110
+ methodology: 'empirical-creative',
111
+ evaluationFocus: ['ROI', 'engagement', 'conversion', 'brand impact'],
112
+ preferredSources: [SourceType.NEWS, SourceType.WEB, SourceType.ORGANIZATION],
113
+ },
114
+ [ResearchDomain.HUMAN_RESOURCES]: {
115
+ keyTerms: ['recruitment', 'performance', 'culture', 'compensation', 'development', 'retention'],
116
+ authoritySource: ['shrm.org', 'cipd.co.uk', 'hbr.org', 'gallup.com'],
117
+ methodology: 'empirical-practical',
118
+ evaluationFocus: ['employee satisfaction', 'productivity', 'retention', 'compliance'],
119
+ preferredSources: [SourceType.ORGANIZATION, SourceType.NEWS, SourceType.ACADEMIC],
120
+ },
121
+ [ResearchDomain.LAW]: {
122
+ keyTerms: ['statute', 'precedent', 'jurisdiction', 'litigation', 'compliance', 'regulation'],
123
+ authoritySource: ['westlaw.com', 'lexisnexis.com', 'law.cornell.edu', 'supremecourt.gov'],
124
+ methodology: 'precedent-analytical',
125
+ evaluationFocus: ['legal validity', 'precedent', 'jurisdiction', 'practical application'],
126
+ preferredSources: [SourceType.GOVERNMENT, SourceType.ACADEMIC],
127
+ },
128
+ [ResearchDomain.POLITICS]: {
129
+ keyTerms: ['policy', 'election', 'governance', 'ideology', 'diplomacy', 'legislation'],
130
+ authoritySource: ['politico.com', 'foreignaffairs.com', 'brookings.edu', 'cfr.org'],
131
+ methodology: 'analytical-comparative',
132
+ evaluationFocus: ['objectivity', 'source diversity', 'historical context', 'impact analysis'],
133
+ preferredSources: [SourceType.NEWS, SourceType.ORGANIZATION, SourceType.GOVERNMENT],
134
+ },
135
+ [ResearchDomain.HISTORY]: {
136
+ keyTerms: ['period', 'civilization', 'event', 'source', 'interpretation', 'archaeology'],
137
+ authoritySource: ['jstor.org', 'archives.gov', 'history.com', 'britannica.com'],
138
+ methodology: 'source-analytical',
139
+ evaluationFocus: ['source reliability', 'historiography', 'context', 'multiple perspectives'],
140
+ preferredSources: [SourceType.ACADEMIC, SourceType.BOOK, SourceType.GOVERNMENT],
141
+ },
142
+ [ResearchDomain.PHILOSOPHY]: {
143
+ keyTerms: ['ethics', 'metaphysics', 'epistemology', 'logic', 'aesthetics', 'phenomenology'],
144
+ authoritySource: ['plato.stanford.edu', 'iep.utm.edu', 'philpapers.org', 'jstor.org'],
145
+ methodology: 'analytical-dialectical',
146
+ evaluationFocus: ['logical consistency', 'clarity', 'originality', 'practical implications'],
147
+ preferredSources: [SourceType.ACADEMIC, SourceType.BOOK],
148
+ },
149
+ [ResearchDomain.ART_DESIGN]: {
150
+ keyTerms: ['aesthetic', 'composition', 'medium', 'movement', 'technique', 'critique'],
151
+ authoritySource: ['artforum.com', 'moma.org', 'tate.org.uk', 'designboom.com'],
152
+ methodology: 'critical-creative',
153
+ evaluationFocus: ['originality', 'technique', 'cultural impact', 'aesthetic value'],
154
+ preferredSources: [SourceType.WEB, SourceType.ORGANIZATION, SourceType.BOOK],
155
+ },
156
+ [ResearchDomain.ENTERTAINMENT]: {
157
+ keyTerms: ['media', 'audience', 'production', 'distribution', 'content', 'platform'],
158
+ authoritySource: ['variety.com', 'hollywoodreporter.com', 'rottentomatoes.com', 'imdb.com'],
159
+ methodology: 'analytical-critical',
160
+ evaluationFocus: ['audience reception', 'critical analysis', 'commercial success', 'cultural impact'],
161
+ preferredSources: [SourceType.NEWS, SourceType.WEB],
162
+ },
163
+ [ResearchDomain.TRANSPORTATION]: {
164
+ keyTerms: ['mobility', 'infrastructure', 'logistics', 'autonomous', 'sustainability', 'urban'],
165
+ authoritySource: ['transportation.gov', 'itf-oecd.org', 'apta.com', 'railway-technology.com'],
166
+ methodology: 'systems-analytical',
167
+ evaluationFocus: ['efficiency', 'safety', 'sustainability', 'cost-effectiveness'],
168
+ preferredSources: [SourceType.GOVERNMENT, SourceType.TECHNICAL, SourceType.ORGANIZATION],
169
+ },
170
+ [ResearchDomain.GENERAL]: {
171
+ keyTerms: [],
172
+ authoritySource: ['wikipedia.org', 'britannica.com', 'scholar.google.com'],
173
+ methodology: 'mixed-methods',
174
+ evaluationFocus: ['accuracy', 'comprehensiveness', 'clarity', 'source diversity'],
175
+ preferredSources: [SourceType.WEB, SourceType.ACADEMIC, SourceType.NEWS],
176
+ },
177
+ };
178
+
179
+ interface DomainConfig {
180
+ keyTerms: string[];
181
+ authoritySource: string[];
182
+ methodology: string;
183
+ evaluationFocus: string[];
184
+ preferredSources: SourceType[];
185
+ }
186
+
187
+ export class ResearchStrategyFactory {
188
+ constructor(private runtime: IAgentRuntime) {}
189
+
190
+ async createStrategy(
191
+ query: string,
192
+ domain: ResearchDomain,
193
+ taskType: TaskType,
194
+ depth: ResearchDepth
195
+ ): Promise<SearchStrategy> {
196
+ const domainConfig = DOMAIN_CONFIGS[domain];
197
+
198
+ // Determine search approach based on task type and depth
199
+ const approach = this.determineSearchApproach(taskType, depth);
200
+
201
+ // Determine temporal focus based on query analysis
202
+ const temporalFocus = await this.analyzeTemporalFocus(query);
203
+
204
+ return {
205
+ approach,
206
+ sourceTypes: domainConfig.preferredSources,
207
+ qualityThreshold: this.getQualityThreshold(depth),
208
+ diversityRequirement: taskType === TaskType.COMPARATIVE || taskType === TaskType.EVALUATIVE,
209
+ temporalFocus,
210
+ geographicScope: await this.extractGeographicScope(query),
211
+ languagePreferences: ['en'], // Can be extended based on query
212
+ };
213
+ }
214
+
215
+ private determineSearchApproach(taskType: TaskType, depth: ResearchDepth): SearchApproach {
216
+ if (depth === ResearchDepth.PHD_LEVEL) {
217
+ return SearchApproach.CITATION_CHAINING;
218
+ }
219
+
220
+ switch (taskType) {
221
+ case TaskType.EXPLORATORY:
222
+ return SearchApproach.BREADTH_FIRST;
223
+ case TaskType.ANALYTICAL:
224
+ case TaskType.SYNTHETIC:
225
+ return SearchApproach.DEPTH_FIRST;
226
+ case TaskType.COMPARATIVE:
227
+ case TaskType.EVALUATIVE:
228
+ return SearchApproach.ITERATIVE_REFINEMENT;
229
+ case TaskType.PREDICTIVE:
230
+ return SearchApproach.HYPOTHESIS_DRIVEN;
231
+ default:
232
+ return SearchApproach.BREADTH_FIRST;
233
+ }
234
+ }
235
+
236
+ private getQualityThreshold(depth: ResearchDepth): number {
237
+ switch (depth) {
238
+ case ResearchDepth.SURFACE:
239
+ return 0.6;
240
+ case ResearchDepth.MODERATE:
241
+ return 0.7;
242
+ case ResearchDepth.DEEP:
243
+ return 0.8;
244
+ case ResearchDepth.PHD_LEVEL:
245
+ return 0.9;
246
+ }
247
+ }
248
+
249
+ private async analyzeTemporalFocus(query: string): Promise<TemporalFocus | undefined> {
250
+ const prompt = `Analyze this research query and determine its temporal focus:
251
+ Query: "${query}"
252
+
253
+ Options:
254
+ - historical: Focus on past events, history, origins
255
+ - current: Focus on present state, current situation
256
+ - recent: Focus on recent developments (last 1-2 years)
257
+ - future-oriented: Focus on predictions, trends, future scenarios
258
+
259
+ Respond with just the option name.`;
260
+
261
+ try {
262
+ const response = await this.runtime.useModel(ModelType.TEXT_LARGE, {
263
+ messages: [
264
+ {
265
+ role: 'system',
266
+ content: 'You are a temporal focus analyzer. Respond with only the temporal focus option, nothing else.'
267
+ },
268
+ { role: 'user', content: prompt }
269
+ ],
270
+ temperature: 0.3,
271
+ });
272
+
273
+ const focus = (typeof response === 'string' ? response : (response as any).content || '').trim().toLowerCase();
274
+
275
+ if (focus.includes('historical')) return TemporalFocus.HISTORICAL;
276
+ if (focus.includes('current')) return TemporalFocus.CURRENT;
277
+ if (focus.includes('recent')) return TemporalFocus.RECENT;
278
+ if (focus.includes('future')) return TemporalFocus.FUTURE_ORIENTED;
279
+
280
+ return undefined;
281
+ } catch (error) {
282
+ elizaLogger.error('Error analyzing temporal focus:', error);
283
+ return undefined;
284
+ }
285
+ }
286
+
287
+ private async extractGeographicScope(query: string): Promise<string[]> {
288
+ const prompt = `Extract any geographic locations or regions mentioned in this query:
289
+ Query: "${query}"
290
+
291
+ List any countries, regions, cities, or geographic areas mentioned. If none, return "global".
292
+ Respond with a comma-separated list.`;
293
+
294
+ try {
295
+ const response = await this.runtime.useModel(ModelType.TEXT_LARGE, {
296
+ messages: [
297
+ {
298
+ role: 'system',
299
+ content: 'You are a geographic scope extractor. Return only a comma-separated list of locations or "global".'
300
+ },
301
+ { role: 'user', content: prompt }
302
+ ],
303
+ temperature: 0.3,
304
+ });
305
+
306
+ const locations = (typeof response === 'string' ? response : (response as any).content || '')
307
+ .split(',')
308
+ .map((s: string) => s.trim())
309
+ .filter((s: string) => s && s !== 'global');
310
+
311
+ return locations.length > 0 ? locations : [];
312
+ } catch (error) {
313
+ elizaLogger.error('Error extracting geographic scope:', error);
314
+ return [];
315
+ }
316
+ }
317
+ }
318
+
319
+ export class QueryPlanner {
320
+ constructor(private runtime: IAgentRuntime) {}
321
+
322
+ async createQueryPlan(
323
+ mainQuery: string,
324
+ metadata: Partial<ResearchMetadata>
325
+ ): Promise<QueryPlan> {
326
+ const domain = metadata.domain || ResearchDomain.GENERAL;
327
+ const taskType = metadata.taskType || TaskType.EXPLORATORY;
328
+ const depth = metadata.depth || ResearchDepth.MODERATE;
329
+
330
+ const domainConfig = DOMAIN_CONFIGS[domain];
331
+
332
+ // Generate sub-queries based on domain and task type
333
+ const subQueries = await this.generateSubQueries(
334
+ mainQuery,
335
+ domain,
336
+ taskType,
337
+ domainConfig
338
+ );
339
+
340
+ // Create search strategy
341
+ const strategyFactory = new ResearchStrategyFactory(this.runtime);
342
+ const searchStrategy = await strategyFactory.createStrategy(
343
+ mainQuery,
344
+ domain,
345
+ taskType,
346
+ depth
347
+ );
348
+
349
+ // Determine iteration count based on depth
350
+ const iterationCount = this.getIterationCount(depth);
351
+
352
+ return {
353
+ mainQuery,
354
+ subQueries,
355
+ searchStrategy,
356
+ expectedSources: this.getExpectedSources(depth),
357
+ iterationCount,
358
+ adaptiveRefinement: depth === ResearchDepth.DEEP || depth === ResearchDepth.PHD_LEVEL,
359
+ domainSpecificApproach: {
360
+ methodology: domainConfig.methodology,
361
+ keyTerms: domainConfig.keyTerms,
362
+ authoritySource: domainConfig.authoritySource,
363
+ evaluationFocus: domainConfig.evaluationFocus,
364
+ },
365
+ };
366
+ }
367
+
368
+ private async generateSubQueries(
369
+ mainQuery: string,
370
+ domain: ResearchDomain,
371
+ taskType: TaskType,
372
+ domainConfig: DomainConfig
373
+ ): Promise<SubQuery[]> {
374
+ // Try to use AI model if available
375
+ if (this.runtime.useModel) {
376
+ try {
377
+ const prompt = `Generate sub-queries for this research task:
378
+ Main Query: "${mainQuery}"
379
+ Domain: ${domain}
380
+ Task Type: ${taskType}
381
+ Key Terms: ${domainConfig.keyTerms.join(', ')}
382
+
383
+ Generate 3-7 specific sub-queries that will help answer the main query comprehensively.
384
+ Consider different aspects based on the task type:
385
+ - Exploratory: broad coverage of the topic
386
+ - Comparative: queries for each item being compared
387
+ - Analytical: queries for different analytical dimensions
388
+ - Synthetic: queries for different perspectives to synthesize
389
+ - Evaluative: queries for criteria and evidence
390
+ - Predictive: queries for historical patterns and indicators
391
+
392
+ Format each sub-query as:
393
+ PURPOSE: [why this query is needed]
394
+ QUERY: [the actual search query]
395
+ TYPE: [factual/statistical/theoretical/practical/comparative]
396
+ PRIORITY: [high/medium/low]
397
+
398
+ Separate each sub-query with ---`;
399
+
400
+ const response = await this.runtime.useModel(ModelType.TEXT_LARGE, {
401
+ messages: [
402
+ {
403
+ role: 'system',
404
+ content: 'You are an expert research query planner. Generate detailed sub-queries following the exact format requested.'
405
+ },
406
+ { role: 'user', content: prompt }
407
+ ],
408
+ temperature: 0.7,
409
+ });
410
+
411
+ const responseText = typeof response === 'string' ? response : (response as any).content || '';
412
+ const subQueryTexts = responseText.split('---').filter((s: string) => s.trim());
413
+
414
+ const subQueries: SubQuery[] = [];
415
+
416
+ for (let i = 0; i < subQueryTexts.length; i++) {
417
+ const text = subQueryTexts[i];
418
+ const purposeMatch = text.match(/PURPOSE:\s*(.+)/i);
419
+ const queryMatch = text.match(/QUERY:\s*(.+)/i);
420
+ const typeMatch = text.match(/TYPE:\s*(.+)/i);
421
+ const priorityMatch = text.match(/PRIORITY:\s*(.+)/i);
422
+
423
+ if (queryMatch && purposeMatch) {
424
+ const resultType = this.parseResultType(typeMatch?.[1] || 'factual');
425
+ const priority = this.parsePriority(priorityMatch?.[1] || 'medium');
426
+
427
+ subQueries.push({
428
+ id: `sq_${i + 1}`,
429
+ query: queryMatch[1].trim(),
430
+ purpose: purposeMatch[1].trim(),
431
+ priority,
432
+ dependsOn: this.determineDependencies(i, subQueries),
433
+ searchProviders: this.selectSearchProviders(resultType, domain),
434
+ expectedResultType: resultType,
435
+ completed: false,
436
+ });
437
+ }
438
+ }
439
+
440
+ if (subQueries.length > 0) {
441
+ return subQueries;
442
+ }
443
+ } catch (error) {
444
+ elizaLogger.warn('Error generating sub-queries with AI, using fallback:', error);
445
+ }
446
+ }
447
+
448
+ // Fallback: Generate sub-queries based on task type
449
+ return this.generateFallbackSubQueries(mainQuery, domain, taskType, domainConfig);
450
+ }
451
+
452
+ private generateFallbackSubQueries(
453
+ mainQuery: string,
454
+ domain: ResearchDomain,
455
+ taskType: TaskType,
456
+ domainConfig: DomainConfig
457
+ ): SubQuery[] {
458
+ const subQueries: SubQuery[] = [];
459
+
460
+ // Base sub-queries based on task type
461
+ switch (taskType) {
462
+ case TaskType.COMPARATIVE:
463
+ // Extract items to compare from query
464
+ const items = mainQuery.match(/compare\s+(\w+)\s+and\s+(\w+)/i);
465
+ if (items) {
466
+ subQueries.push({
467
+ id: 'sq_1',
468
+ query: `${items[1]} ${domainConfig.keyTerms[0] || 'overview'}`,
469
+ purpose: `Understand first item: ${items[1]}`,
470
+ priority: 1,
471
+ dependsOn: [],
472
+ searchProviders: ['web'],
473
+ expectedResultType: ResultType.FACTUAL,
474
+ completed: false,
475
+ });
476
+ subQueries.push({
477
+ id: 'sq_2',
478
+ query: `${items[2]} ${domainConfig.keyTerms[0] || 'overview'}`,
479
+ purpose: `Understand second item: ${items[2]}`,
480
+ priority: 1,
481
+ dependsOn: [],
482
+ searchProviders: ['web'],
483
+ expectedResultType: ResultType.FACTUAL,
484
+ completed: false,
485
+ });
486
+ }
487
+ break;
488
+
489
+ case TaskType.ANALYTICAL:
490
+ subQueries.push({
491
+ id: 'sq_1',
492
+ query: `${mainQuery} analysis`,
493
+ purpose: 'Find analytical perspectives',
494
+ priority: 1,
495
+ dependsOn: [],
496
+ searchProviders: ['web', 'academic'],
497
+ expectedResultType: ResultType.THEORETICAL,
498
+ completed: false,
499
+ });
500
+ break;
501
+
502
+ default:
503
+ // Generic sub-queries
504
+ subQueries.push({
505
+ id: 'sq_1',
506
+ query: mainQuery,
507
+ purpose: 'General search for main topic',
508
+ priority: 1,
509
+ dependsOn: [],
510
+ searchProviders: ['web'],
511
+ expectedResultType: ResultType.FACTUAL,
512
+ completed: false,
513
+ });
514
+ }
515
+
516
+ // Add domain-specific sub-query
517
+ if (domainConfig.keyTerms.length > 0) {
518
+ subQueries.push({
519
+ id: `sq_${subQueries.length + 1}`,
520
+ query: `${mainQuery} ${domainConfig.keyTerms[0]}`,
521
+ purpose: `Domain-specific search for ${domain}`,
522
+ priority: 2,
523
+ dependsOn: [],
524
+ searchProviders: this.selectSearchProviders(ResultType.THEORETICAL, domain),
525
+ expectedResultType: ResultType.THEORETICAL,
526
+ completed: false,
527
+ });
528
+ }
529
+
530
+ return subQueries;
531
+ }
532
+
533
+ private parseResultType(type: string): ResultType {
534
+ const normalized = type.toLowerCase().trim();
535
+ if (normalized.includes('statistical')) return ResultType.STATISTICAL;
536
+ if (normalized.includes('theoretical')) return ResultType.THEORETICAL;
537
+ if (normalized.includes('practical')) return ResultType.PRACTICAL;
538
+ if (normalized.includes('comparative')) return ResultType.COMPARATIVE;
539
+ return ResultType.FACTUAL;
540
+ }
541
+
542
+ private parsePriority(priority: string): number {
543
+ const normalized = priority.toLowerCase().trim();
544
+ if (normalized === 'high') return 3;
545
+ if (normalized === 'low') return 1;
546
+ return 2; // medium
547
+ }
548
+
549
+ private determineDependencies(index: number, existingQueries: SubQuery[]): string[] {
550
+ // First query has no dependencies
551
+ if (index === 0) return [];
552
+
553
+ // Comparative queries might depend on factual queries
554
+ // This is a simplified logic - in production, would use more sophisticated dependency analysis
555
+ const dependencies: string[] = [];
556
+
557
+ for (let i = 0; i < index && i < existingQueries.length; i++) {
558
+ if (existingQueries[i].priority > 2) {
559
+ dependencies.push(existingQueries[i].id);
560
+ }
561
+ }
562
+
563
+ return dependencies;
564
+ }
565
+
566
+ private selectSearchProviders(resultType: ResultType, domain: ResearchDomain): string[] {
567
+ const providers: string[] = ['web']; // Always include general web search
568
+
569
+ // Add specialized providers based on result type and domain
570
+ if (resultType === ResultType.STATISTICAL || domain === ResearchDomain.ECONOMICS) {
571
+ providers.push('statistics');
572
+ }
573
+
574
+ if ([ResearchDomain.PHYSICS, ResearchDomain.MATHEMATICS, ResearchDomain.COMPUTER_SCIENCE].includes(domain)) {
575
+ providers.push('arxiv');
576
+ }
577
+
578
+ if ([ResearchDomain.MEDICINE, ResearchDomain.BIOLOGY, ResearchDomain.PSYCHOLOGY].includes(domain)) {
579
+ providers.push('pubmed');
580
+ }
581
+
582
+ if (resultType === ResultType.COMPARATIVE || resultType === ResultType.PRACTICAL) {
583
+ providers.push('news');
584
+ }
585
+
586
+ return providers;
587
+ }
588
+
589
+ private getIterationCount(depth: ResearchDepth): number {
590
+ switch (depth) {
591
+ case ResearchDepth.SURFACE:
592
+ return 1;
593
+ case ResearchDepth.MODERATE:
594
+ return 2;
595
+ case ResearchDepth.DEEP:
596
+ return 3;
597
+ case ResearchDepth.PHD_LEVEL:
598
+ return 5;
599
+ }
600
+ }
601
+
602
+ private getExpectedSources(depth: ResearchDepth): number {
603
+ switch (depth) {
604
+ case ResearchDepth.SURFACE:
605
+ return 10;
606
+ case ResearchDepth.MODERATE:
607
+ return 25;
608
+ case ResearchDepth.DEEP:
609
+ return 50;
610
+ case ResearchDepth.PHD_LEVEL:
611
+ return 100;
612
+ }
613
+ }
614
+
615
+ async refineQuery(
616
+ originalQuery: string,
617
+ currentFindings: string[],
618
+ iteration: number
619
+ ): Promise<string[]> {
620
+ const prompt = `Based on the current research findings, generate refined search queries for iteration ${iteration + 1}.
621
+
622
+ Original Query: "${originalQuery}"
623
+
624
+ Current Findings Summary:
625
+ ${currentFindings.join('\n\n')}
626
+
627
+ Generate 2-4 refined queries that:
628
+ 1. Address gaps in the current findings
629
+ 2. Explore new angles or perspectives
630
+ 3. Seek more specific or technical information
631
+ 4. Verify or challenge existing findings
632
+
633
+ Format: One query per line`;
634
+
635
+ const response = await this.runtime.useModel(ModelType.TEXT_LARGE, {
636
+ messages: [
637
+ {
638
+ role: 'system',
639
+ content: 'You are a research query refinement expert. Generate refined queries based on current findings.'
640
+ },
641
+ { role: 'user', content: prompt }
642
+ ],
643
+ temperature: 0.7,
644
+ });
645
+
646
+ const responseText = typeof response === 'string' ? response : (response as any).content || '';
647
+ return responseText
648
+ .split('\n')
649
+ .map((s: string) => s.trim())
650
+ .filter((s: string) => s && !s.match(/^\d+\./)); // Remove numbering
651
+ }
652
+ }
653
+
654
+ export class EvaluationCriteriaGenerator {
655
+ constructor(private runtime: IAgentRuntime) {}
656
+
657
+ async generateCriteria(
658
+ query: string,
659
+ domain?: ResearchDomain
660
+ ): Promise<EvaluationCriteria> {
661
+ const domainConfig = domain ? DOMAIN_CONFIGS[domain] : DOMAIN_CONFIGS[ResearchDomain.GENERAL];
662
+
663
+ // Generate base criteria
664
+ const baseCriteria: EvaluationCriteria = {
665
+ comprehensiveness: await this.generateCriterion(
666
+ 'Comprehensiveness',
667
+ 'How thoroughly the research covers all relevant aspects of the query',
668
+ 0.25
669
+ ),
670
+ depth: await this.generateCriterion(
671
+ 'Depth',
672
+ 'The level of detail and expertise demonstrated in the analysis',
673
+ 0.25
674
+ ),
675
+ instructionFollowing: await this.generateCriterion(
676
+ 'Instruction Following',
677
+ 'How well the research addresses the specific requirements of the query',
678
+ 0.25
679
+ ),
680
+ readability: await this.generateCriterion(
681
+ 'Readability',
682
+ 'The clarity, organization, and accessibility of the research report',
683
+ 0.25
684
+ ),
685
+ };
686
+
687
+ // Add domain-specific criteria if applicable
688
+ if (domain && domain !== ResearchDomain.GENERAL) {
689
+ baseCriteria.domainSpecific = {};
690
+
691
+ for (const focus of domainConfig.evaluationFocus) {
692
+ baseCriteria.domainSpecific[focus] = await this.generateCriterion(
693
+ focus,
694
+ `Domain-specific evaluation of ${focus} for ${domain} research`,
695
+ 0.2
696
+ );
697
+ }
698
+ }
699
+
700
+ return baseCriteria;
701
+ }
702
+
703
+ private async generateCriterion(
704
+ name: string,
705
+ description: string,
706
+ weight: number
707
+ ): Promise<CriteriaDefinition> {
708
+ // Try to use AI model if available
709
+ if (this.runtime.useModel) {
710
+ try {
711
+ const prompt = `Generate a detailed evaluation rubric for the following criterion:
712
+ Name: ${name}
713
+ Description: ${description}
714
+
715
+ Create a 5-point rubric (0-4) with specific descriptions for each score level.
716
+ Format:
717
+ 0: [Description of failing/missing]
718
+ 1: [Description of poor/minimal]
719
+ 2: [Description of adequate/satisfactory]
720
+ 3: [Description of good/strong]
721
+ 4: [Description of excellent/exceptional]`;
722
+
723
+ const response = await this.runtime.useModel(ModelType.TEXT_LARGE, {
724
+ messages: [
725
+ {
726
+ role: 'system',
727
+ content: 'You are an evaluation criteria expert. Generate a detailed rubric following the exact format requested.'
728
+ },
729
+ { role: 'user', content: prompt }
730
+ ],
731
+ temperature: 0.5,
732
+ });
733
+
734
+ const responseText = typeof response === 'string' ? response : (response as any).content || '';
735
+ const rubricItems = this.parseRubric(responseText);
736
+
737
+ return {
738
+ name,
739
+ description,
740
+ weight,
741
+ rubric: rubricItems,
742
+ scoringMethod: ScoringMethod.RUBRIC,
743
+ };
744
+ } catch (error) {
745
+ elizaLogger.warn('Error generating criterion with AI, using fallback:', error);
746
+ }
747
+ }
748
+
749
+ // Fallback: Generate default rubric
750
+ return {
751
+ name,
752
+ description,
753
+ weight,
754
+ rubric: [
755
+ { score: 0, description: `${name} is completely missing or fails to meet any requirements` },
756
+ { score: 1, description: `${name} shows minimal effort with significant gaps` },
757
+ { score: 2, description: `${name} meets basic requirements but lacks depth` },
758
+ { score: 3, description: `${name} is good with solid coverage and analysis` },
759
+ { score: 4, description: `${name} is excellent with comprehensive and insightful treatment` }
760
+ ],
761
+ scoringMethod: ScoringMethod.RUBRIC,
762
+ };
763
+ }
764
+
765
+ private parseRubric(rubricText: string): RubricItem[] {
766
+ const items: RubricItem[] = [];
767
+ const lines = rubricText.split('\n');
768
+
769
+ for (const line of lines) {
770
+ const match = line.match(/^(\d):\s*(.+)/);
771
+ if (match) {
772
+ const score = parseInt(match[1]);
773
+ const description = match[2].trim();
774
+ items.push({ score, description });
775
+ }
776
+ }
777
+
778
+ // Ensure we have all scores 0-4
779
+ for (let i = 0; i <= 4; i++) {
780
+ if (!items.find(item => item.score === i)) {
781
+ items.push({
782
+ score: i,
783
+ description: `Score level ${i}`,
784
+ });
785
+ }
786
+ }
787
+
788
+ return items.sort((a, b) => a.score - b.score);
789
+ }
790
+ }