@elizaos/plugin-research 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +400 -0
- package/dist/index.cjs +9366 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.js +9284 -0
- package/dist/index.js.map +1 -0
- package/package.json +80 -0
- package/src/__tests__/action-chaining.test.ts +532 -0
- package/src/__tests__/actions.test.ts +118 -0
- package/src/__tests__/cache-rate-limiter.test.ts +303 -0
- package/src/__tests__/content-extractors.test.ts +26 -0
- package/src/__tests__/deepresearch-bench-integration.test.ts +520 -0
- package/src/__tests__/deepresearch-bench-simplified.e2e.test.ts +290 -0
- package/src/__tests__/deepresearch-bench.e2e.test.ts +376 -0
- package/src/__tests__/e2e.test.ts +1870 -0
- package/src/__tests__/multi-benchmark-runner.ts +427 -0
- package/src/__tests__/providers.test.ts +156 -0
- package/src/__tests__/real-world.e2e.test.ts +788 -0
- package/src/__tests__/research-scenarios.test.ts +755 -0
- package/src/__tests__/research.e2e.test.ts +704 -0
- package/src/__tests__/research.test.ts +174 -0
- package/src/__tests__/search-providers.test.ts +174 -0
- package/src/__tests__/single-benchmark-runner.ts +735 -0
- package/src/__tests__/test-search-providers.ts +171 -0
- package/src/__tests__/verify-apis.test.ts +82 -0
- package/src/actions.ts +1677 -0
- package/src/benchmark/deepresearch-benchmark.ts +369 -0
- package/src/evaluation/research-evaluator.ts +444 -0
- package/src/examples/api-integration.md +498 -0
- package/src/examples/browserbase-integration.md +132 -0
- package/src/examples/debug-research-query.ts +162 -0
- package/src/examples/defi-code-scenarios.md +536 -0
- package/src/examples/defi-implementation-guide.md +454 -0
- package/src/examples/eliza-research-example.ts +142 -0
- package/src/examples/fix-renewable-energy-research.ts +209 -0
- package/src/examples/research-scenarios.md +408 -0
- package/src/examples/run-complete-renewable-research.ts +303 -0
- package/src/examples/run-deep-research.ts +352 -0
- package/src/examples/run-logged-research.ts +304 -0
- package/src/examples/run-real-research.ts +151 -0
- package/src/examples/save-research-output.ts +133 -0
- package/src/examples/test-file-logging.ts +199 -0
- package/src/examples/test-real-research.ts +67 -0
- package/src/examples/test-renewable-energy-research.ts +229 -0
- package/src/index.ts +28 -0
- package/src/integrations/cache.ts +128 -0
- package/src/integrations/content-extractors/firecrawl.ts +314 -0
- package/src/integrations/content-extractors/pdf-extractor.ts +350 -0
- package/src/integrations/content-extractors/playwright.ts +420 -0
- package/src/integrations/factory.ts +419 -0
- package/src/integrations/index.ts +18 -0
- package/src/integrations/rate-limiter.ts +181 -0
- package/src/integrations/search-providers/academic.ts +290 -0
- package/src/integrations/search-providers/exa.ts +205 -0
- package/src/integrations/search-providers/npm.ts +330 -0
- package/src/integrations/search-providers/pypi.ts +211 -0
- package/src/integrations/search-providers/serpapi.ts +277 -0
- package/src/integrations/search-providers/serper.ts +358 -0
- package/src/integrations/search-providers/stagehand-google.ts +87 -0
- package/src/integrations/search-providers/tavily.ts +187 -0
- package/src/processing/relevance-analyzer.ts +353 -0
- package/src/processing/research-logger.ts +450 -0
- package/src/processing/result-processor.ts +372 -0
- package/src/prompts/research-prompts.ts +419 -0
- package/src/providers/cacheProvider.ts +164 -0
- package/src/providers.ts +173 -0
- package/src/service.ts +2588 -0
- package/src/services/swe-bench.ts +286 -0
- package/src/strategies/research-strategies.ts +790 -0
- package/src/types/pdf-parse.d.ts +34 -0
- package/src/types.ts +551 -0
- package/src/verification/claim-verifier.ts +443 -0
|
@@ -0,0 +1,444 @@
|
|
|
1
|
+
import { IAgentRuntime, elizaLogger, ModelType } from '@elizaos/core';
|
|
2
|
+
import {
|
|
3
|
+
ResearchProject,
|
|
4
|
+
ResearchReport,
|
|
5
|
+
Citation,
|
|
6
|
+
FactualClaim,
|
|
7
|
+
RACEScore,
|
|
8
|
+
FACTScore,
|
|
9
|
+
EvaluationCriteria,
|
|
10
|
+
EvaluationMetrics,
|
|
11
|
+
VerificationStatus,
|
|
12
|
+
} from '../types';
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* RACE (Reference-based Adaptive Criteria-driven Evaluation) implementation
|
|
16
|
+
*/
|
|
17
|
+
export class RACEEvaluator {
|
|
18
|
+
constructor(private runtime: IAgentRuntime) {}
|
|
19
|
+
|
|
20
|
+
async evaluate(
|
|
21
|
+
report: ResearchReport,
|
|
22
|
+
criteria: EvaluationCriteria,
|
|
23
|
+
referenceReport?: ResearchReport
|
|
24
|
+
): Promise<RACEScore> {
|
|
25
|
+
const scores = {
|
|
26
|
+
comprehensiveness: await this.evaluateDimension(
|
|
27
|
+
report,
|
|
28
|
+
criteria.comprehensiveness,
|
|
29
|
+
'comprehensiveness',
|
|
30
|
+
referenceReport
|
|
31
|
+
),
|
|
32
|
+
depth: await this.evaluateDimension(report, criteria.depth, 'depth', referenceReport),
|
|
33
|
+
instructionFollowing: await this.evaluateDimension(
|
|
34
|
+
report,
|
|
35
|
+
criteria.instructionFollowing,
|
|
36
|
+
'instructionFollowing',
|
|
37
|
+
referenceReport
|
|
38
|
+
),
|
|
39
|
+
readability: await this.evaluateDimension(
|
|
40
|
+
report,
|
|
41
|
+
criteria.readability,
|
|
42
|
+
'readability',
|
|
43
|
+
referenceReport
|
|
44
|
+
),
|
|
45
|
+
};
|
|
46
|
+
|
|
47
|
+
// Calculate weighted overall score
|
|
48
|
+
const overall =
|
|
49
|
+
scores.comprehensiveness * criteria.comprehensiveness.weight +
|
|
50
|
+
scores.depth * criteria.depth.weight +
|
|
51
|
+
scores.instructionFollowing * criteria.instructionFollowing.weight +
|
|
52
|
+
scores.readability * criteria.readability.weight;
|
|
53
|
+
|
|
54
|
+
return {
|
|
55
|
+
overall,
|
|
56
|
+
comprehensiveness: scores.comprehensiveness,
|
|
57
|
+
depth: scores.depth,
|
|
58
|
+
instructionFollowing: scores.instructionFollowing,
|
|
59
|
+
readability: scores.readability,
|
|
60
|
+
breakdown: [],
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
private async evaluateDimension(
|
|
65
|
+
report: ResearchReport,
|
|
66
|
+
criteriaDefinition: any,
|
|
67
|
+
dimension: string,
|
|
68
|
+
referenceReport?: ResearchReport
|
|
69
|
+
): Promise<number> {
|
|
70
|
+
try {
|
|
71
|
+
// Check if we have useModel available
|
|
72
|
+
if (!this.runtime.useModel) {
|
|
73
|
+
elizaLogger.warn(`[RACEEvaluator] No model available for ${dimension} evaluation, using default score`);
|
|
74
|
+
return 0.7; // Default score when no model available
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
const reportContent = this.extractReportContent(report);
|
|
78
|
+
const referenceContent = referenceReport ? this.extractReportContent(referenceReport) : '';
|
|
79
|
+
|
|
80
|
+
// Convert rubric items to string format if they're objects
|
|
81
|
+
let rubricText = '';
|
|
82
|
+
if (Array.isArray(criteriaDefinition.rubric)) {
|
|
83
|
+
rubricText = criteriaDefinition.rubric.map((item: any, i: number) => {
|
|
84
|
+
if (typeof item === 'string') {
|
|
85
|
+
return `${i + 1}. ${item}`;
|
|
86
|
+
} else if (item.description) {
|
|
87
|
+
return `${item.score || i}. ${item.description}`;
|
|
88
|
+
}
|
|
89
|
+
return `${i + 1}. Criterion ${i + 1}`;
|
|
90
|
+
}).join('\n');
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const prompt = `Evaluate this research report on the ${dimension} dimension.
|
|
94
|
+
|
|
95
|
+
Evaluation Criteria:
|
|
96
|
+
${criteriaDefinition.description}
|
|
97
|
+
|
|
98
|
+
Rubric Items to Check:
|
|
99
|
+
${rubricText}
|
|
100
|
+
|
|
101
|
+
Report to Evaluate (first 5000 chars):
|
|
102
|
+
${reportContent.substring(0, 5000)}
|
|
103
|
+
|
|
104
|
+
${referenceContent ? `Reference Report for Comparison (first 2000 chars):\n${referenceContent.substring(0, 2000)}` : ''}
|
|
105
|
+
|
|
106
|
+
Provide a score from 0-100 based on how well the report meets the criteria.
|
|
107
|
+
Consider each rubric item and provide reasoning for your score.
|
|
108
|
+
|
|
109
|
+
Respond with JSON:
|
|
110
|
+
{
|
|
111
|
+
"score": number (0-100),
|
|
112
|
+
"reasoning": "explanation of score",
|
|
113
|
+
"rubricScores": {
|
|
114
|
+
"item1": score,
|
|
115
|
+
"item2": score,
|
|
116
|
+
...
|
|
117
|
+
}
|
|
118
|
+
}`;
|
|
119
|
+
|
|
120
|
+
const response = await this.runtime.useModel(ModelType.TEXT_LARGE, {
|
|
121
|
+
messages: [
|
|
122
|
+
{
|
|
123
|
+
role: 'system',
|
|
124
|
+
content: 'You are an expert research evaluator. Provide a balanced, fair assessment.'
|
|
125
|
+
},
|
|
126
|
+
{ role: 'user', content: prompt }
|
|
127
|
+
],
|
|
128
|
+
temperature: 0.3,
|
|
129
|
+
max_tokens: 1000,
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
const content = typeof response === 'string' ? response : (response as any).content || '';
|
|
133
|
+
|
|
134
|
+
// Try to parse JSON response
|
|
135
|
+
try {
|
|
136
|
+
const jsonMatch = content.match(/\{[\s\S]*\}/);
|
|
137
|
+
if (jsonMatch) {
|
|
138
|
+
const result = JSON.parse(jsonMatch[0]);
|
|
139
|
+
return Math.max(0, Math.min(1, result.score / 100)); // Normalize to 0-1
|
|
140
|
+
}
|
|
141
|
+
} catch (parseError) {
|
|
142
|
+
elizaLogger.warn(`[RACEEvaluator] Failed to parse JSON response for ${dimension}:`, parseError);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// Fallback: try to extract score from text
|
|
146
|
+
const scoreMatch = content.match(/score[:\s]+(\d+)/i);
|
|
147
|
+
if (scoreMatch) {
|
|
148
|
+
const score = parseInt(scoreMatch[1]);
|
|
149
|
+
return Math.max(0, Math.min(1, score / 100));
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
elizaLogger.error(`[RACEEvaluator] Failed to extract score for ${dimension}`);
|
|
153
|
+
return 0.5; // Default middle score
|
|
154
|
+
} catch (e) {
|
|
155
|
+
elizaLogger.error(`[RACEEvaluator] Failed to evaluate ${dimension}:`, e);
|
|
156
|
+
return 0.5; // Default middle score
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
private extractReportContent(report: ResearchReport): string {
|
|
161
|
+
let content = `Title: ${report.title}\n\nSummary: ${report.summary}\n\n`;
|
|
162
|
+
|
|
163
|
+
for (const section of report.sections) {
|
|
164
|
+
content += `## ${section.heading}\n${section.content}\n\n`;
|
|
165
|
+
if (section.subsections) {
|
|
166
|
+
for (const subsection of section.subsections) {
|
|
167
|
+
content += `### ${subsection.heading}\n${subsection.content}\n\n`;
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
return content.substring(0, 10000); // Limit length for LLM context
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* FACT (Framework for Factual Abundance and Citation Trustworthiness) implementation
|
|
178
|
+
*/
|
|
179
|
+
export class FACTEvaluator {
|
|
180
|
+
constructor(private runtime: IAgentRuntime) {}
|
|
181
|
+
|
|
182
|
+
async evaluate(project: ResearchProject): Promise<FACTScore> {
|
|
183
|
+
const allClaims = await this.extractFactualClaims(project);
|
|
184
|
+
const verificationResults = await this.verifyClaims(allClaims);
|
|
185
|
+
|
|
186
|
+
const totalCitations = project.report?.citations.length || 0;
|
|
187
|
+
const verifiedCitations = verificationResults.filter((r) => r.verified).length;
|
|
188
|
+
const citationAccuracy = totalCitations > 0 ? verifiedCitations / totalCitations : 0;
|
|
189
|
+
|
|
190
|
+
// Deduplicate claims
|
|
191
|
+
const uniqueClaims = this.deduplicateClaims(verificationResults);
|
|
192
|
+
const effectiveCitations = uniqueClaims.filter((c) => c.verified).length;
|
|
193
|
+
|
|
194
|
+
return {
|
|
195
|
+
citationAccuracy,
|
|
196
|
+
effectiveCitations,
|
|
197
|
+
totalCitations,
|
|
198
|
+
verifiedCitations,
|
|
199
|
+
disputedCitations: 0,
|
|
200
|
+
citationCoverage: totalCitations > 0 ? effectiveCitations / totalCitations : 0,
|
|
201
|
+
sourceCredibility: 0.8, // Default credibility score
|
|
202
|
+
breakdown: [],
|
|
203
|
+
};
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
private async extractFactualClaims(project: ResearchProject): Promise<FactualClaim[]> {
|
|
207
|
+
const claims: FactualClaim[] = [];
|
|
208
|
+
|
|
209
|
+
if (!project.report) return claims;
|
|
210
|
+
|
|
211
|
+
// Extract claims from report sections
|
|
212
|
+
for (const section of project.report.sections) {
|
|
213
|
+
const sectionClaims = await this.extractClaimsFromText(section.content, section.citations);
|
|
214
|
+
claims.push(...sectionClaims);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Also extract from findings
|
|
218
|
+
for (const finding of project.findings) {
|
|
219
|
+
if (finding.factualClaims) {
|
|
220
|
+
claims.push(...finding.factualClaims);
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
return claims;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
private async extractClaimsFromText(
|
|
228
|
+
text: string,
|
|
229
|
+
citations: Citation[]
|
|
230
|
+
): Promise<FactualClaim[]> {
|
|
231
|
+
// Check if we have a model available
|
|
232
|
+
if (!this.runtime.useModel) {
|
|
233
|
+
elizaLogger.warn('[FACTEvaluator] No model available for claim extraction');
|
|
234
|
+
return [];
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
const prompt = `Extract factual claims and their citations from this text.
|
|
238
|
+
|
|
239
|
+
Text:
|
|
240
|
+
${text.substring(0, 3000)}
|
|
241
|
+
|
|
242
|
+
Available Citations:
|
|
243
|
+
${citations.slice(0, 10).map((c, i) => `[${i + 1}] ${c.source.url}`).join('\n')}
|
|
244
|
+
|
|
245
|
+
For each factual claim in the text:
|
|
246
|
+
1. Extract the exact statement
|
|
247
|
+
2. Identify which citation supports it (by number)
|
|
248
|
+
3. Note the supporting evidence
|
|
249
|
+
|
|
250
|
+
Respond with JSON array:
|
|
251
|
+
[
|
|
252
|
+
{
|
|
253
|
+
"statement": "exact factual claim",
|
|
254
|
+
"citationIndex": number,
|
|
255
|
+
"supportingEvidence": "relevant quote or context"
|
|
256
|
+
}
|
|
257
|
+
]
|
|
258
|
+
|
|
259
|
+
Extract 3-5 key claims maximum.`;
|
|
260
|
+
|
|
261
|
+
try {
|
|
262
|
+
const response = await this.runtime.useModel(ModelType.TEXT_LARGE, {
|
|
263
|
+
messages: [
|
|
264
|
+
{
|
|
265
|
+
role: 'system',
|
|
266
|
+
content: 'You are a fact extraction expert. Extract only clear, verifiable claims.'
|
|
267
|
+
},
|
|
268
|
+
{ role: 'user', content: prompt }
|
|
269
|
+
],
|
|
270
|
+
temperature: 0.2,
|
|
271
|
+
max_tokens: 1500,
|
|
272
|
+
});
|
|
273
|
+
|
|
274
|
+
const content = typeof response === 'string' ? response : (response as any).content || '';
|
|
275
|
+
|
|
276
|
+
// Try to parse JSON array
|
|
277
|
+
const jsonMatch = content.match(/\[[\s\S]*\]/);
|
|
278
|
+
if (jsonMatch) {
|
|
279
|
+
const extracted = JSON.parse(jsonMatch[0]);
|
|
280
|
+
|
|
281
|
+
return extracted.slice(0, 5).map((item: any) => ({
|
|
282
|
+
id: `claim_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`,
|
|
283
|
+
statement: item.statement || '',
|
|
284
|
+
supportingEvidence: [item.supportingEvidence || ''],
|
|
285
|
+
sourceUrls: citations[item.citationIndex - 1]
|
|
286
|
+
? [citations[item.citationIndex - 1].source.url]
|
|
287
|
+
: [],
|
|
288
|
+
verificationStatus: VerificationStatus.UNVERIFIED,
|
|
289
|
+
confidenceScore: 0.8,
|
|
290
|
+
relatedClaims: [],
|
|
291
|
+
}));
|
|
292
|
+
}
|
|
293
|
+
} catch (e) {
|
|
294
|
+
elizaLogger.error('[FACTEvaluator] Failed to extract claims:', e);
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
return [];
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
private async verifyClaims(
|
|
301
|
+
claims: FactualClaim[]
|
|
302
|
+
): Promise<Array<FactualClaim & { verified: boolean }>> {
|
|
303
|
+
const results = [];
|
|
304
|
+
|
|
305
|
+
for (const claim of claims) {
|
|
306
|
+
const verified = await this.verifySingleClaim(claim);
|
|
307
|
+
results.push({ ...claim, verified });
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
return results;
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
private async verifySingleClaim(claim: FactualClaim): Promise<boolean> {
|
|
314
|
+
// If no model available, return conservative estimate
|
|
315
|
+
if (!this.runtime.useModel) {
|
|
316
|
+
return claim.confidenceScore > 0.7;
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// In a real implementation, this would:
|
|
320
|
+
// 1. Fetch the source URL content
|
|
321
|
+
// 2. Check if the content supports the claim
|
|
322
|
+
// For now, we'll use a simplified verification
|
|
323
|
+
|
|
324
|
+
if (!claim.sourceUrls || claim.sourceUrls.length === 0 || !claim.statement) return false;
|
|
325
|
+
|
|
326
|
+
const prompt = `Does this evidence support the claim?
|
|
327
|
+
|
|
328
|
+
Claim: ${claim.statement}
|
|
329
|
+
Evidence: ${claim.supportingEvidence?.join(' ')}
|
|
330
|
+
Source URL: ${claim.sourceUrls[0]}
|
|
331
|
+
|
|
332
|
+
Answer with just "yes" or "no".`;
|
|
333
|
+
|
|
334
|
+
try {
|
|
335
|
+
const response = await this.runtime.useModel(ModelType.TEXT_LARGE, {
|
|
336
|
+
messages: [
|
|
337
|
+
{
|
|
338
|
+
role: 'system',
|
|
339
|
+
content: 'You are a fact verifier. Answer only yes or no.'
|
|
340
|
+
},
|
|
341
|
+
{ role: 'user', content: prompt }
|
|
342
|
+
],
|
|
343
|
+
temperature: 0.1,
|
|
344
|
+
max_tokens: 10,
|
|
345
|
+
});
|
|
346
|
+
|
|
347
|
+
const answer = typeof response === 'string' ? response : (response as any).content || '';
|
|
348
|
+
return answer.toLowerCase().includes('yes');
|
|
349
|
+
} catch (e) {
|
|
350
|
+
elizaLogger.error('[FACTEvaluator] Failed to verify claim:', e);
|
|
351
|
+
return false;
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
private deduplicateClaims(
|
|
356
|
+
claims: Array<FactualClaim & { verified: boolean }>
|
|
357
|
+
): Array<FactualClaim & { verified: boolean }> {
|
|
358
|
+
const seen = new Set<string>();
|
|
359
|
+
const unique = [];
|
|
360
|
+
|
|
361
|
+
for (const claim of claims) {
|
|
362
|
+
const key = `${claim.statement}|${claim.sourceUrls?.join(',')}`;
|
|
363
|
+
if (!seen.has(key)) {
|
|
364
|
+
seen.add(key);
|
|
365
|
+
unique.push(claim);
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
return unique;
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
/**
|
|
374
|
+
* Combined evaluator for complete research assessment
|
|
375
|
+
*/
|
|
376
|
+
export class ResearchEvaluator {
|
|
377
|
+
private raceEvaluator: RACEEvaluator;
|
|
378
|
+
private factEvaluator: FACTEvaluator;
|
|
379
|
+
|
|
380
|
+
constructor(runtime: IAgentRuntime) {
|
|
381
|
+
this.raceEvaluator = new RACEEvaluator(runtime);
|
|
382
|
+
this.factEvaluator = new FACTEvaluator(runtime);
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
async evaluateProject(
|
|
386
|
+
project: ResearchProject,
|
|
387
|
+
criteria: EvaluationCriteria,
|
|
388
|
+
referenceReport?: ResearchReport
|
|
389
|
+
): Promise<EvaluationMetrics> {
|
|
390
|
+
if (!project.report) {
|
|
391
|
+
throw new Error('Project must have a completed report for evaluation');
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
const [raceScore, factScore] = await Promise.all([
|
|
395
|
+
this.raceEvaluator.evaluate(project.report, criteria, referenceReport),
|
|
396
|
+
this.factEvaluator.evaluate(project),
|
|
397
|
+
]);
|
|
398
|
+
|
|
399
|
+
return {
|
|
400
|
+
raceScore,
|
|
401
|
+
factScore,
|
|
402
|
+
timestamp: Date.now(),
|
|
403
|
+
evaluatorVersion: '1.0',
|
|
404
|
+
};
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
formatEvaluationReport(metrics: EvaluationMetrics): string {
|
|
408
|
+
const race = metrics.raceScore!;
|
|
409
|
+
const fact = metrics.factScore!;
|
|
410
|
+
|
|
411
|
+
return `# Research Evaluation Report
|
|
412
|
+
|
|
413
|
+
## RACE Scores
|
|
414
|
+
- **Overall**: ${(race.overall * 100).toFixed(1)}%
|
|
415
|
+
- **Comprehensiveness**: ${(race.comprehensiveness * 100).toFixed(1)}%
|
|
416
|
+
- **Depth**: ${(race.depth * 100).toFixed(1)}%
|
|
417
|
+
- **Instruction Following**: ${(race.instructionFollowing * 100).toFixed(1)}%
|
|
418
|
+
- **Readability**: ${(race.readability * 100).toFixed(1)}%
|
|
419
|
+
|
|
420
|
+
## FACT Scores
|
|
421
|
+
- **Citation Accuracy**: ${(fact.citationAccuracy * 100).toFixed(1)}%
|
|
422
|
+
- **Effective Citations**: ${fact.effectiveCitations}
|
|
423
|
+
- **Total Citations**: ${fact.totalCitations}
|
|
424
|
+
- **Verified Citations**: ${fact.verifiedCitations}
|
|
425
|
+
|
|
426
|
+
## Overall Assessment
|
|
427
|
+
${this.generateAssessment(race, fact)}`;
|
|
428
|
+
}
|
|
429
|
+
|
|
430
|
+
private generateAssessment(race: RACEScore, fact: FACTScore): string {
|
|
431
|
+
const overallQuality = race.overall;
|
|
432
|
+
const citationQuality = fact.citationAccuracy;
|
|
433
|
+
|
|
434
|
+
if (overallQuality >= 0.8 && citationQuality >= 0.8) {
|
|
435
|
+
return 'Excellent research quality with strong factual grounding.';
|
|
436
|
+
} else if (overallQuality >= 0.6 && citationQuality >= 0.6) {
|
|
437
|
+
return 'Good research quality with adequate citation support.';
|
|
438
|
+
} else if (overallQuality >= 0.4 || citationQuality >= 0.4) {
|
|
439
|
+
return 'Moderate research quality. Consider improving depth and citation accuracy.';
|
|
440
|
+
} else {
|
|
441
|
+
return 'Research needs significant improvement in both content quality and factual support.';
|
|
442
|
+
}
|
|
443
|
+
}
|
|
444
|
+
}
|