@agentv/core 0.22.1 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +595 -252
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +61 -22
- package/dist/index.d.ts +61 -22
- package/dist/index.js +594 -251
- package/dist/index.js.map +1 -1
- package/package.json +4 -8
package/dist/index.js
CHANGED
|
@@ -51,7 +51,7 @@ function isTestMessage(value) {
|
|
|
51
51
|
}
|
|
52
52
|
return candidate.content.every(isJsonObject);
|
|
53
53
|
}
|
|
54
|
-
var EVALUATOR_KIND_VALUES = ["
|
|
54
|
+
var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
|
|
55
55
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
56
56
|
function isEvaluatorKind(value) {
|
|
57
57
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -403,10 +403,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
403
403
|
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
404
404
|
continue;
|
|
405
405
|
}
|
|
406
|
-
if (typeValue === "
|
|
406
|
+
if (typeValue === "code_judge") {
|
|
407
407
|
const script = asString2(rawEvaluator.script);
|
|
408
408
|
if (!script) {
|
|
409
|
-
logWarning2(`Skipping
|
|
409
|
+
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
|
|
410
410
|
continue;
|
|
411
411
|
}
|
|
412
412
|
const cwd = asString2(rawEvaluator.cwd);
|
|
@@ -417,7 +417,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
417
417
|
resolvedCwd = path3.resolve(resolved.resolvedPath);
|
|
418
418
|
} else {
|
|
419
419
|
logWarning2(
|
|
420
|
-
`
|
|
420
|
+
`Code_judge evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
421
421
|
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
422
422
|
);
|
|
423
423
|
}
|
|
@@ -433,6 +433,105 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
433
433
|
});
|
|
434
434
|
continue;
|
|
435
435
|
}
|
|
436
|
+
if (typeValue === "composite") {
|
|
437
|
+
const rawMembers = rawEvaluator.evaluators;
|
|
438
|
+
if (!Array.isArray(rawMembers)) {
|
|
439
|
+
logWarning2(
|
|
440
|
+
`Skipping composite evaluator '${name}' in '${evalId}': missing evaluators array`
|
|
441
|
+
);
|
|
442
|
+
continue;
|
|
443
|
+
}
|
|
444
|
+
const rawAggregator = rawEvaluator.aggregator;
|
|
445
|
+
if (!isJsonObject2(rawAggregator)) {
|
|
446
|
+
logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
|
|
447
|
+
continue;
|
|
448
|
+
}
|
|
449
|
+
const aggregatorType = asString2(rawAggregator.type);
|
|
450
|
+
if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
|
|
451
|
+
logWarning2(
|
|
452
|
+
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
453
|
+
);
|
|
454
|
+
continue;
|
|
455
|
+
}
|
|
456
|
+
const memberEvaluators = [];
|
|
457
|
+
for (const rawMember of rawMembers) {
|
|
458
|
+
if (!isJsonObject2(rawMember)) {
|
|
459
|
+
logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
|
|
460
|
+
continue;
|
|
461
|
+
}
|
|
462
|
+
const memberName = asString2(rawMember.name);
|
|
463
|
+
const memberType = rawMember.type;
|
|
464
|
+
if (!memberName || !isEvaluatorKind(memberType)) {
|
|
465
|
+
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
|
|
466
|
+
continue;
|
|
467
|
+
}
|
|
468
|
+
const memberConfigs = await parseEvaluators(
|
|
469
|
+
{ evaluators: [rawMember] },
|
|
470
|
+
void 0,
|
|
471
|
+
searchRoots,
|
|
472
|
+
`${evalId}:${name}:${memberName}`
|
|
473
|
+
);
|
|
474
|
+
if (memberConfigs && memberConfigs.length > 0) {
|
|
475
|
+
memberEvaluators.push(memberConfigs[0]);
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
if (memberEvaluators.length === 0) {
|
|
479
|
+
logWarning2(
|
|
480
|
+
`Skipping composite evaluator '${name}' in '${evalId}': no valid member evaluators`
|
|
481
|
+
);
|
|
482
|
+
continue;
|
|
483
|
+
}
|
|
484
|
+
let aggregator;
|
|
485
|
+
if (aggregatorType === "weighted_average") {
|
|
486
|
+
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
487
|
+
const parsedWeights = {};
|
|
488
|
+
if (weights) {
|
|
489
|
+
for (const [key, value] of Object.entries(weights)) {
|
|
490
|
+
if (typeof value === "number") {
|
|
491
|
+
parsedWeights[key] = value;
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
aggregator = {
|
|
496
|
+
type: "weighted_average",
|
|
497
|
+
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
498
|
+
};
|
|
499
|
+
} else if (aggregatorType === "code_judge") {
|
|
500
|
+
const aggregatorPath = asString2(rawAggregator.path);
|
|
501
|
+
if (!aggregatorPath) {
|
|
502
|
+
logWarning2(
|
|
503
|
+
`Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
|
|
504
|
+
);
|
|
505
|
+
continue;
|
|
506
|
+
}
|
|
507
|
+
aggregator = {
|
|
508
|
+
type: "code_judge",
|
|
509
|
+
path: aggregatorPath,
|
|
510
|
+
cwd: searchRoots[0]
|
|
511
|
+
};
|
|
512
|
+
} else {
|
|
513
|
+
const aggregatorPrompt = asString2(rawAggregator.prompt);
|
|
514
|
+
let promptPath2;
|
|
515
|
+
if (aggregatorPrompt) {
|
|
516
|
+
const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
|
|
517
|
+
if (resolved.resolvedPath) {
|
|
518
|
+
promptPath2 = path3.resolve(resolved.resolvedPath);
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
aggregator = {
|
|
522
|
+
type: "llm_judge",
|
|
523
|
+
...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
|
|
524
|
+
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
525
|
+
};
|
|
526
|
+
}
|
|
527
|
+
evaluators.push({
|
|
528
|
+
name,
|
|
529
|
+
type: "composite",
|
|
530
|
+
evaluators: memberEvaluators,
|
|
531
|
+
aggregator
|
|
532
|
+
});
|
|
533
|
+
continue;
|
|
534
|
+
}
|
|
436
535
|
const prompt = asString2(rawEvaluator.prompt);
|
|
437
536
|
let promptPath;
|
|
438
537
|
if (prompt) {
|
|
@@ -453,25 +552,25 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
453
552
|
}
|
|
454
553
|
}
|
|
455
554
|
const _model = asString2(rawEvaluator.model);
|
|
555
|
+
const rawRubrics = rawEvaluator.rubrics;
|
|
556
|
+
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
557
|
+
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
558
|
+
description: asString2(rubric.description) ?? "",
|
|
559
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
560
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
561
|
+
})).filter((r) => r.description.length > 0) : void 0;
|
|
456
562
|
if (typeValue === "rubric") {
|
|
457
|
-
|
|
458
|
-
if (!Array.isArray(rubrics)) {
|
|
563
|
+
if (!parsedRubrics) {
|
|
459
564
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
|
|
460
565
|
continue;
|
|
461
566
|
}
|
|
462
|
-
const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
463
|
-
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
464
|
-
description: asString2(rubric.description) ?? "",
|
|
465
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
466
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
467
|
-
})).filter((r) => r.description.length > 0);
|
|
468
567
|
if (parsedRubrics.length === 0) {
|
|
469
568
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
|
|
470
569
|
continue;
|
|
471
570
|
}
|
|
472
571
|
evaluators.push({
|
|
473
572
|
name,
|
|
474
|
-
type: "
|
|
573
|
+
type: "llm_judge",
|
|
475
574
|
rubrics: parsedRubrics
|
|
476
575
|
});
|
|
477
576
|
continue;
|
|
@@ -480,7 +579,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
480
579
|
name,
|
|
481
580
|
type: "llm_judge",
|
|
482
581
|
prompt,
|
|
483
|
-
promptPath
|
|
582
|
+
promptPath,
|
|
583
|
+
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
|
|
484
584
|
});
|
|
485
585
|
}
|
|
486
586
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -1031,7 +1131,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1031
1131
|
if (rubricItems.length > 0) {
|
|
1032
1132
|
const rubricEvaluator = {
|
|
1033
1133
|
name: "rubric",
|
|
1034
|
-
type: "
|
|
1134
|
+
type: "llm_judge",
|
|
1035
1135
|
rubrics: rubricItems
|
|
1036
1136
|
};
|
|
1037
1137
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
@@ -2928,149 +3028,9 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
2928
3028
|
return createProvider(resolved);
|
|
2929
3029
|
}
|
|
2930
3030
|
|
|
2931
|
-
// src/evaluation/evaluators
|
|
3031
|
+
// src/evaluation/evaluators.ts
|
|
2932
3032
|
import { generateText as generateText2 } from "ai";
|
|
2933
3033
|
import { z } from "zod";
|
|
2934
|
-
var rubricCheckResultSchema = z.object({
|
|
2935
|
-
id: z.string().describe("The ID of the rubric item being checked"),
|
|
2936
|
-
satisfied: z.boolean().describe("Whether this rubric requirement is met"),
|
|
2937
|
-
reasoning: z.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
2938
|
-
});
|
|
2939
|
-
var rubricEvaluationSchema = z.object({
|
|
2940
|
-
checks: z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
2941
|
-
overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)")
|
|
2942
|
-
});
|
|
2943
|
-
var RubricEvaluator = class {
|
|
2944
|
-
kind = "rubric";
|
|
2945
|
-
config;
|
|
2946
|
-
resolveJudgeProvider;
|
|
2947
|
-
constructor(options) {
|
|
2948
|
-
this.config = options.config;
|
|
2949
|
-
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
2950
|
-
}
|
|
2951
|
-
async evaluate(context) {
|
|
2952
|
-
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
2953
|
-
if (!judgeProvider) {
|
|
2954
|
-
throw new Error("No judge provider available for rubric evaluation");
|
|
2955
|
-
}
|
|
2956
|
-
if (!this.config.rubrics || this.config.rubrics.length === 0) {
|
|
2957
|
-
throw new Error(
|
|
2958
|
-
`No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
|
|
2959
|
-
);
|
|
2960
|
-
}
|
|
2961
|
-
const prompt = this.buildPrompt(context, this.config.rubrics);
|
|
2962
|
-
const model = judgeProvider.asLanguageModel?.();
|
|
2963
|
-
if (!model) {
|
|
2964
|
-
throw new Error("Judge provider does not support language model interface");
|
|
2965
|
-
}
|
|
2966
|
-
const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
2967
|
-
You must return a valid JSON object matching this schema:
|
|
2968
|
-
{
|
|
2969
|
-
"checks": [
|
|
2970
|
-
{
|
|
2971
|
-
"id": "string (rubric id)",
|
|
2972
|
-
"satisfied": boolean,
|
|
2973
|
-
"reasoning": "string (brief explanation)"
|
|
2974
|
-
}
|
|
2975
|
-
],
|
|
2976
|
-
"overall_reasoning": "string (summary)"
|
|
2977
|
-
}`;
|
|
2978
|
-
let result;
|
|
2979
|
-
let lastError;
|
|
2980
|
-
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
2981
|
-
try {
|
|
2982
|
-
const { text } = await generateText2({
|
|
2983
|
-
model,
|
|
2984
|
-
system,
|
|
2985
|
-
prompt
|
|
2986
|
-
});
|
|
2987
|
-
const cleaned = text.replace(/```json\n?|```/g, "").trim();
|
|
2988
|
-
result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
|
|
2989
|
-
break;
|
|
2990
|
-
} catch (e) {
|
|
2991
|
-
lastError = e instanceof Error ? e : new Error(String(e));
|
|
2992
|
-
}
|
|
2993
|
-
}
|
|
2994
|
-
if (!result) {
|
|
2995
|
-
throw new Error(
|
|
2996
|
-
`Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
|
|
2997
|
-
);
|
|
2998
|
-
}
|
|
2999
|
-
const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
|
|
3000
|
-
return {
|
|
3001
|
-
score,
|
|
3002
|
-
verdict,
|
|
3003
|
-
hits,
|
|
3004
|
-
misses,
|
|
3005
|
-
expectedAspectCount: this.config.rubrics.length,
|
|
3006
|
-
reasoning: result.overall_reasoning,
|
|
3007
|
-
evaluatorRawRequest: {
|
|
3008
|
-
prompt
|
|
3009
|
-
}
|
|
3010
|
-
};
|
|
3011
|
-
}
|
|
3012
|
-
buildPrompt(context, rubrics) {
|
|
3013
|
-
const parts = [
|
|
3014
|
-
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
3015
|
-
"",
|
|
3016
|
-
"[[ ## question ## ]]",
|
|
3017
|
-
context.evalCase.question,
|
|
3018
|
-
"",
|
|
3019
|
-
"[[ ## expected_outcome ## ]]",
|
|
3020
|
-
context.evalCase.expected_outcome,
|
|
3021
|
-
""
|
|
3022
|
-
];
|
|
3023
|
-
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
3024
|
-
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
3025
|
-
}
|
|
3026
|
-
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
3027
|
-
for (const rubric of rubrics) {
|
|
3028
|
-
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
3029
|
-
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
3030
|
-
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
3031
|
-
}
|
|
3032
|
-
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
3033
|
-
return parts.join("\n");
|
|
3034
|
-
}
|
|
3035
|
-
calculateScore(result, rubrics) {
|
|
3036
|
-
const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
|
|
3037
|
-
const hits = [];
|
|
3038
|
-
const misses = [];
|
|
3039
|
-
let totalWeight = 0;
|
|
3040
|
-
let earnedWeight = 0;
|
|
3041
|
-
let failedRequired = false;
|
|
3042
|
-
for (const check of result.checks) {
|
|
3043
|
-
const rubric = rubricMap.get(check.id);
|
|
3044
|
-
if (!rubric) {
|
|
3045
|
-
continue;
|
|
3046
|
-
}
|
|
3047
|
-
totalWeight += rubric.weight;
|
|
3048
|
-
if (check.satisfied) {
|
|
3049
|
-
earnedWeight += rubric.weight;
|
|
3050
|
-
hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3051
|
-
} else {
|
|
3052
|
-
misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3053
|
-
if (rubric.required) {
|
|
3054
|
-
failedRequired = true;
|
|
3055
|
-
}
|
|
3056
|
-
}
|
|
3057
|
-
}
|
|
3058
|
-
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
3059
|
-
let verdict;
|
|
3060
|
-
if (failedRequired) {
|
|
3061
|
-
verdict = "fail";
|
|
3062
|
-
} else if (score >= 0.8) {
|
|
3063
|
-
verdict = "pass";
|
|
3064
|
-
} else if (score >= 0.6) {
|
|
3065
|
-
verdict = "borderline";
|
|
3066
|
-
} else {
|
|
3067
|
-
verdict = "fail";
|
|
3068
|
-
}
|
|
3069
|
-
return { score, verdict, hits, misses };
|
|
3070
|
-
}
|
|
3071
|
-
};
|
|
3072
|
-
|
|
3073
|
-
// src/evaluation/evaluators.ts
|
|
3074
3034
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3075
3035
|
|
|
3076
3036
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -3088,6 +3048,21 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
3088
3048
|
|
|
3089
3049
|
[[ ## candidate_answer ## ]]
|
|
3090
3050
|
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
3051
|
+
var freeformEvaluationSchema = z.object({
|
|
3052
|
+
score: z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
3053
|
+
hits: z.array(z.string()).describe("Brief specific achievements").optional(),
|
|
3054
|
+
misses: z.array(z.string()).describe("Brief failures or omissions").optional(),
|
|
3055
|
+
reasoning: z.string().describe("Concise explanation (1-2 sentences)").optional()
|
|
3056
|
+
});
|
|
3057
|
+
var rubricCheckResultSchema = z.object({
|
|
3058
|
+
id: z.string().describe("The ID of the rubric item being checked"),
|
|
3059
|
+
satisfied: z.boolean().describe("Whether this rubric requirement is met"),
|
|
3060
|
+
reasoning: z.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
3061
|
+
});
|
|
3062
|
+
var rubricEvaluationSchema = z.object({
|
|
3063
|
+
checks: z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
3064
|
+
overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)")
|
|
3065
|
+
});
|
|
3091
3066
|
var LlmJudgeEvaluator = class {
|
|
3092
3067
|
kind = "llm_judge";
|
|
3093
3068
|
resolveJudgeProvider;
|
|
@@ -3105,9 +3080,13 @@ var LlmJudgeEvaluator = class {
|
|
|
3105
3080
|
if (!judgeProvider) {
|
|
3106
3081
|
throw new Error("No judge provider available for LLM grading");
|
|
3107
3082
|
}
|
|
3108
|
-
|
|
3083
|
+
const config = context.evaluator;
|
|
3084
|
+
if (config?.type === "llm_judge" && config.rubrics && config.rubrics.length > 0) {
|
|
3085
|
+
return this.evaluateWithRubrics(context, judgeProvider, config.rubrics);
|
|
3086
|
+
}
|
|
3087
|
+
return this.evaluateFreeform(context, judgeProvider);
|
|
3109
3088
|
}
|
|
3110
|
-
async
|
|
3089
|
+
async evaluateFreeform(context, judgeProvider) {
|
|
3111
3090
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3112
3091
|
const variables = {
|
|
3113
3092
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
@@ -3124,34 +3103,132 @@ var LlmJudgeEvaluator = class {
|
|
|
3124
3103
|
const systemPrompt = buildOutputSchema();
|
|
3125
3104
|
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
3126
3105
|
const userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
3127
|
-
const response = await judgeProvider.invoke({
|
|
3128
|
-
question: userPrompt,
|
|
3129
|
-
systemPrompt,
|
|
3130
|
-
evalCaseId: context.evalCase.id,
|
|
3131
|
-
attempt: context.attempt,
|
|
3132
|
-
maxOutputTokens: this.maxOutputTokens,
|
|
3133
|
-
temperature: this.temperature
|
|
3134
|
-
});
|
|
3135
|
-
const parsed = parseQualityResponse(response);
|
|
3136
|
-
const score = clampScore(parsed.score ?? 0);
|
|
3137
|
-
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3138
|
-
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3139
|
-
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
3140
|
-
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
3141
3106
|
const evaluatorRawRequest = {
|
|
3142
3107
|
userPrompt,
|
|
3143
3108
|
systemPrompt,
|
|
3144
3109
|
target: judgeProvider.targetName
|
|
3145
3110
|
};
|
|
3111
|
+
try {
|
|
3112
|
+
const { data, providerResponse } = await this.runWithRetry({
|
|
3113
|
+
context,
|
|
3114
|
+
judgeProvider,
|
|
3115
|
+
systemPrompt,
|
|
3116
|
+
userPrompt,
|
|
3117
|
+
schema: freeformEvaluationSchema
|
|
3118
|
+
});
|
|
3119
|
+
const score = clampScore(data.score);
|
|
3120
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3121
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3122
|
+
const reasoning = data.reasoning ?? providerResponse?.reasoning;
|
|
3123
|
+
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
3124
|
+
return {
|
|
3125
|
+
score,
|
|
3126
|
+
verdict: scoreToVerdict(score),
|
|
3127
|
+
hits,
|
|
3128
|
+
misses,
|
|
3129
|
+
expectedAspectCount,
|
|
3130
|
+
reasoning,
|
|
3131
|
+
evaluatorRawRequest
|
|
3132
|
+
};
|
|
3133
|
+
} catch {
|
|
3134
|
+
return {
|
|
3135
|
+
score: 0,
|
|
3136
|
+
verdict: "fail",
|
|
3137
|
+
hits: [],
|
|
3138
|
+
misses: [],
|
|
3139
|
+
expectedAspectCount: 1,
|
|
3140
|
+
evaluatorRawRequest
|
|
3141
|
+
};
|
|
3142
|
+
}
|
|
3143
|
+
}
|
|
3144
|
+
async evaluateWithRubrics(context, judgeProvider, rubrics) {
|
|
3145
|
+
if (!rubrics || rubrics.length === 0) {
|
|
3146
|
+
throw new Error(
|
|
3147
|
+
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
|
|
3148
|
+
);
|
|
3149
|
+
}
|
|
3150
|
+
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
3151
|
+
const systemPrompt = buildRubricOutputSchema();
|
|
3152
|
+
const evaluatorRawRequest = {
|
|
3153
|
+
userPrompt: prompt,
|
|
3154
|
+
systemPrompt,
|
|
3155
|
+
target: judgeProvider.targetName
|
|
3156
|
+
};
|
|
3157
|
+
const { data } = await this.runWithRetry({
|
|
3158
|
+
context,
|
|
3159
|
+
judgeProvider,
|
|
3160
|
+
systemPrompt,
|
|
3161
|
+
userPrompt: prompt,
|
|
3162
|
+
schema: rubricEvaluationSchema
|
|
3163
|
+
});
|
|
3164
|
+
const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
|
|
3146
3165
|
return {
|
|
3147
3166
|
score,
|
|
3167
|
+
verdict,
|
|
3148
3168
|
hits,
|
|
3149
3169
|
misses,
|
|
3150
|
-
expectedAspectCount,
|
|
3151
|
-
reasoning,
|
|
3170
|
+
expectedAspectCount: rubrics.length,
|
|
3171
|
+
reasoning: data.overall_reasoning,
|
|
3152
3172
|
evaluatorRawRequest
|
|
3153
3173
|
};
|
|
3154
3174
|
}
|
|
3175
|
+
buildRubricPrompt(context, rubrics) {
|
|
3176
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3177
|
+
const parts = [
|
|
3178
|
+
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
3179
|
+
"",
|
|
3180
|
+
"[[ ## question ## ]]",
|
|
3181
|
+
formattedQuestion,
|
|
3182
|
+
"",
|
|
3183
|
+
"[[ ## expected_outcome ## ]]",
|
|
3184
|
+
context.evalCase.expected_outcome,
|
|
3185
|
+
""
|
|
3186
|
+
];
|
|
3187
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
3188
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
3189
|
+
}
|
|
3190
|
+
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
3191
|
+
for (const rubric of rubrics) {
|
|
3192
|
+
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
3193
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
3194
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
3195
|
+
}
|
|
3196
|
+
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
3197
|
+
return parts.join("\n");
|
|
3198
|
+
}
|
|
3199
|
+
async runWithRetry(options) {
|
|
3200
|
+
const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;
|
|
3201
|
+
let lastError;
|
|
3202
|
+
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
3203
|
+
try {
|
|
3204
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
3205
|
+
if (model) {
|
|
3206
|
+
const { text } = await generateText2({
|
|
3207
|
+
model,
|
|
3208
|
+
system: systemPrompt,
|
|
3209
|
+
prompt: userPrompt,
|
|
3210
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
3211
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
3212
|
+
});
|
|
3213
|
+
const data2 = schema.parse(parseJsonFromText(text));
|
|
3214
|
+
return { data: data2 };
|
|
3215
|
+
}
|
|
3216
|
+
const response = await judgeProvider.invoke({
|
|
3217
|
+
question: userPrompt,
|
|
3218
|
+
systemPrompt,
|
|
3219
|
+
evalCaseId: context.evalCase.id,
|
|
3220
|
+
attempt: context.attempt,
|
|
3221
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
3222
|
+
temperature: this.temperature
|
|
3223
|
+
});
|
|
3224
|
+
const data = schema.parse(parseJsonFromText(response.text ?? ""));
|
|
3225
|
+
return { data, providerResponse: response };
|
|
3226
|
+
} catch (e) {
|
|
3227
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
3228
|
+
}
|
|
3229
|
+
}
|
|
3230
|
+
throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
|
|
3231
|
+
}
|
|
3155
3232
|
};
|
|
3156
3233
|
function buildOutputSchema() {
|
|
3157
3234
|
return [
|
|
@@ -3165,6 +3242,29 @@ function buildOutputSchema() {
|
|
|
3165
3242
|
"}"
|
|
3166
3243
|
].join("\n");
|
|
3167
3244
|
}
|
|
3245
|
+
function buildRubricOutputSchema() {
|
|
3246
|
+
return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
3247
|
+
You must return a valid JSON object matching this schema:
|
|
3248
|
+
{
|
|
3249
|
+
"checks": [
|
|
3250
|
+
{
|
|
3251
|
+
"id": "string (rubric id)",
|
|
3252
|
+
"satisfied": boolean,
|
|
3253
|
+
"reasoning": "string (brief explanation)"
|
|
3254
|
+
}
|
|
3255
|
+
],
|
|
3256
|
+
"overall_reasoning": "string (summary)"
|
|
3257
|
+
}`;
|
|
3258
|
+
}
|
|
3259
|
+
function scoreToVerdict(score) {
|
|
3260
|
+
if (score >= 0.8) {
|
|
3261
|
+
return "pass";
|
|
3262
|
+
}
|
|
3263
|
+
if (score >= 0.6) {
|
|
3264
|
+
return "borderline";
|
|
3265
|
+
}
|
|
3266
|
+
return "fail";
|
|
3267
|
+
}
|
|
3168
3268
|
function clampScore(value) {
|
|
3169
3269
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
3170
3270
|
return 0;
|
|
@@ -3177,71 +3277,15 @@ function clampScore(value) {
|
|
|
3177
3277
|
}
|
|
3178
3278
|
return value;
|
|
3179
3279
|
}
|
|
3180
|
-
function parseQualityResponse(response) {
|
|
3181
|
-
const text = typeof response.text === "string" ? response.text.trim() : "";
|
|
3182
|
-
if (text.length === 0) {
|
|
3183
|
-
return {};
|
|
3184
|
-
}
|
|
3185
|
-
const direct = attemptParseJson(text);
|
|
3186
|
-
if (direct && validateQualityJson(direct)) {
|
|
3187
|
-
return direct;
|
|
3188
|
-
}
|
|
3189
|
-
const extracted = extractJsonBlob(text);
|
|
3190
|
-
if (extracted) {
|
|
3191
|
-
const parsed = attemptParseJson(extracted);
|
|
3192
|
-
if (parsed && validateQualityJson(parsed)) {
|
|
3193
|
-
return parsed;
|
|
3194
|
-
}
|
|
3195
|
-
}
|
|
3196
|
-
return {};
|
|
3197
|
-
}
|
|
3198
|
-
function attemptParseJson(text) {
|
|
3199
|
-
try {
|
|
3200
|
-
const parsed = JSON.parse(text);
|
|
3201
|
-
const score = typeof parsed.score === "number" ? parsed.score : void 0;
|
|
3202
|
-
const hits = parsed.hits;
|
|
3203
|
-
const misses = parsed.misses;
|
|
3204
|
-
const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
|
|
3205
|
-
return { score, hits, misses, reasoning };
|
|
3206
|
-
} catch {
|
|
3207
|
-
return void 0;
|
|
3208
|
-
}
|
|
3209
|
-
}
|
|
3210
|
-
function validateQualityJson(parsed) {
|
|
3211
|
-
if (typeof parsed.score !== "number") {
|
|
3212
|
-
return false;
|
|
3213
|
-
}
|
|
3214
|
-
if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
|
|
3215
|
-
return false;
|
|
3216
|
-
}
|
|
3217
|
-
if (parsed.score < 0 || parsed.score > 1) {
|
|
3218
|
-
return false;
|
|
3219
|
-
}
|
|
3220
|
-
if (parsed.hits !== void 0) {
|
|
3221
|
-
if (!Array.isArray(parsed.hits)) {
|
|
3222
|
-
return false;
|
|
3223
|
-
}
|
|
3224
|
-
if (!parsed.hits.every((item) => typeof item === "string")) {
|
|
3225
|
-
return false;
|
|
3226
|
-
}
|
|
3227
|
-
}
|
|
3228
|
-
if (parsed.misses !== void 0) {
|
|
3229
|
-
if (!Array.isArray(parsed.misses)) {
|
|
3230
|
-
return false;
|
|
3231
|
-
}
|
|
3232
|
-
if (!parsed.misses.every((item) => typeof item === "string")) {
|
|
3233
|
-
return false;
|
|
3234
|
-
}
|
|
3235
|
-
}
|
|
3236
|
-
if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
|
|
3237
|
-
return false;
|
|
3238
|
-
}
|
|
3239
|
-
return true;
|
|
3240
|
-
}
|
|
3241
3280
|
function extractJsonBlob(text) {
|
|
3242
3281
|
const match = text.match(/\{[\s\S]*\}/);
|
|
3243
3282
|
return match?.[0];
|
|
3244
3283
|
}
|
|
3284
|
+
function parseJsonFromText(text) {
|
|
3285
|
+
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
3286
|
+
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
3287
|
+
return JSON.parse(blob);
|
|
3288
|
+
}
|
|
3245
3289
|
function isNonEmptyString(value) {
|
|
3246
3290
|
return typeof value === "string" && value.trim().length > 0;
|
|
3247
3291
|
}
|
|
@@ -3278,6 +3322,7 @@ var CodeEvaluator = class {
|
|
|
3278
3322
|
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
3279
3323
|
return {
|
|
3280
3324
|
score,
|
|
3325
|
+
verdict: scoreToVerdict(score),
|
|
3281
3326
|
hits,
|
|
3282
3327
|
misses,
|
|
3283
3328
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
@@ -3291,6 +3336,7 @@ var CodeEvaluator = class {
|
|
|
3291
3336
|
const message = error instanceof Error ? error.message : String(error);
|
|
3292
3337
|
return {
|
|
3293
3338
|
score: 0,
|
|
3339
|
+
verdict: "fail",
|
|
3294
3340
|
hits: [],
|
|
3295
3341
|
misses: [`Code evaluator failed: ${message}`],
|
|
3296
3342
|
expectedAspectCount: 1,
|
|
@@ -3304,6 +3350,33 @@ var CodeEvaluator = class {
|
|
|
3304
3350
|
}
|
|
3305
3351
|
}
|
|
3306
3352
|
};
|
|
3353
|
+
function calculateRubricScore(result, rubrics) {
|
|
3354
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
3355
|
+
const hits = [];
|
|
3356
|
+
const misses = [];
|
|
3357
|
+
let totalWeight = 0;
|
|
3358
|
+
let earnedWeight = 0;
|
|
3359
|
+
let failedRequired = false;
|
|
3360
|
+
for (const check of result.checks) {
|
|
3361
|
+
const rubric = rubricMap.get(check.id);
|
|
3362
|
+
if (!rubric) {
|
|
3363
|
+
continue;
|
|
3364
|
+
}
|
|
3365
|
+
totalWeight += rubric.weight;
|
|
3366
|
+
if (check.satisfied) {
|
|
3367
|
+
earnedWeight += rubric.weight;
|
|
3368
|
+
hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3369
|
+
} else {
|
|
3370
|
+
misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3371
|
+
if (rubric.required) {
|
|
3372
|
+
failedRequired = true;
|
|
3373
|
+
}
|
|
3374
|
+
}
|
|
3375
|
+
}
|
|
3376
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
3377
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
3378
|
+
return { score, verdict, hits, misses };
|
|
3379
|
+
}
|
|
3307
3380
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
3308
3381
|
const { spawn: spawn2 } = await import("node:child_process");
|
|
3309
3382
|
return await new Promise((resolve, reject) => {
|
|
@@ -3355,6 +3428,228 @@ function substituteVariables(template, variables) {
|
|
|
3355
3428
|
return variables[varName] ?? match;
|
|
3356
3429
|
});
|
|
3357
3430
|
}
|
|
3431
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
3432
|
+
{{EVALUATOR_RESULTS_JSON}}
|
|
3433
|
+
|
|
3434
|
+
Decide the final score and verdict based on all evaluator results.
|
|
3435
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
3436
|
+
var CompositeEvaluator = class {
|
|
3437
|
+
kind = "composite";
|
|
3438
|
+
config;
|
|
3439
|
+
evaluatorFactory;
|
|
3440
|
+
cwd;
|
|
3441
|
+
constructor(options) {
|
|
3442
|
+
this.config = options.config;
|
|
3443
|
+
this.evaluatorFactory = options.evaluatorFactory;
|
|
3444
|
+
this.cwd = options.cwd;
|
|
3445
|
+
}
|
|
3446
|
+
async evaluate(context) {
|
|
3447
|
+
const memberResults = await Promise.all(
|
|
3448
|
+
this.config.evaluators.map(async (memberConfig) => {
|
|
3449
|
+
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
3450
|
+
return {
|
|
3451
|
+
id: memberConfig.name,
|
|
3452
|
+
type: memberConfig.type,
|
|
3453
|
+
result: await evaluator.evaluate(context)
|
|
3454
|
+
};
|
|
3455
|
+
})
|
|
3456
|
+
);
|
|
3457
|
+
return this.aggregate(memberResults, context);
|
|
3458
|
+
}
|
|
3459
|
+
async aggregate(results, context) {
|
|
3460
|
+
const aggregator = this.config.aggregator;
|
|
3461
|
+
switch (aggregator.type) {
|
|
3462
|
+
case "code_judge":
|
|
3463
|
+
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
3464
|
+
case "llm_judge":
|
|
3465
|
+
return this.runLlmAggregator(results, context, aggregator);
|
|
3466
|
+
default:
|
|
3467
|
+
return this.runWeightedAverage(results, aggregator.weights);
|
|
3468
|
+
}
|
|
3469
|
+
}
|
|
3470
|
+
runWeightedAverage(results, weights) {
|
|
3471
|
+
let totalWeight = 0;
|
|
3472
|
+
let weightedSum = 0;
|
|
3473
|
+
const allHits = [];
|
|
3474
|
+
const allMisses = [];
|
|
3475
|
+
const reasoningParts = [];
|
|
3476
|
+
const evaluatorResults = [];
|
|
3477
|
+
for (const member of results) {
|
|
3478
|
+
const weight = weights?.[member.id] ?? 1;
|
|
3479
|
+
totalWeight += weight;
|
|
3480
|
+
weightedSum += member.result.score * weight;
|
|
3481
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
3482
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
3483
|
+
if (member.result.reasoning) {
|
|
3484
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
3485
|
+
}
|
|
3486
|
+
evaluatorResults.push({
|
|
3487
|
+
name: member.id,
|
|
3488
|
+
type: member.type,
|
|
3489
|
+
score: member.result.score,
|
|
3490
|
+
weight,
|
|
3491
|
+
verdict: member.result.verdict,
|
|
3492
|
+
hits: [...member.result.hits],
|
|
3493
|
+
misses: [...member.result.misses],
|
|
3494
|
+
reasoning: member.result.reasoning,
|
|
3495
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
3496
|
+
evaluatorResults: member.result.evaluatorResults
|
|
3497
|
+
});
|
|
3498
|
+
}
|
|
3499
|
+
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
3500
|
+
return {
|
|
3501
|
+
score: clampScore(finalScore),
|
|
3502
|
+
verdict: scoreToVerdict(finalScore),
|
|
3503
|
+
hits: allHits,
|
|
3504
|
+
misses: allMisses,
|
|
3505
|
+
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
3506
|
+
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
3507
|
+
evaluatorRawRequest: {
|
|
3508
|
+
aggregator: "weighted_average",
|
|
3509
|
+
...weights ? { weights } : {}
|
|
3510
|
+
},
|
|
3511
|
+
evaluatorResults
|
|
3512
|
+
};
|
|
3513
|
+
}
|
|
3514
|
+
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
3515
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
3516
|
+
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
3517
|
+
const evaluatorResults = results.map((member) => ({
|
|
3518
|
+
name: member.id,
|
|
3519
|
+
type: member.type,
|
|
3520
|
+
score: member.result.score,
|
|
3521
|
+
weight: weights?.[member.id] ?? 1,
|
|
3522
|
+
verdict: member.result.verdict,
|
|
3523
|
+
hits: [...member.result.hits],
|
|
3524
|
+
misses: [...member.result.misses],
|
|
3525
|
+
reasoning: member.result.reasoning,
|
|
3526
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
3527
|
+
evaluatorResults: member.result.evaluatorResults
|
|
3528
|
+
}));
|
|
3529
|
+
try {
|
|
3530
|
+
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
3531
|
+
const parsed = parseJsonSafe(stdout);
|
|
3532
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
3533
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
3534
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
3535
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
3536
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
3537
|
+
return {
|
|
3538
|
+
score,
|
|
3539
|
+
verdict,
|
|
3540
|
+
hits,
|
|
3541
|
+
misses,
|
|
3542
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
3543
|
+
reasoning,
|
|
3544
|
+
evaluatorRawRequest: {
|
|
3545
|
+
aggregator: "code_judge",
|
|
3546
|
+
script: scriptPath
|
|
3547
|
+
},
|
|
3548
|
+
evaluatorResults
|
|
3549
|
+
};
|
|
3550
|
+
} catch (error) {
|
|
3551
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3552
|
+
return {
|
|
3553
|
+
score: 0,
|
|
3554
|
+
verdict: "fail",
|
|
3555
|
+
hits: [],
|
|
3556
|
+
misses: [`Code aggregator failed: ${message}`],
|
|
3557
|
+
expectedAspectCount: 1,
|
|
3558
|
+
reasoning: message,
|
|
3559
|
+
evaluatorRawRequest: {
|
|
3560
|
+
aggregator: "code_judge",
|
|
3561
|
+
script: scriptPath,
|
|
3562
|
+
error: message
|
|
3563
|
+
},
|
|
3564
|
+
evaluatorResults
|
|
3565
|
+
};
|
|
3566
|
+
}
|
|
3567
|
+
}
|
|
3568
|
+
async runLlmAggregator(results, context, config) {
|
|
3569
|
+
const judgeProvider = context.judgeProvider;
|
|
3570
|
+
if (!judgeProvider) {
|
|
3571
|
+
throw new Error("No judge provider available for LLM aggregation");
|
|
3572
|
+
}
|
|
3573
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
3574
|
+
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
3575
|
+
const evaluatorResults = results.map((member) => ({
|
|
3576
|
+
name: member.id,
|
|
3577
|
+
type: member.type,
|
|
3578
|
+
score: member.result.score,
|
|
3579
|
+
verdict: member.result.verdict,
|
|
3580
|
+
hits: [...member.result.hits],
|
|
3581
|
+
misses: [...member.result.misses],
|
|
3582
|
+
reasoning: member.result.reasoning,
|
|
3583
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
3584
|
+
evaluatorResults: member.result.evaluatorResults
|
|
3585
|
+
}));
|
|
3586
|
+
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
3587
|
+
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
3588
|
+
const systemPrompt = buildOutputSchema();
|
|
3589
|
+
const evaluatorRawRequest = {
|
|
3590
|
+
aggregator: "llm_judge",
|
|
3591
|
+
userPrompt,
|
|
3592
|
+
systemPrompt,
|
|
3593
|
+
target: judgeProvider.targetName
|
|
3594
|
+
};
|
|
3595
|
+
try {
|
|
3596
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
3597
|
+
if (model) {
|
|
3598
|
+
const { text } = await generateText2({
|
|
3599
|
+
model,
|
|
3600
|
+
system: systemPrompt,
|
|
3601
|
+
prompt: userPrompt
|
|
3602
|
+
});
|
|
3603
|
+
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
3604
|
+
const score2 = clampScore(data2.score);
|
|
3605
|
+
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3606
|
+
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3607
|
+
const reasoning2 = data2.reasoning;
|
|
3608
|
+
return {
|
|
3609
|
+
score: score2,
|
|
3610
|
+
verdict: scoreToVerdict(score2),
|
|
3611
|
+
hits: hits2,
|
|
3612
|
+
misses: misses2,
|
|
3613
|
+
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
3614
|
+
reasoning: reasoning2,
|
|
3615
|
+
evaluatorRawRequest,
|
|
3616
|
+
evaluatorResults
|
|
3617
|
+
};
|
|
3618
|
+
}
|
|
3619
|
+
const response = await judgeProvider.invoke({
|
|
3620
|
+
question: userPrompt,
|
|
3621
|
+
systemPrompt,
|
|
3622
|
+
evalCaseId: context.evalCase.id,
|
|
3623
|
+
attempt: context.attempt
|
|
3624
|
+
});
|
|
3625
|
+
const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
|
|
3626
|
+
const score = clampScore(data.score);
|
|
3627
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3628
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3629
|
+
const reasoning = data.reasoning ?? response.reasoning;
|
|
3630
|
+
return {
|
|
3631
|
+
score,
|
|
3632
|
+
verdict: scoreToVerdict(score),
|
|
3633
|
+
hits,
|
|
3634
|
+
misses,
|
|
3635
|
+
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
3636
|
+
reasoning,
|
|
3637
|
+
evaluatorRawRequest,
|
|
3638
|
+
evaluatorResults
|
|
3639
|
+
};
|
|
3640
|
+
} catch {
|
|
3641
|
+
return {
|
|
3642
|
+
score: 0,
|
|
3643
|
+
verdict: "fail",
|
|
3644
|
+
hits: [],
|
|
3645
|
+
misses: [],
|
|
3646
|
+
expectedAspectCount: 1,
|
|
3647
|
+
evaluatorRawRequest,
|
|
3648
|
+
evaluatorResults
|
|
3649
|
+
};
|
|
3650
|
+
}
|
|
3651
|
+
}
|
|
3652
|
+
};
|
|
3358
3653
|
|
|
3359
3654
|
// src/evaluation/orchestrator.ts
|
|
3360
3655
|
import { createHash, randomUUID as randomUUID2 } from "node:crypto";
|
|
@@ -4040,7 +4335,6 @@ async function runEvaluatorList(options) {
|
|
|
4040
4335
|
reasoning: score2.reasoning,
|
|
4041
4336
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4042
4337
|
});
|
|
4043
|
-
continue;
|
|
4044
4338
|
}
|
|
4045
4339
|
if (evaluator.type === "code") {
|
|
4046
4340
|
const codeEvaluator = new CodeEvaluator({
|
|
@@ -4057,10 +4351,10 @@ async function runEvaluatorList(options) {
|
|
|
4057
4351
|
promptInputs,
|
|
4058
4352
|
now
|
|
4059
4353
|
});
|
|
4060
|
-
scored.push({ score: score2, name: evaluator.name, type:
|
|
4354
|
+
scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
|
|
4061
4355
|
evaluatorResults.push({
|
|
4062
4356
|
name: evaluator.name,
|
|
4063
|
-
type:
|
|
4357
|
+
type: "code_judge",
|
|
4064
4358
|
score: score2.score,
|
|
4065
4359
|
verdict: score2.verdict,
|
|
4066
4360
|
hits: score2.hits,
|
|
@@ -4068,19 +4362,37 @@ async function runEvaluatorList(options) {
|
|
|
4068
4362
|
reasoning: score2.reasoning,
|
|
4069
4363
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4070
4364
|
});
|
|
4071
|
-
continue;
|
|
4072
4365
|
}
|
|
4073
|
-
if (evaluator.type === "
|
|
4074
|
-
const
|
|
4075
|
-
|
|
4076
|
-
|
|
4077
|
-
|
|
4078
|
-
return
|
|
4366
|
+
if (evaluator.type === "composite") {
|
|
4367
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path12.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
4368
|
+
const createEvaluator = (memberConfig) => {
|
|
4369
|
+
switch (memberConfig.type) {
|
|
4370
|
+
case "llm_judge":
|
|
4371
|
+
return evaluatorRegistry.llm_judge;
|
|
4372
|
+
case "code":
|
|
4373
|
+
return new CodeEvaluator({
|
|
4374
|
+
script: memberConfig.script,
|
|
4375
|
+
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
4376
|
+
agentTimeoutMs
|
|
4377
|
+
});
|
|
4378
|
+
case "composite":
|
|
4379
|
+
return new CompositeEvaluator({
|
|
4380
|
+
config: memberConfig,
|
|
4381
|
+
cwd: evalFileDir,
|
|
4382
|
+
evaluatorFactory: { create: createEvaluator }
|
|
4383
|
+
});
|
|
4384
|
+
default: {
|
|
4385
|
+
const unknownConfig = memberConfig;
|
|
4386
|
+
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
4079
4387
|
}
|
|
4080
|
-
return judgeProvider;
|
|
4081
4388
|
}
|
|
4389
|
+
};
|
|
4390
|
+
const compositeEvaluator = new CompositeEvaluator({
|
|
4391
|
+
config: evaluator,
|
|
4392
|
+
cwd: evalFileDir,
|
|
4393
|
+
evaluatorFactory: { create: createEvaluator }
|
|
4082
4394
|
});
|
|
4083
|
-
const score2 = await
|
|
4395
|
+
const score2 = await compositeEvaluator.evaluate({
|
|
4084
4396
|
evalCase,
|
|
4085
4397
|
candidate,
|
|
4086
4398
|
target,
|
|
@@ -4099,27 +4411,31 @@ async function runEvaluatorList(options) {
|
|
|
4099
4411
|
hits: score2.hits,
|
|
4100
4412
|
misses: score2.misses,
|
|
4101
4413
|
reasoning: score2.reasoning,
|
|
4102
|
-
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4414
|
+
evaluator_provider_request: score2.evaluatorRawRequest,
|
|
4415
|
+
evaluator_results: mapChildResults(score2.evaluatorResults)
|
|
4103
4416
|
});
|
|
4104
4417
|
}
|
|
4105
4418
|
} catch (error) {
|
|
4106
4419
|
const message = error instanceof Error ? error.message : String(error);
|
|
4107
4420
|
const fallbackScore = {
|
|
4108
4421
|
score: 0,
|
|
4422
|
+
verdict: "fail",
|
|
4109
4423
|
hits: [],
|
|
4110
4424
|
misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
|
|
4111
4425
|
expectedAspectCount: 1,
|
|
4112
4426
|
reasoning: message
|
|
4113
4427
|
};
|
|
4428
|
+
const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
|
|
4114
4429
|
scored.push({
|
|
4115
4430
|
score: fallbackScore,
|
|
4116
4431
|
name: evaluator.name ?? "unknown",
|
|
4117
|
-
type:
|
|
4432
|
+
type: resultType ?? "llm_judge"
|
|
4118
4433
|
});
|
|
4119
4434
|
evaluatorResults.push({
|
|
4120
4435
|
name: evaluator.name ?? "unknown",
|
|
4121
|
-
type:
|
|
4436
|
+
type: resultType ?? "llm_judge",
|
|
4122
4437
|
score: 0,
|
|
4438
|
+
verdict: "fail",
|
|
4123
4439
|
hits: [],
|
|
4124
4440
|
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
4125
4441
|
reasoning: message
|
|
@@ -4138,6 +4454,7 @@ async function runEvaluatorList(options) {
|
|
|
4138
4454
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
4139
4455
|
const score = {
|
|
4140
4456
|
score: aggregateScore,
|
|
4457
|
+
verdict: scoreToVerdict2(aggregateScore),
|
|
4141
4458
|
hits,
|
|
4142
4459
|
misses,
|
|
4143
4460
|
expectedAspectCount,
|
|
@@ -4188,6 +4505,15 @@ async function resolveCustomPrompt(config) {
|
|
|
4188
4505
|
function isNonEmptyString2(value) {
|
|
4189
4506
|
return typeof value === "string" && value.trim().length > 0;
|
|
4190
4507
|
}
|
|
4508
|
+
function scoreToVerdict2(score) {
|
|
4509
|
+
if (score >= 0.8) {
|
|
4510
|
+
return "pass";
|
|
4511
|
+
}
|
|
4512
|
+
if (score >= 0.6) {
|
|
4513
|
+
return "borderline";
|
|
4514
|
+
}
|
|
4515
|
+
return "fail";
|
|
4516
|
+
}
|
|
4191
4517
|
function filterEvalCases(evalCases, evalId) {
|
|
4192
4518
|
if (!evalId) {
|
|
4193
4519
|
return evalCases;
|
|
@@ -4325,6 +4651,23 @@ function isTimeoutLike(error) {
|
|
|
4325
4651
|
const value = String(error).toLowerCase();
|
|
4326
4652
|
return value.includes("timeout");
|
|
4327
4653
|
}
|
|
4654
|
+
function mapChildResults(children) {
|
|
4655
|
+
if (!children || children.length === 0) {
|
|
4656
|
+
return void 0;
|
|
4657
|
+
}
|
|
4658
|
+
return children.map((child) => ({
|
|
4659
|
+
name: child.name,
|
|
4660
|
+
type: child.type,
|
|
4661
|
+
score: child.score,
|
|
4662
|
+
weight: child.weight,
|
|
4663
|
+
verdict: child.verdict,
|
|
4664
|
+
hits: child.hits,
|
|
4665
|
+
misses: child.misses,
|
|
4666
|
+
reasoning: child.reasoning,
|
|
4667
|
+
evaluator_provider_request: child.evaluatorRawRequest,
|
|
4668
|
+
evaluator_results: mapChildResults(child.evaluatorResults)
|
|
4669
|
+
}));
|
|
4670
|
+
}
|
|
4328
4671
|
|
|
4329
4672
|
// src/evaluation/generators/rubric-generator.ts
|
|
4330
4673
|
import { generateText as generateText3 } from "ai";
|
|
@@ -4412,8 +4755,8 @@ function createAgentKernel() {
|
|
|
4412
4755
|
}
|
|
4413
4756
|
export {
|
|
4414
4757
|
CodeEvaluator,
|
|
4758
|
+
CompositeEvaluator,
|
|
4415
4759
|
LlmJudgeEvaluator,
|
|
4416
|
-
RubricEvaluator,
|
|
4417
4760
|
TEST_MESSAGE_ROLES,
|
|
4418
4761
|
buildDirectoryChain,
|
|
4419
4762
|
buildPromptInputs,
|