@agentv/core 0.22.1 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +595 -252
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +61 -22
- package/dist/index.d.ts +61 -22
- package/dist/index.js +594 -251
- package/dist/index.js.map +1 -1
- package/package.json +4 -8
package/dist/index.cjs
CHANGED
|
@@ -31,8 +31,8 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
|
+
CompositeEvaluator: () => CompositeEvaluator,
|
|
34
35
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
35
|
-
RubricEvaluator: () => RubricEvaluator,
|
|
36
36
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
37
37
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
38
38
|
buildPromptInputs: () => buildPromptInputs,
|
|
@@ -108,7 +108,7 @@ function isTestMessage(value) {
|
|
|
108
108
|
}
|
|
109
109
|
return candidate.content.every(isJsonObject);
|
|
110
110
|
}
|
|
111
|
-
var EVALUATOR_KIND_VALUES = ["
|
|
111
|
+
var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
|
|
112
112
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
113
113
|
function isEvaluatorKind(value) {
|
|
114
114
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -460,10 +460,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
460
460
|
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
461
461
|
continue;
|
|
462
462
|
}
|
|
463
|
-
if (typeValue === "
|
|
463
|
+
if (typeValue === "code_judge") {
|
|
464
464
|
const script = asString2(rawEvaluator.script);
|
|
465
465
|
if (!script) {
|
|
466
|
-
logWarning2(`Skipping
|
|
466
|
+
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
|
|
467
467
|
continue;
|
|
468
468
|
}
|
|
469
469
|
const cwd = asString2(rawEvaluator.cwd);
|
|
@@ -474,7 +474,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
474
474
|
resolvedCwd = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
475
475
|
} else {
|
|
476
476
|
logWarning2(
|
|
477
|
-
`
|
|
477
|
+
`Code_judge evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
478
478
|
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
479
479
|
);
|
|
480
480
|
}
|
|
@@ -490,6 +490,105 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
490
490
|
});
|
|
491
491
|
continue;
|
|
492
492
|
}
|
|
493
|
+
if (typeValue === "composite") {
|
|
494
|
+
const rawMembers = rawEvaluator.evaluators;
|
|
495
|
+
if (!Array.isArray(rawMembers)) {
|
|
496
|
+
logWarning2(
|
|
497
|
+
`Skipping composite evaluator '${name}' in '${evalId}': missing evaluators array`
|
|
498
|
+
);
|
|
499
|
+
continue;
|
|
500
|
+
}
|
|
501
|
+
const rawAggregator = rawEvaluator.aggregator;
|
|
502
|
+
if (!isJsonObject2(rawAggregator)) {
|
|
503
|
+
logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
|
|
504
|
+
continue;
|
|
505
|
+
}
|
|
506
|
+
const aggregatorType = asString2(rawAggregator.type);
|
|
507
|
+
if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
|
|
508
|
+
logWarning2(
|
|
509
|
+
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
510
|
+
);
|
|
511
|
+
continue;
|
|
512
|
+
}
|
|
513
|
+
const memberEvaluators = [];
|
|
514
|
+
for (const rawMember of rawMembers) {
|
|
515
|
+
if (!isJsonObject2(rawMember)) {
|
|
516
|
+
logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
|
|
517
|
+
continue;
|
|
518
|
+
}
|
|
519
|
+
const memberName = asString2(rawMember.name);
|
|
520
|
+
const memberType = rawMember.type;
|
|
521
|
+
if (!memberName || !isEvaluatorKind(memberType)) {
|
|
522
|
+
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
|
|
523
|
+
continue;
|
|
524
|
+
}
|
|
525
|
+
const memberConfigs = await parseEvaluators(
|
|
526
|
+
{ evaluators: [rawMember] },
|
|
527
|
+
void 0,
|
|
528
|
+
searchRoots,
|
|
529
|
+
`${evalId}:${name}:${memberName}`
|
|
530
|
+
);
|
|
531
|
+
if (memberConfigs && memberConfigs.length > 0) {
|
|
532
|
+
memberEvaluators.push(memberConfigs[0]);
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
if (memberEvaluators.length === 0) {
|
|
536
|
+
logWarning2(
|
|
537
|
+
`Skipping composite evaluator '${name}' in '${evalId}': no valid member evaluators`
|
|
538
|
+
);
|
|
539
|
+
continue;
|
|
540
|
+
}
|
|
541
|
+
let aggregator;
|
|
542
|
+
if (aggregatorType === "weighted_average") {
|
|
543
|
+
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
544
|
+
const parsedWeights = {};
|
|
545
|
+
if (weights) {
|
|
546
|
+
for (const [key, value] of Object.entries(weights)) {
|
|
547
|
+
if (typeof value === "number") {
|
|
548
|
+
parsedWeights[key] = value;
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
aggregator = {
|
|
553
|
+
type: "weighted_average",
|
|
554
|
+
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
555
|
+
};
|
|
556
|
+
} else if (aggregatorType === "code_judge") {
|
|
557
|
+
const aggregatorPath = asString2(rawAggregator.path);
|
|
558
|
+
if (!aggregatorPath) {
|
|
559
|
+
logWarning2(
|
|
560
|
+
`Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
|
|
561
|
+
);
|
|
562
|
+
continue;
|
|
563
|
+
}
|
|
564
|
+
aggregator = {
|
|
565
|
+
type: "code_judge",
|
|
566
|
+
path: aggregatorPath,
|
|
567
|
+
cwd: searchRoots[0]
|
|
568
|
+
};
|
|
569
|
+
} else {
|
|
570
|
+
const aggregatorPrompt = asString2(rawAggregator.prompt);
|
|
571
|
+
let promptPath2;
|
|
572
|
+
if (aggregatorPrompt) {
|
|
573
|
+
const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
|
|
574
|
+
if (resolved.resolvedPath) {
|
|
575
|
+
promptPath2 = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
aggregator = {
|
|
579
|
+
type: "llm_judge",
|
|
580
|
+
...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
|
|
581
|
+
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
582
|
+
};
|
|
583
|
+
}
|
|
584
|
+
evaluators.push({
|
|
585
|
+
name,
|
|
586
|
+
type: "composite",
|
|
587
|
+
evaluators: memberEvaluators,
|
|
588
|
+
aggregator
|
|
589
|
+
});
|
|
590
|
+
continue;
|
|
591
|
+
}
|
|
493
592
|
const prompt = asString2(rawEvaluator.prompt);
|
|
494
593
|
let promptPath;
|
|
495
594
|
if (prompt) {
|
|
@@ -510,25 +609,25 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
510
609
|
}
|
|
511
610
|
}
|
|
512
611
|
const _model = asString2(rawEvaluator.model);
|
|
612
|
+
const rawRubrics = rawEvaluator.rubrics;
|
|
613
|
+
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
614
|
+
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
615
|
+
description: asString2(rubric.description) ?? "",
|
|
616
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
617
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
618
|
+
})).filter((r) => r.description.length > 0) : void 0;
|
|
513
619
|
if (typeValue === "rubric") {
|
|
514
|
-
|
|
515
|
-
if (!Array.isArray(rubrics)) {
|
|
620
|
+
if (!parsedRubrics) {
|
|
516
621
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': missing rubrics array`);
|
|
517
622
|
continue;
|
|
518
623
|
}
|
|
519
|
-
const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
520
|
-
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
521
|
-
description: asString2(rubric.description) ?? "",
|
|
522
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
523
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
524
|
-
})).filter((r) => r.description.length > 0);
|
|
525
624
|
if (parsedRubrics.length === 0) {
|
|
526
625
|
logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
|
|
527
626
|
continue;
|
|
528
627
|
}
|
|
529
628
|
evaluators.push({
|
|
530
629
|
name,
|
|
531
|
-
type: "
|
|
630
|
+
type: "llm_judge",
|
|
532
631
|
rubrics: parsedRubrics
|
|
533
632
|
});
|
|
534
633
|
continue;
|
|
@@ -537,7 +636,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
537
636
|
name,
|
|
538
637
|
type: "llm_judge",
|
|
539
638
|
prompt,
|
|
540
|
-
promptPath
|
|
639
|
+
promptPath,
|
|
640
|
+
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
|
|
541
641
|
});
|
|
542
642
|
}
|
|
543
643
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -1088,7 +1188,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
1088
1188
|
if (rubricItems.length > 0) {
|
|
1089
1189
|
const rubricEvaluator = {
|
|
1090
1190
|
name: "rubric",
|
|
1091
|
-
type: "
|
|
1191
|
+
type: "llm_judge",
|
|
1092
1192
|
rubrics: rubricItems
|
|
1093
1193
|
};
|
|
1094
1194
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
@@ -3621,149 +3721,9 @@ function resolveAndCreateProvider(definition, env = process.env) {
|
|
|
3621
3721
|
return createProvider(resolved);
|
|
3622
3722
|
}
|
|
3623
3723
|
|
|
3624
|
-
// src/evaluation/evaluators
|
|
3724
|
+
// src/evaluation/evaluators.ts
|
|
3625
3725
|
var import_ai2 = require("ai");
|
|
3626
3726
|
var import_zod2 = require("zod");
|
|
3627
|
-
var rubricCheckResultSchema = import_zod2.z.object({
|
|
3628
|
-
id: import_zod2.z.string().describe("The ID of the rubric item being checked"),
|
|
3629
|
-
satisfied: import_zod2.z.boolean().describe("Whether this rubric requirement is met"),
|
|
3630
|
-
reasoning: import_zod2.z.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
3631
|
-
});
|
|
3632
|
-
var rubricEvaluationSchema = import_zod2.z.object({
|
|
3633
|
-
checks: import_zod2.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
3634
|
-
overall_reasoning: import_zod2.z.string().describe("Overall assessment summary (1-2 sentences)")
|
|
3635
|
-
});
|
|
3636
|
-
var RubricEvaluator = class {
|
|
3637
|
-
kind = "rubric";
|
|
3638
|
-
config;
|
|
3639
|
-
resolveJudgeProvider;
|
|
3640
|
-
constructor(options) {
|
|
3641
|
-
this.config = options.config;
|
|
3642
|
-
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
3643
|
-
}
|
|
3644
|
-
async evaluate(context) {
|
|
3645
|
-
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
3646
|
-
if (!judgeProvider) {
|
|
3647
|
-
throw new Error("No judge provider available for rubric evaluation");
|
|
3648
|
-
}
|
|
3649
|
-
if (!this.config.rubrics || this.config.rubrics.length === 0) {
|
|
3650
|
-
throw new Error(
|
|
3651
|
-
`No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
|
|
3652
|
-
);
|
|
3653
|
-
}
|
|
3654
|
-
const prompt = this.buildPrompt(context, this.config.rubrics);
|
|
3655
|
-
const model = judgeProvider.asLanguageModel?.();
|
|
3656
|
-
if (!model) {
|
|
3657
|
-
throw new Error("Judge provider does not support language model interface");
|
|
3658
|
-
}
|
|
3659
|
-
const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
3660
|
-
You must return a valid JSON object matching this schema:
|
|
3661
|
-
{
|
|
3662
|
-
"checks": [
|
|
3663
|
-
{
|
|
3664
|
-
"id": "string (rubric id)",
|
|
3665
|
-
"satisfied": boolean,
|
|
3666
|
-
"reasoning": "string (brief explanation)"
|
|
3667
|
-
}
|
|
3668
|
-
],
|
|
3669
|
-
"overall_reasoning": "string (summary)"
|
|
3670
|
-
}`;
|
|
3671
|
-
let result;
|
|
3672
|
-
let lastError;
|
|
3673
|
-
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
3674
|
-
try {
|
|
3675
|
-
const { text } = await (0, import_ai2.generateText)({
|
|
3676
|
-
model,
|
|
3677
|
-
system,
|
|
3678
|
-
prompt
|
|
3679
|
-
});
|
|
3680
|
-
const cleaned = text.replace(/```json\n?|```/g, "").trim();
|
|
3681
|
-
result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
|
|
3682
|
-
break;
|
|
3683
|
-
} catch (e) {
|
|
3684
|
-
lastError = e instanceof Error ? e : new Error(String(e));
|
|
3685
|
-
}
|
|
3686
|
-
}
|
|
3687
|
-
if (!result) {
|
|
3688
|
-
throw new Error(
|
|
3689
|
-
`Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
|
|
3690
|
-
);
|
|
3691
|
-
}
|
|
3692
|
-
const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
|
|
3693
|
-
return {
|
|
3694
|
-
score,
|
|
3695
|
-
verdict,
|
|
3696
|
-
hits,
|
|
3697
|
-
misses,
|
|
3698
|
-
expectedAspectCount: this.config.rubrics.length,
|
|
3699
|
-
reasoning: result.overall_reasoning,
|
|
3700
|
-
evaluatorRawRequest: {
|
|
3701
|
-
prompt
|
|
3702
|
-
}
|
|
3703
|
-
};
|
|
3704
|
-
}
|
|
3705
|
-
buildPrompt(context, rubrics) {
|
|
3706
|
-
const parts = [
|
|
3707
|
-
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
3708
|
-
"",
|
|
3709
|
-
"[[ ## question ## ]]",
|
|
3710
|
-
context.evalCase.question,
|
|
3711
|
-
"",
|
|
3712
|
-
"[[ ## expected_outcome ## ]]",
|
|
3713
|
-
context.evalCase.expected_outcome,
|
|
3714
|
-
""
|
|
3715
|
-
];
|
|
3716
|
-
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
3717
|
-
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
3718
|
-
}
|
|
3719
|
-
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
3720
|
-
for (const rubric of rubrics) {
|
|
3721
|
-
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
3722
|
-
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
3723
|
-
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
3724
|
-
}
|
|
3725
|
-
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
3726
|
-
return parts.join("\n");
|
|
3727
|
-
}
|
|
3728
|
-
calculateScore(result, rubrics) {
|
|
3729
|
-
const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
|
|
3730
|
-
const hits = [];
|
|
3731
|
-
const misses = [];
|
|
3732
|
-
let totalWeight = 0;
|
|
3733
|
-
let earnedWeight = 0;
|
|
3734
|
-
let failedRequired = false;
|
|
3735
|
-
for (const check of result.checks) {
|
|
3736
|
-
const rubric = rubricMap.get(check.id);
|
|
3737
|
-
if (!rubric) {
|
|
3738
|
-
continue;
|
|
3739
|
-
}
|
|
3740
|
-
totalWeight += rubric.weight;
|
|
3741
|
-
if (check.satisfied) {
|
|
3742
|
-
earnedWeight += rubric.weight;
|
|
3743
|
-
hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3744
|
-
} else {
|
|
3745
|
-
misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
3746
|
-
if (rubric.required) {
|
|
3747
|
-
failedRequired = true;
|
|
3748
|
-
}
|
|
3749
|
-
}
|
|
3750
|
-
}
|
|
3751
|
-
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
3752
|
-
let verdict;
|
|
3753
|
-
if (failedRequired) {
|
|
3754
|
-
verdict = "fail";
|
|
3755
|
-
} else if (score >= 0.8) {
|
|
3756
|
-
verdict = "pass";
|
|
3757
|
-
} else if (score >= 0.6) {
|
|
3758
|
-
verdict = "borderline";
|
|
3759
|
-
} else {
|
|
3760
|
-
verdict = "fail";
|
|
3761
|
-
}
|
|
3762
|
-
return { score, verdict, hits, misses };
|
|
3763
|
-
}
|
|
3764
|
-
};
|
|
3765
|
-
|
|
3766
|
-
// src/evaluation/evaluators.ts
|
|
3767
3727
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
3768
3728
|
|
|
3769
3729
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -3781,6 +3741,21 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
3781
3741
|
|
|
3782
3742
|
[[ ## candidate_answer ## ]]
|
|
3783
3743
|
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
3744
|
+
var freeformEvaluationSchema = import_zod2.z.object({
|
|
3745
|
+
score: import_zod2.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
3746
|
+
hits: import_zod2.z.array(import_zod2.z.string()).describe("Brief specific achievements").optional(),
|
|
3747
|
+
misses: import_zod2.z.array(import_zod2.z.string()).describe("Brief failures or omissions").optional(),
|
|
3748
|
+
reasoning: import_zod2.z.string().describe("Concise explanation (1-2 sentences)").optional()
|
|
3749
|
+
});
|
|
3750
|
+
var rubricCheckResultSchema = import_zod2.z.object({
|
|
3751
|
+
id: import_zod2.z.string().describe("The ID of the rubric item being checked"),
|
|
3752
|
+
satisfied: import_zod2.z.boolean().describe("Whether this rubric requirement is met"),
|
|
3753
|
+
reasoning: import_zod2.z.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
3754
|
+
});
|
|
3755
|
+
var rubricEvaluationSchema = import_zod2.z.object({
|
|
3756
|
+
checks: import_zod2.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
3757
|
+
overall_reasoning: import_zod2.z.string().describe("Overall assessment summary (1-2 sentences)")
|
|
3758
|
+
});
|
|
3784
3759
|
var LlmJudgeEvaluator = class {
|
|
3785
3760
|
kind = "llm_judge";
|
|
3786
3761
|
resolveJudgeProvider;
|
|
@@ -3798,9 +3773,13 @@ var LlmJudgeEvaluator = class {
|
|
|
3798
3773
|
if (!judgeProvider) {
|
|
3799
3774
|
throw new Error("No judge provider available for LLM grading");
|
|
3800
3775
|
}
|
|
3801
|
-
|
|
3776
|
+
const config = context.evaluator;
|
|
3777
|
+
if (config?.type === "llm_judge" && config.rubrics && config.rubrics.length > 0) {
|
|
3778
|
+
return this.evaluateWithRubrics(context, judgeProvider, config.rubrics);
|
|
3779
|
+
}
|
|
3780
|
+
return this.evaluateFreeform(context, judgeProvider);
|
|
3802
3781
|
}
|
|
3803
|
-
async
|
|
3782
|
+
async evaluateFreeform(context, judgeProvider) {
|
|
3804
3783
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3805
3784
|
const variables = {
|
|
3806
3785
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
@@ -3817,34 +3796,132 @@ var LlmJudgeEvaluator = class {
|
|
|
3817
3796
|
const systemPrompt = buildOutputSchema();
|
|
3818
3797
|
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
3819
3798
|
const userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
3820
|
-
const response = await judgeProvider.invoke({
|
|
3821
|
-
question: userPrompt,
|
|
3822
|
-
systemPrompt,
|
|
3823
|
-
evalCaseId: context.evalCase.id,
|
|
3824
|
-
attempt: context.attempt,
|
|
3825
|
-
maxOutputTokens: this.maxOutputTokens,
|
|
3826
|
-
temperature: this.temperature
|
|
3827
|
-
});
|
|
3828
|
-
const parsed = parseQualityResponse(response);
|
|
3829
|
-
const score = clampScore(parsed.score ?? 0);
|
|
3830
|
-
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3831
|
-
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3832
|
-
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
3833
|
-
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
3834
3799
|
const evaluatorRawRequest = {
|
|
3835
3800
|
userPrompt,
|
|
3836
3801
|
systemPrompt,
|
|
3837
3802
|
target: judgeProvider.targetName
|
|
3838
3803
|
};
|
|
3804
|
+
try {
|
|
3805
|
+
const { data, providerResponse } = await this.runWithRetry({
|
|
3806
|
+
context,
|
|
3807
|
+
judgeProvider,
|
|
3808
|
+
systemPrompt,
|
|
3809
|
+
userPrompt,
|
|
3810
|
+
schema: freeformEvaluationSchema
|
|
3811
|
+
});
|
|
3812
|
+
const score = clampScore(data.score);
|
|
3813
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3814
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3815
|
+
const reasoning = data.reasoning ?? providerResponse?.reasoning;
|
|
3816
|
+
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
3817
|
+
return {
|
|
3818
|
+
score,
|
|
3819
|
+
verdict: scoreToVerdict(score),
|
|
3820
|
+
hits,
|
|
3821
|
+
misses,
|
|
3822
|
+
expectedAspectCount,
|
|
3823
|
+
reasoning,
|
|
3824
|
+
evaluatorRawRequest
|
|
3825
|
+
};
|
|
3826
|
+
} catch {
|
|
3827
|
+
return {
|
|
3828
|
+
score: 0,
|
|
3829
|
+
verdict: "fail",
|
|
3830
|
+
hits: [],
|
|
3831
|
+
misses: [],
|
|
3832
|
+
expectedAspectCount: 1,
|
|
3833
|
+
evaluatorRawRequest
|
|
3834
|
+
};
|
|
3835
|
+
}
|
|
3836
|
+
}
|
|
3837
|
+
async evaluateWithRubrics(context, judgeProvider, rubrics) {
|
|
3838
|
+
if (!rubrics || rubrics.length === 0) {
|
|
3839
|
+
throw new Error(
|
|
3840
|
+
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
|
|
3841
|
+
);
|
|
3842
|
+
}
|
|
3843
|
+
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
3844
|
+
const systemPrompt = buildRubricOutputSchema();
|
|
3845
|
+
const evaluatorRawRequest = {
|
|
3846
|
+
userPrompt: prompt,
|
|
3847
|
+
systemPrompt,
|
|
3848
|
+
target: judgeProvider.targetName
|
|
3849
|
+
};
|
|
3850
|
+
const { data } = await this.runWithRetry({
|
|
3851
|
+
context,
|
|
3852
|
+
judgeProvider,
|
|
3853
|
+
systemPrompt,
|
|
3854
|
+
userPrompt: prompt,
|
|
3855
|
+
schema: rubricEvaluationSchema
|
|
3856
|
+
});
|
|
3857
|
+
const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
|
|
3839
3858
|
return {
|
|
3840
3859
|
score,
|
|
3860
|
+
verdict,
|
|
3841
3861
|
hits,
|
|
3842
3862
|
misses,
|
|
3843
|
-
expectedAspectCount,
|
|
3844
|
-
reasoning,
|
|
3863
|
+
expectedAspectCount: rubrics.length,
|
|
3864
|
+
reasoning: data.overall_reasoning,
|
|
3845
3865
|
evaluatorRawRequest
|
|
3846
3866
|
};
|
|
3847
3867
|
}
|
|
3868
|
+
buildRubricPrompt(context, rubrics) {
|
|
3869
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
3870
|
+
const parts = [
|
|
3871
|
+
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
3872
|
+
"",
|
|
3873
|
+
"[[ ## question ## ]]",
|
|
3874
|
+
formattedQuestion,
|
|
3875
|
+
"",
|
|
3876
|
+
"[[ ## expected_outcome ## ]]",
|
|
3877
|
+
context.evalCase.expected_outcome,
|
|
3878
|
+
""
|
|
3879
|
+
];
|
|
3880
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
3881
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
3882
|
+
}
|
|
3883
|
+
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
3884
|
+
for (const rubric of rubrics) {
|
|
3885
|
+
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
3886
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
3887
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
3888
|
+
}
|
|
3889
|
+
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
3890
|
+
return parts.join("\n");
|
|
3891
|
+
}
|
|
3892
|
+
async runWithRetry(options) {
|
|
3893
|
+
const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;
|
|
3894
|
+
let lastError;
|
|
3895
|
+
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
3896
|
+
try {
|
|
3897
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
3898
|
+
if (model) {
|
|
3899
|
+
const { text } = await (0, import_ai2.generateText)({
|
|
3900
|
+
model,
|
|
3901
|
+
system: systemPrompt,
|
|
3902
|
+
prompt: userPrompt,
|
|
3903
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
3904
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
3905
|
+
});
|
|
3906
|
+
const data2 = schema.parse(parseJsonFromText(text));
|
|
3907
|
+
return { data: data2 };
|
|
3908
|
+
}
|
|
3909
|
+
const response = await judgeProvider.invoke({
|
|
3910
|
+
question: userPrompt,
|
|
3911
|
+
systemPrompt,
|
|
3912
|
+
evalCaseId: context.evalCase.id,
|
|
3913
|
+
attempt: context.attempt,
|
|
3914
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
3915
|
+
temperature: this.temperature
|
|
3916
|
+
});
|
|
3917
|
+
const data = schema.parse(parseJsonFromText(response.text ?? ""));
|
|
3918
|
+
return { data, providerResponse: response };
|
|
3919
|
+
} catch (e) {
|
|
3920
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
3921
|
+
}
|
|
3922
|
+
}
|
|
3923
|
+
throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
|
|
3924
|
+
}
|
|
3848
3925
|
};
|
|
3849
3926
|
function buildOutputSchema() {
|
|
3850
3927
|
return [
|
|
@@ -3858,6 +3935,29 @@ function buildOutputSchema() {
|
|
|
3858
3935
|
"}"
|
|
3859
3936
|
].join("\n");
|
|
3860
3937
|
}
|
|
3938
|
+
function buildRubricOutputSchema() {
|
|
3939
|
+
return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
3940
|
+
You must return a valid JSON object matching this schema:
|
|
3941
|
+
{
|
|
3942
|
+
"checks": [
|
|
3943
|
+
{
|
|
3944
|
+
"id": "string (rubric id)",
|
|
3945
|
+
"satisfied": boolean,
|
|
3946
|
+
"reasoning": "string (brief explanation)"
|
|
3947
|
+
}
|
|
3948
|
+
],
|
|
3949
|
+
"overall_reasoning": "string (summary)"
|
|
3950
|
+
}`;
|
|
3951
|
+
}
|
|
3952
|
+
function scoreToVerdict(score) {
|
|
3953
|
+
if (score >= 0.8) {
|
|
3954
|
+
return "pass";
|
|
3955
|
+
}
|
|
3956
|
+
if (score >= 0.6) {
|
|
3957
|
+
return "borderline";
|
|
3958
|
+
}
|
|
3959
|
+
return "fail";
|
|
3960
|
+
}
|
|
3861
3961
|
function clampScore(value) {
|
|
3862
3962
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
3863
3963
|
return 0;
|
|
@@ -3870,71 +3970,15 @@ function clampScore(value) {
|
|
|
3870
3970
|
}
|
|
3871
3971
|
return value;
|
|
3872
3972
|
}
|
|
3873
|
-
function parseQualityResponse(response) {
|
|
3874
|
-
const text = typeof response.text === "string" ? response.text.trim() : "";
|
|
3875
|
-
if (text.length === 0) {
|
|
3876
|
-
return {};
|
|
3877
|
-
}
|
|
3878
|
-
const direct = attemptParseJson(text);
|
|
3879
|
-
if (direct && validateQualityJson(direct)) {
|
|
3880
|
-
return direct;
|
|
3881
|
-
}
|
|
3882
|
-
const extracted = extractJsonBlob(text);
|
|
3883
|
-
if (extracted) {
|
|
3884
|
-
const parsed = attemptParseJson(extracted);
|
|
3885
|
-
if (parsed && validateQualityJson(parsed)) {
|
|
3886
|
-
return parsed;
|
|
3887
|
-
}
|
|
3888
|
-
}
|
|
3889
|
-
return {};
|
|
3890
|
-
}
|
|
3891
|
-
function attemptParseJson(text) {
|
|
3892
|
-
try {
|
|
3893
|
-
const parsed = JSON.parse(text);
|
|
3894
|
-
const score = typeof parsed.score === "number" ? parsed.score : void 0;
|
|
3895
|
-
const hits = parsed.hits;
|
|
3896
|
-
const misses = parsed.misses;
|
|
3897
|
-
const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
|
|
3898
|
-
return { score, hits, misses, reasoning };
|
|
3899
|
-
} catch {
|
|
3900
|
-
return void 0;
|
|
3901
|
-
}
|
|
3902
|
-
}
|
|
3903
|
-
function validateQualityJson(parsed) {
|
|
3904
|
-
if (typeof parsed.score !== "number") {
|
|
3905
|
-
return false;
|
|
3906
|
-
}
|
|
3907
|
-
if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
|
|
3908
|
-
return false;
|
|
3909
|
-
}
|
|
3910
|
-
if (parsed.score < 0 || parsed.score > 1) {
|
|
3911
|
-
return false;
|
|
3912
|
-
}
|
|
3913
|
-
if (parsed.hits !== void 0) {
|
|
3914
|
-
if (!Array.isArray(parsed.hits)) {
|
|
3915
|
-
return false;
|
|
3916
|
-
}
|
|
3917
|
-
if (!parsed.hits.every((item) => typeof item === "string")) {
|
|
3918
|
-
return false;
|
|
3919
|
-
}
|
|
3920
|
-
}
|
|
3921
|
-
if (parsed.misses !== void 0) {
|
|
3922
|
-
if (!Array.isArray(parsed.misses)) {
|
|
3923
|
-
return false;
|
|
3924
|
-
}
|
|
3925
|
-
if (!parsed.misses.every((item) => typeof item === "string")) {
|
|
3926
|
-
return false;
|
|
3927
|
-
}
|
|
3928
|
-
}
|
|
3929
|
-
if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
|
|
3930
|
-
return false;
|
|
3931
|
-
}
|
|
3932
|
-
return true;
|
|
3933
|
-
}
|
|
3934
3973
|
function extractJsonBlob(text) {
|
|
3935
3974
|
const match = text.match(/\{[\s\S]*\}/);
|
|
3936
3975
|
return match?.[0];
|
|
3937
3976
|
}
|
|
3977
|
+
function parseJsonFromText(text) {
|
|
3978
|
+
const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
|
|
3979
|
+
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
3980
|
+
return JSON.parse(blob);
|
|
3981
|
+
}
|
|
3938
3982
|
function isNonEmptyString(value) {
|
|
3939
3983
|
return typeof value === "string" && value.trim().length > 0;
|
|
3940
3984
|
}
|
|
@@ -3971,6 +4015,7 @@ var CodeEvaluator = class {
|
|
|
3971
4015
|
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
3972
4016
|
return {
|
|
3973
4017
|
score,
|
|
4018
|
+
verdict: scoreToVerdict(score),
|
|
3974
4019
|
hits,
|
|
3975
4020
|
misses,
|
|
3976
4021
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
@@ -3984,6 +4029,7 @@ var CodeEvaluator = class {
|
|
|
3984
4029
|
const message = error instanceof Error ? error.message : String(error);
|
|
3985
4030
|
return {
|
|
3986
4031
|
score: 0,
|
|
4032
|
+
verdict: "fail",
|
|
3987
4033
|
hits: [],
|
|
3988
4034
|
misses: [`Code evaluator failed: ${message}`],
|
|
3989
4035
|
expectedAspectCount: 1,
|
|
@@ -3997,6 +4043,33 @@ var CodeEvaluator = class {
|
|
|
3997
4043
|
}
|
|
3998
4044
|
}
|
|
3999
4045
|
};
|
|
4046
|
+
function calculateRubricScore(result, rubrics) {
|
|
4047
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
4048
|
+
const hits = [];
|
|
4049
|
+
const misses = [];
|
|
4050
|
+
let totalWeight = 0;
|
|
4051
|
+
let earnedWeight = 0;
|
|
4052
|
+
let failedRequired = false;
|
|
4053
|
+
for (const check of result.checks) {
|
|
4054
|
+
const rubric = rubricMap.get(check.id);
|
|
4055
|
+
if (!rubric) {
|
|
4056
|
+
continue;
|
|
4057
|
+
}
|
|
4058
|
+
totalWeight += rubric.weight;
|
|
4059
|
+
if (check.satisfied) {
|
|
4060
|
+
earnedWeight += rubric.weight;
|
|
4061
|
+
hits.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
4062
|
+
} else {
|
|
4063
|
+
misses.push(`[${rubric.id}] ${rubric.description}: ${check.reasoning}`);
|
|
4064
|
+
if (rubric.required) {
|
|
4065
|
+
failedRequired = true;
|
|
4066
|
+
}
|
|
4067
|
+
}
|
|
4068
|
+
}
|
|
4069
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
4070
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
4071
|
+
return { score, verdict, hits, misses };
|
|
4072
|
+
}
|
|
4000
4073
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
4001
4074
|
const { spawn: spawn2 } = await import("child_process");
|
|
4002
4075
|
return await new Promise((resolve, reject) => {
|
|
@@ -4048,6 +4121,228 @@ function substituteVariables(template, variables) {
|
|
|
4048
4121
|
return variables[varName] ?? match;
|
|
4049
4122
|
});
|
|
4050
4123
|
}
|
|
4124
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
4125
|
+
{{EVALUATOR_RESULTS_JSON}}
|
|
4126
|
+
|
|
4127
|
+
Decide the final score and verdict based on all evaluator results.
|
|
4128
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
4129
|
+
var CompositeEvaluator = class {
|
|
4130
|
+
kind = "composite";
|
|
4131
|
+
config;
|
|
4132
|
+
evaluatorFactory;
|
|
4133
|
+
cwd;
|
|
4134
|
+
constructor(options) {
|
|
4135
|
+
this.config = options.config;
|
|
4136
|
+
this.evaluatorFactory = options.evaluatorFactory;
|
|
4137
|
+
this.cwd = options.cwd;
|
|
4138
|
+
}
|
|
4139
|
+
async evaluate(context) {
|
|
4140
|
+
const memberResults = await Promise.all(
|
|
4141
|
+
this.config.evaluators.map(async (memberConfig) => {
|
|
4142
|
+
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
4143
|
+
return {
|
|
4144
|
+
id: memberConfig.name,
|
|
4145
|
+
type: memberConfig.type,
|
|
4146
|
+
result: await evaluator.evaluate(context)
|
|
4147
|
+
};
|
|
4148
|
+
})
|
|
4149
|
+
);
|
|
4150
|
+
return this.aggregate(memberResults, context);
|
|
4151
|
+
}
|
|
4152
|
+
async aggregate(results, context) {
|
|
4153
|
+
const aggregator = this.config.aggregator;
|
|
4154
|
+
switch (aggregator.type) {
|
|
4155
|
+
case "code_judge":
|
|
4156
|
+
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
4157
|
+
case "llm_judge":
|
|
4158
|
+
return this.runLlmAggregator(results, context, aggregator);
|
|
4159
|
+
default:
|
|
4160
|
+
return this.runWeightedAverage(results, aggregator.weights);
|
|
4161
|
+
}
|
|
4162
|
+
}
|
|
4163
|
+
runWeightedAverage(results, weights) {
|
|
4164
|
+
let totalWeight = 0;
|
|
4165
|
+
let weightedSum = 0;
|
|
4166
|
+
const allHits = [];
|
|
4167
|
+
const allMisses = [];
|
|
4168
|
+
const reasoningParts = [];
|
|
4169
|
+
const evaluatorResults = [];
|
|
4170
|
+
for (const member of results) {
|
|
4171
|
+
const weight = weights?.[member.id] ?? 1;
|
|
4172
|
+
totalWeight += weight;
|
|
4173
|
+
weightedSum += member.result.score * weight;
|
|
4174
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
4175
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
4176
|
+
if (member.result.reasoning) {
|
|
4177
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
4178
|
+
}
|
|
4179
|
+
evaluatorResults.push({
|
|
4180
|
+
name: member.id,
|
|
4181
|
+
type: member.type,
|
|
4182
|
+
score: member.result.score,
|
|
4183
|
+
weight,
|
|
4184
|
+
verdict: member.result.verdict,
|
|
4185
|
+
hits: [...member.result.hits],
|
|
4186
|
+
misses: [...member.result.misses],
|
|
4187
|
+
reasoning: member.result.reasoning,
|
|
4188
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
4189
|
+
evaluatorResults: member.result.evaluatorResults
|
|
4190
|
+
});
|
|
4191
|
+
}
|
|
4192
|
+
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
4193
|
+
return {
|
|
4194
|
+
score: clampScore(finalScore),
|
|
4195
|
+
verdict: scoreToVerdict(finalScore),
|
|
4196
|
+
hits: allHits,
|
|
4197
|
+
misses: allMisses,
|
|
4198
|
+
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
4199
|
+
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
4200
|
+
evaluatorRawRequest: {
|
|
4201
|
+
aggregator: "weighted_average",
|
|
4202
|
+
...weights ? { weights } : {}
|
|
4203
|
+
},
|
|
4204
|
+
evaluatorResults
|
|
4205
|
+
};
|
|
4206
|
+
}
|
|
4207
|
+
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
4208
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
4209
|
+
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
4210
|
+
const evaluatorResults = results.map((member) => ({
|
|
4211
|
+
name: member.id,
|
|
4212
|
+
type: member.type,
|
|
4213
|
+
score: member.result.score,
|
|
4214
|
+
weight: weights?.[member.id] ?? 1,
|
|
4215
|
+
verdict: member.result.verdict,
|
|
4216
|
+
hits: [...member.result.hits],
|
|
4217
|
+
misses: [...member.result.misses],
|
|
4218
|
+
reasoning: member.result.reasoning,
|
|
4219
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
4220
|
+
evaluatorResults: member.result.evaluatorResults
|
|
4221
|
+
}));
|
|
4222
|
+
try {
|
|
4223
|
+
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
4224
|
+
const parsed = parseJsonSafe(stdout);
|
|
4225
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
4226
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
4227
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
4228
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
4229
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
4230
|
+
return {
|
|
4231
|
+
score,
|
|
4232
|
+
verdict,
|
|
4233
|
+
hits,
|
|
4234
|
+
misses,
|
|
4235
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
4236
|
+
reasoning,
|
|
4237
|
+
evaluatorRawRequest: {
|
|
4238
|
+
aggregator: "code_judge",
|
|
4239
|
+
script: scriptPath
|
|
4240
|
+
},
|
|
4241
|
+
evaluatorResults
|
|
4242
|
+
};
|
|
4243
|
+
} catch (error) {
|
|
4244
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
4245
|
+
return {
|
|
4246
|
+
score: 0,
|
|
4247
|
+
verdict: "fail",
|
|
4248
|
+
hits: [],
|
|
4249
|
+
misses: [`Code aggregator failed: ${message}`],
|
|
4250
|
+
expectedAspectCount: 1,
|
|
4251
|
+
reasoning: message,
|
|
4252
|
+
evaluatorRawRequest: {
|
|
4253
|
+
aggregator: "code_judge",
|
|
4254
|
+
script: scriptPath,
|
|
4255
|
+
error: message
|
|
4256
|
+
},
|
|
4257
|
+
evaluatorResults
|
|
4258
|
+
};
|
|
4259
|
+
}
|
|
4260
|
+
}
|
|
4261
|
+
async runLlmAggregator(results, context, config) {
|
|
4262
|
+
const judgeProvider = context.judgeProvider;
|
|
4263
|
+
if (!judgeProvider) {
|
|
4264
|
+
throw new Error("No judge provider available for LLM aggregation");
|
|
4265
|
+
}
|
|
4266
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
4267
|
+
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
4268
|
+
const evaluatorResults = results.map((member) => ({
|
|
4269
|
+
name: member.id,
|
|
4270
|
+
type: member.type,
|
|
4271
|
+
score: member.result.score,
|
|
4272
|
+
verdict: member.result.verdict,
|
|
4273
|
+
hits: [...member.result.hits],
|
|
4274
|
+
misses: [...member.result.misses],
|
|
4275
|
+
reasoning: member.result.reasoning,
|
|
4276
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
4277
|
+
evaluatorResults: member.result.evaluatorResults
|
|
4278
|
+
}));
|
|
4279
|
+
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
4280
|
+
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
4281
|
+
const systemPrompt = buildOutputSchema();
|
|
4282
|
+
const evaluatorRawRequest = {
|
|
4283
|
+
aggregator: "llm_judge",
|
|
4284
|
+
userPrompt,
|
|
4285
|
+
systemPrompt,
|
|
4286
|
+
target: judgeProvider.targetName
|
|
4287
|
+
};
|
|
4288
|
+
try {
|
|
4289
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
4290
|
+
if (model) {
|
|
4291
|
+
const { text } = await (0, import_ai2.generateText)({
|
|
4292
|
+
model,
|
|
4293
|
+
system: systemPrompt,
|
|
4294
|
+
prompt: userPrompt
|
|
4295
|
+
});
|
|
4296
|
+
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
4297
|
+
const score2 = clampScore(data2.score);
|
|
4298
|
+
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4299
|
+
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4300
|
+
const reasoning2 = data2.reasoning;
|
|
4301
|
+
return {
|
|
4302
|
+
score: score2,
|
|
4303
|
+
verdict: scoreToVerdict(score2),
|
|
4304
|
+
hits: hits2,
|
|
4305
|
+
misses: misses2,
|
|
4306
|
+
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
4307
|
+
reasoning: reasoning2,
|
|
4308
|
+
evaluatorRawRequest,
|
|
4309
|
+
evaluatorResults
|
|
4310
|
+
};
|
|
4311
|
+
}
|
|
4312
|
+
const response = await judgeProvider.invoke({
|
|
4313
|
+
question: userPrompt,
|
|
4314
|
+
systemPrompt,
|
|
4315
|
+
evalCaseId: context.evalCase.id,
|
|
4316
|
+
attempt: context.attempt
|
|
4317
|
+
});
|
|
4318
|
+
const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
|
|
4319
|
+
const score = clampScore(data.score);
|
|
4320
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4321
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4322
|
+
const reasoning = data.reasoning ?? response.reasoning;
|
|
4323
|
+
return {
|
|
4324
|
+
score,
|
|
4325
|
+
verdict: scoreToVerdict(score),
|
|
4326
|
+
hits,
|
|
4327
|
+
misses,
|
|
4328
|
+
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
4329
|
+
reasoning,
|
|
4330
|
+
evaluatorRawRequest,
|
|
4331
|
+
evaluatorResults
|
|
4332
|
+
};
|
|
4333
|
+
} catch {
|
|
4334
|
+
return {
|
|
4335
|
+
score: 0,
|
|
4336
|
+
verdict: "fail",
|
|
4337
|
+
hits: [],
|
|
4338
|
+
misses: [],
|
|
4339
|
+
expectedAspectCount: 1,
|
|
4340
|
+
evaluatorRawRequest,
|
|
4341
|
+
evaluatorResults
|
|
4342
|
+
};
|
|
4343
|
+
}
|
|
4344
|
+
}
|
|
4345
|
+
};
|
|
4051
4346
|
|
|
4052
4347
|
// src/evaluation/orchestrator.ts
|
|
4053
4348
|
var import_node_crypto2 = require("crypto");
|
|
@@ -4743,7 +5038,6 @@ async function runEvaluatorList(options) {
|
|
|
4743
5038
|
reasoning: score2.reasoning,
|
|
4744
5039
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4745
5040
|
});
|
|
4746
|
-
continue;
|
|
4747
5041
|
}
|
|
4748
5042
|
if (evaluator.type === "code") {
|
|
4749
5043
|
const codeEvaluator = new CodeEvaluator({
|
|
@@ -4760,10 +5054,10 @@ async function runEvaluatorList(options) {
|
|
|
4760
5054
|
promptInputs,
|
|
4761
5055
|
now
|
|
4762
5056
|
});
|
|
4763
|
-
scored.push({ score: score2, name: evaluator.name, type:
|
|
5057
|
+
scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
|
|
4764
5058
|
evaluatorResults.push({
|
|
4765
5059
|
name: evaluator.name,
|
|
4766
|
-
type:
|
|
5060
|
+
type: "code_judge",
|
|
4767
5061
|
score: score2.score,
|
|
4768
5062
|
verdict: score2.verdict,
|
|
4769
5063
|
hits: score2.hits,
|
|
@@ -4771,19 +5065,37 @@ async function runEvaluatorList(options) {
|
|
|
4771
5065
|
reasoning: score2.reasoning,
|
|
4772
5066
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4773
5067
|
});
|
|
4774
|
-
continue;
|
|
4775
5068
|
}
|
|
4776
|
-
if (evaluator.type === "
|
|
4777
|
-
const
|
|
4778
|
-
|
|
4779
|
-
|
|
4780
|
-
|
|
4781
|
-
return
|
|
5069
|
+
if (evaluator.type === "composite") {
|
|
5070
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path13.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
5071
|
+
const createEvaluator = (memberConfig) => {
|
|
5072
|
+
switch (memberConfig.type) {
|
|
5073
|
+
case "llm_judge":
|
|
5074
|
+
return evaluatorRegistry.llm_judge;
|
|
5075
|
+
case "code":
|
|
5076
|
+
return new CodeEvaluator({
|
|
5077
|
+
script: memberConfig.script,
|
|
5078
|
+
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
5079
|
+
agentTimeoutMs
|
|
5080
|
+
});
|
|
5081
|
+
case "composite":
|
|
5082
|
+
return new CompositeEvaluator({
|
|
5083
|
+
config: memberConfig,
|
|
5084
|
+
cwd: evalFileDir,
|
|
5085
|
+
evaluatorFactory: { create: createEvaluator }
|
|
5086
|
+
});
|
|
5087
|
+
default: {
|
|
5088
|
+
const unknownConfig = memberConfig;
|
|
5089
|
+
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
4782
5090
|
}
|
|
4783
|
-
return judgeProvider;
|
|
4784
5091
|
}
|
|
5092
|
+
};
|
|
5093
|
+
const compositeEvaluator = new CompositeEvaluator({
|
|
5094
|
+
config: evaluator,
|
|
5095
|
+
cwd: evalFileDir,
|
|
5096
|
+
evaluatorFactory: { create: createEvaluator }
|
|
4785
5097
|
});
|
|
4786
|
-
const score2 = await
|
|
5098
|
+
const score2 = await compositeEvaluator.evaluate({
|
|
4787
5099
|
evalCase,
|
|
4788
5100
|
candidate,
|
|
4789
5101
|
target,
|
|
@@ -4802,27 +5114,31 @@ async function runEvaluatorList(options) {
|
|
|
4802
5114
|
hits: score2.hits,
|
|
4803
5115
|
misses: score2.misses,
|
|
4804
5116
|
reasoning: score2.reasoning,
|
|
4805
|
-
evaluator_provider_request: score2.evaluatorRawRequest
|
|
5117
|
+
evaluator_provider_request: score2.evaluatorRawRequest,
|
|
5118
|
+
evaluator_results: mapChildResults(score2.evaluatorResults)
|
|
4806
5119
|
});
|
|
4807
5120
|
}
|
|
4808
5121
|
} catch (error) {
|
|
4809
5122
|
const message = error instanceof Error ? error.message : String(error);
|
|
4810
5123
|
const fallbackScore = {
|
|
4811
5124
|
score: 0,
|
|
5125
|
+
verdict: "fail",
|
|
4812
5126
|
hits: [],
|
|
4813
5127
|
misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
|
|
4814
5128
|
expectedAspectCount: 1,
|
|
4815
5129
|
reasoning: message
|
|
4816
5130
|
};
|
|
5131
|
+
const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
|
|
4817
5132
|
scored.push({
|
|
4818
5133
|
score: fallbackScore,
|
|
4819
5134
|
name: evaluator.name ?? "unknown",
|
|
4820
|
-
type:
|
|
5135
|
+
type: resultType ?? "llm_judge"
|
|
4821
5136
|
});
|
|
4822
5137
|
evaluatorResults.push({
|
|
4823
5138
|
name: evaluator.name ?? "unknown",
|
|
4824
|
-
type:
|
|
5139
|
+
type: resultType ?? "llm_judge",
|
|
4825
5140
|
score: 0,
|
|
5141
|
+
verdict: "fail",
|
|
4826
5142
|
hits: [],
|
|
4827
5143
|
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
4828
5144
|
reasoning: message
|
|
@@ -4841,6 +5157,7 @@ async function runEvaluatorList(options) {
|
|
|
4841
5157
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
4842
5158
|
const score = {
|
|
4843
5159
|
score: aggregateScore,
|
|
5160
|
+
verdict: scoreToVerdict2(aggregateScore),
|
|
4844
5161
|
hits,
|
|
4845
5162
|
misses,
|
|
4846
5163
|
expectedAspectCount,
|
|
@@ -4891,6 +5208,15 @@ async function resolveCustomPrompt(config) {
|
|
|
4891
5208
|
function isNonEmptyString2(value) {
|
|
4892
5209
|
return typeof value === "string" && value.trim().length > 0;
|
|
4893
5210
|
}
|
|
5211
|
+
function scoreToVerdict2(score) {
|
|
5212
|
+
if (score >= 0.8) {
|
|
5213
|
+
return "pass";
|
|
5214
|
+
}
|
|
5215
|
+
if (score >= 0.6) {
|
|
5216
|
+
return "borderline";
|
|
5217
|
+
}
|
|
5218
|
+
return "fail";
|
|
5219
|
+
}
|
|
4894
5220
|
function filterEvalCases(evalCases, evalId) {
|
|
4895
5221
|
if (!evalId) {
|
|
4896
5222
|
return evalCases;
|
|
@@ -5028,6 +5354,23 @@ function isTimeoutLike(error) {
|
|
|
5028
5354
|
const value = String(error).toLowerCase();
|
|
5029
5355
|
return value.includes("timeout");
|
|
5030
5356
|
}
|
|
5357
|
+
function mapChildResults(children) {
|
|
5358
|
+
if (!children || children.length === 0) {
|
|
5359
|
+
return void 0;
|
|
5360
|
+
}
|
|
5361
|
+
return children.map((child) => ({
|
|
5362
|
+
name: child.name,
|
|
5363
|
+
type: child.type,
|
|
5364
|
+
score: child.score,
|
|
5365
|
+
weight: child.weight,
|
|
5366
|
+
verdict: child.verdict,
|
|
5367
|
+
hits: child.hits,
|
|
5368
|
+
misses: child.misses,
|
|
5369
|
+
reasoning: child.reasoning,
|
|
5370
|
+
evaluator_provider_request: child.evaluatorRawRequest,
|
|
5371
|
+
evaluator_results: mapChildResults(child.evaluatorResults)
|
|
5372
|
+
}));
|
|
5373
|
+
}
|
|
5031
5374
|
|
|
5032
5375
|
// src/evaluation/generators/rubric-generator.ts
|
|
5033
5376
|
var import_ai3 = require("ai");
|
|
@@ -5116,8 +5459,8 @@ function createAgentKernel() {
|
|
|
5116
5459
|
// Annotate the CommonJS export names for ESM import in node:
|
|
5117
5460
|
0 && (module.exports = {
|
|
5118
5461
|
CodeEvaluator,
|
|
5462
|
+
CompositeEvaluator,
|
|
5119
5463
|
LlmJudgeEvaluator,
|
|
5120
|
-
RubricEvaluator,
|
|
5121
5464
|
TEST_MESSAGE_ROLES,
|
|
5122
5465
|
buildDirectoryChain,
|
|
5123
5466
|
buildPromptInputs,
|