agentv 0.21.2 → 0.22.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-WOCXZEH4.js → chunk-QRY42RAP.js} +595 -250
- package/dist/chunk-QRY42RAP.js.map +1 -0
- package/dist/cli.js +1 -1
- package/dist/cli.js.map +1 -1
- package/dist/index.js +1 -1
- package/package.json +5 -9
- package/dist/chunk-WOCXZEH4.js.map +0 -1
- package/dist/templates/agentv/.env.template +0 -23
|
@@ -34562,7 +34562,7 @@ function isTestMessage(value) {
|
|
|
34562
34562
|
}
|
|
34563
34563
|
return candidate.content.every(isJsonObject);
|
|
34564
34564
|
}
|
|
34565
|
-
var EVALUATOR_KIND_VALUES = ["
|
|
34565
|
+
var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
|
|
34566
34566
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
34567
34567
|
function isEvaluatorKind(value) {
|
|
34568
34568
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -34879,10 +34879,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34879
34879
|
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
34880
34880
|
continue;
|
|
34881
34881
|
}
|
|
34882
|
-
if (typeValue === "
|
|
34882
|
+
if (typeValue === "code_judge") {
|
|
34883
34883
|
const script = asString2(rawEvaluator.script);
|
|
34884
34884
|
if (!script) {
|
|
34885
|
-
logWarning2(`Skipping
|
|
34885
|
+
logWarning2(`Skipping code_judge evaluator '${name16}' in '${evalId}': missing script`);
|
|
34886
34886
|
continue;
|
|
34887
34887
|
}
|
|
34888
34888
|
const cwd = asString2(rawEvaluator.cwd);
|
|
@@ -34893,7 +34893,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34893
34893
|
resolvedCwd = path32.resolve(resolved.resolvedPath);
|
|
34894
34894
|
} else {
|
|
34895
34895
|
logWarning2(
|
|
34896
|
-
`
|
|
34896
|
+
`Code_judge evaluator '${name16}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
34897
34897
|
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
34898
34898
|
);
|
|
34899
34899
|
}
|
|
@@ -34909,6 +34909,105 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34909
34909
|
});
|
|
34910
34910
|
continue;
|
|
34911
34911
|
}
|
|
34912
|
+
if (typeValue === "composite") {
|
|
34913
|
+
const rawMembers = rawEvaluator.evaluators;
|
|
34914
|
+
if (!Array.isArray(rawMembers)) {
|
|
34915
|
+
logWarning2(
|
|
34916
|
+
`Skipping composite evaluator '${name16}' in '${evalId}': missing evaluators array`
|
|
34917
|
+
);
|
|
34918
|
+
continue;
|
|
34919
|
+
}
|
|
34920
|
+
const rawAggregator = rawEvaluator.aggregator;
|
|
34921
|
+
if (!isJsonObject2(rawAggregator)) {
|
|
34922
|
+
logWarning2(`Skipping composite evaluator '${name16}' in '${evalId}': missing aggregator`);
|
|
34923
|
+
continue;
|
|
34924
|
+
}
|
|
34925
|
+
const aggregatorType = asString2(rawAggregator.type);
|
|
34926
|
+
if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
|
|
34927
|
+
logWarning2(
|
|
34928
|
+
`Skipping composite evaluator '${name16}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
34929
|
+
);
|
|
34930
|
+
continue;
|
|
34931
|
+
}
|
|
34932
|
+
const memberEvaluators = [];
|
|
34933
|
+
for (const rawMember of rawMembers) {
|
|
34934
|
+
if (!isJsonObject2(rawMember)) {
|
|
34935
|
+
logWarning2(`Skipping invalid member evaluator in composite '${name16}' (expected object)`);
|
|
34936
|
+
continue;
|
|
34937
|
+
}
|
|
34938
|
+
const memberName = asString2(rawMember.name);
|
|
34939
|
+
const memberType = rawMember.type;
|
|
34940
|
+
if (!memberName || !isEvaluatorKind(memberType)) {
|
|
34941
|
+
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name16}'`);
|
|
34942
|
+
continue;
|
|
34943
|
+
}
|
|
34944
|
+
const memberConfigs = await parseEvaluators(
|
|
34945
|
+
{ evaluators: [rawMember] },
|
|
34946
|
+
void 0,
|
|
34947
|
+
searchRoots,
|
|
34948
|
+
`${evalId}:${name16}:${memberName}`
|
|
34949
|
+
);
|
|
34950
|
+
if (memberConfigs && memberConfigs.length > 0) {
|
|
34951
|
+
memberEvaluators.push(memberConfigs[0]);
|
|
34952
|
+
}
|
|
34953
|
+
}
|
|
34954
|
+
if (memberEvaluators.length === 0) {
|
|
34955
|
+
logWarning2(
|
|
34956
|
+
`Skipping composite evaluator '${name16}' in '${evalId}': no valid member evaluators`
|
|
34957
|
+
);
|
|
34958
|
+
continue;
|
|
34959
|
+
}
|
|
34960
|
+
let aggregator;
|
|
34961
|
+
if (aggregatorType === "weighted_average") {
|
|
34962
|
+
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
34963
|
+
const parsedWeights = {};
|
|
34964
|
+
if (weights) {
|
|
34965
|
+
for (const [key2, value] of Object.entries(weights)) {
|
|
34966
|
+
if (typeof value === "number") {
|
|
34967
|
+
parsedWeights[key2] = value;
|
|
34968
|
+
}
|
|
34969
|
+
}
|
|
34970
|
+
}
|
|
34971
|
+
aggregator = {
|
|
34972
|
+
type: "weighted_average",
|
|
34973
|
+
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
34974
|
+
};
|
|
34975
|
+
} else if (aggregatorType === "code_judge") {
|
|
34976
|
+
const aggregatorPath = asString2(rawAggregator.path);
|
|
34977
|
+
if (!aggregatorPath) {
|
|
34978
|
+
logWarning2(
|
|
34979
|
+
`Skipping composite evaluator '${name16}' in '${evalId}': code_judge aggregator missing path`
|
|
34980
|
+
);
|
|
34981
|
+
continue;
|
|
34982
|
+
}
|
|
34983
|
+
aggregator = {
|
|
34984
|
+
type: "code_judge",
|
|
34985
|
+
path: aggregatorPath,
|
|
34986
|
+
cwd: searchRoots[0]
|
|
34987
|
+
};
|
|
34988
|
+
} else {
|
|
34989
|
+
const aggregatorPrompt = asString2(rawAggregator.prompt);
|
|
34990
|
+
let promptPath2;
|
|
34991
|
+
if (aggregatorPrompt) {
|
|
34992
|
+
const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
|
|
34993
|
+
if (resolved.resolvedPath) {
|
|
34994
|
+
promptPath2 = path32.resolve(resolved.resolvedPath);
|
|
34995
|
+
}
|
|
34996
|
+
}
|
|
34997
|
+
aggregator = {
|
|
34998
|
+
type: "llm_judge",
|
|
34999
|
+
...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
|
|
35000
|
+
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
35001
|
+
};
|
|
35002
|
+
}
|
|
35003
|
+
evaluators.push({
|
|
35004
|
+
name: name16,
|
|
35005
|
+
type: "composite",
|
|
35006
|
+
evaluators: memberEvaluators,
|
|
35007
|
+
aggregator
|
|
35008
|
+
});
|
|
35009
|
+
continue;
|
|
35010
|
+
}
|
|
34912
35011
|
const prompt = asString2(rawEvaluator.prompt);
|
|
34913
35012
|
let promptPath;
|
|
34914
35013
|
if (prompt) {
|
|
@@ -34929,25 +35028,25 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34929
35028
|
}
|
|
34930
35029
|
}
|
|
34931
35030
|
const _model = asString2(rawEvaluator.model);
|
|
35031
|
+
const rawRubrics = rawEvaluator.rubrics;
|
|
35032
|
+
const parsedRubrics = Array.isArray(rawRubrics) ? rawRubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
35033
|
+
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
35034
|
+
description: asString2(rubric.description) ?? "",
|
|
35035
|
+
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
35036
|
+
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
35037
|
+
})).filter((r) => r.description.length > 0) : void 0;
|
|
34932
35038
|
if (typeValue === "rubric") {
|
|
34933
|
-
|
|
34934
|
-
if (!Array.isArray(rubrics)) {
|
|
35039
|
+
if (!parsedRubrics) {
|
|
34935
35040
|
logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': missing rubrics array`);
|
|
34936
35041
|
continue;
|
|
34937
35042
|
}
|
|
34938
|
-
const parsedRubrics = rubrics.filter((r) => isJsonObject2(r)).map((rubric, index) => ({
|
|
34939
|
-
id: asString2(rubric.id) ?? `rubric-${index + 1}`,
|
|
34940
|
-
description: asString2(rubric.description) ?? "",
|
|
34941
|
-
weight: typeof rubric.weight === "number" ? rubric.weight : 1,
|
|
34942
|
-
required: typeof rubric.required === "boolean" ? rubric.required : true
|
|
34943
|
-
})).filter((r) => r.description.length > 0);
|
|
34944
35043
|
if (parsedRubrics.length === 0) {
|
|
34945
35044
|
logWarning2(`Skipping rubric evaluator '${name16}' in '${evalId}': no valid rubrics found`);
|
|
34946
35045
|
continue;
|
|
34947
35046
|
}
|
|
34948
35047
|
evaluators.push({
|
|
34949
35048
|
name: name16,
|
|
34950
|
-
type: "
|
|
35049
|
+
type: "llm_judge",
|
|
34951
35050
|
rubrics: parsedRubrics
|
|
34952
35051
|
});
|
|
34953
35052
|
continue;
|
|
@@ -34956,7 +35055,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
34956
35055
|
name: name16,
|
|
34957
35056
|
type: "llm_judge",
|
|
34958
35057
|
prompt,
|
|
34959
|
-
promptPath
|
|
35058
|
+
promptPath,
|
|
35059
|
+
...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
|
|
34960
35060
|
});
|
|
34961
35061
|
}
|
|
34962
35062
|
return evaluators.length > 0 ? evaluators : void 0;
|
|
@@ -35497,7 +35597,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
|
|
|
35497
35597
|
if (rubricItems.length > 0) {
|
|
35498
35598
|
const rubricEvaluator = {
|
|
35499
35599
|
name: "rubric",
|
|
35500
|
-
type: "
|
|
35600
|
+
type: "llm_judge",
|
|
35501
35601
|
rubrics: rubricItems
|
|
35502
35602
|
};
|
|
35503
35603
|
evaluators = evaluators ? [rubricEvaluator, ...evaluators] : [rubricEvaluator];
|
|
@@ -37330,144 +37430,6 @@ function createProvider(target) {
|
|
|
37330
37430
|
}
|
|
37331
37431
|
}
|
|
37332
37432
|
}
|
|
37333
|
-
var rubricCheckResultSchema = external_exports.object({
|
|
37334
|
-
id: external_exports.string().describe("The ID of the rubric item being checked"),
|
|
37335
|
-
satisfied: external_exports.boolean().describe("Whether this rubric requirement is met"),
|
|
37336
|
-
reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
37337
|
-
});
|
|
37338
|
-
var rubricEvaluationSchema = external_exports.object({
|
|
37339
|
-
checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
37340
|
-
overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
|
|
37341
|
-
});
|
|
37342
|
-
var RubricEvaluator = class {
|
|
37343
|
-
kind = "rubric";
|
|
37344
|
-
config;
|
|
37345
|
-
resolveJudgeProvider;
|
|
37346
|
-
constructor(options) {
|
|
37347
|
-
this.config = options.config;
|
|
37348
|
-
this.resolveJudgeProvider = options.resolveJudgeProvider;
|
|
37349
|
-
}
|
|
37350
|
-
async evaluate(context) {
|
|
37351
|
-
const judgeProvider = await this.resolveJudgeProvider(context);
|
|
37352
|
-
if (!judgeProvider) {
|
|
37353
|
-
throw new Error("No judge provider available for rubric evaluation");
|
|
37354
|
-
}
|
|
37355
|
-
if (!this.config.rubrics || this.config.rubrics.length === 0) {
|
|
37356
|
-
throw new Error(
|
|
37357
|
-
`No rubrics found for evaluator "${this.config.name}". Run "agentv generate rubrics" first.`
|
|
37358
|
-
);
|
|
37359
|
-
}
|
|
37360
|
-
const prompt = this.buildPrompt(context, this.config.rubrics);
|
|
37361
|
-
const model = judgeProvider.asLanguageModel?.();
|
|
37362
|
-
if (!model) {
|
|
37363
|
-
throw new Error("Judge provider does not support language model interface");
|
|
37364
|
-
}
|
|
37365
|
-
const system = `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
37366
|
-
You must return a valid JSON object matching this schema:
|
|
37367
|
-
{
|
|
37368
|
-
"checks": [
|
|
37369
|
-
{
|
|
37370
|
-
"id": "string (rubric id)",
|
|
37371
|
-
"satisfied": boolean,
|
|
37372
|
-
"reasoning": "string (brief explanation)"
|
|
37373
|
-
}
|
|
37374
|
-
],
|
|
37375
|
-
"overall_reasoning": "string (summary)"
|
|
37376
|
-
}`;
|
|
37377
|
-
let result;
|
|
37378
|
-
let lastError;
|
|
37379
|
-
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
37380
|
-
try {
|
|
37381
|
-
const { text: text2 } = await generateText({
|
|
37382
|
-
model,
|
|
37383
|
-
system,
|
|
37384
|
-
prompt
|
|
37385
|
-
});
|
|
37386
|
-
const cleaned = text2.replace(/```json\n?|```/g, "").trim();
|
|
37387
|
-
result = rubricEvaluationSchema.parse(JSON.parse(cleaned));
|
|
37388
|
-
break;
|
|
37389
|
-
} catch (e) {
|
|
37390
|
-
lastError = e instanceof Error ? e : new Error(String(e));
|
|
37391
|
-
}
|
|
37392
|
-
}
|
|
37393
|
-
if (!result) {
|
|
37394
|
-
throw new Error(
|
|
37395
|
-
`Failed to parse rubric evaluation result after 3 attempts: ${lastError?.message}`
|
|
37396
|
-
);
|
|
37397
|
-
}
|
|
37398
|
-
const { score, verdict, hits, misses } = this.calculateScore(result, this.config.rubrics);
|
|
37399
|
-
return {
|
|
37400
|
-
score,
|
|
37401
|
-
verdict,
|
|
37402
|
-
hits,
|
|
37403
|
-
misses,
|
|
37404
|
-
expectedAspectCount: this.config.rubrics.length,
|
|
37405
|
-
reasoning: result.overall_reasoning,
|
|
37406
|
-
evaluatorRawRequest: {
|
|
37407
|
-
prompt
|
|
37408
|
-
}
|
|
37409
|
-
};
|
|
37410
|
-
}
|
|
37411
|
-
buildPrompt(context, rubrics) {
|
|
37412
|
-
const parts = [
|
|
37413
|
-
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
37414
|
-
"",
|
|
37415
|
-
"[[ ## question ## ]]",
|
|
37416
|
-
context.evalCase.question,
|
|
37417
|
-
"",
|
|
37418
|
-
"[[ ## expected_outcome ## ]]",
|
|
37419
|
-
context.evalCase.expected_outcome,
|
|
37420
|
-
""
|
|
37421
|
-
];
|
|
37422
|
-
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
37423
|
-
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
37424
|
-
}
|
|
37425
|
-
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
37426
|
-
for (const rubric of rubrics) {
|
|
37427
|
-
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
37428
|
-
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
37429
|
-
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
37430
|
-
}
|
|
37431
|
-
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
37432
|
-
return parts.join("\n");
|
|
37433
|
-
}
|
|
37434
|
-
calculateScore(result, rubrics) {
|
|
37435
|
-
const rubricMap = new Map(rubrics.map((r) => [r.id, r]));
|
|
37436
|
-
const hits = [];
|
|
37437
|
-
const misses = [];
|
|
37438
|
-
let totalWeight = 0;
|
|
37439
|
-
let earnedWeight = 0;
|
|
37440
|
-
let failedRequired = false;
|
|
37441
|
-
for (const check2 of result.checks) {
|
|
37442
|
-
const rubric = rubricMap.get(check2.id);
|
|
37443
|
-
if (!rubric) {
|
|
37444
|
-
continue;
|
|
37445
|
-
}
|
|
37446
|
-
totalWeight += rubric.weight;
|
|
37447
|
-
if (check2.satisfied) {
|
|
37448
|
-
earnedWeight += rubric.weight;
|
|
37449
|
-
hits.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
|
|
37450
|
-
} else {
|
|
37451
|
-
misses.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
|
|
37452
|
-
if (rubric.required) {
|
|
37453
|
-
failedRequired = true;
|
|
37454
|
-
}
|
|
37455
|
-
}
|
|
37456
|
-
}
|
|
37457
|
-
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
37458
|
-
let verdict;
|
|
37459
|
-
if (failedRequired) {
|
|
37460
|
-
verdict = "fail";
|
|
37461
|
-
} else if (score >= 0.8) {
|
|
37462
|
-
verdict = "pass";
|
|
37463
|
-
} else if (score >= 0.6) {
|
|
37464
|
-
verdict = "borderline";
|
|
37465
|
-
} else {
|
|
37466
|
-
verdict = "fail";
|
|
37467
|
-
}
|
|
37468
|
-
return { score, verdict, hits, misses };
|
|
37469
|
-
}
|
|
37470
|
-
};
|
|
37471
37433
|
var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
|
|
37472
37434
|
|
|
37473
37435
|
Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
|
|
@@ -37485,6 +37447,21 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
|
|
|
37485
37447
|
|
|
37486
37448
|
[[ ## candidate_answer ## ]]
|
|
37487
37449
|
{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
|
|
37450
|
+
var freeformEvaluationSchema = external_exports.object({
|
|
37451
|
+
score: external_exports.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
|
|
37452
|
+
hits: external_exports.array(external_exports.string()).describe("Brief specific achievements").optional(),
|
|
37453
|
+
misses: external_exports.array(external_exports.string()).describe("Brief failures or omissions").optional(),
|
|
37454
|
+
reasoning: external_exports.string().describe("Concise explanation (1-2 sentences)").optional()
|
|
37455
|
+
});
|
|
37456
|
+
var rubricCheckResultSchema = external_exports.object({
|
|
37457
|
+
id: external_exports.string().describe("The ID of the rubric item being checked"),
|
|
37458
|
+
satisfied: external_exports.boolean().describe("Whether this rubric requirement is met"),
|
|
37459
|
+
reasoning: external_exports.string().describe("Brief explanation (1-2 sentences) for this check")
|
|
37460
|
+
});
|
|
37461
|
+
var rubricEvaluationSchema = external_exports.object({
|
|
37462
|
+
checks: external_exports.array(rubricCheckResultSchema).describe("Results for each rubric item"),
|
|
37463
|
+
overall_reasoning: external_exports.string().describe("Overall assessment summary (1-2 sentences)")
|
|
37464
|
+
});
|
|
37488
37465
|
var LlmJudgeEvaluator = class {
|
|
37489
37466
|
kind = "llm_judge";
|
|
37490
37467
|
resolveJudgeProvider;
|
|
@@ -37502,9 +37479,13 @@ var LlmJudgeEvaluator = class {
|
|
|
37502
37479
|
if (!judgeProvider) {
|
|
37503
37480
|
throw new Error("No judge provider available for LLM grading");
|
|
37504
37481
|
}
|
|
37505
|
-
|
|
37482
|
+
const config2 = context.evaluator;
|
|
37483
|
+
if (config2?.type === "llm_judge" && config2.rubrics && config2.rubrics.length > 0) {
|
|
37484
|
+
return this.evaluateWithRubrics(context, judgeProvider, config2.rubrics);
|
|
37485
|
+
}
|
|
37486
|
+
return this.evaluateFreeform(context, judgeProvider);
|
|
37506
37487
|
}
|
|
37507
|
-
async
|
|
37488
|
+
async evaluateFreeform(context, judgeProvider) {
|
|
37508
37489
|
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
37509
37490
|
const variables = {
|
|
37510
37491
|
[TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
|
|
@@ -37521,34 +37502,132 @@ var LlmJudgeEvaluator = class {
|
|
|
37521
37502
|
const systemPrompt = buildOutputSchema();
|
|
37522
37503
|
const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
|
|
37523
37504
|
const userPrompt = substituteVariables(evaluatorTemplate, variables);
|
|
37524
|
-
const response = await judgeProvider.invoke({
|
|
37525
|
-
question: userPrompt,
|
|
37526
|
-
systemPrompt,
|
|
37527
|
-
evalCaseId: context.evalCase.id,
|
|
37528
|
-
attempt: context.attempt,
|
|
37529
|
-
maxOutputTokens: this.maxOutputTokens,
|
|
37530
|
-
temperature: this.temperature
|
|
37531
|
-
});
|
|
37532
|
-
const parsed = parseQualityResponse(response);
|
|
37533
|
-
const score = clampScore(parsed.score ?? 0);
|
|
37534
|
-
const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
37535
|
-
const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
37536
|
-
const reasoning = parsed.reasoning ?? response.reasoning;
|
|
37537
|
-
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
37538
37505
|
const evaluatorRawRequest = {
|
|
37539
37506
|
userPrompt,
|
|
37540
37507
|
systemPrompt,
|
|
37541
37508
|
target: judgeProvider.targetName
|
|
37542
37509
|
};
|
|
37510
|
+
try {
|
|
37511
|
+
const { data, providerResponse } = await this.runWithRetry({
|
|
37512
|
+
context,
|
|
37513
|
+
judgeProvider,
|
|
37514
|
+
systemPrompt,
|
|
37515
|
+
userPrompt,
|
|
37516
|
+
schema: freeformEvaluationSchema
|
|
37517
|
+
});
|
|
37518
|
+
const score = clampScore(data.score);
|
|
37519
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
37520
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
37521
|
+
const reasoning = data.reasoning ?? providerResponse?.reasoning;
|
|
37522
|
+
const expectedAspectCount = Math.max(hits.length + misses.length, 1);
|
|
37523
|
+
return {
|
|
37524
|
+
score,
|
|
37525
|
+
verdict: scoreToVerdict(score),
|
|
37526
|
+
hits,
|
|
37527
|
+
misses,
|
|
37528
|
+
expectedAspectCount,
|
|
37529
|
+
reasoning,
|
|
37530
|
+
evaluatorRawRequest
|
|
37531
|
+
};
|
|
37532
|
+
} catch {
|
|
37533
|
+
return {
|
|
37534
|
+
score: 0,
|
|
37535
|
+
verdict: "fail",
|
|
37536
|
+
hits: [],
|
|
37537
|
+
misses: [],
|
|
37538
|
+
expectedAspectCount: 1,
|
|
37539
|
+
evaluatorRawRequest
|
|
37540
|
+
};
|
|
37541
|
+
}
|
|
37542
|
+
}
|
|
37543
|
+
async evaluateWithRubrics(context, judgeProvider, rubrics) {
|
|
37544
|
+
if (!rubrics || rubrics.length === 0) {
|
|
37545
|
+
throw new Error(
|
|
37546
|
+
`No rubrics found for evaluator "${context.evaluator?.name ?? "llm_judge"}". Run "agentv generate rubrics" first.`
|
|
37547
|
+
);
|
|
37548
|
+
}
|
|
37549
|
+
const prompt = this.buildRubricPrompt(context, rubrics);
|
|
37550
|
+
const systemPrompt = buildRubricOutputSchema();
|
|
37551
|
+
const evaluatorRawRequest = {
|
|
37552
|
+
userPrompt: prompt,
|
|
37553
|
+
systemPrompt,
|
|
37554
|
+
target: judgeProvider.targetName
|
|
37555
|
+
};
|
|
37556
|
+
const { data } = await this.runWithRetry({
|
|
37557
|
+
context,
|
|
37558
|
+
judgeProvider,
|
|
37559
|
+
systemPrompt,
|
|
37560
|
+
userPrompt: prompt,
|
|
37561
|
+
schema: rubricEvaluationSchema
|
|
37562
|
+
});
|
|
37563
|
+
const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
|
|
37543
37564
|
return {
|
|
37544
37565
|
score,
|
|
37566
|
+
verdict,
|
|
37545
37567
|
hits,
|
|
37546
37568
|
misses,
|
|
37547
|
-
expectedAspectCount,
|
|
37548
|
-
reasoning,
|
|
37569
|
+
expectedAspectCount: rubrics.length,
|
|
37570
|
+
reasoning: data.overall_reasoning,
|
|
37549
37571
|
evaluatorRawRequest
|
|
37550
37572
|
};
|
|
37551
37573
|
}
|
|
37574
|
+
buildRubricPrompt(context, rubrics) {
|
|
37575
|
+
const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
|
|
37576
|
+
const parts = [
|
|
37577
|
+
"You are an expert evaluator. Evaluate the candidate answer against each rubric item below.",
|
|
37578
|
+
"",
|
|
37579
|
+
"[[ ## question ## ]]",
|
|
37580
|
+
formattedQuestion,
|
|
37581
|
+
"",
|
|
37582
|
+
"[[ ## expected_outcome ## ]]",
|
|
37583
|
+
context.evalCase.expected_outcome,
|
|
37584
|
+
""
|
|
37585
|
+
];
|
|
37586
|
+
if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
|
|
37587
|
+
parts.push("[[ ## reference_answer ## ]]", context.evalCase.reference_answer, "");
|
|
37588
|
+
}
|
|
37589
|
+
parts.push("[[ ## candidate_answer ## ]]", context.candidate, "", "[[ ## rubrics ## ]]");
|
|
37590
|
+
for (const rubric of rubrics) {
|
|
37591
|
+
const requiredLabel = rubric.required ? " (REQUIRED)" : "";
|
|
37592
|
+
const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
|
|
37593
|
+
parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.description}`);
|
|
37594
|
+
}
|
|
37595
|
+
parts.push("", "For each rubric, determine if it is satisfied and provide brief reasoning.");
|
|
37596
|
+
return parts.join("\n");
|
|
37597
|
+
}
|
|
37598
|
+
async runWithRetry(options) {
|
|
37599
|
+
const { context, judgeProvider, systemPrompt, userPrompt, schema } = options;
|
|
37600
|
+
let lastError;
|
|
37601
|
+
for (let attempt = 1; attempt <= 3; attempt++) {
|
|
37602
|
+
try {
|
|
37603
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
37604
|
+
if (model) {
|
|
37605
|
+
const { text: text2 } = await generateText({
|
|
37606
|
+
model,
|
|
37607
|
+
system: systemPrompt,
|
|
37608
|
+
prompt: userPrompt,
|
|
37609
|
+
...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
|
|
37610
|
+
...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
|
|
37611
|
+
});
|
|
37612
|
+
const data2 = schema.parse(parseJsonFromText(text2));
|
|
37613
|
+
return { data: data2 };
|
|
37614
|
+
}
|
|
37615
|
+
const response = await judgeProvider.invoke({
|
|
37616
|
+
question: userPrompt,
|
|
37617
|
+
systemPrompt,
|
|
37618
|
+
evalCaseId: context.evalCase.id,
|
|
37619
|
+
attempt: context.attempt,
|
|
37620
|
+
maxOutputTokens: this.maxOutputTokens,
|
|
37621
|
+
temperature: this.temperature
|
|
37622
|
+
});
|
|
37623
|
+
const data = schema.parse(parseJsonFromText(response.text ?? ""));
|
|
37624
|
+
return { data, providerResponse: response };
|
|
37625
|
+
} catch (e) {
|
|
37626
|
+
lastError = e instanceof Error ? e : new Error(String(e));
|
|
37627
|
+
}
|
|
37628
|
+
}
|
|
37629
|
+
throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
|
|
37630
|
+
}
|
|
37552
37631
|
};
|
|
37553
37632
|
function buildOutputSchema() {
|
|
37554
37633
|
return [
|
|
@@ -37562,6 +37641,29 @@ function buildOutputSchema() {
|
|
|
37562
37641
|
"}"
|
|
37563
37642
|
].join("\n");
|
|
37564
37643
|
}
|
|
37644
|
+
function buildRubricOutputSchema() {
|
|
37645
|
+
return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
|
|
37646
|
+
You must return a valid JSON object matching this schema:
|
|
37647
|
+
{
|
|
37648
|
+
"checks": [
|
|
37649
|
+
{
|
|
37650
|
+
"id": "string (rubric id)",
|
|
37651
|
+
"satisfied": boolean,
|
|
37652
|
+
"reasoning": "string (brief explanation)"
|
|
37653
|
+
}
|
|
37654
|
+
],
|
|
37655
|
+
"overall_reasoning": "string (summary)"
|
|
37656
|
+
}`;
|
|
37657
|
+
}
|
|
37658
|
+
function scoreToVerdict(score) {
|
|
37659
|
+
if (score >= 0.8) {
|
|
37660
|
+
return "pass";
|
|
37661
|
+
}
|
|
37662
|
+
if (score >= 0.6) {
|
|
37663
|
+
return "borderline";
|
|
37664
|
+
}
|
|
37665
|
+
return "fail";
|
|
37666
|
+
}
|
|
37565
37667
|
function clampScore(value) {
|
|
37566
37668
|
if (Number.isNaN(value) || !Number.isFinite(value)) {
|
|
37567
37669
|
return 0;
|
|
@@ -37574,71 +37676,15 @@ function clampScore(value) {
|
|
|
37574
37676
|
}
|
|
37575
37677
|
return value;
|
|
37576
37678
|
}
|
|
37577
|
-
function parseQualityResponse(response) {
|
|
37578
|
-
const text2 = typeof response.text === "string" ? response.text.trim() : "";
|
|
37579
|
-
if (text2.length === 0) {
|
|
37580
|
-
return {};
|
|
37581
|
-
}
|
|
37582
|
-
const direct = attemptParseJson(text2);
|
|
37583
|
-
if (direct && validateQualityJson(direct)) {
|
|
37584
|
-
return direct;
|
|
37585
|
-
}
|
|
37586
|
-
const extracted = extractJsonBlob(text2);
|
|
37587
|
-
if (extracted) {
|
|
37588
|
-
const parsed = attemptParseJson(extracted);
|
|
37589
|
-
if (parsed && validateQualityJson(parsed)) {
|
|
37590
|
-
return parsed;
|
|
37591
|
-
}
|
|
37592
|
-
}
|
|
37593
|
-
return {};
|
|
37594
|
-
}
|
|
37595
|
-
function attemptParseJson(text2) {
|
|
37596
|
-
try {
|
|
37597
|
-
const parsed = JSON.parse(text2);
|
|
37598
|
-
const score = typeof parsed.score === "number" ? parsed.score : void 0;
|
|
37599
|
-
const hits = parsed.hits;
|
|
37600
|
-
const misses = parsed.misses;
|
|
37601
|
-
const reasoning = typeof parsed.reasoning === "string" ? parsed.reasoning : void 0;
|
|
37602
|
-
return { score, hits, misses, reasoning };
|
|
37603
|
-
} catch {
|
|
37604
|
-
return void 0;
|
|
37605
|
-
}
|
|
37606
|
-
}
|
|
37607
|
-
function validateQualityJson(parsed) {
|
|
37608
|
-
if (typeof parsed.score !== "number") {
|
|
37609
|
-
return false;
|
|
37610
|
-
}
|
|
37611
|
-
if (Number.isNaN(parsed.score) || !Number.isFinite(parsed.score)) {
|
|
37612
|
-
return false;
|
|
37613
|
-
}
|
|
37614
|
-
if (parsed.score < 0 || parsed.score > 1) {
|
|
37615
|
-
return false;
|
|
37616
|
-
}
|
|
37617
|
-
if (parsed.hits !== void 0) {
|
|
37618
|
-
if (!Array.isArray(parsed.hits)) {
|
|
37619
|
-
return false;
|
|
37620
|
-
}
|
|
37621
|
-
if (!parsed.hits.every((item) => typeof item === "string")) {
|
|
37622
|
-
return false;
|
|
37623
|
-
}
|
|
37624
|
-
}
|
|
37625
|
-
if (parsed.misses !== void 0) {
|
|
37626
|
-
if (!Array.isArray(parsed.misses)) {
|
|
37627
|
-
return false;
|
|
37628
|
-
}
|
|
37629
|
-
if (!parsed.misses.every((item) => typeof item === "string")) {
|
|
37630
|
-
return false;
|
|
37631
|
-
}
|
|
37632
|
-
}
|
|
37633
|
-
if (parsed.reasoning !== void 0 && typeof parsed.reasoning !== "string") {
|
|
37634
|
-
return false;
|
|
37635
|
-
}
|
|
37636
|
-
return true;
|
|
37637
|
-
}
|
|
37638
37679
|
function extractJsonBlob(text2) {
|
|
37639
37680
|
const match = text2.match(/\{[\s\S]*\}/);
|
|
37640
37681
|
return match?.[0];
|
|
37641
37682
|
}
|
|
37683
|
+
function parseJsonFromText(text2) {
|
|
37684
|
+
const cleaned = typeof text2 === "string" ? text2.replace(/```json\n?|```/g, "").trim() : "";
|
|
37685
|
+
const blob = extractJsonBlob(cleaned) ?? cleaned;
|
|
37686
|
+
return JSON.parse(blob);
|
|
37687
|
+
}
|
|
37642
37688
|
function isNonEmptyString(value) {
|
|
37643
37689
|
return typeof value === "string" && value.trim().length > 0;
|
|
37644
37690
|
}
|
|
@@ -37675,6 +37721,7 @@ var CodeEvaluator = class {
|
|
|
37675
37721
|
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
37676
37722
|
return {
|
|
37677
37723
|
score,
|
|
37724
|
+
verdict: scoreToVerdict(score),
|
|
37678
37725
|
hits,
|
|
37679
37726
|
misses,
|
|
37680
37727
|
expectedAspectCount: hits.length + misses.length || 1,
|
|
@@ -37688,6 +37735,7 @@ var CodeEvaluator = class {
|
|
|
37688
37735
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
37689
37736
|
return {
|
|
37690
37737
|
score: 0,
|
|
37738
|
+
verdict: "fail",
|
|
37691
37739
|
hits: [],
|
|
37692
37740
|
misses: [`Code evaluator failed: ${message}`],
|
|
37693
37741
|
expectedAspectCount: 1,
|
|
@@ -37701,6 +37749,33 @@ var CodeEvaluator = class {
|
|
|
37701
37749
|
}
|
|
37702
37750
|
}
|
|
37703
37751
|
};
|
|
37752
|
+
function calculateRubricScore(result, rubrics) {
|
|
37753
|
+
const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
|
|
37754
|
+
const hits = [];
|
|
37755
|
+
const misses = [];
|
|
37756
|
+
let totalWeight = 0;
|
|
37757
|
+
let earnedWeight = 0;
|
|
37758
|
+
let failedRequired = false;
|
|
37759
|
+
for (const check2 of result.checks) {
|
|
37760
|
+
const rubric = rubricMap.get(check2.id);
|
|
37761
|
+
if (!rubric) {
|
|
37762
|
+
continue;
|
|
37763
|
+
}
|
|
37764
|
+
totalWeight += rubric.weight;
|
|
37765
|
+
if (check2.satisfied) {
|
|
37766
|
+
earnedWeight += rubric.weight;
|
|
37767
|
+
hits.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
|
|
37768
|
+
} else {
|
|
37769
|
+
misses.push(`[${rubric.id}] ${rubric.description}: ${check2.reasoning}`);
|
|
37770
|
+
if (rubric.required) {
|
|
37771
|
+
failedRequired = true;
|
|
37772
|
+
}
|
|
37773
|
+
}
|
|
37774
|
+
}
|
|
37775
|
+
const score = totalWeight > 0 ? Math.min(1, Math.max(0, earnedWeight / totalWeight)) : 0;
|
|
37776
|
+
const verdict = failedRequired ? "fail" : scoreToVerdict(score);
|
|
37777
|
+
return { score, verdict, hits, misses };
|
|
37778
|
+
}
|
|
37704
37779
|
async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
|
|
37705
37780
|
const { spawn: spawn22 } = await import("node:child_process");
|
|
37706
37781
|
return await new Promise((resolve2, reject) => {
|
|
@@ -37752,6 +37827,228 @@ function substituteVariables(template, variables) {
|
|
|
37752
37827
|
return variables[varName] ?? match;
|
|
37753
37828
|
});
|
|
37754
37829
|
}
|
|
37830
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
37831
|
+
{{EVALUATOR_RESULTS_JSON}}
|
|
37832
|
+
|
|
37833
|
+
Decide the final score and verdict based on all evaluator results.
|
|
37834
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
37835
|
+
var CompositeEvaluator = class {
|
|
37836
|
+
kind = "composite";
|
|
37837
|
+
config;
|
|
37838
|
+
evaluatorFactory;
|
|
37839
|
+
cwd;
|
|
37840
|
+
constructor(options) {
|
|
37841
|
+
this.config = options.config;
|
|
37842
|
+
this.evaluatorFactory = options.evaluatorFactory;
|
|
37843
|
+
this.cwd = options.cwd;
|
|
37844
|
+
}
|
|
37845
|
+
async evaluate(context) {
|
|
37846
|
+
const memberResults = await Promise.all(
|
|
37847
|
+
this.config.evaluators.map(async (memberConfig) => {
|
|
37848
|
+
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
37849
|
+
return {
|
|
37850
|
+
id: memberConfig.name,
|
|
37851
|
+
type: memberConfig.type,
|
|
37852
|
+
result: await evaluator.evaluate(context)
|
|
37853
|
+
};
|
|
37854
|
+
})
|
|
37855
|
+
);
|
|
37856
|
+
return this.aggregate(memberResults, context);
|
|
37857
|
+
}
|
|
37858
|
+
async aggregate(results, context) {
|
|
37859
|
+
const aggregator = this.config.aggregator;
|
|
37860
|
+
switch (aggregator.type) {
|
|
37861
|
+
case "code_judge":
|
|
37862
|
+
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
37863
|
+
case "llm_judge":
|
|
37864
|
+
return this.runLlmAggregator(results, context, aggregator);
|
|
37865
|
+
default:
|
|
37866
|
+
return this.runWeightedAverage(results, aggregator.weights);
|
|
37867
|
+
}
|
|
37868
|
+
}
|
|
37869
|
+
runWeightedAverage(results, weights) {
|
|
37870
|
+
let totalWeight = 0;
|
|
37871
|
+
let weightedSum = 0;
|
|
37872
|
+
const allHits = [];
|
|
37873
|
+
const allMisses = [];
|
|
37874
|
+
const reasoningParts = [];
|
|
37875
|
+
const evaluatorResults = [];
|
|
37876
|
+
for (const member of results) {
|
|
37877
|
+
const weight = weights?.[member.id] ?? 1;
|
|
37878
|
+
totalWeight += weight;
|
|
37879
|
+
weightedSum += member.result.score * weight;
|
|
37880
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
37881
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
37882
|
+
if (member.result.reasoning) {
|
|
37883
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
37884
|
+
}
|
|
37885
|
+
evaluatorResults.push({
|
|
37886
|
+
name: member.id,
|
|
37887
|
+
type: member.type,
|
|
37888
|
+
score: member.result.score,
|
|
37889
|
+
weight,
|
|
37890
|
+
verdict: member.result.verdict,
|
|
37891
|
+
hits: [...member.result.hits],
|
|
37892
|
+
misses: [...member.result.misses],
|
|
37893
|
+
reasoning: member.result.reasoning,
|
|
37894
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
37895
|
+
evaluatorResults: member.result.evaluatorResults
|
|
37896
|
+
});
|
|
37897
|
+
}
|
|
37898
|
+
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
37899
|
+
return {
|
|
37900
|
+
score: clampScore(finalScore),
|
|
37901
|
+
verdict: scoreToVerdict(finalScore),
|
|
37902
|
+
hits: allHits,
|
|
37903
|
+
misses: allMisses,
|
|
37904
|
+
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
37905
|
+
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
37906
|
+
evaluatorRawRequest: {
|
|
37907
|
+
aggregator: "weighted_average",
|
|
37908
|
+
...weights ? { weights } : {}
|
|
37909
|
+
},
|
|
37910
|
+
evaluatorResults
|
|
37911
|
+
};
|
|
37912
|
+
}
|
|
37913
|
+
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
37914
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
37915
|
+
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
37916
|
+
const evaluatorResults = results.map((member) => ({
|
|
37917
|
+
name: member.id,
|
|
37918
|
+
type: member.type,
|
|
37919
|
+
score: member.result.score,
|
|
37920
|
+
weight: weights?.[member.id] ?? 1,
|
|
37921
|
+
verdict: member.result.verdict,
|
|
37922
|
+
hits: [...member.result.hits],
|
|
37923
|
+
misses: [...member.result.misses],
|
|
37924
|
+
reasoning: member.result.reasoning,
|
|
37925
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
37926
|
+
evaluatorResults: member.result.evaluatorResults
|
|
37927
|
+
}));
|
|
37928
|
+
try {
|
|
37929
|
+
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
37930
|
+
const parsed = parseJsonSafe(stdout);
|
|
37931
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
37932
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
37933
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
37934
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
37935
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
37936
|
+
return {
|
|
37937
|
+
score,
|
|
37938
|
+
verdict,
|
|
37939
|
+
hits,
|
|
37940
|
+
misses,
|
|
37941
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
37942
|
+
reasoning,
|
|
37943
|
+
evaluatorRawRequest: {
|
|
37944
|
+
aggregator: "code_judge",
|
|
37945
|
+
script: scriptPath
|
|
37946
|
+
},
|
|
37947
|
+
evaluatorResults
|
|
37948
|
+
};
|
|
37949
|
+
} catch (error40) {
|
|
37950
|
+
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
37951
|
+
return {
|
|
37952
|
+
score: 0,
|
|
37953
|
+
verdict: "fail",
|
|
37954
|
+
hits: [],
|
|
37955
|
+
misses: [`Code aggregator failed: ${message}`],
|
|
37956
|
+
expectedAspectCount: 1,
|
|
37957
|
+
reasoning: message,
|
|
37958
|
+
evaluatorRawRequest: {
|
|
37959
|
+
aggregator: "code_judge",
|
|
37960
|
+
script: scriptPath,
|
|
37961
|
+
error: message
|
|
37962
|
+
},
|
|
37963
|
+
evaluatorResults
|
|
37964
|
+
};
|
|
37965
|
+
}
|
|
37966
|
+
}
|
|
37967
|
+
async runLlmAggregator(results, context, config2) {
|
|
37968
|
+
const judgeProvider = context.judgeProvider;
|
|
37969
|
+
if (!judgeProvider) {
|
|
37970
|
+
throw new Error("No judge provider available for LLM aggregation");
|
|
37971
|
+
}
|
|
37972
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
37973
|
+
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
37974
|
+
const evaluatorResults = results.map((member) => ({
|
|
37975
|
+
name: member.id,
|
|
37976
|
+
type: member.type,
|
|
37977
|
+
score: member.result.score,
|
|
37978
|
+
verdict: member.result.verdict,
|
|
37979
|
+
hits: [...member.result.hits],
|
|
37980
|
+
misses: [...member.result.misses],
|
|
37981
|
+
reasoning: member.result.reasoning,
|
|
37982
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
37983
|
+
evaluatorResults: member.result.evaluatorResults
|
|
37984
|
+
}));
|
|
37985
|
+
const promptTemplate = config2.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
37986
|
+
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
37987
|
+
const systemPrompt = buildOutputSchema();
|
|
37988
|
+
const evaluatorRawRequest = {
|
|
37989
|
+
aggregator: "llm_judge",
|
|
37990
|
+
userPrompt,
|
|
37991
|
+
systemPrompt,
|
|
37992
|
+
target: judgeProvider.targetName
|
|
37993
|
+
};
|
|
37994
|
+
try {
|
|
37995
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
37996
|
+
if (model) {
|
|
37997
|
+
const { text: text2 } = await generateText({
|
|
37998
|
+
model,
|
|
37999
|
+
system: systemPrompt,
|
|
38000
|
+
prompt: userPrompt
|
|
38001
|
+
});
|
|
38002
|
+
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text2));
|
|
38003
|
+
const score2 = clampScore(data2.score);
|
|
38004
|
+
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
38005
|
+
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
38006
|
+
const reasoning2 = data2.reasoning;
|
|
38007
|
+
return {
|
|
38008
|
+
score: score2,
|
|
38009
|
+
verdict: scoreToVerdict(score2),
|
|
38010
|
+
hits: hits2,
|
|
38011
|
+
misses: misses2,
|
|
38012
|
+
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
38013
|
+
reasoning: reasoning2,
|
|
38014
|
+
evaluatorRawRequest,
|
|
38015
|
+
evaluatorResults
|
|
38016
|
+
};
|
|
38017
|
+
}
|
|
38018
|
+
const response = await judgeProvider.invoke({
|
|
38019
|
+
question: userPrompt,
|
|
38020
|
+
systemPrompt,
|
|
38021
|
+
evalCaseId: context.evalCase.id,
|
|
38022
|
+
attempt: context.attempt
|
|
38023
|
+
});
|
|
38024
|
+
const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
|
|
38025
|
+
const score = clampScore(data.score);
|
|
38026
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
38027
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
38028
|
+
const reasoning = data.reasoning ?? response.reasoning;
|
|
38029
|
+
return {
|
|
38030
|
+
score,
|
|
38031
|
+
verdict: scoreToVerdict(score),
|
|
38032
|
+
hits,
|
|
38033
|
+
misses,
|
|
38034
|
+
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
38035
|
+
reasoning,
|
|
38036
|
+
evaluatorRawRequest,
|
|
38037
|
+
evaluatorResults
|
|
38038
|
+
};
|
|
38039
|
+
} catch {
|
|
38040
|
+
return {
|
|
38041
|
+
score: 0,
|
|
38042
|
+
verdict: "fail",
|
|
38043
|
+
hits: [],
|
|
38044
|
+
misses: [],
|
|
38045
|
+
expectedAspectCount: 1,
|
|
38046
|
+
evaluatorRawRequest,
|
|
38047
|
+
evaluatorResults
|
|
38048
|
+
};
|
|
38049
|
+
}
|
|
38050
|
+
}
|
|
38051
|
+
};
|
|
37755
38052
|
var Node = class {
|
|
37756
38053
|
value;
|
|
37757
38054
|
next;
|
|
@@ -38426,7 +38723,6 @@ async function runEvaluatorList(options) {
|
|
|
38426
38723
|
reasoning: score2.reasoning,
|
|
38427
38724
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
38428
38725
|
});
|
|
38429
|
-
continue;
|
|
38430
38726
|
}
|
|
38431
38727
|
if (evaluator.type === "code") {
|
|
38432
38728
|
const codeEvaluator = new CodeEvaluator({
|
|
@@ -38443,10 +38739,10 @@ async function runEvaluatorList(options) {
|
|
|
38443
38739
|
promptInputs,
|
|
38444
38740
|
now
|
|
38445
38741
|
});
|
|
38446
|
-
scored.push({ score: score2, name: evaluator.name, type:
|
|
38742
|
+
scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
|
|
38447
38743
|
evaluatorResults.push({
|
|
38448
38744
|
name: evaluator.name,
|
|
38449
|
-
type:
|
|
38745
|
+
type: "code_judge",
|
|
38450
38746
|
score: score2.score,
|
|
38451
38747
|
verdict: score2.verdict,
|
|
38452
38748
|
hits: score2.hits,
|
|
@@ -38454,19 +38750,37 @@ async function runEvaluatorList(options) {
|
|
|
38454
38750
|
reasoning: score2.reasoning,
|
|
38455
38751
|
evaluator_provider_request: score2.evaluatorRawRequest
|
|
38456
38752
|
});
|
|
38457
|
-
continue;
|
|
38458
38753
|
}
|
|
38459
|
-
if (evaluator.type === "
|
|
38460
|
-
const
|
|
38461
|
-
|
|
38462
|
-
|
|
38463
|
-
|
|
38464
|
-
return
|
|
38754
|
+
if (evaluator.type === "composite") {
|
|
38755
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path122.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
38756
|
+
const createEvaluator = (memberConfig) => {
|
|
38757
|
+
switch (memberConfig.type) {
|
|
38758
|
+
case "llm_judge":
|
|
38759
|
+
return evaluatorRegistry.llm_judge;
|
|
38760
|
+
case "code":
|
|
38761
|
+
return new CodeEvaluator({
|
|
38762
|
+
script: memberConfig.script,
|
|
38763
|
+
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
38764
|
+
agentTimeoutMs
|
|
38765
|
+
});
|
|
38766
|
+
case "composite":
|
|
38767
|
+
return new CompositeEvaluator({
|
|
38768
|
+
config: memberConfig,
|
|
38769
|
+
cwd: evalFileDir,
|
|
38770
|
+
evaluatorFactory: { create: createEvaluator }
|
|
38771
|
+
});
|
|
38772
|
+
default: {
|
|
38773
|
+
const unknownConfig = memberConfig;
|
|
38774
|
+
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
38465
38775
|
}
|
|
38466
|
-
return judgeProvider;
|
|
38467
38776
|
}
|
|
38777
|
+
};
|
|
38778
|
+
const compositeEvaluator = new CompositeEvaluator({
|
|
38779
|
+
config: evaluator,
|
|
38780
|
+
cwd: evalFileDir,
|
|
38781
|
+
evaluatorFactory: { create: createEvaluator }
|
|
38468
38782
|
});
|
|
38469
|
-
const score2 = await
|
|
38783
|
+
const score2 = await compositeEvaluator.evaluate({
|
|
38470
38784
|
evalCase,
|
|
38471
38785
|
candidate,
|
|
38472
38786
|
target,
|
|
@@ -38485,27 +38799,31 @@ async function runEvaluatorList(options) {
|
|
|
38485
38799
|
hits: score2.hits,
|
|
38486
38800
|
misses: score2.misses,
|
|
38487
38801
|
reasoning: score2.reasoning,
|
|
38488
|
-
evaluator_provider_request: score2.evaluatorRawRequest
|
|
38802
|
+
evaluator_provider_request: score2.evaluatorRawRequest,
|
|
38803
|
+
evaluator_results: mapChildResults(score2.evaluatorResults)
|
|
38489
38804
|
});
|
|
38490
38805
|
}
|
|
38491
38806
|
} catch (error40) {
|
|
38492
38807
|
const message = error40 instanceof Error ? error40.message : String(error40);
|
|
38493
38808
|
const fallbackScore = {
|
|
38494
38809
|
score: 0,
|
|
38810
|
+
verdict: "fail",
|
|
38495
38811
|
hits: [],
|
|
38496
38812
|
misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
|
|
38497
38813
|
expectedAspectCount: 1,
|
|
38498
38814
|
reasoning: message
|
|
38499
38815
|
};
|
|
38816
|
+
const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
|
|
38500
38817
|
scored.push({
|
|
38501
38818
|
score: fallbackScore,
|
|
38502
38819
|
name: evaluator.name ?? "unknown",
|
|
38503
|
-
type:
|
|
38820
|
+
type: resultType ?? "llm_judge"
|
|
38504
38821
|
});
|
|
38505
38822
|
evaluatorResults.push({
|
|
38506
38823
|
name: evaluator.name ?? "unknown",
|
|
38507
|
-
type:
|
|
38824
|
+
type: resultType ?? "llm_judge",
|
|
38508
38825
|
score: 0,
|
|
38826
|
+
verdict: "fail",
|
|
38509
38827
|
hits: [],
|
|
38510
38828
|
misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
|
|
38511
38829
|
reasoning: message
|
|
@@ -38524,6 +38842,7 @@ async function runEvaluatorList(options) {
|
|
|
38524
38842
|
const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
|
|
38525
38843
|
const score = {
|
|
38526
38844
|
score: aggregateScore,
|
|
38845
|
+
verdict: scoreToVerdict2(aggregateScore),
|
|
38527
38846
|
hits,
|
|
38528
38847
|
misses,
|
|
38529
38848
|
expectedAspectCount,
|
|
@@ -38574,6 +38893,15 @@ async function resolveCustomPrompt(config2) {
|
|
|
38574
38893
|
function isNonEmptyString2(value) {
|
|
38575
38894
|
return typeof value === "string" && value.trim().length > 0;
|
|
38576
38895
|
}
|
|
38896
|
+
function scoreToVerdict2(score) {
|
|
38897
|
+
if (score >= 0.8) {
|
|
38898
|
+
return "pass";
|
|
38899
|
+
}
|
|
38900
|
+
if (score >= 0.6) {
|
|
38901
|
+
return "borderline";
|
|
38902
|
+
}
|
|
38903
|
+
return "fail";
|
|
38904
|
+
}
|
|
38577
38905
|
function filterEvalCases(evalCases, evalId) {
|
|
38578
38906
|
if (!evalId) {
|
|
38579
38907
|
return evalCases;
|
|
@@ -38711,6 +39039,23 @@ function isTimeoutLike(error40) {
|
|
|
38711
39039
|
const value = String(error40).toLowerCase();
|
|
38712
39040
|
return value.includes("timeout");
|
|
38713
39041
|
}
|
|
39042
|
+
function mapChildResults(children) {
|
|
39043
|
+
if (!children || children.length === 0) {
|
|
39044
|
+
return void 0;
|
|
39045
|
+
}
|
|
39046
|
+
return children.map((child) => ({
|
|
39047
|
+
name: child.name,
|
|
39048
|
+
type: child.type,
|
|
39049
|
+
score: child.score,
|
|
39050
|
+
weight: child.weight,
|
|
39051
|
+
verdict: child.verdict,
|
|
39052
|
+
hits: child.hits,
|
|
39053
|
+
misses: child.misses,
|
|
39054
|
+
reasoning: child.reasoning,
|
|
39055
|
+
evaluator_provider_request: child.evaluatorRawRequest,
|
|
39056
|
+
evaluator_results: mapChildResults(child.evaluatorResults)
|
|
39057
|
+
}));
|
|
39058
|
+
}
|
|
38714
39059
|
var rubricItemSchema = external_exports.object({
|
|
38715
39060
|
id: external_exports.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
|
|
38716
39061
|
description: external_exports.string().describe("What this rubric checks for"),
|
|
@@ -40964,8 +41309,8 @@ var evalCommand = command({
|
|
|
40964
41309
|
workers: option({
|
|
40965
41310
|
type: number4,
|
|
40966
41311
|
long: "workers",
|
|
40967
|
-
description: "Number of parallel workers (default:
|
|
40968
|
-
defaultValue: () =>
|
|
41312
|
+
description: "Number of parallel workers (default: 3, max: 50). Can also be set per-target in targets.yaml",
|
|
41313
|
+
defaultValue: () => 3
|
|
40969
41314
|
}),
|
|
40970
41315
|
out: option({
|
|
40971
41316
|
type: optional2(string4),
|
|
@@ -41713,4 +42058,4 @@ export {
|
|
|
41713
42058
|
app,
|
|
41714
42059
|
runCli
|
|
41715
42060
|
};
|
|
41716
|
-
//# sourceMappingURL=chunk-
|
|
42061
|
+
//# sourceMappingURL=chunk-QRY42RAP.js.map
|