@agentv/core 0.22.2 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +400 -7
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +57 -3
- package/dist/index.d.ts +57 -3
- package/dist/index.js +399 -7
- package/dist/index.js.map +1 -1
- package/package.json +4 -8
package/dist/index.js
CHANGED
|
@@ -51,7 +51,7 @@ function isTestMessage(value) {
|
|
|
51
51
|
}
|
|
52
52
|
return candidate.content.every(isJsonObject);
|
|
53
53
|
}
|
|
54
|
-
var EVALUATOR_KIND_VALUES = ["
|
|
54
|
+
var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
|
|
55
55
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
56
56
|
function isEvaluatorKind(value) {
|
|
57
57
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -403,10 +403,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
403
403
|
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
404
404
|
continue;
|
|
405
405
|
}
|
|
406
|
-
if (typeValue === "
|
|
406
|
+
if (typeValue === "code_judge") {
|
|
407
407
|
const script = asString2(rawEvaluator.script);
|
|
408
408
|
if (!script) {
|
|
409
|
-
logWarning2(`Skipping
|
|
409
|
+
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
|
|
410
410
|
continue;
|
|
411
411
|
}
|
|
412
412
|
const cwd = asString2(rawEvaluator.cwd);
|
|
@@ -417,7 +417,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
417
417
|
resolvedCwd = path3.resolve(resolved.resolvedPath);
|
|
418
418
|
} else {
|
|
419
419
|
logWarning2(
|
|
420
|
-
`
|
|
420
|
+
`Code_judge evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
421
421
|
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
422
422
|
);
|
|
423
423
|
}
|
|
@@ -433,6 +433,105 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
433
433
|
});
|
|
434
434
|
continue;
|
|
435
435
|
}
|
|
436
|
+
if (typeValue === "composite") {
|
|
437
|
+
const rawMembers = rawEvaluator.evaluators;
|
|
438
|
+
if (!Array.isArray(rawMembers)) {
|
|
439
|
+
logWarning2(
|
|
440
|
+
`Skipping composite evaluator '${name}' in '${evalId}': missing evaluators array`
|
|
441
|
+
);
|
|
442
|
+
continue;
|
|
443
|
+
}
|
|
444
|
+
const rawAggregator = rawEvaluator.aggregator;
|
|
445
|
+
if (!isJsonObject2(rawAggregator)) {
|
|
446
|
+
logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
|
|
447
|
+
continue;
|
|
448
|
+
}
|
|
449
|
+
const aggregatorType = asString2(rawAggregator.type);
|
|
450
|
+
if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
|
|
451
|
+
logWarning2(
|
|
452
|
+
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
453
|
+
);
|
|
454
|
+
continue;
|
|
455
|
+
}
|
|
456
|
+
const memberEvaluators = [];
|
|
457
|
+
for (const rawMember of rawMembers) {
|
|
458
|
+
if (!isJsonObject2(rawMember)) {
|
|
459
|
+
logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
|
|
460
|
+
continue;
|
|
461
|
+
}
|
|
462
|
+
const memberName = asString2(rawMember.name);
|
|
463
|
+
const memberType = rawMember.type;
|
|
464
|
+
if (!memberName || !isEvaluatorKind(memberType)) {
|
|
465
|
+
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
|
|
466
|
+
continue;
|
|
467
|
+
}
|
|
468
|
+
const memberConfigs = await parseEvaluators(
|
|
469
|
+
{ evaluators: [rawMember] },
|
|
470
|
+
void 0,
|
|
471
|
+
searchRoots,
|
|
472
|
+
`${evalId}:${name}:${memberName}`
|
|
473
|
+
);
|
|
474
|
+
if (memberConfigs && memberConfigs.length > 0) {
|
|
475
|
+
memberEvaluators.push(memberConfigs[0]);
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
if (memberEvaluators.length === 0) {
|
|
479
|
+
logWarning2(
|
|
480
|
+
`Skipping composite evaluator '${name}' in '${evalId}': no valid member evaluators`
|
|
481
|
+
);
|
|
482
|
+
continue;
|
|
483
|
+
}
|
|
484
|
+
let aggregator;
|
|
485
|
+
if (aggregatorType === "weighted_average") {
|
|
486
|
+
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
487
|
+
const parsedWeights = {};
|
|
488
|
+
if (weights) {
|
|
489
|
+
for (const [key, value] of Object.entries(weights)) {
|
|
490
|
+
if (typeof value === "number") {
|
|
491
|
+
parsedWeights[key] = value;
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
aggregator = {
|
|
496
|
+
type: "weighted_average",
|
|
497
|
+
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
498
|
+
};
|
|
499
|
+
} else if (aggregatorType === "code_judge") {
|
|
500
|
+
const aggregatorPath = asString2(rawAggregator.path);
|
|
501
|
+
if (!aggregatorPath) {
|
|
502
|
+
logWarning2(
|
|
503
|
+
`Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
|
|
504
|
+
);
|
|
505
|
+
continue;
|
|
506
|
+
}
|
|
507
|
+
aggregator = {
|
|
508
|
+
type: "code_judge",
|
|
509
|
+
path: aggregatorPath,
|
|
510
|
+
cwd: searchRoots[0]
|
|
511
|
+
};
|
|
512
|
+
} else {
|
|
513
|
+
const aggregatorPrompt = asString2(rawAggregator.prompt);
|
|
514
|
+
let promptPath2;
|
|
515
|
+
if (aggregatorPrompt) {
|
|
516
|
+
const resolved = await resolveFileReference2(aggregatorPrompt, searchRoots);
|
|
517
|
+
if (resolved.resolvedPath) {
|
|
518
|
+
promptPath2 = path3.resolve(resolved.resolvedPath);
|
|
519
|
+
}
|
|
520
|
+
}
|
|
521
|
+
aggregator = {
|
|
522
|
+
type: "llm_judge",
|
|
523
|
+
...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
|
|
524
|
+
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
525
|
+
};
|
|
526
|
+
}
|
|
527
|
+
evaluators.push({
|
|
528
|
+
name,
|
|
529
|
+
type: "composite",
|
|
530
|
+
evaluators: memberEvaluators,
|
|
531
|
+
aggregator
|
|
532
|
+
});
|
|
533
|
+
continue;
|
|
534
|
+
}
|
|
436
535
|
const prompt = asString2(rawEvaluator.prompt);
|
|
437
536
|
let promptPath;
|
|
438
537
|
if (prompt) {
|
|
@@ -3329,6 +3428,228 @@ function substituteVariables(template, variables) {
|
|
|
3329
3428
|
return variables[varName] ?? match;
|
|
3330
3429
|
});
|
|
3331
3430
|
}
|
|
3431
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
3432
|
+
{{EVALUATOR_RESULTS_JSON}}
|
|
3433
|
+
|
|
3434
|
+
Decide the final score and verdict based on all evaluator results.
|
|
3435
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
3436
|
+
var CompositeEvaluator = class {
|
|
3437
|
+
kind = "composite";
|
|
3438
|
+
config;
|
|
3439
|
+
evaluatorFactory;
|
|
3440
|
+
cwd;
|
|
3441
|
+
constructor(options) {
|
|
3442
|
+
this.config = options.config;
|
|
3443
|
+
this.evaluatorFactory = options.evaluatorFactory;
|
|
3444
|
+
this.cwd = options.cwd;
|
|
3445
|
+
}
|
|
3446
|
+
async evaluate(context) {
|
|
3447
|
+
const memberResults = await Promise.all(
|
|
3448
|
+
this.config.evaluators.map(async (memberConfig) => {
|
|
3449
|
+
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
3450
|
+
return {
|
|
3451
|
+
id: memberConfig.name,
|
|
3452
|
+
type: memberConfig.type,
|
|
3453
|
+
result: await evaluator.evaluate(context)
|
|
3454
|
+
};
|
|
3455
|
+
})
|
|
3456
|
+
);
|
|
3457
|
+
return this.aggregate(memberResults, context);
|
|
3458
|
+
}
|
|
3459
|
+
async aggregate(results, context) {
|
|
3460
|
+
const aggregator = this.config.aggregator;
|
|
3461
|
+
switch (aggregator.type) {
|
|
3462
|
+
case "code_judge":
|
|
3463
|
+
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
3464
|
+
case "llm_judge":
|
|
3465
|
+
return this.runLlmAggregator(results, context, aggregator);
|
|
3466
|
+
default:
|
|
3467
|
+
return this.runWeightedAverage(results, aggregator.weights);
|
|
3468
|
+
}
|
|
3469
|
+
}
|
|
3470
|
+
runWeightedAverage(results, weights) {
|
|
3471
|
+
let totalWeight = 0;
|
|
3472
|
+
let weightedSum = 0;
|
|
3473
|
+
const allHits = [];
|
|
3474
|
+
const allMisses = [];
|
|
3475
|
+
const reasoningParts = [];
|
|
3476
|
+
const evaluatorResults = [];
|
|
3477
|
+
for (const member of results) {
|
|
3478
|
+
const weight = weights?.[member.id] ?? 1;
|
|
3479
|
+
totalWeight += weight;
|
|
3480
|
+
weightedSum += member.result.score * weight;
|
|
3481
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
3482
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
3483
|
+
if (member.result.reasoning) {
|
|
3484
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
3485
|
+
}
|
|
3486
|
+
evaluatorResults.push({
|
|
3487
|
+
name: member.id,
|
|
3488
|
+
type: member.type,
|
|
3489
|
+
score: member.result.score,
|
|
3490
|
+
weight,
|
|
3491
|
+
verdict: member.result.verdict,
|
|
3492
|
+
hits: [...member.result.hits],
|
|
3493
|
+
misses: [...member.result.misses],
|
|
3494
|
+
reasoning: member.result.reasoning,
|
|
3495
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
3496
|
+
evaluatorResults: member.result.evaluatorResults
|
|
3497
|
+
});
|
|
3498
|
+
}
|
|
3499
|
+
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
3500
|
+
return {
|
|
3501
|
+
score: clampScore(finalScore),
|
|
3502
|
+
verdict: scoreToVerdict(finalScore),
|
|
3503
|
+
hits: allHits,
|
|
3504
|
+
misses: allMisses,
|
|
3505
|
+
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
3506
|
+
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
3507
|
+
evaluatorRawRequest: {
|
|
3508
|
+
aggregator: "weighted_average",
|
|
3509
|
+
...weights ? { weights } : {}
|
|
3510
|
+
},
|
|
3511
|
+
evaluatorResults
|
|
3512
|
+
};
|
|
3513
|
+
}
|
|
3514
|
+
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
3515
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
3516
|
+
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
3517
|
+
const evaluatorResults = results.map((member) => ({
|
|
3518
|
+
name: member.id,
|
|
3519
|
+
type: member.type,
|
|
3520
|
+
score: member.result.score,
|
|
3521
|
+
weight: weights?.[member.id] ?? 1,
|
|
3522
|
+
verdict: member.result.verdict,
|
|
3523
|
+
hits: [...member.result.hits],
|
|
3524
|
+
misses: [...member.result.misses],
|
|
3525
|
+
reasoning: member.result.reasoning,
|
|
3526
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
3527
|
+
evaluatorResults: member.result.evaluatorResults
|
|
3528
|
+
}));
|
|
3529
|
+
try {
|
|
3530
|
+
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
3531
|
+
const parsed = parseJsonSafe(stdout);
|
|
3532
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
3533
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
3534
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
3535
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
3536
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
3537
|
+
return {
|
|
3538
|
+
score,
|
|
3539
|
+
verdict,
|
|
3540
|
+
hits,
|
|
3541
|
+
misses,
|
|
3542
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
3543
|
+
reasoning,
|
|
3544
|
+
evaluatorRawRequest: {
|
|
3545
|
+
aggregator: "code_judge",
|
|
3546
|
+
script: scriptPath
|
|
3547
|
+
},
|
|
3548
|
+
evaluatorResults
|
|
3549
|
+
};
|
|
3550
|
+
} catch (error) {
|
|
3551
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
3552
|
+
return {
|
|
3553
|
+
score: 0,
|
|
3554
|
+
verdict: "fail",
|
|
3555
|
+
hits: [],
|
|
3556
|
+
misses: [`Code aggregator failed: ${message}`],
|
|
3557
|
+
expectedAspectCount: 1,
|
|
3558
|
+
reasoning: message,
|
|
3559
|
+
evaluatorRawRequest: {
|
|
3560
|
+
aggregator: "code_judge",
|
|
3561
|
+
script: scriptPath,
|
|
3562
|
+
error: message
|
|
3563
|
+
},
|
|
3564
|
+
evaluatorResults
|
|
3565
|
+
};
|
|
3566
|
+
}
|
|
3567
|
+
}
|
|
3568
|
+
async runLlmAggregator(results, context, config) {
|
|
3569
|
+
const judgeProvider = context.judgeProvider;
|
|
3570
|
+
if (!judgeProvider) {
|
|
3571
|
+
throw new Error("No judge provider available for LLM aggregation");
|
|
3572
|
+
}
|
|
3573
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
3574
|
+
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
3575
|
+
const evaluatorResults = results.map((member) => ({
|
|
3576
|
+
name: member.id,
|
|
3577
|
+
type: member.type,
|
|
3578
|
+
score: member.result.score,
|
|
3579
|
+
verdict: member.result.verdict,
|
|
3580
|
+
hits: [...member.result.hits],
|
|
3581
|
+
misses: [...member.result.misses],
|
|
3582
|
+
reasoning: member.result.reasoning,
|
|
3583
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
3584
|
+
evaluatorResults: member.result.evaluatorResults
|
|
3585
|
+
}));
|
|
3586
|
+
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
3587
|
+
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
3588
|
+
const systemPrompt = buildOutputSchema();
|
|
3589
|
+
const evaluatorRawRequest = {
|
|
3590
|
+
aggregator: "llm_judge",
|
|
3591
|
+
userPrompt,
|
|
3592
|
+
systemPrompt,
|
|
3593
|
+
target: judgeProvider.targetName
|
|
3594
|
+
};
|
|
3595
|
+
try {
|
|
3596
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
3597
|
+
if (model) {
|
|
3598
|
+
const { text } = await generateText2({
|
|
3599
|
+
model,
|
|
3600
|
+
system: systemPrompt,
|
|
3601
|
+
prompt: userPrompt
|
|
3602
|
+
});
|
|
3603
|
+
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
3604
|
+
const score2 = clampScore(data2.score);
|
|
3605
|
+
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3606
|
+
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3607
|
+
const reasoning2 = data2.reasoning;
|
|
3608
|
+
return {
|
|
3609
|
+
score: score2,
|
|
3610
|
+
verdict: scoreToVerdict(score2),
|
|
3611
|
+
hits: hits2,
|
|
3612
|
+
misses: misses2,
|
|
3613
|
+
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
3614
|
+
reasoning: reasoning2,
|
|
3615
|
+
evaluatorRawRequest,
|
|
3616
|
+
evaluatorResults
|
|
3617
|
+
};
|
|
3618
|
+
}
|
|
3619
|
+
const response = await judgeProvider.invoke({
|
|
3620
|
+
question: userPrompt,
|
|
3621
|
+
systemPrompt,
|
|
3622
|
+
evalCaseId: context.evalCase.id,
|
|
3623
|
+
attempt: context.attempt
|
|
3624
|
+
});
|
|
3625
|
+
const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
|
|
3626
|
+
const score = clampScore(data.score);
|
|
3627
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3628
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
3629
|
+
const reasoning = data.reasoning ?? response.reasoning;
|
|
3630
|
+
return {
|
|
3631
|
+
score,
|
|
3632
|
+
verdict: scoreToVerdict(score),
|
|
3633
|
+
hits,
|
|
3634
|
+
misses,
|
|
3635
|
+
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
3636
|
+
reasoning,
|
|
3637
|
+
evaluatorRawRequest,
|
|
3638
|
+
evaluatorResults
|
|
3639
|
+
};
|
|
3640
|
+
} catch {
|
|
3641
|
+
return {
|
|
3642
|
+
score: 0,
|
|
3643
|
+
verdict: "fail",
|
|
3644
|
+
hits: [],
|
|
3645
|
+
misses: [],
|
|
3646
|
+
expectedAspectCount: 1,
|
|
3647
|
+
evaluatorRawRequest,
|
|
3648
|
+
evaluatorResults
|
|
3649
|
+
};
|
|
3650
|
+
}
|
|
3651
|
+
}
|
|
3652
|
+
};
|
|
3332
3653
|
|
|
3333
3654
|
// src/evaluation/orchestrator.ts
|
|
3334
3655
|
import { createHash, randomUUID as randomUUID2 } from "node:crypto";
|
|
@@ -4030,6 +4351,57 @@ async function runEvaluatorList(options) {
|
|
|
4030
4351
|
promptInputs,
|
|
4031
4352
|
now
|
|
4032
4353
|
});
|
|
4354
|
+
scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
|
|
4355
|
+
evaluatorResults.push({
|
|
4356
|
+
name: evaluator.name,
|
|
4357
|
+
type: "code_judge",
|
|
4358
|
+
score: score2.score,
|
|
4359
|
+
verdict: score2.verdict,
|
|
4360
|
+
hits: score2.hits,
|
|
4361
|
+
misses: score2.misses,
|
|
4362
|
+
reasoning: score2.reasoning,
|
|
4363
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4364
|
+
});
|
|
4365
|
+
}
|
|
4366
|
+
if (evaluator.type === "composite") {
|
|
4367
|
+
const evalFileDir = evalCase.guideline_paths[0] ? path12.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
4368
|
+
const createEvaluator = (memberConfig) => {
|
|
4369
|
+
switch (memberConfig.type) {
|
|
4370
|
+
case "llm_judge":
|
|
4371
|
+
return evaluatorRegistry.llm_judge;
|
|
4372
|
+
case "code":
|
|
4373
|
+
return new CodeEvaluator({
|
|
4374
|
+
script: memberConfig.script,
|
|
4375
|
+
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
4376
|
+
agentTimeoutMs
|
|
4377
|
+
});
|
|
4378
|
+
case "composite":
|
|
4379
|
+
return new CompositeEvaluator({
|
|
4380
|
+
config: memberConfig,
|
|
4381
|
+
cwd: evalFileDir,
|
|
4382
|
+
evaluatorFactory: { create: createEvaluator }
|
|
4383
|
+
});
|
|
4384
|
+
default: {
|
|
4385
|
+
const unknownConfig = memberConfig;
|
|
4386
|
+
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
4387
|
+
}
|
|
4388
|
+
}
|
|
4389
|
+
};
|
|
4390
|
+
const compositeEvaluator = new CompositeEvaluator({
|
|
4391
|
+
config: evaluator,
|
|
4392
|
+
cwd: evalFileDir,
|
|
4393
|
+
evaluatorFactory: { create: createEvaluator }
|
|
4394
|
+
});
|
|
4395
|
+
const score2 = await compositeEvaluator.evaluate({
|
|
4396
|
+
evalCase,
|
|
4397
|
+
candidate,
|
|
4398
|
+
target,
|
|
4399
|
+
provider,
|
|
4400
|
+
attempt,
|
|
4401
|
+
promptInputs,
|
|
4402
|
+
now,
|
|
4403
|
+
judgeProvider
|
|
4404
|
+
});
|
|
4033
4405
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
4034
4406
|
evaluatorResults.push({
|
|
4035
4407
|
name: evaluator.name,
|
|
@@ -4039,7 +4411,8 @@ async function runEvaluatorList(options) {
|
|
|
4039
4411
|
hits: score2.hits,
|
|
4040
4412
|
misses: score2.misses,
|
|
4041
4413
|
reasoning: score2.reasoning,
|
|
4042
|
-
evaluator_provider_request: score2.evaluatorRawRequest
|
|
4414
|
+
evaluator_provider_request: score2.evaluatorRawRequest,
|
|
4415
|
+
evaluator_results: mapChildResults(score2.evaluatorResults)
|
|
4043
4416
|
});
|
|
4044
4417
|
}
|
|
4045
4418
|
} catch (error) {
|
|
@@ -4052,14 +4425,15 @@ async function runEvaluatorList(options) {
|
|
|
4052
4425
|
expectedAspectCount: 1,
|
|
4053
4426
|
reasoning: message
|
|
4054
4427
|
};
|
|
4428
|
+
const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
|
|
4055
4429
|
scored.push({
|
|
4056
4430
|
score: fallbackScore,
|
|
4057
4431
|
name: evaluator.name ?? "unknown",
|
|
4058
|
-
type:
|
|
4432
|
+
type: resultType ?? "llm_judge"
|
|
4059
4433
|
});
|
|
4060
4434
|
evaluatorResults.push({
|
|
4061
4435
|
name: evaluator.name ?? "unknown",
|
|
4062
|
-
type:
|
|
4436
|
+
type: resultType ?? "llm_judge",
|
|
4063
4437
|
score: 0,
|
|
4064
4438
|
verdict: "fail",
|
|
4065
4439
|
hits: [],
|
|
@@ -4277,6 +4651,23 @@ function isTimeoutLike(error) {
|
|
|
4277
4651
|
const value = String(error).toLowerCase();
|
|
4278
4652
|
return value.includes("timeout");
|
|
4279
4653
|
}
|
|
4654
|
+
function mapChildResults(children) {
|
|
4655
|
+
if (!children || children.length === 0) {
|
|
4656
|
+
return void 0;
|
|
4657
|
+
}
|
|
4658
|
+
return children.map((child) => ({
|
|
4659
|
+
name: child.name,
|
|
4660
|
+
type: child.type,
|
|
4661
|
+
score: child.score,
|
|
4662
|
+
weight: child.weight,
|
|
4663
|
+
verdict: child.verdict,
|
|
4664
|
+
hits: child.hits,
|
|
4665
|
+
misses: child.misses,
|
|
4666
|
+
reasoning: child.reasoning,
|
|
4667
|
+
evaluator_provider_request: child.evaluatorRawRequest,
|
|
4668
|
+
evaluator_results: mapChildResults(child.evaluatorResults)
|
|
4669
|
+
}));
|
|
4670
|
+
}
|
|
4280
4671
|
|
|
4281
4672
|
// src/evaluation/generators/rubric-generator.ts
|
|
4282
4673
|
import { generateText as generateText3 } from "ai";
|
|
@@ -4364,6 +4755,7 @@ function createAgentKernel() {
|
|
|
4364
4755
|
}
|
|
4365
4756
|
export {
|
|
4366
4757
|
CodeEvaluator,
|
|
4758
|
+
CompositeEvaluator,
|
|
4367
4759
|
LlmJudgeEvaluator,
|
|
4368
4760
|
TEST_MESSAGE_ROLES,
|
|
4369
4761
|
buildDirectoryChain,
|