@agentv/core 0.22.2 → 0.23.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/evaluation/validation/index.cjs.map +1 -1
- package/dist/evaluation/validation/index.js.map +1 -1
- package/dist/index.cjs +400 -7
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +57 -3
- package/dist/index.d.ts +57 -3
- package/dist/index.js +399 -7
- package/dist/index.js.map +1 -1
- package/package.json +4 -8
package/dist/index.cjs
CHANGED
|
@@ -31,6 +31,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
|
|
|
31
31
|
var index_exports = {};
|
|
32
32
|
__export(index_exports, {
|
|
33
33
|
CodeEvaluator: () => CodeEvaluator,
|
|
34
|
+
CompositeEvaluator: () => CompositeEvaluator,
|
|
34
35
|
LlmJudgeEvaluator: () => LlmJudgeEvaluator,
|
|
35
36
|
TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
|
|
36
37
|
buildDirectoryChain: () => buildDirectoryChain2,
|
|
@@ -107,7 +108,7 @@ function isTestMessage(value) {
|
|
|
107
108
|
}
|
|
108
109
|
return candidate.content.every(isJsonObject);
|
|
109
110
|
}
|
|
110
|
-
var EVALUATOR_KIND_VALUES = ["
|
|
111
|
+
var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
|
|
111
112
|
var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
|
|
112
113
|
function isEvaluatorKind(value) {
|
|
113
114
|
return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
|
|
@@ -459,10 +460,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
459
460
|
logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
|
|
460
461
|
continue;
|
|
461
462
|
}
|
|
462
|
-
if (typeValue === "
|
|
463
|
+
if (typeValue === "code_judge") {
|
|
463
464
|
const script = asString2(rawEvaluator.script);
|
|
464
465
|
if (!script) {
|
|
465
|
-
logWarning2(`Skipping
|
|
466
|
+
logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
|
|
466
467
|
continue;
|
|
467
468
|
}
|
|
468
469
|
const cwd = asString2(rawEvaluator.cwd);
|
|
@@ -473,7 +474,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
473
474
|
resolvedCwd = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
474
475
|
} else {
|
|
475
476
|
logWarning2(
|
|
476
|
-
`
|
|
477
|
+
`Code_judge evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
|
|
477
478
|
resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
|
|
478
479
|
);
|
|
479
480
|
}
|
|
@@ -489,6 +490,105 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
|
|
|
489
490
|
});
|
|
490
491
|
continue;
|
|
491
492
|
}
|
|
493
|
+
if (typeValue === "composite") {
|
|
494
|
+
const rawMembers = rawEvaluator.evaluators;
|
|
495
|
+
if (!Array.isArray(rawMembers)) {
|
|
496
|
+
logWarning2(
|
|
497
|
+
`Skipping composite evaluator '${name}' in '${evalId}': missing evaluators array`
|
|
498
|
+
);
|
|
499
|
+
continue;
|
|
500
|
+
}
|
|
501
|
+
const rawAggregator = rawEvaluator.aggregator;
|
|
502
|
+
if (!isJsonObject2(rawAggregator)) {
|
|
503
|
+
logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
|
|
504
|
+
continue;
|
|
505
|
+
}
|
|
506
|
+
const aggregatorType = asString2(rawAggregator.type);
|
|
507
|
+
if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
|
|
508
|
+
logWarning2(
|
|
509
|
+
`Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
|
|
510
|
+
);
|
|
511
|
+
continue;
|
|
512
|
+
}
|
|
513
|
+
const memberEvaluators = [];
|
|
514
|
+
for (const rawMember of rawMembers) {
|
|
515
|
+
if (!isJsonObject2(rawMember)) {
|
|
516
|
+
logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
|
|
517
|
+
continue;
|
|
518
|
+
}
|
|
519
|
+
const memberName = asString2(rawMember.name);
|
|
520
|
+
const memberType = rawMember.type;
|
|
521
|
+
if (!memberName || !isEvaluatorKind(memberType)) {
|
|
522
|
+
logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
|
|
523
|
+
continue;
|
|
524
|
+
}
|
|
525
|
+
const memberConfigs = await parseEvaluators(
|
|
526
|
+
{ evaluators: [rawMember] },
|
|
527
|
+
void 0,
|
|
528
|
+
searchRoots,
|
|
529
|
+
`${evalId}:${name}:${memberName}`
|
|
530
|
+
);
|
|
531
|
+
if (memberConfigs && memberConfigs.length > 0) {
|
|
532
|
+
memberEvaluators.push(memberConfigs[0]);
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
if (memberEvaluators.length === 0) {
|
|
536
|
+
logWarning2(
|
|
537
|
+
`Skipping composite evaluator '${name}' in '${evalId}': no valid member evaluators`
|
|
538
|
+
);
|
|
539
|
+
continue;
|
|
540
|
+
}
|
|
541
|
+
let aggregator;
|
|
542
|
+
if (aggregatorType === "weighted_average") {
|
|
543
|
+
const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
|
|
544
|
+
const parsedWeights = {};
|
|
545
|
+
if (weights) {
|
|
546
|
+
for (const [key, value] of Object.entries(weights)) {
|
|
547
|
+
if (typeof value === "number") {
|
|
548
|
+
parsedWeights[key] = value;
|
|
549
|
+
}
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
aggregator = {
|
|
553
|
+
type: "weighted_average",
|
|
554
|
+
...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
|
|
555
|
+
};
|
|
556
|
+
} else if (aggregatorType === "code_judge") {
|
|
557
|
+
const aggregatorPath = asString2(rawAggregator.path);
|
|
558
|
+
if (!aggregatorPath) {
|
|
559
|
+
logWarning2(
|
|
560
|
+
`Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
|
|
561
|
+
);
|
|
562
|
+
continue;
|
|
563
|
+
}
|
|
564
|
+
aggregator = {
|
|
565
|
+
type: "code_judge",
|
|
566
|
+
path: aggregatorPath,
|
|
567
|
+
cwd: searchRoots[0]
|
|
568
|
+
};
|
|
569
|
+
} else {
|
|
570
|
+
const aggregatorPrompt = asString2(rawAggregator.prompt);
|
|
571
|
+
let promptPath2;
|
|
572
|
+
if (aggregatorPrompt) {
|
|
573
|
+
const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
|
|
574
|
+
if (resolved.resolvedPath) {
|
|
575
|
+
promptPath2 = import_node_path3.default.resolve(resolved.resolvedPath);
|
|
576
|
+
}
|
|
577
|
+
}
|
|
578
|
+
aggregator = {
|
|
579
|
+
type: "llm_judge",
|
|
580
|
+
...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
|
|
581
|
+
...promptPath2 ? { promptPath: promptPath2 } : {}
|
|
582
|
+
};
|
|
583
|
+
}
|
|
584
|
+
evaluators.push({
|
|
585
|
+
name,
|
|
586
|
+
type: "composite",
|
|
587
|
+
evaluators: memberEvaluators,
|
|
588
|
+
aggregator
|
|
589
|
+
});
|
|
590
|
+
continue;
|
|
591
|
+
}
|
|
492
592
|
const prompt = asString2(rawEvaluator.prompt);
|
|
493
593
|
let promptPath;
|
|
494
594
|
if (prompt) {
|
|
@@ -4021,6 +4121,228 @@ function substituteVariables(template, variables) {
|
|
|
4021
4121
|
return variables[varName] ?? match;
|
|
4022
4122
|
});
|
|
4023
4123
|
}
|
|
4124
|
+
var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
|
|
4125
|
+
{{EVALUATOR_RESULTS_JSON}}
|
|
4126
|
+
|
|
4127
|
+
Decide the final score and verdict based on all evaluator results.
|
|
4128
|
+
Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
|
|
4129
|
+
var CompositeEvaluator = class {
|
|
4130
|
+
kind = "composite";
|
|
4131
|
+
config;
|
|
4132
|
+
evaluatorFactory;
|
|
4133
|
+
cwd;
|
|
4134
|
+
constructor(options) {
|
|
4135
|
+
this.config = options.config;
|
|
4136
|
+
this.evaluatorFactory = options.evaluatorFactory;
|
|
4137
|
+
this.cwd = options.cwd;
|
|
4138
|
+
}
|
|
4139
|
+
async evaluate(context) {
|
|
4140
|
+
const memberResults = await Promise.all(
|
|
4141
|
+
this.config.evaluators.map(async (memberConfig) => {
|
|
4142
|
+
const evaluator = this.evaluatorFactory.create(memberConfig, context);
|
|
4143
|
+
return {
|
|
4144
|
+
id: memberConfig.name,
|
|
4145
|
+
type: memberConfig.type,
|
|
4146
|
+
result: await evaluator.evaluate(context)
|
|
4147
|
+
};
|
|
4148
|
+
})
|
|
4149
|
+
);
|
|
4150
|
+
return this.aggregate(memberResults, context);
|
|
4151
|
+
}
|
|
4152
|
+
async aggregate(results, context) {
|
|
4153
|
+
const aggregator = this.config.aggregator;
|
|
4154
|
+
switch (aggregator.type) {
|
|
4155
|
+
case "code_judge":
|
|
4156
|
+
return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
|
|
4157
|
+
case "llm_judge":
|
|
4158
|
+
return this.runLlmAggregator(results, context, aggregator);
|
|
4159
|
+
default:
|
|
4160
|
+
return this.runWeightedAverage(results, aggregator.weights);
|
|
4161
|
+
}
|
|
4162
|
+
}
|
|
4163
|
+
runWeightedAverage(results, weights) {
|
|
4164
|
+
let totalWeight = 0;
|
|
4165
|
+
let weightedSum = 0;
|
|
4166
|
+
const allHits = [];
|
|
4167
|
+
const allMisses = [];
|
|
4168
|
+
const reasoningParts = [];
|
|
4169
|
+
const evaluatorResults = [];
|
|
4170
|
+
for (const member of results) {
|
|
4171
|
+
const weight = weights?.[member.id] ?? 1;
|
|
4172
|
+
totalWeight += weight;
|
|
4173
|
+
weightedSum += member.result.score * weight;
|
|
4174
|
+
allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
|
|
4175
|
+
allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
|
|
4176
|
+
if (member.result.reasoning) {
|
|
4177
|
+
reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
|
|
4178
|
+
}
|
|
4179
|
+
evaluatorResults.push({
|
|
4180
|
+
name: member.id,
|
|
4181
|
+
type: member.type,
|
|
4182
|
+
score: member.result.score,
|
|
4183
|
+
weight,
|
|
4184
|
+
verdict: member.result.verdict,
|
|
4185
|
+
hits: [...member.result.hits],
|
|
4186
|
+
misses: [...member.result.misses],
|
|
4187
|
+
reasoning: member.result.reasoning,
|
|
4188
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
4189
|
+
evaluatorResults: member.result.evaluatorResults
|
|
4190
|
+
});
|
|
4191
|
+
}
|
|
4192
|
+
const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
|
|
4193
|
+
return {
|
|
4194
|
+
score: clampScore(finalScore),
|
|
4195
|
+
verdict: scoreToVerdict(finalScore),
|
|
4196
|
+
hits: allHits,
|
|
4197
|
+
misses: allMisses,
|
|
4198
|
+
expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
|
|
4199
|
+
reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
|
|
4200
|
+
evaluatorRawRequest: {
|
|
4201
|
+
aggregator: "weighted_average",
|
|
4202
|
+
...weights ? { weights } : {}
|
|
4203
|
+
},
|
|
4204
|
+
evaluatorResults
|
|
4205
|
+
};
|
|
4206
|
+
}
|
|
4207
|
+
async runCodeAggregator(results, scriptPath, cwd, weights) {
|
|
4208
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
4209
|
+
const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
|
|
4210
|
+
const evaluatorResults = results.map((member) => ({
|
|
4211
|
+
name: member.id,
|
|
4212
|
+
type: member.type,
|
|
4213
|
+
score: member.result.score,
|
|
4214
|
+
weight: weights?.[member.id] ?? 1,
|
|
4215
|
+
verdict: member.result.verdict,
|
|
4216
|
+
hits: [...member.result.hits],
|
|
4217
|
+
misses: [...member.result.misses],
|
|
4218
|
+
reasoning: member.result.reasoning,
|
|
4219
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
4220
|
+
evaluatorResults: member.result.evaluatorResults
|
|
4221
|
+
}));
|
|
4222
|
+
try {
|
|
4223
|
+
const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
|
|
4224
|
+
const parsed = parseJsonSafe(stdout);
|
|
4225
|
+
const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
|
|
4226
|
+
const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
|
|
4227
|
+
const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
|
|
4228
|
+
const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
|
|
4229
|
+
const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
|
|
4230
|
+
return {
|
|
4231
|
+
score,
|
|
4232
|
+
verdict,
|
|
4233
|
+
hits,
|
|
4234
|
+
misses,
|
|
4235
|
+
expectedAspectCount: hits.length + misses.length || 1,
|
|
4236
|
+
reasoning,
|
|
4237
|
+
evaluatorRawRequest: {
|
|
4238
|
+
aggregator: "code_judge",
|
|
4239
|
+
script: scriptPath
|
|
4240
|
+
},
|
|
4241
|
+
evaluatorResults
|
|
4242
|
+
};
|
|
4243
|
+
} catch (error) {
|
|
4244
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
4245
|
+
return {
|
|
4246
|
+
score: 0,
|
|
4247
|
+
verdict: "fail",
|
|
4248
|
+
hits: [],
|
|
4249
|
+
misses: [`Code aggregator failed: ${message}`],
|
|
4250
|
+
expectedAspectCount: 1,
|
|
4251
|
+
reasoning: message,
|
|
4252
|
+
evaluatorRawRequest: {
|
|
4253
|
+
aggregator: "code_judge",
|
|
4254
|
+
script: scriptPath,
|
|
4255
|
+
error: message
|
|
4256
|
+
},
|
|
4257
|
+
evaluatorResults
|
|
4258
|
+
};
|
|
4259
|
+
}
|
|
4260
|
+
}
|
|
4261
|
+
async runLlmAggregator(results, context, config) {
|
|
4262
|
+
const judgeProvider = context.judgeProvider;
|
|
4263
|
+
if (!judgeProvider) {
|
|
4264
|
+
throw new Error("No judge provider available for LLM aggregation");
|
|
4265
|
+
}
|
|
4266
|
+
const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
|
|
4267
|
+
const resultsJson = JSON.stringify(resultsObject, null, 2);
|
|
4268
|
+
const evaluatorResults = results.map((member) => ({
|
|
4269
|
+
name: member.id,
|
|
4270
|
+
type: member.type,
|
|
4271
|
+
score: member.result.score,
|
|
4272
|
+
verdict: member.result.verdict,
|
|
4273
|
+
hits: [...member.result.hits],
|
|
4274
|
+
misses: [...member.result.misses],
|
|
4275
|
+
reasoning: member.result.reasoning,
|
|
4276
|
+
evaluatorRawRequest: member.result.evaluatorRawRequest,
|
|
4277
|
+
evaluatorResults: member.result.evaluatorResults
|
|
4278
|
+
}));
|
|
4279
|
+
const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
|
|
4280
|
+
const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
|
|
4281
|
+
const systemPrompt = buildOutputSchema();
|
|
4282
|
+
const evaluatorRawRequest = {
|
|
4283
|
+
aggregator: "llm_judge",
|
|
4284
|
+
userPrompt,
|
|
4285
|
+
systemPrompt,
|
|
4286
|
+
target: judgeProvider.targetName
|
|
4287
|
+
};
|
|
4288
|
+
try {
|
|
4289
|
+
const model = judgeProvider.asLanguageModel?.();
|
|
4290
|
+
if (model) {
|
|
4291
|
+
const { text } = await (0, import_ai2.generateText)({
|
|
4292
|
+
model,
|
|
4293
|
+
system: systemPrompt,
|
|
4294
|
+
prompt: userPrompt
|
|
4295
|
+
});
|
|
4296
|
+
const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
|
|
4297
|
+
const score2 = clampScore(data2.score);
|
|
4298
|
+
const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4299
|
+
const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4300
|
+
const reasoning2 = data2.reasoning;
|
|
4301
|
+
return {
|
|
4302
|
+
score: score2,
|
|
4303
|
+
verdict: scoreToVerdict(score2),
|
|
4304
|
+
hits: hits2,
|
|
4305
|
+
misses: misses2,
|
|
4306
|
+
expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
|
|
4307
|
+
reasoning: reasoning2,
|
|
4308
|
+
evaluatorRawRequest,
|
|
4309
|
+
evaluatorResults
|
|
4310
|
+
};
|
|
4311
|
+
}
|
|
4312
|
+
const response = await judgeProvider.invoke({
|
|
4313
|
+
question: userPrompt,
|
|
4314
|
+
systemPrompt,
|
|
4315
|
+
evalCaseId: context.evalCase.id,
|
|
4316
|
+
attempt: context.attempt
|
|
4317
|
+
});
|
|
4318
|
+
const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
|
|
4319
|
+
const score = clampScore(data.score);
|
|
4320
|
+
const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4321
|
+
const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
|
|
4322
|
+
const reasoning = data.reasoning ?? response.reasoning;
|
|
4323
|
+
return {
|
|
4324
|
+
score,
|
|
4325
|
+
verdict: scoreToVerdict(score),
|
|
4326
|
+
hits,
|
|
4327
|
+
misses,
|
|
4328
|
+
expectedAspectCount: Math.max(hits.length + misses.length, 1),
|
|
4329
|
+
reasoning,
|
|
4330
|
+
evaluatorRawRequest,
|
|
4331
|
+
evaluatorResults
|
|
4332
|
+
};
|
|
4333
|
+
} catch {
|
|
4334
|
+
return {
|
|
4335
|
+
score: 0,
|
|
4336
|
+
verdict: "fail",
|
|
4337
|
+
hits: [],
|
|
4338
|
+
misses: [],
|
|
4339
|
+
expectedAspectCount: 1,
|
|
4340
|
+
evaluatorRawRequest,
|
|
4341
|
+
evaluatorResults
|
|
4342
|
+
};
|
|
4343
|
+
}
|
|
4344
|
+
}
|
|
4345
|
+
};
|
|
4024
4346
|
|
|
4025
4347
|
// src/evaluation/orchestrator.ts
|
|
4026
4348
|
var import_node_crypto2 = require("crypto");
|
|
@@ -4732,6 +5054,57 @@ async function runEvaluatorList(options) {
|
|
|
4732
5054
|
promptInputs,
|
|
4733
5055
|
now
|
|
4734
5056
|
});
|
|
5057
|
+
scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
|
|
5058
|
+
evaluatorResults.push({
|
|
5059
|
+
name: evaluator.name,
|
|
5060
|
+
type: "code_judge",
|
|
5061
|
+
score: score2.score,
|
|
5062
|
+
verdict: score2.verdict,
|
|
5063
|
+
hits: score2.hits,
|
|
5064
|
+
misses: score2.misses,
|
|
5065
|
+
reasoning: score2.reasoning,
|
|
5066
|
+
evaluator_provider_request: score2.evaluatorRawRequest
|
|
5067
|
+
});
|
|
5068
|
+
}
|
|
5069
|
+
if (evaluator.type === "composite") {
|
|
5070
|
+
const evalFileDir = evalCase.guideline_paths[0] ? import_node_path13.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
|
|
5071
|
+
const createEvaluator = (memberConfig) => {
|
|
5072
|
+
switch (memberConfig.type) {
|
|
5073
|
+
case "llm_judge":
|
|
5074
|
+
return evaluatorRegistry.llm_judge;
|
|
5075
|
+
case "code":
|
|
5076
|
+
return new CodeEvaluator({
|
|
5077
|
+
script: memberConfig.script,
|
|
5078
|
+
cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
|
|
5079
|
+
agentTimeoutMs
|
|
5080
|
+
});
|
|
5081
|
+
case "composite":
|
|
5082
|
+
return new CompositeEvaluator({
|
|
5083
|
+
config: memberConfig,
|
|
5084
|
+
cwd: evalFileDir,
|
|
5085
|
+
evaluatorFactory: { create: createEvaluator }
|
|
5086
|
+
});
|
|
5087
|
+
default: {
|
|
5088
|
+
const unknownConfig = memberConfig;
|
|
5089
|
+
throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
|
|
5090
|
+
}
|
|
5091
|
+
}
|
|
5092
|
+
};
|
|
5093
|
+
const compositeEvaluator = new CompositeEvaluator({
|
|
5094
|
+
config: evaluator,
|
|
5095
|
+
cwd: evalFileDir,
|
|
5096
|
+
evaluatorFactory: { create: createEvaluator }
|
|
5097
|
+
});
|
|
5098
|
+
const score2 = await compositeEvaluator.evaluate({
|
|
5099
|
+
evalCase,
|
|
5100
|
+
candidate,
|
|
5101
|
+
target,
|
|
5102
|
+
provider,
|
|
5103
|
+
attempt,
|
|
5104
|
+
promptInputs,
|
|
5105
|
+
now,
|
|
5106
|
+
judgeProvider
|
|
5107
|
+
});
|
|
4735
5108
|
scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
|
|
4736
5109
|
evaluatorResults.push({
|
|
4737
5110
|
name: evaluator.name,
|
|
@@ -4741,7 +5114,8 @@ async function runEvaluatorList(options) {
|
|
|
4741
5114
|
hits: score2.hits,
|
|
4742
5115
|
misses: score2.misses,
|
|
4743
5116
|
reasoning: score2.reasoning,
|
|
4744
|
-
evaluator_provider_request: score2.evaluatorRawRequest
|
|
5117
|
+
evaluator_provider_request: score2.evaluatorRawRequest,
|
|
5118
|
+
evaluator_results: mapChildResults(score2.evaluatorResults)
|
|
4745
5119
|
});
|
|
4746
5120
|
}
|
|
4747
5121
|
} catch (error) {
|
|
@@ -4754,14 +5128,15 @@ async function runEvaluatorList(options) {
|
|
|
4754
5128
|
expectedAspectCount: 1,
|
|
4755
5129
|
reasoning: message
|
|
4756
5130
|
};
|
|
5131
|
+
const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
|
|
4757
5132
|
scored.push({
|
|
4758
5133
|
score: fallbackScore,
|
|
4759
5134
|
name: evaluator.name ?? "unknown",
|
|
4760
|
-
type:
|
|
5135
|
+
type: resultType ?? "llm_judge"
|
|
4761
5136
|
});
|
|
4762
5137
|
evaluatorResults.push({
|
|
4763
5138
|
name: evaluator.name ?? "unknown",
|
|
4764
|
-
type:
|
|
5139
|
+
type: resultType ?? "llm_judge",
|
|
4765
5140
|
score: 0,
|
|
4766
5141
|
verdict: "fail",
|
|
4767
5142
|
hits: [],
|
|
@@ -4979,6 +5354,23 @@ function isTimeoutLike(error) {
|
|
|
4979
5354
|
const value = String(error).toLowerCase();
|
|
4980
5355
|
return value.includes("timeout");
|
|
4981
5356
|
}
|
|
5357
|
+
function mapChildResults(children) {
|
|
5358
|
+
if (!children || children.length === 0) {
|
|
5359
|
+
return void 0;
|
|
5360
|
+
}
|
|
5361
|
+
return children.map((child) => ({
|
|
5362
|
+
name: child.name,
|
|
5363
|
+
type: child.type,
|
|
5364
|
+
score: child.score,
|
|
5365
|
+
weight: child.weight,
|
|
5366
|
+
verdict: child.verdict,
|
|
5367
|
+
hits: child.hits,
|
|
5368
|
+
misses: child.misses,
|
|
5369
|
+
reasoning: child.reasoning,
|
|
5370
|
+
evaluator_provider_request: child.evaluatorRawRequest,
|
|
5371
|
+
evaluator_results: mapChildResults(child.evaluatorResults)
|
|
5372
|
+
}));
|
|
5373
|
+
}
|
|
4982
5374
|
|
|
4983
5375
|
// src/evaluation/generators/rubric-generator.ts
|
|
4984
5376
|
var import_ai3 = require("ai");
|
|
@@ -5067,6 +5459,7 @@ function createAgentKernel() {
|
|
|
5067
5459
|
// Annotate the CommonJS export names for ESM import in node:
|
|
5068
5460
|
0 && (module.exports = {
|
|
5069
5461
|
CodeEvaluator,
|
|
5462
|
+
CompositeEvaluator,
|
|
5070
5463
|
LlmJudgeEvaluator,
|
|
5071
5464
|
TEST_MESSAGE_ROLES,
|
|
5072
5465
|
buildDirectoryChain,
|