@agentv/core 0.22.2 → 0.23.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -31,6 +31,7 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
31
31
  var index_exports = {};
32
32
  __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
+ CompositeEvaluator: () => CompositeEvaluator,
34
35
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
35
36
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
36
37
  buildDirectoryChain: () => buildDirectoryChain2,
@@ -107,7 +108,7 @@ function isTestMessage(value) {
107
108
  }
108
109
  return candidate.content.every(isJsonObject);
109
110
  }
110
- var EVALUATOR_KIND_VALUES = ["code", "llm_judge", "rubric"];
111
+ var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
111
112
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
112
113
  function isEvaluatorKind(value) {
113
114
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -459,10 +460,10 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
459
460
  logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
460
461
  continue;
461
462
  }
462
- if (typeValue === "code") {
463
+ if (typeValue === "code_judge") {
463
464
  const script = asString2(rawEvaluator.script);
464
465
  if (!script) {
465
- logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
466
+ logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
466
467
  continue;
467
468
  }
468
469
  const cwd = asString2(rawEvaluator.cwd);
@@ -473,7 +474,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
473
474
  resolvedCwd = import_node_path3.default.resolve(resolved.resolvedPath);
474
475
  } else {
475
476
  logWarning2(
476
- `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
477
+ `Code_judge evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
477
478
  resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => ` Tried: ${attempt}`) : void 0
478
479
  );
479
480
  }
@@ -489,6 +490,105 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
489
490
  });
490
491
  continue;
491
492
  }
493
+ if (typeValue === "composite") {
494
+ const rawMembers = rawEvaluator.evaluators;
495
+ if (!Array.isArray(rawMembers)) {
496
+ logWarning2(
497
+ `Skipping composite evaluator '${name}' in '${evalId}': missing evaluators array`
498
+ );
499
+ continue;
500
+ }
501
+ const rawAggregator = rawEvaluator.aggregator;
502
+ if (!isJsonObject2(rawAggregator)) {
503
+ logWarning2(`Skipping composite evaluator '${name}' in '${evalId}': missing aggregator`);
504
+ continue;
505
+ }
506
+ const aggregatorType = asString2(rawAggregator.type);
507
+ if (aggregatorType !== "weighted_average" && aggregatorType !== "code_judge" && aggregatorType !== "llm_judge") {
508
+ logWarning2(
509
+ `Skipping composite evaluator '${name}' in '${evalId}': invalid aggregator type '${aggregatorType}'`
510
+ );
511
+ continue;
512
+ }
513
+ const memberEvaluators = [];
514
+ for (const rawMember of rawMembers) {
515
+ if (!isJsonObject2(rawMember)) {
516
+ logWarning2(`Skipping invalid member evaluator in composite '${name}' (expected object)`);
517
+ continue;
518
+ }
519
+ const memberName = asString2(rawMember.name);
520
+ const memberType = rawMember.type;
521
+ if (!memberName || !isEvaluatorKind(memberType)) {
522
+ logWarning2(`Skipping member evaluator with invalid name/type in composite '${name}'`);
523
+ continue;
524
+ }
525
+ const memberConfigs = await parseEvaluators(
526
+ { evaluators: [rawMember] },
527
+ void 0,
528
+ searchRoots,
529
+ `${evalId}:${name}:${memberName}`
530
+ );
531
+ if (memberConfigs && memberConfigs.length > 0) {
532
+ memberEvaluators.push(memberConfigs[0]);
533
+ }
534
+ }
535
+ if (memberEvaluators.length === 0) {
536
+ logWarning2(
537
+ `Skipping composite evaluator '${name}' in '${evalId}': no valid member evaluators`
538
+ );
539
+ continue;
540
+ }
541
+ let aggregator;
542
+ if (aggregatorType === "weighted_average") {
543
+ const weights = isJsonObject2(rawAggregator.weights) ? rawAggregator.weights : void 0;
544
+ const parsedWeights = {};
545
+ if (weights) {
546
+ for (const [key, value] of Object.entries(weights)) {
547
+ if (typeof value === "number") {
548
+ parsedWeights[key] = value;
549
+ }
550
+ }
551
+ }
552
+ aggregator = {
553
+ type: "weighted_average",
554
+ ...Object.keys(parsedWeights).length > 0 ? { weights: parsedWeights } : {}
555
+ };
556
+ } else if (aggregatorType === "code_judge") {
557
+ const aggregatorPath = asString2(rawAggregator.path);
558
+ if (!aggregatorPath) {
559
+ logWarning2(
560
+ `Skipping composite evaluator '${name}' in '${evalId}': code_judge aggregator missing path`
561
+ );
562
+ continue;
563
+ }
564
+ aggregator = {
565
+ type: "code_judge",
566
+ path: aggregatorPath,
567
+ cwd: searchRoots[0]
568
+ };
569
+ } else {
570
+ const aggregatorPrompt = asString2(rawAggregator.prompt);
571
+ let promptPath2;
572
+ if (aggregatorPrompt) {
573
+ const resolved = await resolveFileReference(aggregatorPrompt, searchRoots);
574
+ if (resolved.resolvedPath) {
575
+ promptPath2 = import_node_path3.default.resolve(resolved.resolvedPath);
576
+ }
577
+ }
578
+ aggregator = {
579
+ type: "llm_judge",
580
+ ...aggregatorPrompt ? { prompt: aggregatorPrompt } : {},
581
+ ...promptPath2 ? { promptPath: promptPath2 } : {}
582
+ };
583
+ }
584
+ evaluators.push({
585
+ name,
586
+ type: "composite",
587
+ evaluators: memberEvaluators,
588
+ aggregator
589
+ });
590
+ continue;
591
+ }
492
592
  const prompt = asString2(rawEvaluator.prompt);
493
593
  let promptPath;
494
594
  if (prompt) {
@@ -4021,6 +4121,228 @@ function substituteVariables(template, variables) {
4021
4121
  return variables[varName] ?? match;
4022
4122
  });
4023
4123
  }
4124
+ var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
4125
+ {{EVALUATOR_RESULTS_JSON}}
4126
+
4127
+ Decide the final score and verdict based on all evaluator results.
4128
+ Return a JSON object with: score (0.0-1.0), verdict (pass/fail/borderline), and reasoning.`;
4129
+ var CompositeEvaluator = class {
4130
+ kind = "composite";
4131
+ config;
4132
+ evaluatorFactory;
4133
+ cwd;
4134
+ constructor(options) {
4135
+ this.config = options.config;
4136
+ this.evaluatorFactory = options.evaluatorFactory;
4137
+ this.cwd = options.cwd;
4138
+ }
4139
+ async evaluate(context) {
4140
+ const memberResults = await Promise.all(
4141
+ this.config.evaluators.map(async (memberConfig) => {
4142
+ const evaluator = this.evaluatorFactory.create(memberConfig, context);
4143
+ return {
4144
+ id: memberConfig.name,
4145
+ type: memberConfig.type,
4146
+ result: await evaluator.evaluate(context)
4147
+ };
4148
+ })
4149
+ );
4150
+ return this.aggregate(memberResults, context);
4151
+ }
4152
+ async aggregate(results, context) {
4153
+ const aggregator = this.config.aggregator;
4154
+ switch (aggregator.type) {
4155
+ case "code_judge":
4156
+ return this.runCodeAggregator(results, aggregator.path, aggregator.cwd ?? this.cwd);
4157
+ case "llm_judge":
4158
+ return this.runLlmAggregator(results, context, aggregator);
4159
+ default:
4160
+ return this.runWeightedAverage(results, aggregator.weights);
4161
+ }
4162
+ }
4163
+ runWeightedAverage(results, weights) {
4164
+ let totalWeight = 0;
4165
+ let weightedSum = 0;
4166
+ const allHits = [];
4167
+ const allMisses = [];
4168
+ const reasoningParts = [];
4169
+ const evaluatorResults = [];
4170
+ for (const member of results) {
4171
+ const weight = weights?.[member.id] ?? 1;
4172
+ totalWeight += weight;
4173
+ weightedSum += member.result.score * weight;
4174
+ allHits.push(...member.result.hits.map((h) => `[${member.id}] ${h}`));
4175
+ allMisses.push(...member.result.misses.map((m) => `[${member.id}] ${m}`));
4176
+ if (member.result.reasoning) {
4177
+ reasoningParts.push(`${member.id}: ${member.result.reasoning}`);
4178
+ }
4179
+ evaluatorResults.push({
4180
+ name: member.id,
4181
+ type: member.type,
4182
+ score: member.result.score,
4183
+ weight,
4184
+ verdict: member.result.verdict,
4185
+ hits: [...member.result.hits],
4186
+ misses: [...member.result.misses],
4187
+ reasoning: member.result.reasoning,
4188
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
4189
+ evaluatorResults: member.result.evaluatorResults
4190
+ });
4191
+ }
4192
+ const finalScore = totalWeight > 0 ? weightedSum / totalWeight : 0;
4193
+ return {
4194
+ score: clampScore(finalScore),
4195
+ verdict: scoreToVerdict(finalScore),
4196
+ hits: allHits,
4197
+ misses: allMisses,
4198
+ expectedAspectCount: Math.max(allHits.length + allMisses.length, 1),
4199
+ reasoning: reasoningParts.length > 0 ? reasoningParts.join("; ") : void 0,
4200
+ evaluatorRawRequest: {
4201
+ aggregator: "weighted_average",
4202
+ ...weights ? { weights } : {}
4203
+ },
4204
+ evaluatorResults
4205
+ };
4206
+ }
4207
+ async runCodeAggregator(results, scriptPath, cwd, weights) {
4208
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
4209
+ const inputPayload = JSON.stringify({ results: resultsObject }, null, 2);
4210
+ const evaluatorResults = results.map((member) => ({
4211
+ name: member.id,
4212
+ type: member.type,
4213
+ score: member.result.score,
4214
+ weight: weights?.[member.id] ?? 1,
4215
+ verdict: member.result.verdict,
4216
+ hits: [...member.result.hits],
4217
+ misses: [...member.result.misses],
4218
+ reasoning: member.result.reasoning,
4219
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
4220
+ evaluatorResults: member.result.evaluatorResults
4221
+ }));
4222
+ try {
4223
+ const stdout = await executeScript(scriptPath, inputPayload, void 0, cwd);
4224
+ const parsed = parseJsonSafe(stdout);
4225
+ const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
4226
+ const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
4227
+ const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
4228
+ const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
4229
+ const verdict = typeof parsed?.verdict === "string" && (parsed.verdict === "pass" || parsed.verdict === "fail" || parsed.verdict === "borderline") ? parsed.verdict : scoreToVerdict(score);
4230
+ return {
4231
+ score,
4232
+ verdict,
4233
+ hits,
4234
+ misses,
4235
+ expectedAspectCount: hits.length + misses.length || 1,
4236
+ reasoning,
4237
+ evaluatorRawRequest: {
4238
+ aggregator: "code_judge",
4239
+ script: scriptPath
4240
+ },
4241
+ evaluatorResults
4242
+ };
4243
+ } catch (error) {
4244
+ const message = error instanceof Error ? error.message : String(error);
4245
+ return {
4246
+ score: 0,
4247
+ verdict: "fail",
4248
+ hits: [],
4249
+ misses: [`Code aggregator failed: ${message}`],
4250
+ expectedAspectCount: 1,
4251
+ reasoning: message,
4252
+ evaluatorRawRequest: {
4253
+ aggregator: "code_judge",
4254
+ script: scriptPath,
4255
+ error: message
4256
+ },
4257
+ evaluatorResults
4258
+ };
4259
+ }
4260
+ }
4261
+ async runLlmAggregator(results, context, config) {
4262
+ const judgeProvider = context.judgeProvider;
4263
+ if (!judgeProvider) {
4264
+ throw new Error("No judge provider available for LLM aggregation");
4265
+ }
4266
+ const resultsObject = Object.fromEntries(results.map((r) => [r.id, r.result]));
4267
+ const resultsJson = JSON.stringify(resultsObject, null, 2);
4268
+ const evaluatorResults = results.map((member) => ({
4269
+ name: member.id,
4270
+ type: member.type,
4271
+ score: member.result.score,
4272
+ verdict: member.result.verdict,
4273
+ hits: [...member.result.hits],
4274
+ misses: [...member.result.misses],
4275
+ reasoning: member.result.reasoning,
4276
+ evaluatorRawRequest: member.result.evaluatorRawRequest,
4277
+ evaluatorResults: member.result.evaluatorResults
4278
+ }));
4279
+ const promptTemplate = config.prompt ?? DEFAULT_COMPOSITE_AGGREGATOR_PROMPT;
4280
+ const userPrompt = promptTemplate.replace(/\{\{EVALUATOR_RESULTS_JSON\}\}/g, resultsJson);
4281
+ const systemPrompt = buildOutputSchema();
4282
+ const evaluatorRawRequest = {
4283
+ aggregator: "llm_judge",
4284
+ userPrompt,
4285
+ systemPrompt,
4286
+ target: judgeProvider.targetName
4287
+ };
4288
+ try {
4289
+ const model = judgeProvider.asLanguageModel?.();
4290
+ if (model) {
4291
+ const { text } = await (0, import_ai2.generateText)({
4292
+ model,
4293
+ system: systemPrompt,
4294
+ prompt: userPrompt
4295
+ });
4296
+ const data2 = freeformEvaluationSchema.parse(parseJsonFromText(text));
4297
+ const score2 = clampScore(data2.score);
4298
+ const hits2 = Array.isArray(data2.hits) ? data2.hits.filter(isNonEmptyString).slice(0, 4) : [];
4299
+ const misses2 = Array.isArray(data2.misses) ? data2.misses.filter(isNonEmptyString).slice(0, 4) : [];
4300
+ const reasoning2 = data2.reasoning;
4301
+ return {
4302
+ score: score2,
4303
+ verdict: scoreToVerdict(score2),
4304
+ hits: hits2,
4305
+ misses: misses2,
4306
+ expectedAspectCount: Math.max(hits2.length + misses2.length, 1),
4307
+ reasoning: reasoning2,
4308
+ evaluatorRawRequest,
4309
+ evaluatorResults
4310
+ };
4311
+ }
4312
+ const response = await judgeProvider.invoke({
4313
+ question: userPrompt,
4314
+ systemPrompt,
4315
+ evalCaseId: context.evalCase.id,
4316
+ attempt: context.attempt
4317
+ });
4318
+ const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
4319
+ const score = clampScore(data.score);
4320
+ const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
4321
+ const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
4322
+ const reasoning = data.reasoning ?? response.reasoning;
4323
+ return {
4324
+ score,
4325
+ verdict: scoreToVerdict(score),
4326
+ hits,
4327
+ misses,
4328
+ expectedAspectCount: Math.max(hits.length + misses.length, 1),
4329
+ reasoning,
4330
+ evaluatorRawRequest,
4331
+ evaluatorResults
4332
+ };
4333
+ } catch {
4334
+ return {
4335
+ score: 0,
4336
+ verdict: "fail",
4337
+ hits: [],
4338
+ misses: [],
4339
+ expectedAspectCount: 1,
4340
+ evaluatorRawRequest,
4341
+ evaluatorResults
4342
+ };
4343
+ }
4344
+ }
4345
+ };
4024
4346
 
4025
4347
  // src/evaluation/orchestrator.ts
4026
4348
  var import_node_crypto2 = require("crypto");
@@ -4732,6 +5054,57 @@ async function runEvaluatorList(options) {
4732
5054
  promptInputs,
4733
5055
  now
4734
5056
  });
5057
+ scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
5058
+ evaluatorResults.push({
5059
+ name: evaluator.name,
5060
+ type: "code_judge",
5061
+ score: score2.score,
5062
+ verdict: score2.verdict,
5063
+ hits: score2.hits,
5064
+ misses: score2.misses,
5065
+ reasoning: score2.reasoning,
5066
+ evaluator_provider_request: score2.evaluatorRawRequest
5067
+ });
5068
+ }
5069
+ if (evaluator.type === "composite") {
5070
+ const evalFileDir = evalCase.guideline_paths[0] ? import_node_path13.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
5071
+ const createEvaluator = (memberConfig) => {
5072
+ switch (memberConfig.type) {
5073
+ case "llm_judge":
5074
+ return evaluatorRegistry.llm_judge;
5075
+ case "code":
5076
+ return new CodeEvaluator({
5077
+ script: memberConfig.script,
5078
+ cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
5079
+ agentTimeoutMs
5080
+ });
5081
+ case "composite":
5082
+ return new CompositeEvaluator({
5083
+ config: memberConfig,
5084
+ cwd: evalFileDir,
5085
+ evaluatorFactory: { create: createEvaluator }
5086
+ });
5087
+ default: {
5088
+ const unknownConfig = memberConfig;
5089
+ throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
5090
+ }
5091
+ }
5092
+ };
5093
+ const compositeEvaluator = new CompositeEvaluator({
5094
+ config: evaluator,
5095
+ cwd: evalFileDir,
5096
+ evaluatorFactory: { create: createEvaluator }
5097
+ });
5098
+ const score2 = await compositeEvaluator.evaluate({
5099
+ evalCase,
5100
+ candidate,
5101
+ target,
5102
+ provider,
5103
+ attempt,
5104
+ promptInputs,
5105
+ now,
5106
+ judgeProvider
5107
+ });
4735
5108
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4736
5109
  evaluatorResults.push({
4737
5110
  name: evaluator.name,
@@ -4741,7 +5114,8 @@ async function runEvaluatorList(options) {
4741
5114
  hits: score2.hits,
4742
5115
  misses: score2.misses,
4743
5116
  reasoning: score2.reasoning,
4744
- evaluator_provider_request: score2.evaluatorRawRequest
5117
+ evaluator_provider_request: score2.evaluatorRawRequest,
5118
+ evaluator_results: mapChildResults(score2.evaluatorResults)
4745
5119
  });
4746
5120
  }
4747
5121
  } catch (error) {
@@ -4754,14 +5128,15 @@ async function runEvaluatorList(options) {
4754
5128
  expectedAspectCount: 1,
4755
5129
  reasoning: message
4756
5130
  };
5131
+ const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
4757
5132
  scored.push({
4758
5133
  score: fallbackScore,
4759
5134
  name: evaluator.name ?? "unknown",
4760
- type: evaluator.type ?? "unknown"
5135
+ type: resultType ?? "llm_judge"
4761
5136
  });
4762
5137
  evaluatorResults.push({
4763
5138
  name: evaluator.name ?? "unknown",
4764
- type: evaluator.type ?? "unknown",
5139
+ type: resultType ?? "llm_judge",
4765
5140
  score: 0,
4766
5141
  verdict: "fail",
4767
5142
  hits: [],
@@ -4979,6 +5354,23 @@ function isTimeoutLike(error) {
4979
5354
  const value = String(error).toLowerCase();
4980
5355
  return value.includes("timeout");
4981
5356
  }
5357
+ function mapChildResults(children) {
5358
+ if (!children || children.length === 0) {
5359
+ return void 0;
5360
+ }
5361
+ return children.map((child) => ({
5362
+ name: child.name,
5363
+ type: child.type,
5364
+ score: child.score,
5365
+ weight: child.weight,
5366
+ verdict: child.verdict,
5367
+ hits: child.hits,
5368
+ misses: child.misses,
5369
+ reasoning: child.reasoning,
5370
+ evaluator_provider_request: child.evaluatorRawRequest,
5371
+ evaluator_results: mapChildResults(child.evaluatorResults)
5372
+ }));
5373
+ }
4982
5374
 
4983
5375
  // src/evaluation/generators/rubric-generator.ts
4984
5376
  var import_ai3 = require("ai");
@@ -5067,6 +5459,7 @@ function createAgentKernel() {
5067
5459
  // Annotate the CommonJS export names for ESM import in node:
5068
5460
  0 && (module.exports = {
5069
5461
  CodeEvaluator,
5462
+ CompositeEvaluator,
5070
5463
  LlmJudgeEvaluator,
5071
5464
  TEST_MESSAGE_ROLES,
5072
5465
  buildDirectoryChain,