@agentv/core 0.26.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -32,7 +32,6 @@ var index_exports = {};
32
32
  __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
34
  CompositeEvaluator: () => CompositeEvaluator,
35
- ExpectedMessagesEvaluator: () => ExpectedMessagesEvaluator,
36
35
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
37
36
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
38
37
  ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
@@ -50,7 +49,6 @@ __export(index_exports, {
50
49
  generateRubrics: () => generateRubrics,
51
50
  getHitCount: () => getHitCount,
52
51
  isEvaluatorKind: () => isEvaluatorKind,
53
- isExpectedToolCall: () => isExpectedToolCall,
54
52
  isGuidelineFile: () => isGuidelineFile,
55
53
  isJsonObject: () => isJsonObject,
56
54
  isJsonValue: () => isJsonValue,
@@ -110,18 +108,23 @@ function isTestMessage(value) {
110
108
  if (typeof candidate.content === "string") {
111
109
  return true;
112
110
  }
113
- if (!Array.isArray(candidate.content)) {
114
- return false;
111
+ if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
112
+ return true;
113
+ }
114
+ if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
115
+ return true;
115
116
  }
116
- return candidate.content.every(isJsonObject);
117
+ if (isJsonObject(candidate.content)) {
118
+ return true;
119
+ }
120
+ return false;
117
121
  }
118
122
  var EVALUATOR_KIND_VALUES = [
119
123
  "code_judge",
120
124
  "llm_judge",
121
125
  "rubric",
122
126
  "composite",
123
- "tool_trajectory",
124
- "expected_messages"
127
+ "tool_trajectory"
125
128
  ];
126
129
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
127
130
  function isEvaluatorKind(value) {
@@ -142,13 +145,6 @@ function isTraceEvent(value) {
142
145
  const candidate = value;
143
146
  return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
144
147
  }
145
- function isExpectedToolCall(value) {
146
- if (typeof value !== "object" || value === null) {
147
- return false;
148
- }
149
- const candidate = value;
150
- return typeof candidate.tool === "string";
151
- }
152
148
  function computeTraceSummary(trace) {
153
149
  const toolCallCounts = {};
154
150
  let errorCount = 0;
@@ -645,15 +641,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
645
641
  });
646
642
  continue;
647
643
  }
648
- if (typeValue === "expected_messages") {
649
- const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
650
- evaluators.push({
651
- name,
652
- type: "expected_messages",
653
- ...weight2 !== void 0 ? { weight: weight2 } : {}
654
- });
655
- continue;
656
- }
657
644
  if (typeValue === "tool_trajectory") {
658
645
  const mode = asString2(rawEvaluator.mode);
659
646
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
@@ -908,63 +895,6 @@ async function processMessages(options) {
908
895
  }
909
896
  return segments;
910
897
  }
911
- async function resolveAssistantContent(content, searchRoots, verbose) {
912
- if (typeof content === "string") {
913
- return content;
914
- }
915
- if (!content) {
916
- return "";
917
- }
918
- const parts = [];
919
- for (const entry of content) {
920
- if (typeof entry === "string") {
921
- parts.push({ content: entry, isFile: false });
922
- continue;
923
- }
924
- if (!isJsonObject(entry)) {
925
- continue;
926
- }
927
- const segmentType = asString3(entry.type);
928
- if (segmentType === "file") {
929
- const rawValue = asString3(entry.value);
930
- if (!rawValue) {
931
- continue;
932
- }
933
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
934
- rawValue,
935
- searchRoots
936
- );
937
- if (!resolvedPath) {
938
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
939
- logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
940
- continue;
941
- }
942
- try {
943
- const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
944
- parts.push({ content: fileContent, isFile: true, displayPath });
945
- if (verbose) {
946
- console.log(` [Expected Assistant File] Found: ${displayPath}`);
947
- console.log(` Resolved to: ${resolvedPath}`);
948
- }
949
- } catch (error) {
950
- logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
951
- }
952
- continue;
953
- }
954
- const textValue = asString3(entry.text);
955
- if (typeof textValue === "string") {
956
- parts.push({ content: textValue, isFile: false });
957
- continue;
958
- }
959
- const valueValue = asString3(entry.value);
960
- if (typeof valueValue === "string") {
961
- parts.push({ content: valueValue, isFile: false });
962
- continue;
963
- }
964
- parts.push({ content: JSON.stringify(entry), isFile: false });
965
- }
966
- return formatFileContents(parts);
967
- }
968
898
  function asString3(value) {
969
899
  return typeof value === "string" ? value : void 0;
970
900
  }
@@ -997,14 +927,15 @@ ${detailBlock}${ANSI_RESET4}`);
997
927
  }
998
928
  }
999
929
  async function processExpectedMessages(options) {
1000
- const { messages, searchRoots, repoRootPath, verbose } = options;
930
+ const { messages, searchRoots, verbose } = options;
1001
931
  const segments = [];
1002
932
  for (const message of messages) {
933
+ const extendedMessage = message;
1003
934
  const segment = {
1004
935
  role: message.role
1005
936
  };
1006
- if (message.role === "assistant" && message.tool_calls !== void 0) {
1007
- segment.tool_calls = message.tool_calls;
937
+ if (extendedMessage.name) {
938
+ segment.name = extendedMessage.name;
1008
939
  }
1009
940
  const content = message.content;
1010
941
  if (typeof content === "string") {
@@ -1052,6 +983,13 @@ async function processExpectedMessages(options) {
1052
983
  processedContent.push(cloneJsonObject(rawSegment));
1053
984
  }
1054
985
  segment.content = processedContent;
986
+ } else if (isJsonObject(content)) {
987
+ segment.content = cloneJsonObject(content);
988
+ }
989
+ if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
990
+ segment.tool_calls = extendedMessage.tool_calls.map(
991
+ (tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
992
+ );
1055
993
  }
1056
994
  segments.push(segment);
1057
995
  }
@@ -1346,9 +1284,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1346
1284
  logError(`No valid expected message found for eval case: ${id}`);
1347
1285
  continue;
1348
1286
  }
1349
- if (expectedMessages.length > 1) {
1350
- logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
1351
- }
1352
1287
  const guidelinePaths = [];
1353
1288
  const inputTextParts = [];
1354
1289
  const inputSegments = await processMessages({
@@ -1368,8 +1303,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1368
1303
  verbose
1369
1304
  }) : [];
1370
1305
  const codeSnippets = extractCodeBlocks(inputSegments);
1371
- const expectedContent = expectedMessages[0]?.content;
1372
- const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
1306
+ let referenceAnswer = "";
1307
+ if (outputSegments.length > 1) {
1308
+ referenceAnswer = JSON.stringify(outputSegments, null, 2);
1309
+ } else if (outputSegments.length === 1) {
1310
+ const singleMessage = outputSegments[0];
1311
+ if (typeof singleMessage.content === "string") {
1312
+ referenceAnswer = singleMessage.content;
1313
+ } else if (singleMessage.content) {
1314
+ referenceAnswer = JSON.stringify(singleMessage, null, 2);
1315
+ } else if (singleMessage.tool_calls) {
1316
+ referenceAnswer = JSON.stringify(singleMessage, null, 2);
1317
+ }
1318
+ }
1373
1319
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
1374
1320
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
1375
1321
  let evaluators;
@@ -1424,7 +1370,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1424
1370
  question,
1425
1371
  input_messages: inputMessages,
1426
1372
  input_segments: inputSegments,
1427
- expected_segments: outputSegments,
1373
+ expected_messages: outputSegments,
1428
1374
  reference_answer: referenceAnswer,
1429
1375
  guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
1430
1376
  guideline_patterns: guidelinePatterns,
@@ -3979,7 +3925,7 @@ var import_ai2 = require("ai");
3979
3925
  var import_zod2 = require("zod");
3980
3926
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3981
3927
 
3982
- Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
3928
+ Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
3983
3929
 
3984
3930
  Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
3985
3931
 
@@ -4037,7 +3983,7 @@ var LlmJudgeEvaluator = class {
4037
3983
  const variables = {
4038
3984
  [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
4039
3985
  [TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
4040
- context.evalCase.expected_segments,
3986
+ context.evalCase.expected_messages,
4041
3987
  null,
4042
3988
  2
4043
3989
  ),
@@ -4256,7 +4202,9 @@ var CodeEvaluator = class {
4256
4202
  input_files: context.evalCase.file_paths.filter(
4257
4203
  (path15) => !context.evalCase.guideline_paths.includes(path15)
4258
4204
  ),
4259
- input_messages: context.evalCase.input_messages
4205
+ input_messages: context.evalCase.input_messages,
4206
+ candidate_trace_file: context.candidateTraceRef ?? null,
4207
+ candidate_trace_summary: context.candidateTraceSummary ?? null
4260
4208
  },
4261
4209
  null,
4262
4210
  2
@@ -4522,105 +4470,6 @@ var ToolTrajectoryEvaluator = class {
4522
4470
  };
4523
4471
  }
4524
4472
  };
4525
- var ExpectedMessagesEvaluator = class {
4526
- kind = "expected_messages";
4527
- evaluate(context) {
4528
- const { candidateTrace, evalCase } = context;
4529
- const expectedSegments = evalCase.expected_segments;
4530
- const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
4531
- if (expectedToolCalls.length === 0) {
4532
- return {
4533
- score: 1,
4534
- verdict: "pass",
4535
- hits: ["No tool_calls specified in expected_messages"],
4536
- misses: [],
4537
- expectedAspectCount: 1
4538
- };
4539
- }
4540
- if (!candidateTrace || candidateTrace.length === 0) {
4541
- return {
4542
- score: 0,
4543
- verdict: "fail",
4544
- hits: [],
4545
- misses: ["No trace available to validate tool_calls"],
4546
- expectedAspectCount: expectedToolCalls.length
4547
- };
4548
- }
4549
- const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
4550
- return this.validateToolCalls(expectedToolCalls, actualToolCalls);
4551
- }
4552
- extractExpectedToolCalls(segments) {
4553
- if (!segments) {
4554
- return [];
4555
- }
4556
- const toolCalls = [];
4557
- for (const segment of segments) {
4558
- const role = segment.role;
4559
- const segmentToolCalls = segment.tool_calls;
4560
- if (role === "assistant" && Array.isArray(segmentToolCalls)) {
4561
- for (const tc of segmentToolCalls) {
4562
- if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
4563
- const toolCall = tc;
4564
- toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
4565
- }
4566
- }
4567
- }
4568
- }
4569
- return toolCalls;
4570
- }
4571
- validateToolCalls(expected, actual) {
4572
- const hits = [];
4573
- const misses = [];
4574
- for (let i = 0; i < expected.length; i++) {
4575
- const expectedCall = expected[i];
4576
- const actualCall = actual[i];
4577
- if (!actualCall) {
4578
- misses.push(
4579
- `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
4580
- );
4581
- continue;
4582
- }
4583
- if (actualCall.name !== expectedCall.tool) {
4584
- misses.push(
4585
- `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
4586
- );
4587
- continue;
4588
- }
4589
- if (expectedCall.input !== void 0) {
4590
- if (!this.deepEquals(expectedCall.input, actualCall.input)) {
4591
- misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
4592
- continue;
4593
- }
4594
- }
4595
- hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
4596
- }
4597
- const totalChecks = expected.length || 1;
4598
- const score = hits.length / totalChecks;
4599
- return {
4600
- score,
4601
- verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
4602
- hits,
4603
- misses,
4604
- expectedAspectCount: totalChecks
4605
- };
4606
- }
4607
- deepEquals(a, b) {
4608
- if (a === b) return true;
4609
- if (typeof a !== typeof b) return false;
4610
- if (typeof a !== "object" || a === null || b === null) return false;
4611
- if (Array.isArray(a) && Array.isArray(b)) {
4612
- if (a.length !== b.length) return false;
4613
- return a.every((val, i) => this.deepEquals(val, b[i]));
4614
- }
4615
- if (Array.isArray(a) || Array.isArray(b)) return false;
4616
- const aObj = a;
4617
- const bObj = b;
4618
- const aKeys = Object.keys(aObj);
4619
- const bKeys = Object.keys(bObj);
4620
- if (aKeys.length !== bKeys.length) return false;
4621
- return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
4622
- }
4623
- };
4624
4473
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
4625
4474
  {{EVALUATOR_RESULTS_JSON}}
4626
4475
 
@@ -5392,6 +5241,7 @@ async function runEvalCase(options) {
5392
5241
  judgeProvider,
5393
5242
  agentTimeoutMs,
5394
5243
  candidateTrace,
5244
+ candidateTraceRef: providerResponse.traceRef,
5395
5245
  candidateTraceSummary
5396
5246
  });
5397
5247
  } catch (error) {
@@ -5411,6 +5261,7 @@ async function evaluateCandidate(options) {
5411
5261
  judgeProvider,
5412
5262
  agentTimeoutMs,
5413
5263
  candidateTrace,
5264
+ candidateTraceRef,
5414
5265
  candidateTraceSummary
5415
5266
  } = options;
5416
5267
  const gradeTimestamp = nowFn();
@@ -5426,6 +5277,7 @@ async function evaluateCandidate(options) {
5426
5277
  judgeProvider,
5427
5278
  agentTimeoutMs,
5428
5279
  candidateTrace,
5280
+ candidateTraceRef,
5429
5281
  candidateTraceSummary
5430
5282
  });
5431
5283
  const completedAt = nowFn();
@@ -5480,6 +5332,7 @@ async function runEvaluatorsForCase(options) {
5480
5332
  judgeProvider,
5481
5333
  agentTimeoutMs,
5482
5334
  candidateTrace,
5335
+ candidateTraceRef,
5483
5336
  candidateTraceSummary
5484
5337
  } = options;
5485
5338
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
@@ -5496,6 +5349,7 @@ async function runEvaluatorsForCase(options) {
5496
5349
  judgeProvider,
5497
5350
  agentTimeoutMs,
5498
5351
  candidateTrace,
5352
+ candidateTraceRef,
5499
5353
  candidateTraceSummary
5500
5354
  });
5501
5355
  }
@@ -5514,6 +5368,7 @@ async function runEvaluatorsForCase(options) {
5514
5368
  now,
5515
5369
  judgeProvider,
5516
5370
  candidateTrace,
5371
+ candidateTraceRef,
5517
5372
  candidateTraceSummary
5518
5373
  });
5519
5374
  return { score };
@@ -5532,6 +5387,7 @@ async function runEvaluatorList(options) {
5532
5387
  judgeProvider,
5533
5388
  agentTimeoutMs,
5534
5389
  candidateTrace,
5390
+ candidateTraceRef,
5535
5391
  candidateTraceSummary
5536
5392
  } = options;
5537
5393
  const scored = [];
@@ -5578,7 +5434,9 @@ async function runEvaluatorList(options) {
5578
5434
  provider,
5579
5435
  attempt,
5580
5436
  promptInputs,
5581
- now
5437
+ now,
5438
+ candidateTraceRef,
5439
+ candidateTraceSummary
5582
5440
  });
5583
5441
  const weight = evaluator.weight ?? 1;
5584
5442
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -5616,8 +5474,6 @@ async function runEvaluatorList(options) {
5616
5474
  return new ToolTrajectoryEvaluator({
5617
5475
  config: memberConfig
5618
5476
  });
5619
- case "expected_messages":
5620
- return new ExpectedMessagesEvaluator();
5621
5477
  default: {
5622
5478
  const unknownConfig = memberConfig;
5623
5479
  throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -5667,32 +5523,7 @@ async function runEvaluatorList(options) {
5667
5523
  promptInputs,
5668
5524
  now,
5669
5525
  candidateTrace,
5670
- candidateTraceSummary
5671
- });
5672
- const weight = evaluator.weight ?? 1;
5673
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
5674
- evaluatorResults.push({
5675
- name: evaluator.name,
5676
- type: evaluator.type,
5677
- score: score2.score,
5678
- weight,
5679
- verdict: score2.verdict,
5680
- hits: score2.hits,
5681
- misses: score2.misses,
5682
- reasoning: score2.reasoning
5683
- });
5684
- }
5685
- if (evaluator.type === "expected_messages") {
5686
- const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
5687
- const score2 = expectedMessagesEvaluator.evaluate({
5688
- evalCase,
5689
- candidate,
5690
- target,
5691
- provider,
5692
- attempt,
5693
- promptInputs,
5694
- now,
5695
- candidateTrace,
5526
+ candidateTraceRef,
5696
5527
  candidateTraceSummary
5697
5528
  });
5698
5529
  const weight = evaluator.weight ?? 1;
@@ -6065,7 +5896,6 @@ function createAgentKernel() {
6065
5896
  0 && (module.exports = {
6066
5897
  CodeEvaluator,
6067
5898
  CompositeEvaluator,
6068
- ExpectedMessagesEvaluator,
6069
5899
  LlmJudgeEvaluator,
6070
5900
  TEST_MESSAGE_ROLES,
6071
5901
  ToolTrajectoryEvaluator,
@@ -6083,7 +5913,6 @@ function createAgentKernel() {
6083
5913
  generateRubrics,
6084
5914
  getHitCount,
6085
5915
  isEvaluatorKind,
6086
- isExpectedToolCall,
6087
5916
  isGuidelineFile,
6088
5917
  isJsonObject,
6089
5918
  isJsonValue,