@agentv/core 0.26.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  readTextFile,
10
10
  resolveFileReference,
11
11
  resolveTargetDefinition
12
- } from "./chunk-NDEN3H2B.js";
12
+ } from "./chunk-V3JCB3HI.js";
13
13
 
14
14
  // src/evaluation/types.ts
15
15
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -47,18 +47,23 @@ function isTestMessage(value) {
47
47
  if (typeof candidate.content === "string") {
48
48
  return true;
49
49
  }
50
- if (!Array.isArray(candidate.content)) {
51
- return false;
50
+ if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
51
+ return true;
52
+ }
53
+ if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
54
+ return true;
52
55
  }
53
- return candidate.content.every(isJsonObject);
56
+ if (isJsonObject(candidate.content)) {
57
+ return true;
58
+ }
59
+ return false;
54
60
  }
55
61
  var EVALUATOR_KIND_VALUES = [
56
62
  "code_judge",
57
63
  "llm_judge",
58
64
  "rubric",
59
65
  "composite",
60
- "tool_trajectory",
61
- "expected_messages"
66
+ "tool_trajectory"
62
67
  ];
63
68
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
64
69
  function isEvaluatorKind(value) {
@@ -79,13 +84,6 @@ function isTraceEvent(value) {
79
84
  const candidate = value;
80
85
  return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
81
86
  }
82
- function isExpectedToolCall(value) {
83
- if (typeof value !== "object" || value === null) {
84
- return false;
85
- }
86
- const candidate = value;
87
- return typeof candidate.tool === "string";
88
- }
89
87
  function computeTraceSummary(trace) {
90
88
  const toolCallCounts = {};
91
89
  let errorCount = 0;
@@ -582,15 +580,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
582
580
  });
583
581
  continue;
584
582
  }
585
- if (typeValue === "expected_messages") {
586
- const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
587
- evaluators.push({
588
- name,
589
- type: "expected_messages",
590
- ...weight2 !== void 0 ? { weight: weight2 } : {}
591
- });
592
- continue;
593
- }
594
583
  if (typeValue === "tool_trajectory") {
595
584
  const mode = asString2(rawEvaluator.mode);
596
585
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
@@ -845,63 +834,6 @@ async function processMessages(options) {
845
834
  }
846
835
  return segments;
847
836
  }
848
- async function resolveAssistantContent(content, searchRoots, verbose) {
849
- if (typeof content === "string") {
850
- return content;
851
- }
852
- if (!content) {
853
- return "";
854
- }
855
- const parts = [];
856
- for (const entry of content) {
857
- if (typeof entry === "string") {
858
- parts.push({ content: entry, isFile: false });
859
- continue;
860
- }
861
- if (!isJsonObject(entry)) {
862
- continue;
863
- }
864
- const segmentType = asString3(entry.type);
865
- if (segmentType === "file") {
866
- const rawValue = asString3(entry.value);
867
- if (!rawValue) {
868
- continue;
869
- }
870
- const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
871
- rawValue,
872
- searchRoots
873
- );
874
- if (!resolvedPath) {
875
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
876
- logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
877
- continue;
878
- }
879
- try {
880
- const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
881
- parts.push({ content: fileContent, isFile: true, displayPath });
882
- if (verbose) {
883
- console.log(` [Expected Assistant File] Found: ${displayPath}`);
884
- console.log(` Resolved to: ${resolvedPath}`);
885
- }
886
- } catch (error) {
887
- logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
888
- }
889
- continue;
890
- }
891
- const textValue = asString3(entry.text);
892
- if (typeof textValue === "string") {
893
- parts.push({ content: textValue, isFile: false });
894
- continue;
895
- }
896
- const valueValue = asString3(entry.value);
897
- if (typeof valueValue === "string") {
898
- parts.push({ content: valueValue, isFile: false });
899
- continue;
900
- }
901
- parts.push({ content: JSON.stringify(entry), isFile: false });
902
- }
903
- return formatFileContents(parts);
904
- }
905
837
  function asString3(value) {
906
838
  return typeof value === "string" ? value : void 0;
907
839
  }
@@ -934,14 +866,15 @@ ${detailBlock}${ANSI_RESET4}`);
934
866
  }
935
867
  }
936
868
  async function processExpectedMessages(options) {
937
- const { messages, searchRoots, repoRootPath, verbose } = options;
869
+ const { messages, searchRoots, verbose } = options;
938
870
  const segments = [];
939
871
  for (const message of messages) {
872
+ const extendedMessage = message;
940
873
  const segment = {
941
874
  role: message.role
942
875
  };
943
- if (message.role === "assistant" && message.tool_calls !== void 0) {
944
- segment.tool_calls = message.tool_calls;
876
+ if (extendedMessage.name) {
877
+ segment.name = extendedMessage.name;
945
878
  }
946
879
  const content = message.content;
947
880
  if (typeof content === "string") {
@@ -989,6 +922,13 @@ async function processExpectedMessages(options) {
989
922
  processedContent.push(cloneJsonObject(rawSegment));
990
923
  }
991
924
  segment.content = processedContent;
925
+ } else if (isJsonObject(content)) {
926
+ segment.content = cloneJsonObject(content);
927
+ }
928
+ if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
929
+ segment.tool_calls = extendedMessage.tool_calls.map(
930
+ (tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
931
+ );
992
932
  }
993
933
  segments.push(segment);
994
934
  }
@@ -1283,9 +1223,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1283
1223
  logError(`No valid expected message found for eval case: ${id}`);
1284
1224
  continue;
1285
1225
  }
1286
- if (expectedMessages.length > 1) {
1287
- logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
1288
- }
1289
1226
  const guidelinePaths = [];
1290
1227
  const inputTextParts = [];
1291
1228
  const inputSegments = await processMessages({
@@ -1305,8 +1242,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1305
1242
  verbose
1306
1243
  }) : [];
1307
1244
  const codeSnippets = extractCodeBlocks(inputSegments);
1308
- const expectedContent = expectedMessages[0]?.content;
1309
- const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
1245
+ let referenceAnswer = "";
1246
+ if (outputSegments.length > 1) {
1247
+ referenceAnswer = JSON.stringify(outputSegments, null, 2);
1248
+ } else if (outputSegments.length === 1) {
1249
+ const singleMessage = outputSegments[0];
1250
+ if (typeof singleMessage.content === "string") {
1251
+ referenceAnswer = singleMessage.content;
1252
+ } else if (singleMessage.content) {
1253
+ referenceAnswer = JSON.stringify(singleMessage, null, 2);
1254
+ } else if (singleMessage.tool_calls) {
1255
+ referenceAnswer = JSON.stringify(singleMessage, null, 2);
1256
+ }
1257
+ }
1310
1258
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
1311
1259
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
1312
1260
  let evaluators;
@@ -1361,7 +1309,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1361
1309
  question,
1362
1310
  input_messages: inputMessages,
1363
1311
  input_segments: inputSegments,
1364
- expected_segments: outputSegments,
1312
+ expected_messages: outputSegments,
1365
1313
  reference_answer: referenceAnswer,
1366
1314
  guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
1367
1315
  guideline_patterns: guidelinePatterns,
@@ -3270,7 +3218,7 @@ import { generateText as generateText2 } from "ai";
3270
3218
  import { z } from "zod";
3271
3219
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3272
3220
 
3273
- Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
3221
+ Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
3274
3222
 
3275
3223
  Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
3276
3224
 
@@ -3328,7 +3276,7 @@ var LlmJudgeEvaluator = class {
3328
3276
  const variables = {
3329
3277
  [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
3330
3278
  [TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
3331
- context.evalCase.expected_segments,
3279
+ context.evalCase.expected_messages,
3332
3280
  null,
3333
3281
  2
3334
3282
  ),
@@ -3547,7 +3495,9 @@ var CodeEvaluator = class {
3547
3495
  input_files: context.evalCase.file_paths.filter(
3548
3496
  (path13) => !context.evalCase.guideline_paths.includes(path13)
3549
3497
  ),
3550
- input_messages: context.evalCase.input_messages
3498
+ input_messages: context.evalCase.input_messages,
3499
+ candidate_trace_file: context.candidateTraceRef ?? null,
3500
+ candidate_trace_summary: context.candidateTraceSummary ?? null
3551
3501
  },
3552
3502
  null,
3553
3503
  2
@@ -3813,105 +3763,6 @@ var ToolTrajectoryEvaluator = class {
3813
3763
  };
3814
3764
  }
3815
3765
  };
3816
- var ExpectedMessagesEvaluator = class {
3817
- kind = "expected_messages";
3818
- evaluate(context) {
3819
- const { candidateTrace, evalCase } = context;
3820
- const expectedSegments = evalCase.expected_segments;
3821
- const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
3822
- if (expectedToolCalls.length === 0) {
3823
- return {
3824
- score: 1,
3825
- verdict: "pass",
3826
- hits: ["No tool_calls specified in expected_messages"],
3827
- misses: [],
3828
- expectedAspectCount: 1
3829
- };
3830
- }
3831
- if (!candidateTrace || candidateTrace.length === 0) {
3832
- return {
3833
- score: 0,
3834
- verdict: "fail",
3835
- hits: [],
3836
- misses: ["No trace available to validate tool_calls"],
3837
- expectedAspectCount: expectedToolCalls.length
3838
- };
3839
- }
3840
- const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
3841
- return this.validateToolCalls(expectedToolCalls, actualToolCalls);
3842
- }
3843
- extractExpectedToolCalls(segments) {
3844
- if (!segments) {
3845
- return [];
3846
- }
3847
- const toolCalls = [];
3848
- for (const segment of segments) {
3849
- const role = segment.role;
3850
- const segmentToolCalls = segment.tool_calls;
3851
- if (role === "assistant" && Array.isArray(segmentToolCalls)) {
3852
- for (const tc of segmentToolCalls) {
3853
- if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
3854
- const toolCall = tc;
3855
- toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
3856
- }
3857
- }
3858
- }
3859
- }
3860
- return toolCalls;
3861
- }
3862
- validateToolCalls(expected, actual) {
3863
- const hits = [];
3864
- const misses = [];
3865
- for (let i = 0; i < expected.length; i++) {
3866
- const expectedCall = expected[i];
3867
- const actualCall = actual[i];
3868
- if (!actualCall) {
3869
- misses.push(
3870
- `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
3871
- );
3872
- continue;
3873
- }
3874
- if (actualCall.name !== expectedCall.tool) {
3875
- misses.push(
3876
- `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
3877
- );
3878
- continue;
3879
- }
3880
- if (expectedCall.input !== void 0) {
3881
- if (!this.deepEquals(expectedCall.input, actualCall.input)) {
3882
- misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
3883
- continue;
3884
- }
3885
- }
3886
- hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
3887
- }
3888
- const totalChecks = expected.length || 1;
3889
- const score = hits.length / totalChecks;
3890
- return {
3891
- score,
3892
- verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
3893
- hits,
3894
- misses,
3895
- expectedAspectCount: totalChecks
3896
- };
3897
- }
3898
- deepEquals(a, b) {
3899
- if (a === b) return true;
3900
- if (typeof a !== typeof b) return false;
3901
- if (typeof a !== "object" || a === null || b === null) return false;
3902
- if (Array.isArray(a) && Array.isArray(b)) {
3903
- if (a.length !== b.length) return false;
3904
- return a.every((val, i) => this.deepEquals(val, b[i]));
3905
- }
3906
- if (Array.isArray(a) || Array.isArray(b)) return false;
3907
- const aObj = a;
3908
- const bObj = b;
3909
- const aKeys = Object.keys(aObj);
3910
- const bKeys = Object.keys(bObj);
3911
- if (aKeys.length !== bKeys.length) return false;
3912
- return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
3913
- }
3914
- };
3915
3766
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
3916
3767
  {{EVALUATOR_RESULTS_JSON}}
3917
3768
 
@@ -4673,6 +4524,7 @@ async function runEvalCase(options) {
4673
4524
  judgeProvider,
4674
4525
  agentTimeoutMs,
4675
4526
  candidateTrace,
4527
+ candidateTraceRef: providerResponse.traceRef,
4676
4528
  candidateTraceSummary
4677
4529
  });
4678
4530
  } catch (error) {
@@ -4692,6 +4544,7 @@ async function evaluateCandidate(options) {
4692
4544
  judgeProvider,
4693
4545
  agentTimeoutMs,
4694
4546
  candidateTrace,
4547
+ candidateTraceRef,
4695
4548
  candidateTraceSummary
4696
4549
  } = options;
4697
4550
  const gradeTimestamp = nowFn();
@@ -4707,6 +4560,7 @@ async function evaluateCandidate(options) {
4707
4560
  judgeProvider,
4708
4561
  agentTimeoutMs,
4709
4562
  candidateTrace,
4563
+ candidateTraceRef,
4710
4564
  candidateTraceSummary
4711
4565
  });
4712
4566
  const completedAt = nowFn();
@@ -4761,6 +4615,7 @@ async function runEvaluatorsForCase(options) {
4761
4615
  judgeProvider,
4762
4616
  agentTimeoutMs,
4763
4617
  candidateTrace,
4618
+ candidateTraceRef,
4764
4619
  candidateTraceSummary
4765
4620
  } = options;
4766
4621
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
@@ -4777,6 +4632,7 @@ async function runEvaluatorsForCase(options) {
4777
4632
  judgeProvider,
4778
4633
  agentTimeoutMs,
4779
4634
  candidateTrace,
4635
+ candidateTraceRef,
4780
4636
  candidateTraceSummary
4781
4637
  });
4782
4638
  }
@@ -4795,6 +4651,7 @@ async function runEvaluatorsForCase(options) {
4795
4651
  now,
4796
4652
  judgeProvider,
4797
4653
  candidateTrace,
4654
+ candidateTraceRef,
4798
4655
  candidateTraceSummary
4799
4656
  });
4800
4657
  return { score };
@@ -4813,6 +4670,7 @@ async function runEvaluatorList(options) {
4813
4670
  judgeProvider,
4814
4671
  agentTimeoutMs,
4815
4672
  candidateTrace,
4673
+ candidateTraceRef,
4816
4674
  candidateTraceSummary
4817
4675
  } = options;
4818
4676
  const scored = [];
@@ -4859,7 +4717,9 @@ async function runEvaluatorList(options) {
4859
4717
  provider,
4860
4718
  attempt,
4861
4719
  promptInputs,
4862
- now
4720
+ now,
4721
+ candidateTraceRef,
4722
+ candidateTraceSummary
4863
4723
  });
4864
4724
  const weight = evaluator.weight ?? 1;
4865
4725
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -4897,8 +4757,6 @@ async function runEvaluatorList(options) {
4897
4757
  return new ToolTrajectoryEvaluator({
4898
4758
  config: memberConfig
4899
4759
  });
4900
- case "expected_messages":
4901
- return new ExpectedMessagesEvaluator();
4902
4760
  default: {
4903
4761
  const unknownConfig = memberConfig;
4904
4762
  throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -4948,32 +4806,7 @@ async function runEvaluatorList(options) {
4948
4806
  promptInputs,
4949
4807
  now,
4950
4808
  candidateTrace,
4951
- candidateTraceSummary
4952
- });
4953
- const weight = evaluator.weight ?? 1;
4954
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
4955
- evaluatorResults.push({
4956
- name: evaluator.name,
4957
- type: evaluator.type,
4958
- score: score2.score,
4959
- weight,
4960
- verdict: score2.verdict,
4961
- hits: score2.hits,
4962
- misses: score2.misses,
4963
- reasoning: score2.reasoning
4964
- });
4965
- }
4966
- if (evaluator.type === "expected_messages") {
4967
- const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
4968
- const score2 = expectedMessagesEvaluator.evaluate({
4969
- evalCase,
4970
- candidate,
4971
- target,
4972
- provider,
4973
- attempt,
4974
- promptInputs,
4975
- now,
4976
- candidateTrace,
4809
+ candidateTraceRef,
4977
4810
  candidateTraceSummary
4978
4811
  });
4979
4812
  const weight = evaluator.weight ?? 1;
@@ -5345,7 +5178,6 @@ function createAgentKernel() {
5345
5178
  export {
5346
5179
  CodeEvaluator,
5347
5180
  CompositeEvaluator,
5348
- ExpectedMessagesEvaluator,
5349
5181
  LlmJudgeEvaluator,
5350
5182
  TEST_MESSAGE_ROLES,
5351
5183
  ToolTrajectoryEvaluator,
@@ -5363,7 +5195,6 @@ export {
5363
5195
  generateRubrics,
5364
5196
  getHitCount,
5365
5197
  isEvaluatorKind,
5366
- isExpectedToolCall,
5367
5198
  isGuidelineFile,
5368
5199
  isJsonObject,
5369
5200
  isJsonValue,