@agentv/core 0.23.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,10 +5,11 @@ import {
5
5
  findGitRoot,
6
6
  isAgentProvider,
7
7
  normalizeLineEndings,
8
+ readJsonFile,
8
9
  readTextFile,
9
10
  resolveFileReference,
10
11
  resolveTargetDefinition
11
- } from "./chunk-B2J23S7D.js";
12
+ } from "./chunk-OYTL3LNN.js";
12
13
 
13
14
  // src/evaluation/types.ts
14
15
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -51,7 +52,14 @@ function isTestMessage(value) {
51
52
  }
52
53
  return candidate.content.every(isJsonObject);
53
54
  }
54
- var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
55
+ var EVALUATOR_KIND_VALUES = [
56
+ "code_judge",
57
+ "llm_judge",
58
+ "rubric",
59
+ "composite",
60
+ "tool_trajectory",
61
+ "expected_messages"
62
+ ];
55
63
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
56
64
  function isEvaluatorKind(value) {
57
65
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -60,6 +68,44 @@ function getHitCount(result) {
60
68
  return result.hits.length;
61
69
  }
62
70
 
71
+ // src/evaluation/trace.ts
72
+ function isTraceEventType(value) {
73
+ return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
74
+ }
75
+ function isTraceEvent(value) {
76
+ if (typeof value !== "object" || value === null) {
77
+ return false;
78
+ }
79
+ const candidate = value;
80
+ return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
81
+ }
82
+ function isExpectedToolCall(value) {
83
+ if (typeof value !== "object" || value === null) {
84
+ return false;
85
+ }
86
+ const candidate = value;
87
+ return typeof candidate.tool === "string";
88
+ }
89
+ function computeTraceSummary(trace) {
90
+ const toolCallCounts = {};
91
+ let errorCount = 0;
92
+ for (const event of trace) {
93
+ if (event.type === "tool_call" && event.name) {
94
+ toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
95
+ }
96
+ if (event.type === "error") {
97
+ errorCount++;
98
+ }
99
+ }
100
+ const toolNames = Object.keys(toolCallCounts).sort();
101
+ return {
102
+ eventCount: trace.length,
103
+ toolNames,
104
+ toolCallsByName: toolCallCounts,
105
+ errorCount
106
+ };
107
+ }
108
+
63
109
  // src/evaluation/yaml-parser.ts
64
110
  import { readFile as readFile5 } from "node:fs/promises";
65
111
  import path6 from "node:path";
@@ -532,6 +578,75 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
532
578
  });
533
579
  continue;
534
580
  }
581
+ if (typeValue === "expected_messages") {
582
+ evaluators.push({
583
+ name,
584
+ type: "expected_messages"
585
+ });
586
+ continue;
587
+ }
588
+ if (typeValue === "tool_trajectory") {
589
+ const mode = asString2(rawEvaluator.mode);
590
+ if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
591
+ logWarning2(
592
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
593
+ );
594
+ continue;
595
+ }
596
+ const rawMinimums = rawEvaluator.minimums;
597
+ let minimums;
598
+ if (rawMinimums !== void 0) {
599
+ if (!isJsonObject2(rawMinimums)) {
600
+ logWarning2(
601
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
602
+ );
603
+ continue;
604
+ }
605
+ minimums = {};
606
+ for (const [toolName, count] of Object.entries(rawMinimums)) {
607
+ if (typeof count === "number" && count >= 0) {
608
+ minimums[toolName] = count;
609
+ }
610
+ }
611
+ }
612
+ const rawExpected = rawEvaluator.expected;
613
+ let expected;
614
+ if (rawExpected !== void 0) {
615
+ if (!Array.isArray(rawExpected)) {
616
+ logWarning2(
617
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
618
+ );
619
+ continue;
620
+ }
621
+ expected = [];
622
+ for (const item of rawExpected) {
623
+ if (isJsonObject2(item) && typeof item.tool === "string") {
624
+ expected.push({ tool: item.tool });
625
+ }
626
+ }
627
+ }
628
+ if (mode === "any_order" && !minimums) {
629
+ logWarning2(
630
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
631
+ );
632
+ continue;
633
+ }
634
+ if ((mode === "in_order" || mode === "exact") && !expected) {
635
+ logWarning2(
636
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
637
+ );
638
+ continue;
639
+ }
640
+ const config = {
641
+ name,
642
+ type: "tool_trajectory",
643
+ mode,
644
+ ...minimums ? { minimums } : {},
645
+ ...expected ? { expected } : {}
646
+ };
647
+ evaluators.push(config);
648
+ continue;
649
+ }
535
650
  const prompt = asString2(rawEvaluator.prompt);
536
651
  let promptPath;
537
652
  if (prompt) {
@@ -785,6 +900,67 @@ ${detailBlock}${ANSI_RESET4}`);
785
900
  console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
786
901
  }
787
902
  }
903
+ async function processExpectedMessages(options) {
904
+ const { messages, searchRoots, repoRootPath, verbose } = options;
905
+ const segments = [];
906
+ for (const message of messages) {
907
+ const segment = {
908
+ role: message.role
909
+ };
910
+ if (message.role === "assistant" && message.tool_calls !== void 0) {
911
+ segment.tool_calls = message.tool_calls;
912
+ }
913
+ const content = message.content;
914
+ if (typeof content === "string") {
915
+ segment.content = content;
916
+ } else if (Array.isArray(content)) {
917
+ const processedContent = [];
918
+ for (const rawSegment of content) {
919
+ if (!isJsonObject(rawSegment)) {
920
+ continue;
921
+ }
922
+ const segmentType = asString3(rawSegment.type);
923
+ if (segmentType === "file") {
924
+ const rawValue = asString3(rawSegment.value);
925
+ if (!rawValue) {
926
+ continue;
927
+ }
928
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
929
+ rawValue,
930
+ searchRoots
931
+ );
932
+ if (!resolvedPath) {
933
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
934
+ logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
935
+ continue;
936
+ }
937
+ try {
938
+ const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
939
+ processedContent.push({
940
+ type: "file",
941
+ path: displayPath,
942
+ text: fileContent,
943
+ resolvedPath: path4.resolve(resolvedPath)
944
+ });
945
+ if (verbose) {
946
+ console.log(` [Expected Output File] Found: ${displayPath}`);
947
+ console.log(` Resolved to: ${resolvedPath}`);
948
+ }
949
+ } catch (error) {
950
+ logWarning3(
951
+ `Could not read expected output file ${resolvedPath}: ${error.message}`
952
+ );
953
+ }
954
+ continue;
955
+ }
956
+ processedContent.push(cloneJsonObject(rawSegment));
957
+ }
958
+ segment.content = processedContent;
959
+ }
960
+ segments.push(segment);
961
+ }
962
+ return segments;
963
+ }
788
964
 
789
965
  // src/evaluation/formatting/prompt-builder.ts
790
966
  import { readFile as readFile4 } from "node:fs/promises";
@@ -1089,12 +1265,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1089
1265
  messageType: "input",
1090
1266
  verbose
1091
1267
  });
1092
- const outputSegments = hasExpectedMessages ? await processMessages({
1268
+ const outputSegments = hasExpectedMessages ? await processExpectedMessages({
1093
1269
  messages: expectedMessages,
1094
1270
  searchRoots,
1095
1271
  repoRootPath,
1096
- guidelinePatterns,
1097
- messageType: "output",
1098
1272
  verbose
1099
1273
  }) : [];
1100
1274
  const codeSnippets = extractCodeBlocks(inputSegments);
@@ -1618,9 +1792,11 @@ var CliProvider = class {
1618
1792
  const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
1619
1793
  throw new Error(message);
1620
1794
  }
1621
- const responseText = await this.readAndCleanupOutputFile(outputFilePath);
1795
+ const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
1796
+ const parsed = this.parseOutputContent(responseContent);
1622
1797
  return {
1623
- text: responseText,
1798
+ text: parsed.text,
1799
+ trace: parsed.trace,
1624
1800
  raw: {
1625
1801
  command: renderedCommand,
1626
1802
  stderr: result.stderr,
@@ -1630,6 +1806,31 @@ var CliProvider = class {
1630
1806
  }
1631
1807
  };
1632
1808
  }
1809
+ /**
1810
+ * Parse output content from CLI.
1811
+ * If the content is valid JSON with a 'text' field, extract text and optional trace.
1812
+ * Otherwise, treat the entire content as plain text.
1813
+ */
1814
+ parseOutputContent(content) {
1815
+ try {
1816
+ const parsed = JSON.parse(content);
1817
+ if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
1818
+ const obj = parsed;
1819
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
1820
+ const trace = this.parseTrace(obj.trace);
1821
+ return { text, trace };
1822
+ }
1823
+ } catch {
1824
+ }
1825
+ return { text: content };
1826
+ }
1827
+ parseTrace(trace) {
1828
+ if (!Array.isArray(trace)) {
1829
+ return void 0;
1830
+ }
1831
+ const validEvents = trace.filter(isTraceEvent);
1832
+ return validEvents.length > 0 ? validEvents : void 0;
1833
+ }
1633
1834
  async readAndCleanupOutputFile(filePath) {
1634
1835
  try {
1635
1836
  const content = await readTextFile(filePath);
@@ -2616,6 +2817,7 @@ var MockProvider = class {
2616
2817
  delayMs;
2617
2818
  delayMinMs;
2618
2819
  delayMaxMs;
2820
+ trace;
2619
2821
  constructor(targetName, config) {
2620
2822
  this.id = `mock:${targetName}`;
2621
2823
  this.targetName = targetName;
@@ -2623,6 +2825,7 @@ var MockProvider = class {
2623
2825
  this.delayMs = config.delayMs ?? 0;
2624
2826
  this.delayMinMs = config.delayMinMs ?? 0;
2625
2827
  this.delayMaxMs = config.delayMaxMs ?? 0;
2828
+ this.trace = config.trace;
2626
2829
  }
2627
2830
  async invoke(request) {
2628
2831
  const delay = this.calculateDelay();
@@ -2634,7 +2837,8 @@ var MockProvider = class {
2634
2837
  raw: {
2635
2838
  question: request.question,
2636
2839
  guidelines: request.guidelines
2637
- }
2840
+ },
2841
+ trace: this.trace
2638
2842
  };
2639
2843
  }
2640
2844
  calculateDelay() {
@@ -3428,6 +3632,251 @@ function substituteVariables(template, variables) {
3428
3632
  return variables[varName] ?? match;
3429
3633
  });
3430
3634
  }
3635
+ var ToolTrajectoryEvaluator = class {
3636
+ kind = "tool_trajectory";
3637
+ config;
3638
+ constructor(options) {
3639
+ this.config = options.config;
3640
+ }
3641
+ evaluate(context) {
3642
+ const { candidateTrace, candidateTraceSummary } = context;
3643
+ if (!candidateTrace || !candidateTraceSummary) {
3644
+ return {
3645
+ score: 0,
3646
+ verdict: "fail",
3647
+ hits: [],
3648
+ misses: ["No trace available for evaluation"],
3649
+ expectedAspectCount: 1
3650
+ };
3651
+ }
3652
+ switch (this.config.mode) {
3653
+ case "any_order":
3654
+ return this.evaluateAnyOrder(candidateTraceSummary);
3655
+ case "in_order":
3656
+ return this.evaluateInOrder(candidateTrace);
3657
+ case "exact":
3658
+ return this.evaluateExact(candidateTrace);
3659
+ default:
3660
+ return {
3661
+ score: 0,
3662
+ verdict: "fail",
3663
+ hits: [],
3664
+ misses: [`Unknown mode: ${this.config.mode}`],
3665
+ expectedAspectCount: 1
3666
+ };
3667
+ }
3668
+ }
3669
+ evaluateAnyOrder(summary) {
3670
+ const minimums = this.config.minimums ?? {};
3671
+ const toolNames = Object.keys(minimums);
3672
+ if (toolNames.length === 0) {
3673
+ return {
3674
+ score: 1,
3675
+ verdict: "pass",
3676
+ hits: ["No tool requirements specified"],
3677
+ misses: [],
3678
+ expectedAspectCount: 0
3679
+ };
3680
+ }
3681
+ const hits = [];
3682
+ const misses = [];
3683
+ for (const toolName of toolNames) {
3684
+ const required = minimums[toolName];
3685
+ const actual = summary.toolCallsByName[toolName] ?? 0;
3686
+ if (actual >= required) {
3687
+ hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
3688
+ } else {
3689
+ misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
3690
+ }
3691
+ }
3692
+ const score = hits.length / toolNames.length;
3693
+ return {
3694
+ score,
3695
+ verdict: scoreToVerdict(score),
3696
+ hits,
3697
+ misses,
3698
+ expectedAspectCount: toolNames.length
3699
+ };
3700
+ }
3701
+ evaluateInOrder(trace) {
3702
+ const expected = this.config.expected ?? [];
3703
+ if (expected.length === 0) {
3704
+ return {
3705
+ score: 1,
3706
+ verdict: "pass",
3707
+ hits: ["No tool sequence specified"],
3708
+ misses: [],
3709
+ expectedAspectCount: 0
3710
+ };
3711
+ }
3712
+ const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
3713
+ const hits = [];
3714
+ const misses = [];
3715
+ let actualIndex = 0;
3716
+ for (let i = 0; i < expected.length; i++) {
3717
+ const expectedTool = expected[i].tool;
3718
+ let found = false;
3719
+ while (actualIndex < actualToolCalls.length) {
3720
+ if (actualToolCalls[actualIndex].name === expectedTool) {
3721
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
3722
+ actualIndex++;
3723
+ found = true;
3724
+ break;
3725
+ }
3726
+ actualIndex++;
3727
+ }
3728
+ if (!found) {
3729
+ misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
3730
+ }
3731
+ }
3732
+ const score = hits.length / expected.length;
3733
+ return {
3734
+ score,
3735
+ verdict: scoreToVerdict(score),
3736
+ hits,
3737
+ misses,
3738
+ expectedAspectCount: expected.length
3739
+ };
3740
+ }
3741
+ evaluateExact(trace) {
3742
+ const expected = this.config.expected ?? [];
3743
+ if (expected.length === 0) {
3744
+ return {
3745
+ score: 1,
3746
+ verdict: "pass",
3747
+ hits: ["No tool sequence specified"],
3748
+ misses: [],
3749
+ expectedAspectCount: 0
3750
+ };
3751
+ }
3752
+ const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
3753
+ const hits = [];
3754
+ const misses = [];
3755
+ if (actualToolCalls.length !== expected.length) {
3756
+ misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
3757
+ }
3758
+ const checkLength = Math.min(expected.length, actualToolCalls.length);
3759
+ for (let i = 0; i < checkLength; i++) {
3760
+ const expectedTool = expected[i].tool;
3761
+ const actualTool = actualToolCalls[i].name;
3762
+ if (actualTool === expectedTool) {
3763
+ hits.push(`Position ${i}: ${expectedTool} \u2713`);
3764
+ } else {
3765
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
3766
+ }
3767
+ }
3768
+ for (let i = checkLength; i < expected.length; i++) {
3769
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
3770
+ }
3771
+ const score = hits.length / expected.length;
3772
+ return {
3773
+ score,
3774
+ verdict: scoreToVerdict(score),
3775
+ hits,
3776
+ misses,
3777
+ expectedAspectCount: expected.length
3778
+ };
3779
+ }
3780
+ };
3781
+ var ExpectedMessagesEvaluator = class {
3782
+ kind = "expected_messages";
3783
+ evaluate(context) {
3784
+ const { candidateTrace, evalCase } = context;
3785
+ const expectedSegments = evalCase.expected_segments;
3786
+ const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
3787
+ if (expectedToolCalls.length === 0) {
3788
+ return {
3789
+ score: 1,
3790
+ verdict: "pass",
3791
+ hits: ["No tool_calls specified in expected_messages"],
3792
+ misses: [],
3793
+ expectedAspectCount: 1
3794
+ };
3795
+ }
3796
+ if (!candidateTrace || candidateTrace.length === 0) {
3797
+ return {
3798
+ score: 0,
3799
+ verdict: "fail",
3800
+ hits: [],
3801
+ misses: ["No trace available to validate tool_calls"],
3802
+ expectedAspectCount: expectedToolCalls.length
3803
+ };
3804
+ }
3805
+ const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
3806
+ return this.validateToolCalls(expectedToolCalls, actualToolCalls);
3807
+ }
3808
+ extractExpectedToolCalls(segments) {
3809
+ if (!segments) {
3810
+ return [];
3811
+ }
3812
+ const toolCalls = [];
3813
+ for (const segment of segments) {
3814
+ const role = segment.role;
3815
+ const segmentToolCalls = segment.tool_calls;
3816
+ if (role === "assistant" && Array.isArray(segmentToolCalls)) {
3817
+ for (const tc of segmentToolCalls) {
3818
+ if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
3819
+ const toolCall = tc;
3820
+ toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
3821
+ }
3822
+ }
3823
+ }
3824
+ }
3825
+ return toolCalls;
3826
+ }
3827
+ validateToolCalls(expected, actual) {
3828
+ const hits = [];
3829
+ const misses = [];
3830
+ for (let i = 0; i < expected.length; i++) {
3831
+ const expectedCall = expected[i];
3832
+ const actualCall = actual[i];
3833
+ if (!actualCall) {
3834
+ misses.push(
3835
+ `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
3836
+ );
3837
+ continue;
3838
+ }
3839
+ if (actualCall.name !== expectedCall.tool) {
3840
+ misses.push(
3841
+ `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
3842
+ );
3843
+ continue;
3844
+ }
3845
+ if (expectedCall.input !== void 0) {
3846
+ if (!this.deepEquals(expectedCall.input, actualCall.input)) {
3847
+ misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
3848
+ continue;
3849
+ }
3850
+ }
3851
+ hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
3852
+ }
3853
+ const totalChecks = expected.length || 1;
3854
+ const score = hits.length / totalChecks;
3855
+ return {
3856
+ score,
3857
+ verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
3858
+ hits,
3859
+ misses,
3860
+ expectedAspectCount: totalChecks
3861
+ };
3862
+ }
3863
+ deepEquals(a, b) {
3864
+ if (a === b) return true;
3865
+ if (typeof a !== typeof b) return false;
3866
+ if (typeof a !== "object" || a === null || b === null) return false;
3867
+ if (Array.isArray(a) && Array.isArray(b)) {
3868
+ if (a.length !== b.length) return false;
3869
+ return a.every((val, i) => this.deepEquals(val, b[i]));
3870
+ }
3871
+ if (Array.isArray(a) || Array.isArray(b)) return false;
3872
+ const aObj = a;
3873
+ const bObj = b;
3874
+ const aKeys = Object.keys(aObj);
3875
+ const bKeys = Object.keys(bObj);
3876
+ if (aKeys.length !== bKeys.length) return false;
3877
+ return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
3878
+ }
3879
+ };
3431
3880
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
3432
3881
  {{EVALUATOR_RESULTS_JSON}}
3433
3882
 
@@ -3851,7 +4300,7 @@ async function runEvaluation(options) {
3851
4300
  if (!definition) {
3852
4301
  return void 0;
3853
4302
  }
3854
- const resolved = resolveTargetDefinition(definition, envLookup);
4303
+ const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
3855
4304
  resolvedTargetsByName.set(name, resolved);
3856
4305
  return resolved;
3857
4306
  };
@@ -4165,6 +4614,17 @@ async function runEvalCase(options) {
4165
4614
  if (cacheKey && cache && !cachedResponse) {
4166
4615
  await cache.set(cacheKey, providerResponse);
4167
4616
  }
4617
+ let candidateTrace = providerResponse.trace;
4618
+ if (!candidateTrace && providerResponse.traceRef) {
4619
+ try {
4620
+ const rawTrace = await readJsonFile(providerResponse.traceRef);
4621
+ if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
4622
+ candidateTrace = rawTrace;
4623
+ }
4624
+ } catch {
4625
+ }
4626
+ }
4627
+ const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
4168
4628
  try {
4169
4629
  return await evaluateCandidate({
4170
4630
  evalCase,
@@ -4176,7 +4636,9 @@ async function runEvalCase(options) {
4176
4636
  nowFn,
4177
4637
  attempt,
4178
4638
  judgeProvider,
4179
- agentTimeoutMs
4639
+ agentTimeoutMs,
4640
+ candidateTrace,
4641
+ candidateTraceSummary
4180
4642
  });
4181
4643
  } catch (error) {
4182
4644
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -4193,7 +4655,9 @@ async function evaluateCandidate(options) {
4193
4655
  nowFn,
4194
4656
  attempt,
4195
4657
  judgeProvider,
4196
- agentTimeoutMs
4658
+ agentTimeoutMs,
4659
+ candidateTrace,
4660
+ candidateTraceSummary
4197
4661
  } = options;
4198
4662
  const gradeTimestamp = nowFn();
4199
4663
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -4206,7 +4670,9 @@ async function evaluateCandidate(options) {
4206
4670
  promptInputs,
4207
4671
  now: gradeTimestamp,
4208
4672
  judgeProvider,
4209
- agentTimeoutMs
4673
+ agentTimeoutMs,
4674
+ candidateTrace,
4675
+ candidateTraceSummary
4210
4676
  });
4211
4677
  const completedAt = nowFn();
4212
4678
  let agentProviderRequest;
@@ -4245,7 +4711,8 @@ async function evaluateCandidate(options) {
4245
4711
  agent_provider_request: agentProviderRequest,
4246
4712
  lm_provider_request: lmProviderRequest,
4247
4713
  evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4248
- evaluator_results: evaluatorResults
4714
+ evaluator_results: evaluatorResults,
4715
+ trace_summary: candidateTraceSummary
4249
4716
  };
4250
4717
  }
4251
4718
  async function runEvaluatorsForCase(options) {
@@ -4259,7 +4726,9 @@ async function runEvaluatorsForCase(options) {
4259
4726
  promptInputs,
4260
4727
  now,
4261
4728
  judgeProvider,
4262
- agentTimeoutMs
4729
+ agentTimeoutMs,
4730
+ candidateTrace,
4731
+ candidateTraceSummary
4263
4732
  } = options;
4264
4733
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
4265
4734
  return runEvaluatorList({
@@ -4273,7 +4742,9 @@ async function runEvaluatorsForCase(options) {
4273
4742
  promptInputs,
4274
4743
  now,
4275
4744
  judgeProvider,
4276
- agentTimeoutMs
4745
+ agentTimeoutMs,
4746
+ candidateTrace,
4747
+ candidateTraceSummary
4277
4748
  });
4278
4749
  }
4279
4750
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -4289,7 +4760,9 @@ async function runEvaluatorsForCase(options) {
4289
4760
  attempt,
4290
4761
  promptInputs,
4291
4762
  now,
4292
- judgeProvider
4763
+ judgeProvider,
4764
+ candidateTrace,
4765
+ candidateTraceSummary
4293
4766
  });
4294
4767
  return { score };
4295
4768
  }
@@ -4305,7 +4778,9 @@ async function runEvaluatorList(options) {
4305
4778
  promptInputs,
4306
4779
  now,
4307
4780
  judgeProvider,
4308
- agentTimeoutMs
4781
+ agentTimeoutMs,
4782
+ candidateTrace,
4783
+ candidateTraceSummary
4309
4784
  } = options;
4310
4785
  const scored = [];
4311
4786
  const evaluatorResults = [];
@@ -4381,6 +4856,12 @@ async function runEvaluatorList(options) {
4381
4856
  cwd: evalFileDir,
4382
4857
  evaluatorFactory: { create: createEvaluator }
4383
4858
  });
4859
+ case "tool_trajectory":
4860
+ return new ToolTrajectoryEvaluator({
4861
+ config: memberConfig
4862
+ });
4863
+ case "expected_messages":
4864
+ return new ExpectedMessagesEvaluator();
4384
4865
  default: {
4385
4866
  const unknownConfig = memberConfig;
4386
4867
  throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -4415,6 +4896,56 @@ async function runEvaluatorList(options) {
4415
4896
  evaluator_results: mapChildResults(score2.evaluatorResults)
4416
4897
  });
4417
4898
  }
4899
+ if (evaluator.type === "tool_trajectory") {
4900
+ const trajectoryEvaluator = new ToolTrajectoryEvaluator({
4901
+ config: evaluator
4902
+ });
4903
+ const score2 = trajectoryEvaluator.evaluate({
4904
+ evalCase,
4905
+ candidate,
4906
+ target,
4907
+ provider,
4908
+ attempt,
4909
+ promptInputs,
4910
+ now,
4911
+ candidateTrace,
4912
+ candidateTraceSummary
4913
+ });
4914
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4915
+ evaluatorResults.push({
4916
+ name: evaluator.name,
4917
+ type: evaluator.type,
4918
+ score: score2.score,
4919
+ verdict: score2.verdict,
4920
+ hits: score2.hits,
4921
+ misses: score2.misses,
4922
+ reasoning: score2.reasoning
4923
+ });
4924
+ }
4925
+ if (evaluator.type === "expected_messages") {
4926
+ const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
4927
+ const score2 = expectedMessagesEvaluator.evaluate({
4928
+ evalCase,
4929
+ candidate,
4930
+ target,
4931
+ provider,
4932
+ attempt,
4933
+ promptInputs,
4934
+ now,
4935
+ candidateTrace,
4936
+ candidateTraceSummary
4937
+ });
4938
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4939
+ evaluatorResults.push({
4940
+ name: evaluator.name,
4941
+ type: evaluator.type,
4942
+ score: score2.score,
4943
+ verdict: score2.verdict,
4944
+ hits: score2.hits,
4945
+ misses: score2.misses,
4946
+ reasoning: score2.reasoning
4947
+ });
4948
+ }
4418
4949
  } catch (error) {
4419
4950
  const message = error instanceof Error ? error.message : String(error);
4420
4951
  const fallbackScore = {
@@ -4756,11 +5287,14 @@ function createAgentKernel() {
4756
5287
  export {
4757
5288
  CodeEvaluator,
4758
5289
  CompositeEvaluator,
5290
+ ExpectedMessagesEvaluator,
4759
5291
  LlmJudgeEvaluator,
4760
5292
  TEST_MESSAGE_ROLES,
5293
+ ToolTrajectoryEvaluator,
4761
5294
  buildDirectoryChain,
4762
5295
  buildPromptInputs,
4763
5296
  buildSearchRoots,
5297
+ computeTraceSummary,
4764
5298
  consumeCodexLogEntries,
4765
5299
  createAgentKernel,
4766
5300
  createProvider,
@@ -4771,14 +5305,18 @@ export {
4771
5305
  generateRubrics,
4772
5306
  getHitCount,
4773
5307
  isEvaluatorKind,
5308
+ isExpectedToolCall,
4774
5309
  isGuidelineFile,
4775
5310
  isJsonObject,
4776
5311
  isJsonValue,
4777
5312
  isTestMessage,
4778
5313
  isTestMessageRole,
5314
+ isTraceEvent,
5315
+ isTraceEventType,
4779
5316
  listTargetNames,
4780
5317
  loadEvalCases,
4781
5318
  normalizeLineEndings,
5319
+ readJsonFile,
4782
5320
  readTargetDefinitions,
4783
5321
  readTestSuiteMetadata,
4784
5322
  readTextFile,