@agentv/core 0.23.0 → 0.26.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,10 +5,11 @@ import {
5
5
  findGitRoot,
6
6
  isAgentProvider,
7
7
  normalizeLineEndings,
8
+ readJsonFile,
8
9
  readTextFile,
9
10
  resolveFileReference,
10
11
  resolveTargetDefinition
11
- } from "./chunk-B2J23S7D.js";
12
+ } from "./chunk-NDEN3H2B.js";
12
13
 
13
14
  // src/evaluation/types.ts
14
15
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -51,7 +52,14 @@ function isTestMessage(value) {
51
52
  }
52
53
  return candidate.content.every(isJsonObject);
53
54
  }
54
- var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
55
+ var EVALUATOR_KIND_VALUES = [
56
+ "code_judge",
57
+ "llm_judge",
58
+ "rubric",
59
+ "composite",
60
+ "tool_trajectory",
61
+ "expected_messages"
62
+ ];
55
63
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
56
64
  function isEvaluatorKind(value) {
57
65
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -60,6 +68,44 @@ function getHitCount(result) {
60
68
  return result.hits.length;
61
69
  }
62
70
 
71
+ // src/evaluation/trace.ts
72
+ function isTraceEventType(value) {
73
+ return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
74
+ }
75
+ function isTraceEvent(value) {
76
+ if (typeof value !== "object" || value === null) {
77
+ return false;
78
+ }
79
+ const candidate = value;
80
+ return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
81
+ }
82
+ function isExpectedToolCall(value) {
83
+ if (typeof value !== "object" || value === null) {
84
+ return false;
85
+ }
86
+ const candidate = value;
87
+ return typeof candidate.tool === "string";
88
+ }
89
+ function computeTraceSummary(trace) {
90
+ const toolCallCounts = {};
91
+ let errorCount = 0;
92
+ for (const event of trace) {
93
+ if (event.type === "tool_call" && event.name) {
94
+ toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
95
+ }
96
+ if (event.type === "error") {
97
+ errorCount++;
98
+ }
99
+ }
100
+ const toolNames = Object.keys(toolCallCounts).sort();
101
+ return {
102
+ eventCount: trace.length,
103
+ toolNames,
104
+ toolCallsByName: toolCallCounts,
105
+ errorCount
106
+ };
107
+ }
108
+
63
109
  // src/evaluation/yaml-parser.ts
64
110
  import { readFile as readFile5 } from "node:fs/promises";
65
111
  import path6 from "node:path";
@@ -409,6 +455,7 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
409
455
  logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
410
456
  continue;
411
457
  }
458
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
412
459
  const cwd = asString2(rawEvaluator.cwd);
413
460
  let resolvedCwd;
414
461
  if (cwd) {
@@ -429,7 +476,8 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
429
476
  type: "code",
430
477
  script,
431
478
  cwd,
432
- resolvedCwd
479
+ resolvedCwd,
480
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
433
481
  });
434
482
  continue;
435
483
  }
@@ -524,14 +572,89 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
524
572
  ...promptPath2 ? { promptPath: promptPath2 } : {}
525
573
  };
526
574
  }
575
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
527
576
  evaluators.push({
528
577
  name,
529
578
  type: "composite",
530
579
  evaluators: memberEvaluators,
531
- aggregator
580
+ aggregator,
581
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
582
+ });
583
+ continue;
584
+ }
585
+ if (typeValue === "expected_messages") {
586
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
587
+ evaluators.push({
588
+ name,
589
+ type: "expected_messages",
590
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
532
591
  });
533
592
  continue;
534
593
  }
594
+ if (typeValue === "tool_trajectory") {
595
+ const mode = asString2(rawEvaluator.mode);
596
+ if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
597
+ logWarning2(
598
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
599
+ );
600
+ continue;
601
+ }
602
+ const rawMinimums = rawEvaluator.minimums;
603
+ let minimums;
604
+ if (rawMinimums !== void 0) {
605
+ if (!isJsonObject2(rawMinimums)) {
606
+ logWarning2(
607
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
608
+ );
609
+ continue;
610
+ }
611
+ minimums = {};
612
+ for (const [toolName, count] of Object.entries(rawMinimums)) {
613
+ if (typeof count === "number" && count >= 0) {
614
+ minimums[toolName] = count;
615
+ }
616
+ }
617
+ }
618
+ const rawExpected = rawEvaluator.expected;
619
+ let expected;
620
+ if (rawExpected !== void 0) {
621
+ if (!Array.isArray(rawExpected)) {
622
+ logWarning2(
623
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
624
+ );
625
+ continue;
626
+ }
627
+ expected = [];
628
+ for (const item of rawExpected) {
629
+ if (isJsonObject2(item) && typeof item.tool === "string") {
630
+ expected.push({ tool: item.tool });
631
+ }
632
+ }
633
+ }
634
+ if (mode === "any_order" && !minimums) {
635
+ logWarning2(
636
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
637
+ );
638
+ continue;
639
+ }
640
+ if ((mode === "in_order" || mode === "exact") && !expected) {
641
+ logWarning2(
642
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
643
+ );
644
+ continue;
645
+ }
646
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
647
+ const config = {
648
+ name,
649
+ type: "tool_trajectory",
650
+ mode,
651
+ ...minimums ? { minimums } : {},
652
+ ...expected ? { expected } : {},
653
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
654
+ };
655
+ evaluators.push(config);
656
+ continue;
657
+ }
535
658
  const prompt = asString2(rawEvaluator.prompt);
536
659
  let promptPath;
537
660
  if (prompt) {
@@ -568,19 +691,23 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
568
691
  logWarning2(`Skipping rubric evaluator '${name}' in '${evalId}': no valid rubrics found`);
569
692
  continue;
570
693
  }
694
+ const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
571
695
  evaluators.push({
572
696
  name,
573
697
  type: "llm_judge",
574
- rubrics: parsedRubrics
698
+ rubrics: parsedRubrics,
699
+ ...weight2 !== void 0 ? { weight: weight2 } : {}
575
700
  });
576
701
  continue;
577
702
  }
703
+ const weight = validateWeight(rawEvaluator.weight, name, evalId);
578
704
  evaluators.push({
579
705
  name,
580
706
  type: "llm_judge",
581
707
  prompt,
582
708
  promptPath,
583
- ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {}
709
+ ...parsedRubrics && parsedRubrics.length > 0 ? { rubrics: parsedRubrics } : {},
710
+ ...weight !== void 0 ? { weight } : {}
584
711
  });
585
712
  }
586
713
  return evaluators.length > 0 ? evaluators : void 0;
@@ -610,6 +737,27 @@ ${detailBlock}${ANSI_RESET3}`);
610
737
  console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
611
738
  }
612
739
  }
740
+ function validateWeight(rawWeight, evaluatorName, evalId) {
741
+ if (rawWeight === void 0) {
742
+ return void 0;
743
+ }
744
+ if (typeof rawWeight !== "number") {
745
+ throw new Error(
746
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be a number`
747
+ );
748
+ }
749
+ if (!Number.isFinite(rawWeight)) {
750
+ throw new Error(
751
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be finite (got ${rawWeight})`
752
+ );
753
+ }
754
+ if (rawWeight < 0) {
755
+ throw new Error(
756
+ `Invalid weight for evaluator '${evaluatorName}' in '${evalId}': must be non-negative (got ${rawWeight})`
757
+ );
758
+ }
759
+ return rawWeight;
760
+ }
613
761
 
614
762
  // src/evaluation/loaders/message-processor.ts
615
763
  import { readFile as readFile3 } from "node:fs/promises";
@@ -785,6 +933,67 @@ ${detailBlock}${ANSI_RESET4}`);
785
933
  console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
786
934
  }
787
935
  }
936
+ async function processExpectedMessages(options) {
937
+ const { messages, searchRoots, repoRootPath, verbose } = options;
938
+ const segments = [];
939
+ for (const message of messages) {
940
+ const segment = {
941
+ role: message.role
942
+ };
943
+ if (message.role === "assistant" && message.tool_calls !== void 0) {
944
+ segment.tool_calls = message.tool_calls;
945
+ }
946
+ const content = message.content;
947
+ if (typeof content === "string") {
948
+ segment.content = content;
949
+ } else if (Array.isArray(content)) {
950
+ const processedContent = [];
951
+ for (const rawSegment of content) {
952
+ if (!isJsonObject(rawSegment)) {
953
+ continue;
954
+ }
955
+ const segmentType = asString3(rawSegment.type);
956
+ if (segmentType === "file") {
957
+ const rawValue = asString3(rawSegment.value);
958
+ if (!rawValue) {
959
+ continue;
960
+ }
961
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
962
+ rawValue,
963
+ searchRoots
964
+ );
965
+ if (!resolvedPath) {
966
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
967
+ logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
968
+ continue;
969
+ }
970
+ try {
971
+ const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
972
+ processedContent.push({
973
+ type: "file",
974
+ path: displayPath,
975
+ text: fileContent,
976
+ resolvedPath: path4.resolve(resolvedPath)
977
+ });
978
+ if (verbose) {
979
+ console.log(` [Expected Output File] Found: ${displayPath}`);
980
+ console.log(` Resolved to: ${resolvedPath}`);
981
+ }
982
+ } catch (error) {
983
+ logWarning3(
984
+ `Could not read expected output file ${resolvedPath}: ${error.message}`
985
+ );
986
+ }
987
+ continue;
988
+ }
989
+ processedContent.push(cloneJsonObject(rawSegment));
990
+ }
991
+ segment.content = processedContent;
992
+ }
993
+ segments.push(segment);
994
+ }
995
+ return segments;
996
+ }
788
997
 
789
998
  // src/evaluation/formatting/prompt-builder.ts
790
999
  import { readFile as readFile4 } from "node:fs/promises";
@@ -1089,12 +1298,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1089
1298
  messageType: "input",
1090
1299
  verbose
1091
1300
  });
1092
- const outputSegments = hasExpectedMessages ? await processMessages({
1301
+ const outputSegments = hasExpectedMessages ? await processExpectedMessages({
1093
1302
  messages: expectedMessages,
1094
1303
  searchRoots,
1095
1304
  repoRootPath,
1096
- guidelinePatterns,
1097
- messageType: "output",
1098
1305
  verbose
1099
1306
  }) : [];
1100
1307
  const codeSnippets = extractCodeBlocks(inputSegments);
@@ -1618,9 +1825,11 @@ var CliProvider = class {
1618
1825
  const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
1619
1826
  throw new Error(message);
1620
1827
  }
1621
- const responseText = await this.readAndCleanupOutputFile(outputFilePath);
1828
+ const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
1829
+ const parsed = this.parseOutputContent(responseContent);
1622
1830
  return {
1623
- text: responseText,
1831
+ text: parsed.text,
1832
+ trace: parsed.trace,
1624
1833
  raw: {
1625
1834
  command: renderedCommand,
1626
1835
  stderr: result.stderr,
@@ -1630,6 +1839,31 @@ var CliProvider = class {
1630
1839
  }
1631
1840
  };
1632
1841
  }
1842
+ /**
1843
+ * Parse output content from CLI.
1844
+ * If the content is valid JSON with a 'text' field, extract text and optional trace.
1845
+ * Otherwise, treat the entire content as plain text.
1846
+ */
1847
+ parseOutputContent(content) {
1848
+ try {
1849
+ const parsed = JSON.parse(content);
1850
+ if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
1851
+ const obj = parsed;
1852
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
1853
+ const trace = this.parseTrace(obj.trace);
1854
+ return { text, trace };
1855
+ }
1856
+ } catch {
1857
+ }
1858
+ return { text: content };
1859
+ }
1860
+ parseTrace(trace) {
1861
+ if (!Array.isArray(trace)) {
1862
+ return void 0;
1863
+ }
1864
+ const validEvents = trace.filter(isTraceEvent);
1865
+ return validEvents.length > 0 ? validEvents : void 0;
1866
+ }
1633
1867
  async readAndCleanupOutputFile(filePath) {
1634
1868
  try {
1635
1869
  const content = await readTextFile(filePath);
@@ -2616,6 +2850,7 @@ var MockProvider = class {
2616
2850
  delayMs;
2617
2851
  delayMinMs;
2618
2852
  delayMaxMs;
2853
+ trace;
2619
2854
  constructor(targetName, config) {
2620
2855
  this.id = `mock:${targetName}`;
2621
2856
  this.targetName = targetName;
@@ -2623,6 +2858,7 @@ var MockProvider = class {
2623
2858
  this.delayMs = config.delayMs ?? 0;
2624
2859
  this.delayMinMs = config.delayMinMs ?? 0;
2625
2860
  this.delayMaxMs = config.delayMaxMs ?? 0;
2861
+ this.trace = config.trace;
2626
2862
  }
2627
2863
  async invoke(request) {
2628
2864
  const delay = this.calculateDelay();
@@ -2634,7 +2870,8 @@ var MockProvider = class {
2634
2870
  raw: {
2635
2871
  question: request.question,
2636
2872
  guidelines: request.guidelines
2637
- }
2873
+ },
2874
+ trace: this.trace
2638
2875
  };
2639
2876
  }
2640
2877
  calculateDelay() {
@@ -3306,9 +3543,11 @@ var CodeEvaluator = class {
3306
3543
  expected_outcome: context.evalCase.expected_outcome,
3307
3544
  reference_answer: context.evalCase.reference_answer,
3308
3545
  candidate_answer: context.candidate,
3309
- guideline_paths: context.evalCase.guideline_paths,
3310
- input_files: context.evalCase.file_paths,
3311
- input_segments: context.evalCase.input_segments
3546
+ guideline_files: context.evalCase.guideline_paths,
3547
+ input_files: context.evalCase.file_paths.filter(
3548
+ (path13) => !context.evalCase.guideline_paths.includes(path13)
3549
+ ),
3550
+ input_messages: context.evalCase.input_messages
3312
3551
  },
3313
3552
  null,
3314
3553
  2
@@ -3428,6 +3667,251 @@ function substituteVariables(template, variables) {
3428
3667
  return variables[varName] ?? match;
3429
3668
  });
3430
3669
  }
3670
+ var ToolTrajectoryEvaluator = class {
3671
+ kind = "tool_trajectory";
3672
+ config;
3673
+ constructor(options) {
3674
+ this.config = options.config;
3675
+ }
3676
+ evaluate(context) {
3677
+ const { candidateTrace, candidateTraceSummary } = context;
3678
+ if (!candidateTrace || !candidateTraceSummary) {
3679
+ return {
3680
+ score: 0,
3681
+ verdict: "fail",
3682
+ hits: [],
3683
+ misses: ["No trace available for evaluation"],
3684
+ expectedAspectCount: 1
3685
+ };
3686
+ }
3687
+ switch (this.config.mode) {
3688
+ case "any_order":
3689
+ return this.evaluateAnyOrder(candidateTraceSummary);
3690
+ case "in_order":
3691
+ return this.evaluateInOrder(candidateTrace);
3692
+ case "exact":
3693
+ return this.evaluateExact(candidateTrace);
3694
+ default:
3695
+ return {
3696
+ score: 0,
3697
+ verdict: "fail",
3698
+ hits: [],
3699
+ misses: [`Unknown mode: ${this.config.mode}`],
3700
+ expectedAspectCount: 1
3701
+ };
3702
+ }
3703
+ }
3704
+ evaluateAnyOrder(summary) {
3705
+ const minimums = this.config.minimums ?? {};
3706
+ const toolNames = Object.keys(minimums);
3707
+ if (toolNames.length === 0) {
3708
+ return {
3709
+ score: 1,
3710
+ verdict: "pass",
3711
+ hits: ["No tool requirements specified"],
3712
+ misses: [],
3713
+ expectedAspectCount: 0
3714
+ };
3715
+ }
3716
+ const hits = [];
3717
+ const misses = [];
3718
+ for (const toolName of toolNames) {
3719
+ const required = minimums[toolName];
3720
+ const actual = summary.toolCallsByName[toolName] ?? 0;
3721
+ if (actual >= required) {
3722
+ hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
3723
+ } else {
3724
+ misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
3725
+ }
3726
+ }
3727
+ const score = hits.length / toolNames.length;
3728
+ return {
3729
+ score,
3730
+ verdict: scoreToVerdict(score),
3731
+ hits,
3732
+ misses,
3733
+ expectedAspectCount: toolNames.length
3734
+ };
3735
+ }
3736
+ evaluateInOrder(trace) {
3737
+ const expected = this.config.expected ?? [];
3738
+ if (expected.length === 0) {
3739
+ return {
3740
+ score: 1,
3741
+ verdict: "pass",
3742
+ hits: ["No tool sequence specified"],
3743
+ misses: [],
3744
+ expectedAspectCount: 0
3745
+ };
3746
+ }
3747
+ const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
3748
+ const hits = [];
3749
+ const misses = [];
3750
+ let actualIndex = 0;
3751
+ for (let i = 0; i < expected.length; i++) {
3752
+ const expectedTool = expected[i].tool;
3753
+ let found = false;
3754
+ while (actualIndex < actualToolCalls.length) {
3755
+ if (actualToolCalls[actualIndex].name === expectedTool) {
3756
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
3757
+ actualIndex++;
3758
+ found = true;
3759
+ break;
3760
+ }
3761
+ actualIndex++;
3762
+ }
3763
+ if (!found) {
3764
+ misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
3765
+ }
3766
+ }
3767
+ const score = hits.length / expected.length;
3768
+ return {
3769
+ score,
3770
+ verdict: scoreToVerdict(score),
3771
+ hits,
3772
+ misses,
3773
+ expectedAspectCount: expected.length
3774
+ };
3775
+ }
3776
+ evaluateExact(trace) {
3777
+ const expected = this.config.expected ?? [];
3778
+ if (expected.length === 0) {
3779
+ return {
3780
+ score: 1,
3781
+ verdict: "pass",
3782
+ hits: ["No tool sequence specified"],
3783
+ misses: [],
3784
+ expectedAspectCount: 0
3785
+ };
3786
+ }
3787
+ const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
3788
+ const hits = [];
3789
+ const misses = [];
3790
+ if (actualToolCalls.length !== expected.length) {
3791
+ misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
3792
+ }
3793
+ const checkLength = Math.min(expected.length, actualToolCalls.length);
3794
+ for (let i = 0; i < checkLength; i++) {
3795
+ const expectedTool = expected[i].tool;
3796
+ const actualTool = actualToolCalls[i].name;
3797
+ if (actualTool === expectedTool) {
3798
+ hits.push(`Position ${i}: ${expectedTool} \u2713`);
3799
+ } else {
3800
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
3801
+ }
3802
+ }
3803
+ for (let i = checkLength; i < expected.length; i++) {
3804
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
3805
+ }
3806
+ const score = hits.length / expected.length;
3807
+ return {
3808
+ score,
3809
+ verdict: scoreToVerdict(score),
3810
+ hits,
3811
+ misses,
3812
+ expectedAspectCount: expected.length
3813
+ };
3814
+ }
3815
+ };
3816
+ var ExpectedMessagesEvaluator = class {
3817
+ kind = "expected_messages";
3818
+ evaluate(context) {
3819
+ const { candidateTrace, evalCase } = context;
3820
+ const expectedSegments = evalCase.expected_segments;
3821
+ const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
3822
+ if (expectedToolCalls.length === 0) {
3823
+ return {
3824
+ score: 1,
3825
+ verdict: "pass",
3826
+ hits: ["No tool_calls specified in expected_messages"],
3827
+ misses: [],
3828
+ expectedAspectCount: 1
3829
+ };
3830
+ }
3831
+ if (!candidateTrace || candidateTrace.length === 0) {
3832
+ return {
3833
+ score: 0,
3834
+ verdict: "fail",
3835
+ hits: [],
3836
+ misses: ["No trace available to validate tool_calls"],
3837
+ expectedAspectCount: expectedToolCalls.length
3838
+ };
3839
+ }
3840
+ const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
3841
+ return this.validateToolCalls(expectedToolCalls, actualToolCalls);
3842
+ }
3843
+ extractExpectedToolCalls(segments) {
3844
+ if (!segments) {
3845
+ return [];
3846
+ }
3847
+ const toolCalls = [];
3848
+ for (const segment of segments) {
3849
+ const role = segment.role;
3850
+ const segmentToolCalls = segment.tool_calls;
3851
+ if (role === "assistant" && Array.isArray(segmentToolCalls)) {
3852
+ for (const tc of segmentToolCalls) {
3853
+ if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
3854
+ const toolCall = tc;
3855
+ toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
3856
+ }
3857
+ }
3858
+ }
3859
+ }
3860
+ return toolCalls;
3861
+ }
3862
+ validateToolCalls(expected, actual) {
3863
+ const hits = [];
3864
+ const misses = [];
3865
+ for (let i = 0; i < expected.length; i++) {
3866
+ const expectedCall = expected[i];
3867
+ const actualCall = actual[i];
3868
+ if (!actualCall) {
3869
+ misses.push(
3870
+ `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
3871
+ );
3872
+ continue;
3873
+ }
3874
+ if (actualCall.name !== expectedCall.tool) {
3875
+ misses.push(
3876
+ `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
3877
+ );
3878
+ continue;
3879
+ }
3880
+ if (expectedCall.input !== void 0) {
3881
+ if (!this.deepEquals(expectedCall.input, actualCall.input)) {
3882
+ misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
3883
+ continue;
3884
+ }
3885
+ }
3886
+ hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
3887
+ }
3888
+ const totalChecks = expected.length || 1;
3889
+ const score = hits.length / totalChecks;
3890
+ return {
3891
+ score,
3892
+ verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
3893
+ hits,
3894
+ misses,
3895
+ expectedAspectCount: totalChecks
3896
+ };
3897
+ }
3898
+ deepEquals(a, b) {
3899
+ if (a === b) return true;
3900
+ if (typeof a !== typeof b) return false;
3901
+ if (typeof a !== "object" || a === null || b === null) return false;
3902
+ if (Array.isArray(a) && Array.isArray(b)) {
3903
+ if (a.length !== b.length) return false;
3904
+ return a.every((val, i) => this.deepEquals(val, b[i]));
3905
+ }
3906
+ if (Array.isArray(a) || Array.isArray(b)) return false;
3907
+ const aObj = a;
3908
+ const bObj = b;
3909
+ const aKeys = Object.keys(aObj);
3910
+ const bKeys = Object.keys(bObj);
3911
+ if (aKeys.length !== bKeys.length) return false;
3912
+ return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
3913
+ }
3914
+ };
3431
3915
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
3432
3916
  {{EVALUATOR_RESULTS_JSON}}
3433
3917
 
@@ -3851,7 +4335,7 @@ async function runEvaluation(options) {
3851
4335
  if (!definition) {
3852
4336
  return void 0;
3853
4337
  }
3854
- const resolved = resolveTargetDefinition(definition, envLookup);
4338
+ const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
3855
4339
  resolvedTargetsByName.set(name, resolved);
3856
4340
  return resolved;
3857
4341
  };
@@ -4165,6 +4649,17 @@ async function runEvalCase(options) {
4165
4649
  if (cacheKey && cache && !cachedResponse) {
4166
4650
  await cache.set(cacheKey, providerResponse);
4167
4651
  }
4652
+ let candidateTrace = providerResponse.trace;
4653
+ if (!candidateTrace && providerResponse.traceRef) {
4654
+ try {
4655
+ const rawTrace = await readJsonFile(providerResponse.traceRef);
4656
+ if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
4657
+ candidateTrace = rawTrace;
4658
+ }
4659
+ } catch {
4660
+ }
4661
+ }
4662
+ const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
4168
4663
  try {
4169
4664
  return await evaluateCandidate({
4170
4665
  evalCase,
@@ -4176,7 +4671,9 @@ async function runEvalCase(options) {
4176
4671
  nowFn,
4177
4672
  attempt,
4178
4673
  judgeProvider,
4179
- agentTimeoutMs
4674
+ agentTimeoutMs,
4675
+ candidateTrace,
4676
+ candidateTraceSummary
4180
4677
  });
4181
4678
  } catch (error) {
4182
4679
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -4193,7 +4690,9 @@ async function evaluateCandidate(options) {
4193
4690
  nowFn,
4194
4691
  attempt,
4195
4692
  judgeProvider,
4196
- agentTimeoutMs
4693
+ agentTimeoutMs,
4694
+ candidateTrace,
4695
+ candidateTraceSummary
4197
4696
  } = options;
4198
4697
  const gradeTimestamp = nowFn();
4199
4698
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -4206,7 +4705,9 @@ async function evaluateCandidate(options) {
4206
4705
  promptInputs,
4207
4706
  now: gradeTimestamp,
4208
4707
  judgeProvider,
4209
- agentTimeoutMs
4708
+ agentTimeoutMs,
4709
+ candidateTrace,
4710
+ candidateTraceSummary
4210
4711
  });
4211
4712
  const completedAt = nowFn();
4212
4713
  let agentProviderRequest;
@@ -4219,14 +4720,12 @@ async function evaluateCandidate(options) {
4219
4720
  } else {
4220
4721
  if (promptInputs.chatPrompt) {
4221
4722
  lmProviderRequest = {
4222
- chat_prompt: promptInputs.chatPrompt,
4223
- guideline_paths: evalCase.guideline_paths
4723
+ chat_prompt: promptInputs.chatPrompt
4224
4724
  };
4225
4725
  } else {
4226
4726
  lmProviderRequest = {
4227
4727
  question: promptInputs.question,
4228
- guidelines: promptInputs.guidelines,
4229
- guideline_paths: evalCase.guideline_paths
4728
+ guidelines: promptInputs.guidelines
4230
4729
  };
4231
4730
  }
4232
4731
  }
@@ -4245,7 +4744,8 @@ async function evaluateCandidate(options) {
4245
4744
  agent_provider_request: agentProviderRequest,
4246
4745
  lm_provider_request: lmProviderRequest,
4247
4746
  evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4248
- evaluator_results: evaluatorResults
4747
+ evaluator_results: evaluatorResults,
4748
+ trace_summary: candidateTraceSummary
4249
4749
  };
4250
4750
  }
4251
4751
  async function runEvaluatorsForCase(options) {
@@ -4259,7 +4759,9 @@ async function runEvaluatorsForCase(options) {
4259
4759
  promptInputs,
4260
4760
  now,
4261
4761
  judgeProvider,
4262
- agentTimeoutMs
4762
+ agentTimeoutMs,
4763
+ candidateTrace,
4764
+ candidateTraceSummary
4263
4765
  } = options;
4264
4766
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
4265
4767
  return runEvaluatorList({
@@ -4273,7 +4775,9 @@ async function runEvaluatorsForCase(options) {
4273
4775
  promptInputs,
4274
4776
  now,
4275
4777
  judgeProvider,
4276
- agentTimeoutMs
4778
+ agentTimeoutMs,
4779
+ candidateTrace,
4780
+ candidateTraceSummary
4277
4781
  });
4278
4782
  }
4279
4783
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -4289,7 +4793,9 @@ async function runEvaluatorsForCase(options) {
4289
4793
  attempt,
4290
4794
  promptInputs,
4291
4795
  now,
4292
- judgeProvider
4796
+ judgeProvider,
4797
+ candidateTrace,
4798
+ candidateTraceSummary
4293
4799
  });
4294
4800
  return { score };
4295
4801
  }
@@ -4305,7 +4811,9 @@ async function runEvaluatorList(options) {
4305
4811
  promptInputs,
4306
4812
  now,
4307
4813
  judgeProvider,
4308
- agentTimeoutMs
4814
+ agentTimeoutMs,
4815
+ candidateTrace,
4816
+ candidateTraceSummary
4309
4817
  } = options;
4310
4818
  const scored = [];
4311
4819
  const evaluatorResults = [];
@@ -4324,11 +4832,13 @@ async function runEvaluatorList(options) {
4324
4832
  now,
4325
4833
  judgeProvider
4326
4834
  });
4327
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4835
+ const weight = evaluator.weight ?? 1;
4836
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
4328
4837
  evaluatorResults.push({
4329
4838
  name: evaluator.name,
4330
4839
  type: evaluator.type,
4331
4840
  score: score2.score,
4841
+ weight,
4332
4842
  verdict: score2.verdict,
4333
4843
  hits: score2.hits,
4334
4844
  misses: score2.misses,
@@ -4351,11 +4861,13 @@ async function runEvaluatorList(options) {
4351
4861
  promptInputs,
4352
4862
  now
4353
4863
  });
4354
- scored.push({ score: score2, name: evaluator.name, type: "code_judge" });
4864
+ const weight = evaluator.weight ?? 1;
4865
+ scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
4355
4866
  evaluatorResults.push({
4356
4867
  name: evaluator.name,
4357
4868
  type: "code_judge",
4358
4869
  score: score2.score,
4870
+ weight,
4359
4871
  verdict: score2.verdict,
4360
4872
  hits: score2.hits,
4361
4873
  misses: score2.misses,
@@ -4381,6 +4893,12 @@ async function runEvaluatorList(options) {
4381
4893
  cwd: evalFileDir,
4382
4894
  evaluatorFactory: { create: createEvaluator }
4383
4895
  });
4896
+ case "tool_trajectory":
4897
+ return new ToolTrajectoryEvaluator({
4898
+ config: memberConfig
4899
+ });
4900
+ case "expected_messages":
4901
+ return new ExpectedMessagesEvaluator();
4384
4902
  default: {
4385
4903
  const unknownConfig = memberConfig;
4386
4904
  throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -4402,11 +4920,13 @@ async function runEvaluatorList(options) {
4402
4920
  now,
4403
4921
  judgeProvider
4404
4922
  });
4405
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
4923
+ const weight = evaluator.weight ?? 1;
4924
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
4406
4925
  evaluatorResults.push({
4407
4926
  name: evaluator.name,
4408
4927
  type: evaluator.type,
4409
4928
  score: score2.score,
4929
+ weight,
4410
4930
  verdict: score2.verdict,
4411
4931
  hits: score2.hits,
4412
4932
  misses: score2.misses,
@@ -4415,6 +4935,60 @@ async function runEvaluatorList(options) {
4415
4935
  evaluator_results: mapChildResults(score2.evaluatorResults)
4416
4936
  });
4417
4937
  }
4938
+ if (evaluator.type === "tool_trajectory") {
4939
+ const trajectoryEvaluator = new ToolTrajectoryEvaluator({
4940
+ config: evaluator
4941
+ });
4942
+ const score2 = trajectoryEvaluator.evaluate({
4943
+ evalCase,
4944
+ candidate,
4945
+ target,
4946
+ provider,
4947
+ attempt,
4948
+ promptInputs,
4949
+ now,
4950
+ candidateTrace,
4951
+ candidateTraceSummary
4952
+ });
4953
+ const weight = evaluator.weight ?? 1;
4954
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
4955
+ evaluatorResults.push({
4956
+ name: evaluator.name,
4957
+ type: evaluator.type,
4958
+ score: score2.score,
4959
+ weight,
4960
+ verdict: score2.verdict,
4961
+ hits: score2.hits,
4962
+ misses: score2.misses,
4963
+ reasoning: score2.reasoning
4964
+ });
4965
+ }
4966
+ if (evaluator.type === "expected_messages") {
4967
+ const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
4968
+ const score2 = expectedMessagesEvaluator.evaluate({
4969
+ evalCase,
4970
+ candidate,
4971
+ target,
4972
+ provider,
4973
+ attempt,
4974
+ promptInputs,
4975
+ now,
4976
+ candidateTrace,
4977
+ candidateTraceSummary
4978
+ });
4979
+ const weight = evaluator.weight ?? 1;
4980
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
4981
+ evaluatorResults.push({
4982
+ name: evaluator.name,
4983
+ type: evaluator.type,
4984
+ score: score2.score,
4985
+ weight,
4986
+ verdict: score2.verdict,
4987
+ hits: score2.hits,
4988
+ misses: score2.misses,
4989
+ reasoning: score2.reasoning
4990
+ });
4991
+ }
4418
4992
  } catch (error) {
4419
4993
  const message = error instanceof Error ? error.message : String(error);
4420
4994
  const fallbackScore = {
@@ -4426,15 +5000,18 @@ async function runEvaluatorList(options) {
4426
5000
  reasoning: message
4427
5001
  };
4428
5002
  const resultType = evaluator.type === "code" ? "code_judge" : evaluator.type;
5003
+ const weight = evaluator.weight ?? 1;
4429
5004
  scored.push({
4430
5005
  score: fallbackScore,
4431
5006
  name: evaluator.name ?? "unknown",
4432
- type: resultType ?? "llm_judge"
5007
+ type: resultType ?? "llm_judge",
5008
+ weight
4433
5009
  });
4434
5010
  evaluatorResults.push({
4435
5011
  name: evaluator.name ?? "unknown",
4436
5012
  type: resultType ?? "llm_judge",
4437
5013
  score: 0,
5014
+ weight,
4438
5015
  verdict: "fail",
4439
5016
  hits: [],
4440
5017
  misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
@@ -4442,7 +5019,9 @@ async function runEvaluatorList(options) {
4442
5019
  });
4443
5020
  }
4444
5021
  }
4445
- const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
5022
+ const aggregateScore = scored.length > 0 ? computeWeightedMean(
5023
+ scored.map((entry) => ({ score: entry.score.score, weight: entry.weight }))
5024
+ ) : 0;
4446
5025
  const hits = scored.flatMap((entry) => entry.score.hits);
4447
5026
  const misses = scored.flatMap((entry) => entry.score.misses);
4448
5027
  const expectedAspectCount = scored.reduce(
@@ -4668,6 +5247,16 @@ function mapChildResults(children) {
4668
5247
  evaluator_results: mapChildResults(child.evaluatorResults)
4669
5248
  }));
4670
5249
  }
5250
+ function computeWeightedMean(entries) {
5251
+ let totalWeight = 0;
5252
+ let weightedSum = 0;
5253
+ for (const entry of entries) {
5254
+ const weight = entry.weight ?? 1;
5255
+ totalWeight += weight;
5256
+ weightedSum += entry.score * weight;
5257
+ }
5258
+ return totalWeight > 0 ? weightedSum / totalWeight : 0;
5259
+ }
4671
5260
 
4672
5261
  // src/evaluation/generators/rubric-generator.ts
4673
5262
  import { generateText as generateText3 } from "ai";
@@ -4756,11 +5345,14 @@ function createAgentKernel() {
4756
5345
  export {
4757
5346
  CodeEvaluator,
4758
5347
  CompositeEvaluator,
5348
+ ExpectedMessagesEvaluator,
4759
5349
  LlmJudgeEvaluator,
4760
5350
  TEST_MESSAGE_ROLES,
5351
+ ToolTrajectoryEvaluator,
4761
5352
  buildDirectoryChain,
4762
5353
  buildPromptInputs,
4763
5354
  buildSearchRoots,
5355
+ computeTraceSummary,
4764
5356
  consumeCodexLogEntries,
4765
5357
  createAgentKernel,
4766
5358
  createProvider,
@@ -4771,14 +5363,18 @@ export {
4771
5363
  generateRubrics,
4772
5364
  getHitCount,
4773
5365
  isEvaluatorKind,
5366
+ isExpectedToolCall,
4774
5367
  isGuidelineFile,
4775
5368
  isJsonObject,
4776
5369
  isJsonValue,
4777
5370
  isTestMessage,
4778
5371
  isTestMessageRole,
5372
+ isTraceEvent,
5373
+ isTraceEventType,
4779
5374
  listTargetNames,
4780
5375
  loadEvalCases,
4781
5376
  normalizeLineEndings,
5377
+ readJsonFile,
4782
5378
  readTargetDefinitions,
4783
5379
  readTestSuiteMetadata,
4784
5380
  readTextFile,