@agentv/core 0.23.0 → 0.25.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -32,11 +32,14 @@ var index_exports = {};
32
32
  __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
34
  CompositeEvaluator: () => CompositeEvaluator,
35
+ ExpectedMessagesEvaluator: () => ExpectedMessagesEvaluator,
35
36
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
36
37
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
38
+ ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
37
39
  buildDirectoryChain: () => buildDirectoryChain2,
38
40
  buildPromptInputs: () => buildPromptInputs,
39
41
  buildSearchRoots: () => buildSearchRoots2,
42
+ computeTraceSummary: () => computeTraceSummary,
40
43
  consumeCodexLogEntries: () => consumeCodexLogEntries,
41
44
  createAgentKernel: () => createAgentKernel,
42
45
  createProvider: () => createProvider,
@@ -47,14 +50,18 @@ __export(index_exports, {
47
50
  generateRubrics: () => generateRubrics,
48
51
  getHitCount: () => getHitCount,
49
52
  isEvaluatorKind: () => isEvaluatorKind,
53
+ isExpectedToolCall: () => isExpectedToolCall,
50
54
  isGuidelineFile: () => isGuidelineFile,
51
55
  isJsonObject: () => isJsonObject,
52
56
  isJsonValue: () => isJsonValue,
53
57
  isTestMessage: () => isTestMessage,
54
58
  isTestMessageRole: () => isTestMessageRole,
59
+ isTraceEvent: () => isTraceEvent,
60
+ isTraceEventType: () => isTraceEventType,
55
61
  listTargetNames: () => listTargetNames,
56
62
  loadEvalCases: () => loadEvalCases,
57
63
  normalizeLineEndings: () => normalizeLineEndings,
64
+ readJsonFile: () => readJsonFile,
58
65
  readTargetDefinitions: () => readTargetDefinitions,
59
66
  readTestSuiteMetadata: () => readTestSuiteMetadata,
60
67
  readTextFile: () => readTextFile,
@@ -108,7 +115,14 @@ function isTestMessage(value) {
108
115
  }
109
116
  return candidate.content.every(isJsonObject);
110
117
  }
111
- var EVALUATOR_KIND_VALUES = ["code_judge", "llm_judge", "rubric", "composite"];
118
+ var EVALUATOR_KIND_VALUES = [
119
+ "code_judge",
120
+ "llm_judge",
121
+ "rubric",
122
+ "composite",
123
+ "tool_trajectory",
124
+ "expected_messages"
125
+ ];
112
126
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
113
127
  function isEvaluatorKind(value) {
114
128
  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
@@ -117,6 +131,44 @@ function getHitCount(result) {
117
131
  return result.hits.length;
118
132
  }
119
133
 
134
+ // src/evaluation/trace.ts
135
+ function isTraceEventType(value) {
136
+ return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
137
+ }
138
+ function isTraceEvent(value) {
139
+ if (typeof value !== "object" || value === null) {
140
+ return false;
141
+ }
142
+ const candidate = value;
143
+ return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
144
+ }
145
+ function isExpectedToolCall(value) {
146
+ if (typeof value !== "object" || value === null) {
147
+ return false;
148
+ }
149
+ const candidate = value;
150
+ return typeof candidate.tool === "string";
151
+ }
152
+ function computeTraceSummary(trace) {
153
+ const toolCallCounts = {};
154
+ let errorCount = 0;
155
+ for (const event of trace) {
156
+ if (event.type === "tool_call" && event.name) {
157
+ toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
158
+ }
159
+ if (event.type === "error") {
160
+ errorCount++;
161
+ }
162
+ }
163
+ const toolNames = Object.keys(toolCallCounts).sort();
164
+ return {
165
+ eventCount: trace.length,
166
+ toolNames,
167
+ toolCallsByName: toolCallCounts,
168
+ errorCount
169
+ };
170
+ }
171
+
120
172
  // src/evaluation/yaml-parser.ts
121
173
  var import_promises6 = require("fs/promises");
122
174
  var import_node_path6 = __toESM(require("path"), 1);
@@ -589,6 +641,75 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
589
641
  });
590
642
  continue;
591
643
  }
644
+ if (typeValue === "expected_messages") {
645
+ evaluators.push({
646
+ name,
647
+ type: "expected_messages"
648
+ });
649
+ continue;
650
+ }
651
+ if (typeValue === "tool_trajectory") {
652
+ const mode = asString2(rawEvaluator.mode);
653
+ if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
654
+ logWarning2(
655
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': invalid mode '${mode}' (must be any_order, in_order, or exact)`
656
+ );
657
+ continue;
658
+ }
659
+ const rawMinimums = rawEvaluator.minimums;
660
+ let minimums;
661
+ if (rawMinimums !== void 0) {
662
+ if (!isJsonObject2(rawMinimums)) {
663
+ logWarning2(
664
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': minimums must be an object`
665
+ );
666
+ continue;
667
+ }
668
+ minimums = {};
669
+ for (const [toolName, count] of Object.entries(rawMinimums)) {
670
+ if (typeof count === "number" && count >= 0) {
671
+ minimums[toolName] = count;
672
+ }
673
+ }
674
+ }
675
+ const rawExpected = rawEvaluator.expected;
676
+ let expected;
677
+ if (rawExpected !== void 0) {
678
+ if (!Array.isArray(rawExpected)) {
679
+ logWarning2(
680
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': expected must be an array`
681
+ );
682
+ continue;
683
+ }
684
+ expected = [];
685
+ for (const item of rawExpected) {
686
+ if (isJsonObject2(item) && typeof item.tool === "string") {
687
+ expected.push({ tool: item.tool });
688
+ }
689
+ }
690
+ }
691
+ if (mode === "any_order" && !minimums) {
692
+ logWarning2(
693
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': any_order mode requires minimums`
694
+ );
695
+ continue;
696
+ }
697
+ if ((mode === "in_order" || mode === "exact") && !expected) {
698
+ logWarning2(
699
+ `Skipping tool_trajectory evaluator '${name}' in '${evalId}': ${mode} mode requires expected`
700
+ );
701
+ continue;
702
+ }
703
+ const config = {
704
+ name,
705
+ type: "tool_trajectory",
706
+ mode,
707
+ ...minimums ? { minimums } : {},
708
+ ...expected ? { expected } : {}
709
+ };
710
+ evaluators.push(config);
711
+ continue;
712
+ }
592
713
  const prompt = asString2(rawEvaluator.prompt);
593
714
  let promptPath;
594
715
  if (prompt) {
@@ -842,6 +963,67 @@ ${detailBlock}${ANSI_RESET4}`);
842
963
  console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
843
964
  }
844
965
  }
966
+ async function processExpectedMessages(options) {
967
+ const { messages, searchRoots, repoRootPath, verbose } = options;
968
+ const segments = [];
969
+ for (const message of messages) {
970
+ const segment = {
971
+ role: message.role
972
+ };
973
+ if (message.role === "assistant" && message.tool_calls !== void 0) {
974
+ segment.tool_calls = message.tool_calls;
975
+ }
976
+ const content = message.content;
977
+ if (typeof content === "string") {
978
+ segment.content = content;
979
+ } else if (Array.isArray(content)) {
980
+ const processedContent = [];
981
+ for (const rawSegment of content) {
982
+ if (!isJsonObject(rawSegment)) {
983
+ continue;
984
+ }
985
+ const segmentType = asString3(rawSegment.type);
986
+ if (segmentType === "file") {
987
+ const rawValue = asString3(rawSegment.value);
988
+ if (!rawValue) {
989
+ continue;
990
+ }
991
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference(
992
+ rawValue,
993
+ searchRoots
994
+ );
995
+ if (!resolvedPath) {
996
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
997
+ logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
998
+ continue;
999
+ }
1000
+ try {
1001
+ const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
1002
+ processedContent.push({
1003
+ type: "file",
1004
+ path: displayPath,
1005
+ text: fileContent,
1006
+ resolvedPath: import_node_path4.default.resolve(resolvedPath)
1007
+ });
1008
+ if (verbose) {
1009
+ console.log(` [Expected Output File] Found: ${displayPath}`);
1010
+ console.log(` Resolved to: ${resolvedPath}`);
1011
+ }
1012
+ } catch (error) {
1013
+ logWarning3(
1014
+ `Could not read expected output file ${resolvedPath}: ${error.message}`
1015
+ );
1016
+ }
1017
+ continue;
1018
+ }
1019
+ processedContent.push(cloneJsonObject(rawSegment));
1020
+ }
1021
+ segment.content = processedContent;
1022
+ }
1023
+ segments.push(segment);
1024
+ }
1025
+ return segments;
1026
+ }
845
1027
 
846
1028
  // src/evaluation/formatting/prompt-builder.ts
847
1029
  var import_promises5 = require("fs/promises");
@@ -1146,12 +1328,10 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1146
1328
  messageType: "input",
1147
1329
  verbose
1148
1330
  });
1149
- const outputSegments = hasExpectedMessages ? await processMessages({
1331
+ const outputSegments = hasExpectedMessages ? await processExpectedMessages({
1150
1332
  messages: expectedMessages,
1151
1333
  searchRoots,
1152
1334
  repoRootPath,
1153
- guidelinePatterns,
1154
- messageType: "output",
1155
1335
  verbose
1156
1336
  }) : [];
1157
1337
  const codeSnippets = extractCodeBlocks(inputSegments);
@@ -1278,6 +1458,10 @@ async function readTextFile(filePath) {
1278
1458
  const content = await (0, import_promises7.readFile)(filePath, "utf8");
1279
1459
  return normalizeLineEndings(content);
1280
1460
  }
1461
+ async function readJsonFile(filePath) {
1462
+ const content = await (0, import_promises7.readFile)(filePath, "utf8");
1463
+ return JSON.parse(content);
1464
+ }
1281
1465
  async function findGitRoot(startPath) {
1282
1466
  let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
1283
1467
  const root = import_node_path7.default.parse(currentDir).root;
@@ -1786,9 +1970,11 @@ var CliProvider = class {
1786
1970
  const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
1787
1971
  throw new Error(message);
1788
1972
  }
1789
- const responseText = await this.readAndCleanupOutputFile(outputFilePath);
1973
+ const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
1974
+ const parsed = this.parseOutputContent(responseContent);
1790
1975
  return {
1791
- text: responseText,
1976
+ text: parsed.text,
1977
+ trace: parsed.trace,
1792
1978
  raw: {
1793
1979
  command: renderedCommand,
1794
1980
  stderr: result.stderr,
@@ -1798,6 +1984,31 @@ var CliProvider = class {
1798
1984
  }
1799
1985
  };
1800
1986
  }
1987
+ /**
1988
+ * Parse output content from CLI.
1989
+ * If the content is valid JSON with a 'text' field, extract text and optional trace.
1990
+ * Otherwise, treat the entire content as plain text.
1991
+ */
1992
+ parseOutputContent(content) {
1993
+ try {
1994
+ const parsed = JSON.parse(content);
1995
+ if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
1996
+ const obj = parsed;
1997
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
1998
+ const trace = this.parseTrace(obj.trace);
1999
+ return { text, trace };
2000
+ }
2001
+ } catch {
2002
+ }
2003
+ return { text: content };
2004
+ }
2005
+ parseTrace(trace) {
2006
+ if (!Array.isArray(trace)) {
2007
+ return void 0;
2008
+ }
2009
+ const validEvents = trace.filter(isTraceEvent);
2010
+ return validEvents.length > 0 ? validEvents : void 0;
2011
+ }
1801
2012
  async readAndCleanupOutputFile(filePath) {
1802
2013
  try {
1803
2014
  const content = await readTextFile(filePath);
@@ -2784,6 +2995,7 @@ var MockProvider = class {
2784
2995
  delayMs;
2785
2996
  delayMinMs;
2786
2997
  delayMaxMs;
2998
+ trace;
2787
2999
  constructor(targetName, config) {
2788
3000
  this.id = `mock:${targetName}`;
2789
3001
  this.targetName = targetName;
@@ -2791,6 +3003,7 @@ var MockProvider = class {
2791
3003
  this.delayMs = config.delayMs ?? 0;
2792
3004
  this.delayMinMs = config.delayMinMs ?? 0;
2793
3005
  this.delayMaxMs = config.delayMaxMs ?? 0;
3006
+ this.trace = config.trace;
2794
3007
  }
2795
3008
  async invoke(request) {
2796
3009
  const delay = this.calculateDelay();
@@ -2802,7 +3015,8 @@ var MockProvider = class {
2802
3015
  raw: {
2803
3016
  question: request.question,
2804
3017
  guidelines: request.guidelines
2805
- }
3018
+ },
3019
+ trace: this.trace
2806
3020
  };
2807
3021
  }
2808
3022
  calculateDelay() {
@@ -2816,6 +3030,7 @@ var MockProvider = class {
2816
3030
  };
2817
3031
 
2818
3032
  // src/evaluation/providers/targets.ts
3033
+ var import_node_path11 = __toESM(require("path"), 1);
2819
3034
  var import_zod = require("zod");
2820
3035
  var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
2821
3036
  "PROMPT",
@@ -2831,7 +3046,7 @@ var BASE_TARGET_SCHEMA = import_zod.z.object({
2831
3046
  judge_target: import_zod.z.string().optional(),
2832
3047
  workers: import_zod.z.number().int().min(1).optional()
2833
3048
  }).passthrough();
2834
- var DEFAULT_AZURE_API_VERSION = "2024-10-01-preview";
3049
+ var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
2835
3050
  function normalizeAzureApiVersion(value) {
2836
3051
  if (!value) {
2837
3052
  return DEFAULT_AZURE_API_VERSION;
@@ -2875,7 +3090,7 @@ function resolveRetryConfig(target) {
2875
3090
  retryableStatusCodes
2876
3091
  };
2877
3092
  }
2878
- function resolveTargetDefinition(definition, env = process.env) {
3093
+ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
2879
3094
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
2880
3095
  const provider = parsed.provider.toLowerCase();
2881
3096
  const providerBatching = resolveOptionalBoolean(
@@ -2948,7 +3163,7 @@ function resolveTargetDefinition(definition, env = process.env) {
2948
3163
  judgeTarget: parsed.judge_target,
2949
3164
  workers: parsed.workers,
2950
3165
  providerBatching,
2951
- config: resolveCliConfig(parsed, env)
3166
+ config: resolveCliConfig(parsed, env, evalFilePath)
2952
3167
  };
2953
3168
  default:
2954
3169
  throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
@@ -3066,7 +3281,8 @@ function normalizeCodexLogFormat(value) {
3066
3281
  }
3067
3282
  function resolveMockConfig(target) {
3068
3283
  const response = typeof target.response === "string" ? target.response : void 0;
3069
- return { response };
3284
+ const trace = Array.isArray(target.trace) ? target.trace : void 0;
3285
+ return { response, trace };
3070
3286
  }
3071
3287
  function resolveVSCodeConfig(target, env, insiders) {
3072
3288
  const workspaceTemplateEnvVar = resolveOptionalLiteralString(
@@ -3098,15 +3314,18 @@ function resolveVSCodeConfig(target, env, insiders) {
3098
3314
  workspaceTemplate
3099
3315
  };
3100
3316
  }
3101
- function resolveCliConfig(target, env) {
3317
+ function resolveCliConfig(target, env, evalFilePath) {
3102
3318
  const commandTemplateSource = target.command_template ?? target.commandTemplate;
3103
3319
  const filesFormat = resolveOptionalLiteralString(
3104
3320
  target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
3105
3321
  );
3106
- const cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
3322
+ let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
3107
3323
  allowLiteral: true,
3108
3324
  optionalEnv: true
3109
3325
  });
3326
+ if (!cwd && evalFilePath) {
3327
+ cwd = import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath));
3328
+ }
3110
3329
  const timeoutMs = resolveTimeoutMs(
3111
3330
  target.timeout_seconds ?? target.timeoutSeconds,
3112
3331
  `${target.name} timeout`
@@ -3224,17 +3443,15 @@ function resolveOptionalString(source, env, description, options) {
3224
3443
  if (envVarMatch) {
3225
3444
  const varName = envVarMatch[1];
3226
3445
  const envValue = env[varName];
3227
- if (envValue !== void 0) {
3228
- if (envValue.trim().length === 0) {
3229
- throw new Error(`Environment variable '${varName}' for ${description} is empty`);
3230
- }
3231
- return envValue;
3232
- }
3233
3446
  const optionalEnv = options?.optionalEnv ?? false;
3234
- if (optionalEnv) {
3235
- return void 0;
3447
+ if (envValue === void 0 || envValue.trim().length === 0) {
3448
+ if (optionalEnv) {
3449
+ return void 0;
3450
+ }
3451
+ const status = envValue === void 0 ? "is not set" : "is empty";
3452
+ throw new Error(`Environment variable '${varName}' required for ${description} ${status}`);
3236
3453
  }
3237
- throw new Error(`Environment variable '${varName}' required for ${description} is not set`);
3454
+ return envValue;
3238
3455
  }
3239
3456
  const allowLiteral = options?.allowLiteral ?? false;
3240
3457
  if (!allowLiteral) {
@@ -3346,7 +3563,7 @@ function resolveOptionalNumberArray(source, description) {
3346
3563
  }
3347
3564
 
3348
3565
  // src/evaluation/providers/vscode.ts
3349
- var import_node_path11 = __toESM(require("path"), 1);
3566
+ var import_node_path12 = __toESM(require("path"), 1);
3350
3567
  var import_subagent = require("subagent");
3351
3568
 
3352
3569
  // src/evaluation/providers/vscode-templates.ts
@@ -3516,7 +3733,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
3516
3733
  return "";
3517
3734
  }
3518
3735
  const buildList = (files) => files.map((absolutePath) => {
3519
- const fileName = import_node_path11.default.basename(absolutePath);
3736
+ const fileName = import_node_path12.default.basename(absolutePath);
3520
3737
  const fileUri = pathToFileUri2(absolutePath);
3521
3738
  return `* [${fileName}](${fileUri})`;
3522
3739
  });
@@ -3541,8 +3758,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
3541
3758
  }
3542
3759
  const unique = /* @__PURE__ */ new Map();
3543
3760
  for (const attachment of attachments) {
3544
- const absolutePath = import_node_path11.default.resolve(attachment);
3545
- const normalized = absolutePath.split(import_node_path11.default.sep).join("/");
3761
+ const absolutePath = import_node_path12.default.resolve(attachment);
3762
+ const normalized = absolutePath.split(import_node_path12.default.sep).join("/");
3546
3763
  if (isGuidelineFile(normalized, guidelinePatterns)) {
3547
3764
  if (!unique.has(absolutePath)) {
3548
3765
  unique.set(absolutePath, absolutePath);
@@ -3557,7 +3774,7 @@ function collectAttachmentFiles(attachments) {
3557
3774
  }
3558
3775
  const unique = /* @__PURE__ */ new Map();
3559
3776
  for (const attachment of attachments) {
3560
- const absolutePath = import_node_path11.default.resolve(attachment);
3777
+ const absolutePath = import_node_path12.default.resolve(attachment);
3561
3778
  if (!unique.has(absolutePath)) {
3562
3779
  unique.set(absolutePath, absolutePath);
3563
3780
  }
@@ -3565,7 +3782,7 @@ function collectAttachmentFiles(attachments) {
3565
3782
  return Array.from(unique.values());
3566
3783
  }
3567
3784
  function pathToFileUri2(filePath) {
3568
- const absolutePath = import_node_path11.default.isAbsolute(filePath) ? filePath : import_node_path11.default.resolve(filePath);
3785
+ const absolutePath = import_node_path12.default.isAbsolute(filePath) ? filePath : import_node_path12.default.resolve(filePath);
3569
3786
  const normalizedPath = absolutePath.replace(/\\/g, "/");
3570
3787
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
3571
3788
  return `file:///${normalizedPath}`;
@@ -3578,7 +3795,7 @@ function normalizeAttachments(attachments) {
3578
3795
  }
3579
3796
  const deduped = /* @__PURE__ */ new Set();
3580
3797
  for (const attachment of attachments) {
3581
- deduped.add(import_node_path11.default.resolve(attachment));
3798
+ deduped.add(import_node_path12.default.resolve(attachment));
3582
3799
  }
3583
3800
  return Array.from(deduped);
3584
3801
  }
@@ -3587,7 +3804,7 @@ function mergeAttachments(all) {
3587
3804
  for (const list of all) {
3588
3805
  if (!list) continue;
3589
3806
  for (const inputFile of list) {
3590
- deduped.add(import_node_path11.default.resolve(inputFile));
3807
+ deduped.add(import_node_path12.default.resolve(inputFile));
3591
3808
  }
3592
3809
  }
3593
3810
  return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -3636,7 +3853,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
3636
3853
  // src/evaluation/providers/targets-file.ts
3637
3854
  var import_node_fs4 = require("fs");
3638
3855
  var import_promises10 = require("fs/promises");
3639
- var import_node_path12 = __toESM(require("path"), 1);
3856
+ var import_node_path13 = __toESM(require("path"), 1);
3640
3857
  var import_yaml3 = require("yaml");
3641
3858
  function isRecord(value) {
3642
3859
  return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -3673,7 +3890,7 @@ async function fileExists3(filePath) {
3673
3890
  }
3674
3891
  }
3675
3892
  async function readTargetDefinitions(filePath) {
3676
- const absolutePath = import_node_path12.default.resolve(filePath);
3893
+ const absolutePath = import_node_path13.default.resolve(filePath);
3677
3894
  if (!await fileExists3(absolutePath)) {
3678
3895
  throw new Error(`targets.yaml not found at ${absolutePath}`);
3679
3896
  }
@@ -4121,6 +4338,251 @@ function substituteVariables(template, variables) {
4121
4338
  return variables[varName] ?? match;
4122
4339
  });
4123
4340
  }
4341
+ var ToolTrajectoryEvaluator = class {
4342
+ kind = "tool_trajectory";
4343
+ config;
4344
+ constructor(options) {
4345
+ this.config = options.config;
4346
+ }
4347
+ evaluate(context) {
4348
+ const { candidateTrace, candidateTraceSummary } = context;
4349
+ if (!candidateTrace || !candidateTraceSummary) {
4350
+ return {
4351
+ score: 0,
4352
+ verdict: "fail",
4353
+ hits: [],
4354
+ misses: ["No trace available for evaluation"],
4355
+ expectedAspectCount: 1
4356
+ };
4357
+ }
4358
+ switch (this.config.mode) {
4359
+ case "any_order":
4360
+ return this.evaluateAnyOrder(candidateTraceSummary);
4361
+ case "in_order":
4362
+ return this.evaluateInOrder(candidateTrace);
4363
+ case "exact":
4364
+ return this.evaluateExact(candidateTrace);
4365
+ default:
4366
+ return {
4367
+ score: 0,
4368
+ verdict: "fail",
4369
+ hits: [],
4370
+ misses: [`Unknown mode: ${this.config.mode}`],
4371
+ expectedAspectCount: 1
4372
+ };
4373
+ }
4374
+ }
4375
+ evaluateAnyOrder(summary) {
4376
+ const minimums = this.config.minimums ?? {};
4377
+ const toolNames = Object.keys(minimums);
4378
+ if (toolNames.length === 0) {
4379
+ return {
4380
+ score: 1,
4381
+ verdict: "pass",
4382
+ hits: ["No tool requirements specified"],
4383
+ misses: [],
4384
+ expectedAspectCount: 0
4385
+ };
4386
+ }
4387
+ const hits = [];
4388
+ const misses = [];
4389
+ for (const toolName of toolNames) {
4390
+ const required = minimums[toolName];
4391
+ const actual = summary.toolCallsByName[toolName] ?? 0;
4392
+ if (actual >= required) {
4393
+ hits.push(`${toolName}: called ${actual} times (required \u2265${required})`);
4394
+ } else {
4395
+ misses.push(`${toolName}: called ${actual} times (required \u2265${required})`);
4396
+ }
4397
+ }
4398
+ const score = hits.length / toolNames.length;
4399
+ return {
4400
+ score,
4401
+ verdict: scoreToVerdict(score),
4402
+ hits,
4403
+ misses,
4404
+ expectedAspectCount: toolNames.length
4405
+ };
4406
+ }
4407
+ evaluateInOrder(trace) {
4408
+ const expected = this.config.expected ?? [];
4409
+ if (expected.length === 0) {
4410
+ return {
4411
+ score: 1,
4412
+ verdict: "pass",
4413
+ hits: ["No tool sequence specified"],
4414
+ misses: [],
4415
+ expectedAspectCount: 0
4416
+ };
4417
+ }
4418
+ const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
4419
+ const hits = [];
4420
+ const misses = [];
4421
+ let actualIndex = 0;
4422
+ for (let i = 0; i < expected.length; i++) {
4423
+ const expectedTool = expected[i].tool;
4424
+ let found = false;
4425
+ while (actualIndex < actualToolCalls.length) {
4426
+ if (actualToolCalls[actualIndex].name === expectedTool) {
4427
+ hits.push(`Found ${expectedTool} at position ${actualIndex}`);
4428
+ actualIndex++;
4429
+ found = true;
4430
+ break;
4431
+ }
4432
+ actualIndex++;
4433
+ }
4434
+ if (!found) {
4435
+ misses.push(`Expected ${expectedTool} at position ${i}, not found in remaining trace`);
4436
+ }
4437
+ }
4438
+ const score = hits.length / expected.length;
4439
+ return {
4440
+ score,
4441
+ verdict: scoreToVerdict(score),
4442
+ hits,
4443
+ misses,
4444
+ expectedAspectCount: expected.length
4445
+ };
4446
+ }
4447
+ evaluateExact(trace) {
4448
+ const expected = this.config.expected ?? [];
4449
+ if (expected.length === 0) {
4450
+ return {
4451
+ score: 1,
4452
+ verdict: "pass",
4453
+ hits: ["No tool sequence specified"],
4454
+ misses: [],
4455
+ expectedAspectCount: 0
4456
+ };
4457
+ }
4458
+ const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
4459
+ const hits = [];
4460
+ const misses = [];
4461
+ if (actualToolCalls.length !== expected.length) {
4462
+ misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
4463
+ }
4464
+ const checkLength = Math.min(expected.length, actualToolCalls.length);
4465
+ for (let i = 0; i < checkLength; i++) {
4466
+ const expectedTool = expected[i].tool;
4467
+ const actualTool = actualToolCalls[i].name;
4468
+ if (actualTool === expectedTool) {
4469
+ hits.push(`Position ${i}: ${expectedTool} \u2713`);
4470
+ } else {
4471
+ misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
4472
+ }
4473
+ }
4474
+ for (let i = checkLength; i < expected.length; i++) {
4475
+ misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
4476
+ }
4477
+ const score = hits.length / expected.length;
4478
+ return {
4479
+ score,
4480
+ verdict: scoreToVerdict(score),
4481
+ hits,
4482
+ misses,
4483
+ expectedAspectCount: expected.length
4484
+ };
4485
+ }
4486
+ };
4487
+ var ExpectedMessagesEvaluator = class {
4488
+ kind = "expected_messages";
4489
+ evaluate(context) {
4490
+ const { candidateTrace, evalCase } = context;
4491
+ const expectedSegments = evalCase.expected_segments;
4492
+ const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
4493
+ if (expectedToolCalls.length === 0) {
4494
+ return {
4495
+ score: 1,
4496
+ verdict: "pass",
4497
+ hits: ["No tool_calls specified in expected_messages"],
4498
+ misses: [],
4499
+ expectedAspectCount: 1
4500
+ };
4501
+ }
4502
+ if (!candidateTrace || candidateTrace.length === 0) {
4503
+ return {
4504
+ score: 0,
4505
+ verdict: "fail",
4506
+ hits: [],
4507
+ misses: ["No trace available to validate tool_calls"],
4508
+ expectedAspectCount: expectedToolCalls.length
4509
+ };
4510
+ }
4511
+ const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
4512
+ return this.validateToolCalls(expectedToolCalls, actualToolCalls);
4513
+ }
4514
+ extractExpectedToolCalls(segments) {
4515
+ if (!segments) {
4516
+ return [];
4517
+ }
4518
+ const toolCalls = [];
4519
+ for (const segment of segments) {
4520
+ const role = segment.role;
4521
+ const segmentToolCalls = segment.tool_calls;
4522
+ if (role === "assistant" && Array.isArray(segmentToolCalls)) {
4523
+ for (const tc of segmentToolCalls) {
4524
+ if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
4525
+ const toolCall = tc;
4526
+ toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
4527
+ }
4528
+ }
4529
+ }
4530
+ }
4531
+ return toolCalls;
4532
+ }
4533
+ validateToolCalls(expected, actual) {
4534
+ const hits = [];
4535
+ const misses = [];
4536
+ for (let i = 0; i < expected.length; i++) {
4537
+ const expectedCall = expected[i];
4538
+ const actualCall = actual[i];
4539
+ if (!actualCall) {
4540
+ misses.push(
4541
+ `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
4542
+ );
4543
+ continue;
4544
+ }
4545
+ if (actualCall.name !== expectedCall.tool) {
4546
+ misses.push(
4547
+ `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
4548
+ );
4549
+ continue;
4550
+ }
4551
+ if (expectedCall.input !== void 0) {
4552
+ if (!this.deepEquals(expectedCall.input, actualCall.input)) {
4553
+ misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
4554
+ continue;
4555
+ }
4556
+ }
4557
+ hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
4558
+ }
4559
+ const totalChecks = expected.length || 1;
4560
+ const score = hits.length / totalChecks;
4561
+ return {
4562
+ score,
4563
+ verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
4564
+ hits,
4565
+ misses,
4566
+ expectedAspectCount: totalChecks
4567
+ };
4568
+ }
4569
+ deepEquals(a, b) {
4570
+ if (a === b) return true;
4571
+ if (typeof a !== typeof b) return false;
4572
+ if (typeof a !== "object" || a === null || b === null) return false;
4573
+ if (Array.isArray(a) && Array.isArray(b)) {
4574
+ if (a.length !== b.length) return false;
4575
+ return a.every((val, i) => this.deepEquals(val, b[i]));
4576
+ }
4577
+ if (Array.isArray(a) || Array.isArray(b)) return false;
4578
+ const aObj = a;
4579
+ const bObj = b;
4580
+ const aKeys = Object.keys(aObj);
4581
+ const bKeys = Object.keys(bObj);
4582
+ if (aKeys.length !== bKeys.length) return false;
4583
+ return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
4584
+ }
4585
+ };
4124
4586
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
4125
4587
  {{EVALUATOR_RESULTS_JSON}}
4126
4588
 
@@ -4347,7 +4809,7 @@ var CompositeEvaluator = class {
4347
4809
  // src/evaluation/orchestrator.ts
4348
4810
  var import_node_crypto2 = require("crypto");
4349
4811
  var import_promises11 = require("fs/promises");
4350
- var import_node_path13 = __toESM(require("path"), 1);
4812
+ var import_node_path14 = __toESM(require("path"), 1);
4351
4813
 
4352
4814
  // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
4353
4815
  var Node = class {
@@ -4554,7 +5016,7 @@ async function runEvaluation(options) {
4554
5016
  if (!definition) {
4555
5017
  return void 0;
4556
5018
  }
4557
- const resolved = resolveTargetDefinition(definition, envLookup);
5019
+ const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
4558
5020
  resolvedTargetsByName.set(name, resolved);
4559
5021
  return resolved;
4560
5022
  };
@@ -4868,6 +5330,17 @@ async function runEvalCase(options) {
4868
5330
  if (cacheKey && cache && !cachedResponse) {
4869
5331
  await cache.set(cacheKey, providerResponse);
4870
5332
  }
5333
+ let candidateTrace = providerResponse.trace;
5334
+ if (!candidateTrace && providerResponse.traceRef) {
5335
+ try {
5336
+ const rawTrace = await readJsonFile(providerResponse.traceRef);
5337
+ if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
5338
+ candidateTrace = rawTrace;
5339
+ }
5340
+ } catch {
5341
+ }
5342
+ }
5343
+ const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
4871
5344
  try {
4872
5345
  return await evaluateCandidate({
4873
5346
  evalCase,
@@ -4879,7 +5352,9 @@ async function runEvalCase(options) {
4879
5352
  nowFn,
4880
5353
  attempt,
4881
5354
  judgeProvider,
4882
- agentTimeoutMs
5355
+ agentTimeoutMs,
5356
+ candidateTrace,
5357
+ candidateTraceSummary
4883
5358
  });
4884
5359
  } catch (error) {
4885
5360
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -4896,7 +5371,9 @@ async function evaluateCandidate(options) {
4896
5371
  nowFn,
4897
5372
  attempt,
4898
5373
  judgeProvider,
4899
- agentTimeoutMs
5374
+ agentTimeoutMs,
5375
+ candidateTrace,
5376
+ candidateTraceSummary
4900
5377
  } = options;
4901
5378
  const gradeTimestamp = nowFn();
4902
5379
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -4909,7 +5386,9 @@ async function evaluateCandidate(options) {
4909
5386
  promptInputs,
4910
5387
  now: gradeTimestamp,
4911
5388
  judgeProvider,
4912
- agentTimeoutMs
5389
+ agentTimeoutMs,
5390
+ candidateTrace,
5391
+ candidateTraceSummary
4913
5392
  });
4914
5393
  const completedAt = nowFn();
4915
5394
  let agentProviderRequest;
@@ -4948,7 +5427,8 @@ async function evaluateCandidate(options) {
4948
5427
  agent_provider_request: agentProviderRequest,
4949
5428
  lm_provider_request: lmProviderRequest,
4950
5429
  evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4951
- evaluator_results: evaluatorResults
5430
+ evaluator_results: evaluatorResults,
5431
+ trace_summary: candidateTraceSummary
4952
5432
  };
4953
5433
  }
4954
5434
  async function runEvaluatorsForCase(options) {
@@ -4962,7 +5442,9 @@ async function runEvaluatorsForCase(options) {
4962
5442
  promptInputs,
4963
5443
  now,
4964
5444
  judgeProvider,
4965
- agentTimeoutMs
5445
+ agentTimeoutMs,
5446
+ candidateTrace,
5447
+ candidateTraceSummary
4966
5448
  } = options;
4967
5449
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
4968
5450
  return runEvaluatorList({
@@ -4976,7 +5458,9 @@ async function runEvaluatorsForCase(options) {
4976
5458
  promptInputs,
4977
5459
  now,
4978
5460
  judgeProvider,
4979
- agentTimeoutMs
5461
+ agentTimeoutMs,
5462
+ candidateTrace,
5463
+ candidateTraceSummary
4980
5464
  });
4981
5465
  }
4982
5466
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -4992,7 +5476,9 @@ async function runEvaluatorsForCase(options) {
4992
5476
  attempt,
4993
5477
  promptInputs,
4994
5478
  now,
4995
- judgeProvider
5479
+ judgeProvider,
5480
+ candidateTrace,
5481
+ candidateTraceSummary
4996
5482
  });
4997
5483
  return { score };
4998
5484
  }
@@ -5008,7 +5494,9 @@ async function runEvaluatorList(options) {
5008
5494
  promptInputs,
5009
5495
  now,
5010
5496
  judgeProvider,
5011
- agentTimeoutMs
5497
+ agentTimeoutMs,
5498
+ candidateTrace,
5499
+ candidateTraceSummary
5012
5500
  } = options;
5013
5501
  const scored = [];
5014
5502
  const evaluatorResults = [];
@@ -5067,7 +5555,7 @@ async function runEvaluatorList(options) {
5067
5555
  });
5068
5556
  }
5069
5557
  if (evaluator.type === "composite") {
5070
- const evalFileDir = evalCase.guideline_paths[0] ? import_node_path13.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
5558
+ const evalFileDir = evalCase.guideline_paths[0] ? import_node_path14.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
5071
5559
  const createEvaluator = (memberConfig) => {
5072
5560
  switch (memberConfig.type) {
5073
5561
  case "llm_judge":
@@ -5084,6 +5572,12 @@ async function runEvaluatorList(options) {
5084
5572
  cwd: evalFileDir,
5085
5573
  evaluatorFactory: { create: createEvaluator }
5086
5574
  });
5575
+ case "tool_trajectory":
5576
+ return new ToolTrajectoryEvaluator({
5577
+ config: memberConfig
5578
+ });
5579
+ case "expected_messages":
5580
+ return new ExpectedMessagesEvaluator();
5087
5581
  default: {
5088
5582
  const unknownConfig = memberConfig;
5089
5583
  throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -5118,6 +5612,56 @@ async function runEvaluatorList(options) {
5118
5612
  evaluator_results: mapChildResults(score2.evaluatorResults)
5119
5613
  });
5120
5614
  }
5615
+ if (evaluator.type === "tool_trajectory") {
5616
+ const trajectoryEvaluator = new ToolTrajectoryEvaluator({
5617
+ config: evaluator
5618
+ });
5619
+ const score2 = trajectoryEvaluator.evaluate({
5620
+ evalCase,
5621
+ candidate,
5622
+ target,
5623
+ provider,
5624
+ attempt,
5625
+ promptInputs,
5626
+ now,
5627
+ candidateTrace,
5628
+ candidateTraceSummary
5629
+ });
5630
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
5631
+ evaluatorResults.push({
5632
+ name: evaluator.name,
5633
+ type: evaluator.type,
5634
+ score: score2.score,
5635
+ verdict: score2.verdict,
5636
+ hits: score2.hits,
5637
+ misses: score2.misses,
5638
+ reasoning: score2.reasoning
5639
+ });
5640
+ }
5641
+ if (evaluator.type === "expected_messages") {
5642
+ const expectedMessagesEvaluator = new ExpectedMessagesEvaluator();
5643
+ const score2 = expectedMessagesEvaluator.evaluate({
5644
+ evalCase,
5645
+ candidate,
5646
+ target,
5647
+ provider,
5648
+ attempt,
5649
+ promptInputs,
5650
+ now,
5651
+ candidateTrace,
5652
+ candidateTraceSummary
5653
+ });
5654
+ scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
5655
+ evaluatorResults.push({
5656
+ name: evaluator.name,
5657
+ type: evaluator.type,
5658
+ score: score2.score,
5659
+ verdict: score2.verdict,
5660
+ hits: score2.hits,
5661
+ misses: score2.misses,
5662
+ reasoning: score2.reasoning
5663
+ });
5664
+ }
5121
5665
  } catch (error) {
5122
5666
  const message = error instanceof Error ? error.message : String(error);
5123
5667
  const fallbackScore = {
@@ -5240,8 +5784,8 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
5240
5784
  async function dumpPrompt(directory, evalCase, promptInputs) {
5241
5785
  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
5242
5786
  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
5243
- const filePath = import_node_path13.default.resolve(directory, filename);
5244
- await (0, import_promises11.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
5787
+ const filePath = import_node_path14.default.resolve(directory, filename);
5788
+ await (0, import_promises11.mkdir)(import_node_path14.default.dirname(filePath), { recursive: true });
5245
5789
  const payload = {
5246
5790
  eval_id: evalCase.id,
5247
5791
  question: promptInputs.question,
@@ -5460,11 +6004,14 @@ function createAgentKernel() {
5460
6004
  0 && (module.exports = {
5461
6005
  CodeEvaluator,
5462
6006
  CompositeEvaluator,
6007
+ ExpectedMessagesEvaluator,
5463
6008
  LlmJudgeEvaluator,
5464
6009
  TEST_MESSAGE_ROLES,
6010
+ ToolTrajectoryEvaluator,
5465
6011
  buildDirectoryChain,
5466
6012
  buildPromptInputs,
5467
6013
  buildSearchRoots,
6014
+ computeTraceSummary,
5468
6015
  consumeCodexLogEntries,
5469
6016
  createAgentKernel,
5470
6017
  createProvider,
@@ -5475,14 +6022,18 @@ function createAgentKernel() {
5475
6022
  generateRubrics,
5476
6023
  getHitCount,
5477
6024
  isEvaluatorKind,
6025
+ isExpectedToolCall,
5478
6026
  isGuidelineFile,
5479
6027
  isJsonObject,
5480
6028
  isJsonValue,
5481
6029
  isTestMessage,
5482
6030
  isTestMessageRole,
6031
+ isTraceEvent,
6032
+ isTraceEventType,
5483
6033
  listTargetNames,
5484
6034
  loadEvalCases,
5485
6035
  normalizeLineEndings,
6036
+ readJsonFile,
5486
6037
  readTargetDefinitions,
5487
6038
  readTestSuiteMetadata,
5488
6039
  readTextFile,