@agentv/core 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -54,8 +54,6 @@ __export(index_exports, {
54
54
  isJsonValue: () => isJsonValue,
55
55
  isTestMessage: () => isTestMessage,
56
56
  isTestMessageRole: () => isTestMessageRole,
57
- isTraceEvent: () => isTraceEvent,
58
- isTraceEventType: () => isTraceEventType,
59
57
  listTargetNames: () => listTargetNames,
60
58
  loadEvalCases: () => loadEvalCases,
61
59
  normalizeLineEndings: () => normalizeLineEndings,
@@ -135,33 +133,22 @@ function getHitCount(result) {
135
133
  }
136
134
 
137
135
  // src/evaluation/trace.ts
138
- function isTraceEventType(value) {
139
- return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
140
- }
141
- function isTraceEvent(value) {
142
- if (typeof value !== "object" || value === null) {
143
- return false;
144
- }
145
- const candidate = value;
146
- return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
147
- }
148
- function computeTraceSummary(trace) {
136
+ function computeTraceSummary(messages) {
149
137
  const toolCallCounts = {};
150
- let errorCount = 0;
151
- for (const event of trace) {
152
- if (event.type === "tool_call" && event.name) {
153
- toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
154
- }
155
- if (event.type === "error") {
156
- errorCount++;
138
+ let totalToolCalls = 0;
139
+ for (const message of messages) {
140
+ if (!message.toolCalls) continue;
141
+ for (const toolCall of message.toolCalls) {
142
+ toolCallCounts[toolCall.tool] = (toolCallCounts[toolCall.tool] ?? 0) + 1;
143
+ totalToolCalls++;
157
144
  }
158
145
  }
159
146
  const toolNames = Object.keys(toolCallCounts).sort();
160
147
  return {
161
- eventCount: trace.length,
148
+ eventCount: totalToolCalls,
162
149
  toolNames,
163
150
  toolCallsByName: toolCallCounts,
164
- errorCount
151
+ errorCount: 0
165
152
  };
166
153
  }
167
154
 
@@ -437,7 +424,8 @@ var TEMPLATE_VARIABLES = {
437
424
  QUESTION: "question",
438
425
  EXPECTED_OUTCOME: "expected_outcome",
439
426
  REFERENCE_ANSWER: "reference_answer",
440
- INPUT_MESSAGES: "input_messages"
427
+ INPUT_MESSAGES: "input_messages",
428
+ OUTPUT_MESSAGES: "output_messages"
441
429
  };
442
430
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
443
431
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
@@ -1320,16 +1308,16 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1320
1308
  }) : [];
1321
1309
  const codeSnippets = extractCodeBlocks(inputSegments);
1322
1310
  let referenceAnswer = "";
1323
- if (outputSegments.length > 1) {
1324
- referenceAnswer = JSON.stringify(outputSegments, null, 2);
1325
- } else if (outputSegments.length === 1) {
1326
- const singleMessage = outputSegments[0];
1327
- if (typeof singleMessage.content === "string") {
1328
- referenceAnswer = singleMessage.content;
1329
- } else if (singleMessage.content) {
1330
- referenceAnswer = JSON.stringify(singleMessage, null, 2);
1331
- } else if (singleMessage.tool_calls) {
1332
- referenceAnswer = JSON.stringify(singleMessage, null, 2);
1311
+ if (outputSegments.length > 0) {
1312
+ const lastMessage = outputSegments[outputSegments.length - 1];
1313
+ const content = lastMessage.content;
1314
+ const toolCalls = lastMessage.tool_calls;
1315
+ if (typeof content === "string") {
1316
+ referenceAnswer = content;
1317
+ } else if (content !== void 0 && content !== null) {
1318
+ referenceAnswer = JSON.stringify(content, null, 2);
1319
+ } else if (toolCalls !== void 0 && toolCalls !== null) {
1320
+ referenceAnswer = JSON.stringify(toolCalls, null, 2);
1333
1321
  }
1334
1322
  }
1335
1323
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
@@ -1772,11 +1760,11 @@ async function invokeModel(options) {
1772
1760
  return mapResponse(result);
1773
1761
  }
1774
1762
  function mapResponse(result) {
1763
+ const content = result.text ?? "";
1775
1764
  return {
1776
- text: result.text ?? "",
1777
- reasoning: result.reasoningText ?? void 0,
1778
1765
  raw: result,
1779
- usage: toJsonObject(result.totalUsage ?? result.usage)
1766
+ usage: toJsonObject(result.totalUsage ?? result.usage),
1767
+ outputMessages: [{ role: "assistant", content }]
1780
1768
  };
1781
1769
  }
1782
1770
  function toJsonObject(value) {
@@ -1929,6 +1917,7 @@ var CliProvider = class {
1929
1917
  config;
1930
1918
  runCommand;
1931
1919
  verbose;
1920
+ keepTempFiles;
1932
1921
  healthcheckPromise;
1933
1922
  constructor(targetName, config, runner = defaultCommandRunner) {
1934
1923
  this.targetName = targetName;
@@ -1936,6 +1925,7 @@ var CliProvider = class {
1936
1925
  this.config = config;
1937
1926
  this.runCommand = runner;
1938
1927
  this.verbose = config.verbose ?? false;
1928
+ this.keepTempFiles = config.keepTempFiles ?? false;
1939
1929
  }
1940
1930
  async invoke(request) {
1941
1931
  if (request.signal?.aborted) {
@@ -1973,8 +1963,7 @@ var CliProvider = class {
1973
1963
  const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
1974
1964
  const parsed = this.parseOutputContent(responseContent);
1975
1965
  return {
1976
- text: parsed.text,
1977
- trace: parsed.trace,
1966
+ outputMessages: parsed.outputMessages,
1978
1967
  raw: {
1979
1968
  command: renderedCommand,
1980
1969
  stderr: result.stderr,
@@ -2053,7 +2042,7 @@ var CliProvider = class {
2053
2042
  const evalCaseId = request.evalCaseId;
2054
2043
  if (!evalCaseId) {
2055
2044
  return {
2056
- text: "",
2045
+ outputMessages: [],
2057
2046
  raw: {
2058
2047
  command: renderedCommand,
2059
2048
  stderr: result.stderr,
@@ -2066,7 +2055,7 @@ var CliProvider = class {
2066
2055
  const parsed = recordsById.get(evalCaseId);
2067
2056
  if (!parsed) {
2068
2057
  return {
2069
- text: "",
2058
+ outputMessages: [],
2070
2059
  raw: {
2071
2060
  command: renderedCommand,
2072
2061
  stderr: result.stderr,
@@ -2077,9 +2066,7 @@ var CliProvider = class {
2077
2066
  };
2078
2067
  }
2079
2068
  return {
2080
- text: parsed.text,
2081
- trace: parsed.trace,
2082
- traceRef: parsed.traceRef,
2069
+ outputMessages: parsed.outputMessages,
2083
2070
  raw: {
2084
2071
  command: renderedCommand,
2085
2072
  stderr: result.stderr,
@@ -2094,28 +2081,81 @@ var CliProvider = class {
2094
2081
  }
2095
2082
  /**
2096
2083
  * Parse output content from CLI.
2097
- * If the content is valid JSON with a 'text' field, extract text and optional trace.
2098
- * Otherwise, treat the entire content as plain text.
2084
+ * If the content is valid JSON with 'output_messages' or 'text' field, extract them.
2085
+ * If only 'text' is provided, wrap it in outputMessages.
2086
+ * Otherwise, treat the entire content as plain text wrapped in outputMessages.
2099
2087
  */
2100
2088
  parseOutputContent(content) {
2101
2089
  try {
2102
2090
  const parsed = JSON.parse(content);
2103
- if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
2091
+ if (typeof parsed === "object" && parsed !== null) {
2104
2092
  const obj = parsed;
2105
- const text = typeof obj.text === "string" ? obj.text : String(obj.text);
2106
- const trace = this.parseTrace(obj.trace);
2107
- return { text, trace };
2093
+ const outputMessages = this.parseOutputMessages(obj.output_messages);
2094
+ if (outputMessages && outputMessages.length > 0) {
2095
+ return { outputMessages };
2096
+ }
2097
+ if ("text" in obj) {
2098
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
2099
+ return { outputMessages: [{ role: "assistant", content: text }] };
2100
+ }
2108
2101
  }
2109
2102
  } catch {
2110
2103
  }
2111
- return { text: content };
2104
+ return { outputMessages: [{ role: "assistant", content }] };
2112
2105
  }
2113
- parseTrace(trace) {
2114
- if (!Array.isArray(trace)) {
2106
+ /**
2107
+ * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
2108
+ */
2109
+ parseOutputMessages(outputMessages) {
2110
+ if (!Array.isArray(outputMessages)) {
2115
2111
  return void 0;
2116
2112
  }
2117
- const validEvents = trace.filter(isTraceEvent);
2118
- return validEvents.length > 0 ? validEvents : void 0;
2113
+ const messages = [];
2114
+ for (const msg of outputMessages) {
2115
+ if (typeof msg !== "object" || msg === null) {
2116
+ continue;
2117
+ }
2118
+ const rawMsg = msg;
2119
+ if (typeof rawMsg.role !== "string") {
2120
+ continue;
2121
+ }
2122
+ const message = {
2123
+ role: rawMsg.role,
2124
+ name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
2125
+ content: rawMsg.content,
2126
+ toolCalls: this.parseToolCalls(rawMsg.tool_calls),
2127
+ timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
2128
+ metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
2129
+ };
2130
+ messages.push(message);
2131
+ }
2132
+ return messages.length > 0 ? messages : void 0;
2133
+ }
2134
+ /**
2135
+ * Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
2136
+ */
2137
+ parseToolCalls(toolCalls) {
2138
+ if (!Array.isArray(toolCalls)) {
2139
+ return void 0;
2140
+ }
2141
+ const calls = [];
2142
+ for (const call of toolCalls) {
2143
+ if (typeof call !== "object" || call === null) {
2144
+ continue;
2145
+ }
2146
+ const rawCall = call;
2147
+ if (typeof rawCall.tool !== "string") {
2148
+ continue;
2149
+ }
2150
+ calls.push({
2151
+ tool: rawCall.tool,
2152
+ input: rawCall.input,
2153
+ output: rawCall.output,
2154
+ id: typeof rawCall.id === "string" ? rawCall.id : void 0,
2155
+ timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
2156
+ });
2157
+ }
2158
+ return calls.length > 0 ? calls : void 0;
2119
2159
  }
2120
2160
  parseJsonlBatchOutput(content) {
2121
2161
  const records = /* @__PURE__ */ new Map();
@@ -2139,12 +2179,16 @@ var CliProvider = class {
2139
2179
  if (records.has(id)) {
2140
2180
  throw new Error(`CLI batch output contains duplicate id: ${id}`);
2141
2181
  }
2142
- const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
2143
- const traceRef = typeof obj.traceRef === "string" ? obj.traceRef : typeof obj.trace_ref === "string" ? obj.trace_ref : void 0;
2182
+ const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
2183
+ let outputMessages;
2184
+ if (parsedOutputMessages && parsedOutputMessages.length > 0) {
2185
+ outputMessages = parsedOutputMessages;
2186
+ } else {
2187
+ const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
2188
+ outputMessages = text ? [{ role: "assistant", content: text }] : [];
2189
+ }
2144
2190
  records.set(id, {
2145
- text,
2146
- trace: this.parseTrace(obj.trace),
2147
- traceRef
2191
+ outputMessages
2148
2192
  });
2149
2193
  }
2150
2194
  return records;
@@ -2157,8 +2201,10 @@ var CliProvider = class {
2157
2201
  const errorMsg = error instanceof Error ? error.message : String(error);
2158
2202
  throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
2159
2203
  } finally {
2160
- await import_promises8.default.unlink(filePath).catch(() => {
2161
- });
2204
+ if (!this.keepTempFiles) {
2205
+ await import_promises8.default.unlink(filePath).catch(() => {
2206
+ });
2207
+ }
2162
2208
  }
2163
2209
  }
2164
2210
  async ensureHealthy(signal) {
@@ -2501,7 +2547,6 @@ var CodexProvider = class {
2501
2547
  const parsed = parseCodexJson(result.stdout);
2502
2548
  const assistantText = extractAssistantText(parsed);
2503
2549
  return {
2504
- text: assistantText,
2505
2550
  raw: {
2506
2551
  response: parsed,
2507
2552
  stdout: result.stdout,
@@ -2513,7 +2558,8 @@ var CodexProvider = class {
2513
2558
  workspace: workspaceRoot,
2514
2559
  inputFiles,
2515
2560
  logFile: logger?.filePath
2516
- }
2561
+ },
2562
+ outputMessages: [{ role: "assistant", content: assistantText }]
2517
2563
  };
2518
2564
  } finally {
2519
2565
  await logger?.close();
@@ -3135,7 +3181,6 @@ var MockProvider = class {
3135
3181
  delayMs;
3136
3182
  delayMinMs;
3137
3183
  delayMaxMs;
3138
- trace;
3139
3184
  constructor(targetName, config) {
3140
3185
  this.id = `mock:${targetName}`;
3141
3186
  this.targetName = targetName;
@@ -3143,7 +3188,6 @@ var MockProvider = class {
3143
3188
  this.delayMs = config.delayMs ?? 0;
3144
3189
  this.delayMinMs = config.delayMinMs ?? 0;
3145
3190
  this.delayMaxMs = config.delayMaxMs ?? 0;
3146
- this.trace = config.trace;
3147
3191
  }
3148
3192
  async invoke(request) {
3149
3193
  const delay = this.calculateDelay();
@@ -3151,12 +3195,11 @@ var MockProvider = class {
3151
3195
  await new Promise((resolve) => setTimeout(resolve, delay));
3152
3196
  }
3153
3197
  return {
3154
- text: this.cannedResponse,
3198
+ outputMessages: [{ role: "assistant", content: this.cannedResponse }],
3155
3199
  raw: {
3156
3200
  question: request.question,
3157
3201
  guidelines: request.guidelines
3158
- },
3159
- trace: this.trace
3202
+ }
3160
3203
  };
3161
3204
  }
3162
3205
  calculateDelay() {
@@ -3424,8 +3467,7 @@ function normalizeCodexLogFormat(value) {
3424
3467
  }
3425
3468
  function resolveMockConfig(target) {
3426
3469
  const response = typeof target.response === "string" ? target.response : void 0;
3427
- const trace = Array.isArray(target.trace) ? target.trace : void 0;
3428
- return { response, trace };
3470
+ return { response };
3429
3471
  }
3430
3472
  function resolveVSCodeConfig(target, env, insiders) {
3431
3473
  const workspaceTemplateEnvVar = resolveOptionalLiteralString(
@@ -3463,6 +3505,9 @@ function resolveCliConfig(target, env, evalFilePath) {
3463
3505
  target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
3464
3506
  );
3465
3507
  const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
3508
+ const keepTempFiles = resolveOptionalBoolean(
3509
+ target.keep_temp_files ?? target.keepTempFiles ?? target.keep_output_files ?? target.keepOutputFiles
3510
+ );
3466
3511
  let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
3467
3512
  allowLiteral: true,
3468
3513
  optionalEnv: true
@@ -3491,7 +3536,8 @@ function resolveCliConfig(target, env, evalFilePath) {
3491
3536
  cwd,
3492
3537
  timeoutMs,
3493
3538
  healthcheck,
3494
- verbose
3539
+ verbose,
3540
+ keepTempFiles
3495
3541
  };
3496
3542
  }
3497
3543
  function resolveTimeoutMs(source, description) {
@@ -3786,7 +3832,7 @@ var VSCodeProvider = class {
3786
3832
  }
3787
3833
  if (this.config.dryRun) {
3788
3834
  return {
3789
- text: "",
3835
+ outputMessages: [],
3790
3836
  raw: {
3791
3837
  session,
3792
3838
  inputFiles
@@ -3795,7 +3841,7 @@ var VSCodeProvider = class {
3795
3841
  }
3796
3842
  const responseText = await readTextFile(session.responseFile);
3797
3843
  return {
3798
- text: responseText,
3844
+ outputMessages: [{ role: "assistant", content: responseText }],
3799
3845
  raw: {
3800
3846
  session,
3801
3847
  inputFiles
@@ -3833,7 +3879,7 @@ var VSCodeProvider = class {
3833
3879
  }
3834
3880
  if (this.config.dryRun) {
3835
3881
  return normalizedRequests.map(({ inputFiles }) => ({
3836
- text: "",
3882
+ outputMessages: [],
3837
3883
  raw: {
3838
3884
  session,
3839
3885
  inputFiles,
@@ -3850,7 +3896,7 @@ var VSCodeProvider = class {
3850
3896
  for (const [index, responseFile] of session.responseFiles.entries()) {
3851
3897
  const responseText = await readTextFile(responseFile);
3852
3898
  responses.push({
3853
- text: responseText,
3899
+ outputMessages: [{ role: "assistant", content: responseText }],
3854
3900
  raw: {
3855
3901
  session,
3856
3902
  inputFiles: normalizedRequests[index]?.inputFiles,
@@ -4090,6 +4136,33 @@ function resolveAndCreateProvider(definition, env = process.env) {
4090
4136
  // src/evaluation/evaluators.ts
4091
4137
  var import_ai2 = require("ai");
4092
4138
  var import_zod2 = require("zod");
4139
+
4140
+ // src/evaluation/providers/types.ts
4141
+ var AGENT_PROVIDER_KINDS = [
4142
+ "codex",
4143
+ "vscode",
4144
+ "vscode-insiders"
4145
+ ];
4146
+ function extractLastAssistantContent(messages) {
4147
+ if (!messages || messages.length === 0) {
4148
+ return "";
4149
+ }
4150
+ for (let i = messages.length - 1; i >= 0; i--) {
4151
+ const msg = messages[i];
4152
+ if (msg.role === "assistant" && msg.content !== void 0) {
4153
+ if (typeof msg.content === "string") {
4154
+ return msg.content;
4155
+ }
4156
+ return JSON.stringify(msg.content);
4157
+ }
4158
+ }
4159
+ return "";
4160
+ }
4161
+ function isAgentProvider(provider) {
4162
+ return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
4163
+ }
4164
+
4165
+ // src/evaluation/evaluators.ts
4093
4166
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
4094
4167
 
4095
4168
  Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -4154,6 +4227,7 @@ var LlmJudgeEvaluator = class {
4154
4227
  null,
4155
4228
  2
4156
4229
  ),
4230
+ [TEMPLATE_VARIABLES.OUTPUT_MESSAGES]: JSON.stringify(context.outputMessages ?? [], null, 2),
4157
4231
  [TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
4158
4232
  [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
4159
4233
  [TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
@@ -4178,7 +4252,7 @@ var LlmJudgeEvaluator = class {
4178
4252
  const score = clampScore(data.score);
4179
4253
  const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
4180
4254
  const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
4181
- const reasoning = data.reasoning ?? providerResponse?.reasoning;
4255
+ const reasoning = data.reasoning;
4182
4256
  const expectedAspectCount = Math.max(hits.length + misses.length, 1);
4183
4257
  return {
4184
4258
  score,
@@ -4280,7 +4354,9 @@ var LlmJudgeEvaluator = class {
4280
4354
  maxOutputTokens: this.maxOutputTokens,
4281
4355
  temperature: this.temperature
4282
4356
  });
4283
- const data = schema.parse(parseJsonFromText(response.text ?? ""));
4357
+ const data = schema.parse(
4358
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
4359
+ );
4284
4360
  return { data, providerResponse: response };
4285
4361
  } catch (e) {
4286
4362
  lastError = e instanceof Error ? e : new Error(String(e));
@@ -4366,13 +4442,13 @@ var CodeEvaluator = class {
4366
4442
  expected_messages: context.evalCase.expected_messages,
4367
4443
  reference_answer: context.evalCase.reference_answer,
4368
4444
  candidate_answer: context.candidate,
4445
+ output_messages: context.outputMessages ?? null,
4369
4446
  guideline_files: context.evalCase.guideline_paths,
4370
4447
  input_files: context.evalCase.file_paths.filter(
4371
4448
  (path15) => !context.evalCase.guideline_paths.includes(path15)
4372
4449
  ),
4373
4450
  input_messages: context.evalCase.input_messages,
4374
- candidate_trace_file: context.candidateTraceRef ?? null,
4375
- candidate_trace_summary: context.candidateTraceSummary ?? null
4451
+ candidate_trace_summary: context.traceSummary ?? null
4376
4452
  },
4377
4453
  null,
4378
4454
  2
@@ -4499,8 +4575,19 @@ var ToolTrajectoryEvaluator = class {
4499
4575
  this.config = options.config;
4500
4576
  }
4501
4577
  evaluate(context) {
4502
- const { candidateTrace, candidateTraceSummary } = context;
4503
- if (!candidateTrace || !candidateTraceSummary) {
4578
+ const { outputMessages, traceSummary } = context;
4579
+ const toolCalls = this.extractToolCallsFromMessages(outputMessages);
4580
+ if (toolCalls.length === 0 && !traceSummary) {
4581
+ return {
4582
+ score: 0,
4583
+ verdict: "fail",
4584
+ hits: [],
4585
+ misses: ["No trace available for evaluation"],
4586
+ expectedAspectCount: 1
4587
+ };
4588
+ }
4589
+ const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
4590
+ if (!summary) {
4504
4591
  return {
4505
4592
  score: 0,
4506
4593
  verdict: "fail",
@@ -4511,11 +4598,11 @@ var ToolTrajectoryEvaluator = class {
4511
4598
  }
4512
4599
  switch (this.config.mode) {
4513
4600
  case "any_order":
4514
- return this.evaluateAnyOrder(candidateTraceSummary);
4601
+ return this.evaluateAnyOrder(summary);
4515
4602
  case "in_order":
4516
- return this.evaluateInOrder(candidateTrace);
4603
+ return this.evaluateInOrder(toolCalls);
4517
4604
  case "exact":
4518
- return this.evaluateExact(candidateTrace);
4605
+ return this.evaluateExact(toolCalls);
4519
4606
  default:
4520
4607
  return {
4521
4608
  score: 0,
@@ -4526,6 +4613,39 @@ var ToolTrajectoryEvaluator = class {
4526
4613
  };
4527
4614
  }
4528
4615
  }
4616
+ /**
4617
+ * Extract tool calls from output messages.
4618
+ */
4619
+ extractToolCallsFromMessages(messages) {
4620
+ if (!messages) {
4621
+ return [];
4622
+ }
4623
+ const toolCalls = [];
4624
+ for (const message of messages) {
4625
+ if (message.toolCalls) {
4626
+ for (const call of message.toolCalls) {
4627
+ toolCalls.push({ name: call.tool });
4628
+ }
4629
+ }
4630
+ }
4631
+ return toolCalls;
4632
+ }
4633
+ /**
4634
+ * Build a summary from extracted tool calls.
4635
+ */
4636
+ buildSummary(toolCalls) {
4637
+ const toolCallsByName = {};
4638
+ for (const call of toolCalls) {
4639
+ toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
4640
+ }
4641
+ const toolNames = Object.keys(toolCallsByName).sort();
4642
+ return {
4643
+ eventCount: toolCalls.length,
4644
+ toolNames,
4645
+ toolCallsByName,
4646
+ errorCount: 0
4647
+ };
4648
+ }
4529
4649
  evaluateAnyOrder(summary) {
4530
4650
  const minimums = this.config.minimums ?? {};
4531
4651
  const toolNames = Object.keys(minimums);
@@ -4558,7 +4678,7 @@ var ToolTrajectoryEvaluator = class {
4558
4678
  expectedAspectCount: toolNames.length
4559
4679
  };
4560
4680
  }
4561
- evaluateInOrder(trace) {
4681
+ evaluateInOrder(toolCalls) {
4562
4682
  const expected = this.config.expected ?? [];
4563
4683
  if (expected.length === 0) {
4564
4684
  return {
@@ -4569,15 +4689,14 @@ var ToolTrajectoryEvaluator = class {
4569
4689
  expectedAspectCount: 0
4570
4690
  };
4571
4691
  }
4572
- const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
4573
4692
  const hits = [];
4574
4693
  const misses = [];
4575
4694
  let actualIndex = 0;
4576
4695
  for (let i = 0; i < expected.length; i++) {
4577
4696
  const expectedTool = expected[i].tool;
4578
4697
  let found = false;
4579
- while (actualIndex < actualToolCalls.length) {
4580
- if (actualToolCalls[actualIndex].name === expectedTool) {
4698
+ while (actualIndex < toolCalls.length) {
4699
+ if (toolCalls[actualIndex].name === expectedTool) {
4581
4700
  hits.push(`Found ${expectedTool} at position ${actualIndex}`);
4582
4701
  actualIndex++;
4583
4702
  found = true;
@@ -4598,7 +4717,7 @@ var ToolTrajectoryEvaluator = class {
4598
4717
  expectedAspectCount: expected.length
4599
4718
  };
4600
4719
  }
4601
- evaluateExact(trace) {
4720
+ evaluateExact(toolCalls) {
4602
4721
  const expected = this.config.expected ?? [];
4603
4722
  if (expected.length === 0) {
4604
4723
  return {
@@ -4609,16 +4728,15 @@ var ToolTrajectoryEvaluator = class {
4609
4728
  expectedAspectCount: 0
4610
4729
  };
4611
4730
  }
4612
- const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
4613
4731
  const hits = [];
4614
4732
  const misses = [];
4615
- if (actualToolCalls.length !== expected.length) {
4616
- misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
4733
+ if (toolCalls.length !== expected.length) {
4734
+ misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
4617
4735
  }
4618
- const checkLength = Math.min(expected.length, actualToolCalls.length);
4736
+ const checkLength = Math.min(expected.length, toolCalls.length);
4619
4737
  for (let i = 0; i < checkLength; i++) {
4620
4738
  const expectedTool = expected[i].tool;
4621
- const actualTool = actualToolCalls[i].name;
4739
+ const actualTool = toolCalls[i].name;
4622
4740
  if (actualTool === expectedTool) {
4623
4741
  hits.push(`Position ${i}: ${expectedTool} \u2713`);
4624
4742
  } else {
@@ -4832,11 +4950,13 @@ var CompositeEvaluator = class {
4832
4950
  evalCaseId: context.evalCase.id,
4833
4951
  attempt: context.attempt
4834
4952
  });
4835
- const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
4953
+ const data = freeformEvaluationSchema.parse(
4954
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
4955
+ );
4836
4956
  const score = clampScore(data.score);
4837
4957
  const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
4838
4958
  const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
4839
- const reasoning = data.reasoning ?? response.reasoning;
4959
+ const reasoning = data.reasoning;
4840
4960
  return {
4841
4961
  score,
4842
4962
  verdict: scoreToVerdict(score),
@@ -5005,16 +5125,6 @@ function validateConcurrency(concurrency) {
5005
5125
  }
5006
5126
  }
5007
5127
 
5008
- // src/evaluation/providers/types.ts
5009
- var AGENT_PROVIDER_KINDS = [
5010
- "codex",
5011
- "vscode",
5012
- "vscode-insiders"
5013
- ];
5014
- function isAgentProvider(provider) {
5015
- return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
5016
- }
5017
-
5018
5128
  // src/evaluation/orchestrator.ts
5019
5129
  async function runEvaluation(options) {
5020
5130
  const {
@@ -5269,11 +5379,14 @@ async function runBatchEvaluation(options) {
5269
5379
  const evalCase = evalCases[i];
5270
5380
  const promptInputs = promptInputsList[i];
5271
5381
  const providerResponse = batchResponse[i];
5382
+ const outputMessages = providerResponse.outputMessages;
5383
+ const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
5384
+ const candidate = extractLastAssistantContent(outputMessages);
5272
5385
  let result;
5273
5386
  try {
5274
5387
  result = await evaluateCandidate({
5275
5388
  evalCase,
5276
- candidate: providerResponse.text ?? "",
5389
+ candidate,
5277
5390
  target,
5278
5391
  provider,
5279
5392
  evaluators: evaluatorRegistry,
@@ -5281,7 +5394,9 @@ async function runBatchEvaluation(options) {
5281
5394
  nowFn,
5282
5395
  attempt: 0,
5283
5396
  judgeProvider: await resolveJudgeProvider(target),
5284
- agentTimeoutMs
5397
+ agentTimeoutMs,
5398
+ outputMessages,
5399
+ traceSummary
5285
5400
  });
5286
5401
  } catch (error) {
5287
5402
  const errorResult = buildErrorResult(
@@ -5385,21 +5500,13 @@ async function runEvalCase(options) {
5385
5500
  if (cacheKey && cache && !cachedResponse) {
5386
5501
  await cache.set(cacheKey, providerResponse);
5387
5502
  }
5388
- let candidateTrace = providerResponse.trace;
5389
- if (!candidateTrace && providerResponse.traceRef) {
5390
- try {
5391
- const rawTrace = await readJsonFile(providerResponse.traceRef);
5392
- if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
5393
- candidateTrace = rawTrace;
5394
- }
5395
- } catch {
5396
- }
5397
- }
5398
- const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
5503
+ const outputMessages = providerResponse.outputMessages;
5504
+ const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
5505
+ const candidate = extractLastAssistantContent(outputMessages);
5399
5506
  try {
5400
5507
  return await evaluateCandidate({
5401
5508
  evalCase,
5402
- candidate: providerResponse.text ?? "",
5509
+ candidate,
5403
5510
  target,
5404
5511
  provider,
5405
5512
  evaluators,
@@ -5408,9 +5515,8 @@ async function runEvalCase(options) {
5408
5515
  attempt,
5409
5516
  judgeProvider,
5410
5517
  agentTimeoutMs,
5411
- candidateTrace,
5412
- candidateTraceRef: providerResponse.traceRef,
5413
- candidateTraceSummary
5518
+ outputMessages,
5519
+ traceSummary
5414
5520
  });
5415
5521
  } catch (error) {
5416
5522
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -5428,9 +5534,8 @@ async function evaluateCandidate(options) {
5428
5534
  attempt,
5429
5535
  judgeProvider,
5430
5536
  agentTimeoutMs,
5431
- candidateTrace,
5432
- candidateTraceRef,
5433
- candidateTraceSummary
5537
+ outputMessages,
5538
+ traceSummary
5434
5539
  } = options;
5435
5540
  const gradeTimestamp = nowFn();
5436
5541
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -5444,9 +5549,8 @@ async function evaluateCandidate(options) {
5444
5549
  now: gradeTimestamp,
5445
5550
  judgeProvider,
5446
5551
  agentTimeoutMs,
5447
- candidateTrace,
5448
- candidateTraceRef,
5449
- candidateTraceSummary
5552
+ outputMessages,
5553
+ traceSummary
5450
5554
  });
5451
5555
  const completedAt = nowFn();
5452
5556
  let agentProviderRequest;
@@ -5484,7 +5588,7 @@ async function evaluateCandidate(options) {
5484
5588
  lm_provider_request: lmProviderRequest,
5485
5589
  evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
5486
5590
  evaluator_results: evaluatorResults,
5487
- trace_summary: candidateTraceSummary
5591
+ trace_summary: traceSummary
5488
5592
  };
5489
5593
  }
5490
5594
  async function runEvaluatorsForCase(options) {
@@ -5499,9 +5603,8 @@ async function runEvaluatorsForCase(options) {
5499
5603
  now,
5500
5604
  judgeProvider,
5501
5605
  agentTimeoutMs,
5502
- candidateTrace,
5503
- candidateTraceRef,
5504
- candidateTraceSummary
5606
+ outputMessages,
5607
+ traceSummary
5505
5608
  } = options;
5506
5609
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
5507
5610
  return runEvaluatorList({
@@ -5516,9 +5619,8 @@ async function runEvaluatorsForCase(options) {
5516
5619
  now,
5517
5620
  judgeProvider,
5518
5621
  agentTimeoutMs,
5519
- candidateTrace,
5520
- candidateTraceRef,
5521
- candidateTraceSummary
5622
+ outputMessages,
5623
+ traceSummary
5522
5624
  });
5523
5625
  }
5524
5626
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -5535,9 +5637,8 @@ async function runEvaluatorsForCase(options) {
5535
5637
  promptInputs,
5536
5638
  now,
5537
5639
  judgeProvider,
5538
- candidateTrace,
5539
- candidateTraceRef,
5540
- candidateTraceSummary
5640
+ outputMessages,
5641
+ traceSummary
5541
5642
  });
5542
5643
  return { score };
5543
5644
  }
@@ -5554,9 +5655,8 @@ async function runEvaluatorList(options) {
5554
5655
  now,
5555
5656
  judgeProvider,
5556
5657
  agentTimeoutMs,
5557
- candidateTrace,
5558
- candidateTraceRef,
5559
- candidateTraceSummary
5658
+ outputMessages,
5659
+ traceSummary
5560
5660
  } = options;
5561
5661
  const scored = [];
5562
5662
  const evaluatorResults = [];
@@ -5603,8 +5703,8 @@ async function runEvaluatorList(options) {
5603
5703
  attempt,
5604
5704
  promptInputs,
5605
5705
  now,
5606
- candidateTraceRef,
5607
- candidateTraceSummary
5706
+ outputMessages,
5707
+ traceSummary
5608
5708
  });
5609
5709
  const weight = evaluator.weight ?? 1;
5610
5710
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -5690,9 +5790,8 @@ async function runEvaluatorList(options) {
5690
5790
  attempt,
5691
5791
  promptInputs,
5692
5792
  now,
5693
- candidateTrace,
5694
- candidateTraceRef,
5695
- candidateTraceSummary
5793
+ outputMessages,
5794
+ traceSummary
5696
5795
  });
5697
5796
  const weight = evaluator.weight ?? 1;
5698
5797
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -6086,8 +6185,6 @@ function createAgentKernel() {
6086
6185
  isJsonValue,
6087
6186
  isTestMessage,
6088
6187
  isTestMessageRole,
6089
- isTraceEvent,
6090
- isTraceEventType,
6091
6188
  listTargetNames,
6092
6189
  loadEvalCases,
6093
6190
  normalizeLineEndings,