@agentv/core 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -54,8 +54,6 @@ __export(index_exports, {
54
54
  isJsonValue: () => isJsonValue,
55
55
  isTestMessage: () => isTestMessage,
56
56
  isTestMessageRole: () => isTestMessageRole,
57
- isTraceEvent: () => isTraceEvent,
58
- isTraceEventType: () => isTraceEventType,
59
57
  listTargetNames: () => listTargetNames,
60
58
  loadEvalCases: () => loadEvalCases,
61
59
  normalizeLineEndings: () => normalizeLineEndings,
@@ -135,33 +133,22 @@ function getHitCount(result) {
135
133
  }
136
134
 
137
135
  // src/evaluation/trace.ts
138
- function isTraceEventType(value) {
139
- return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
140
- }
141
- function isTraceEvent(value) {
142
- if (typeof value !== "object" || value === null) {
143
- return false;
144
- }
145
- const candidate = value;
146
- return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
147
- }
148
- function computeTraceSummary(trace) {
136
+ function computeTraceSummary(messages) {
149
137
  const toolCallCounts = {};
150
- let errorCount = 0;
151
- for (const event of trace) {
152
- if (event.type === "tool_call" && event.name) {
153
- toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
154
- }
155
- if (event.type === "error") {
156
- errorCount++;
138
+ let totalToolCalls = 0;
139
+ for (const message of messages) {
140
+ if (!message.toolCalls) continue;
141
+ for (const toolCall of message.toolCalls) {
142
+ toolCallCounts[toolCall.tool] = (toolCallCounts[toolCall.tool] ?? 0) + 1;
143
+ totalToolCalls++;
157
144
  }
158
145
  }
159
146
  const toolNames = Object.keys(toolCallCounts).sort();
160
147
  return {
161
- eventCount: trace.length,
148
+ eventCount: totalToolCalls,
162
149
  toolNames,
163
150
  toolCallsByName: toolCallCounts,
164
- errorCount
151
+ errorCount: 0
165
152
  };
166
153
  }
167
154
 
@@ -437,7 +424,8 @@ var TEMPLATE_VARIABLES = {
437
424
  QUESTION: "question",
438
425
  EXPECTED_OUTCOME: "expected_outcome",
439
426
  REFERENCE_ANSWER: "reference_answer",
440
- INPUT_MESSAGES: "input_messages"
427
+ INPUT_MESSAGES: "input_messages",
428
+ OUTPUT_MESSAGES: "output_messages"
441
429
  };
442
430
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
443
431
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
@@ -835,6 +823,17 @@ async function processMessages(options) {
835
823
  }
836
824
  continue;
837
825
  }
826
+ if (isJsonObject(content)) {
827
+ const rendered = JSON.stringify(content, null, 2);
828
+ segments.push({ type: "text", value: rendered });
829
+ if (textParts) {
830
+ textParts.push(rendered);
831
+ }
832
+ continue;
833
+ }
834
+ if (!Array.isArray(content)) {
835
+ continue;
836
+ }
838
837
  for (const rawSegment of content) {
839
838
  if (!isJsonObject(rawSegment)) {
840
839
  continue;
@@ -1061,6 +1060,11 @@ async function buildPromptInputs(testCase, mode = "lm") {
1061
1060
  }
1062
1061
  }
1063
1062
  }
1063
+ } else if (isJsonObject(message.content)) {
1064
+ const rendered = JSON.stringify(message.content, null, 2);
1065
+ if (rendered.trim().length > 0) {
1066
+ messageSegments.push({ type: "text", value: rendered });
1067
+ }
1064
1068
  }
1065
1069
  segmentsByMessage.push(messageSegments);
1066
1070
  }
@@ -1304,16 +1308,16 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1304
1308
  }) : [];
1305
1309
  const codeSnippets = extractCodeBlocks(inputSegments);
1306
1310
  let referenceAnswer = "";
1307
- if (outputSegments.length > 1) {
1308
- referenceAnswer = JSON.stringify(outputSegments, null, 2);
1309
- } else if (outputSegments.length === 1) {
1310
- const singleMessage = outputSegments[0];
1311
- if (typeof singleMessage.content === "string") {
1312
- referenceAnswer = singleMessage.content;
1313
- } else if (singleMessage.content) {
1314
- referenceAnswer = JSON.stringify(singleMessage, null, 2);
1315
- } else if (singleMessage.tool_calls) {
1316
- referenceAnswer = JSON.stringify(singleMessage, null, 2);
1311
+ if (outputSegments.length > 0) {
1312
+ const lastMessage = outputSegments[outputSegments.length - 1];
1313
+ const content = lastMessage.content;
1314
+ const toolCalls = lastMessage.tool_calls;
1315
+ if (typeof content === "string") {
1316
+ referenceAnswer = content;
1317
+ } else if (content !== void 0 && content !== null) {
1318
+ referenceAnswer = JSON.stringify(content, null, 2);
1319
+ } else if (toolCalls !== void 0 && toolCalls !== null) {
1320
+ referenceAnswer = JSON.stringify(toolCalls, null, 2);
1317
1321
  }
1318
1322
  }
1319
1323
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
@@ -1756,11 +1760,11 @@ async function invokeModel(options) {
1756
1760
  return mapResponse(result);
1757
1761
  }
1758
1762
  function mapResponse(result) {
1763
+ const content = result.text ?? "";
1759
1764
  return {
1760
- text: result.text ?? "",
1761
- reasoning: result.reasoningText ?? void 0,
1762
1765
  raw: result,
1763
- usage: toJsonObject(result.totalUsage ?? result.usage)
1766
+ usage: toJsonObject(result.totalUsage ?? result.usage),
1767
+ outputMessages: [{ role: "assistant", content }]
1764
1768
  };
1765
1769
  }
1766
1770
  function toJsonObject(value) {
@@ -1909,10 +1913,11 @@ var CliProvider = class {
1909
1913
  id;
1910
1914
  kind = "cli";
1911
1915
  targetName;
1912
- supportsBatch = false;
1916
+ supportsBatch = true;
1913
1917
  config;
1914
1918
  runCommand;
1915
1919
  verbose;
1920
+ keepTempFiles;
1916
1921
  healthcheckPromise;
1917
1922
  constructor(targetName, config, runner = defaultCommandRunner) {
1918
1923
  this.targetName = targetName;
@@ -1920,6 +1925,7 @@ var CliProvider = class {
1920
1925
  this.config = config;
1921
1926
  this.runCommand = runner;
1922
1927
  this.verbose = config.verbose ?? false;
1928
+ this.keepTempFiles = config.keepTempFiles ?? false;
1923
1929
  }
1924
1930
  async invoke(request) {
1925
1931
  if (request.signal?.aborted) {
@@ -1929,6 +1935,11 @@ var CliProvider = class {
1929
1935
  const outputFilePath = generateOutputFilePath(request.evalCaseId);
1930
1936
  const templateValues = buildTemplateValues(request, this.config, outputFilePath);
1931
1937
  const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
1938
+ if (this.verbose) {
1939
+ console.log(
1940
+ `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
1941
+ );
1942
+ }
1932
1943
  const result = await this.runCommand(renderedCommand, {
1933
1944
  cwd: this.config.cwd,
1934
1945
  env: process.env,
@@ -1952,8 +1963,7 @@ var CliProvider = class {
1952
1963
  const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
1953
1964
  const parsed = this.parseOutputContent(responseContent);
1954
1965
  return {
1955
- text: parsed.text,
1956
- trace: parsed.trace,
1966
+ outputMessages: parsed.outputMessages,
1957
1967
  raw: {
1958
1968
  command: renderedCommand,
1959
1969
  stderr: result.stderr,
@@ -1963,30 +1973,225 @@ var CliProvider = class {
1963
1973
  }
1964
1974
  };
1965
1975
  }
1976
+ async invokeBatch(requests) {
1977
+ if (requests.length === 0) {
1978
+ return [];
1979
+ }
1980
+ for (const request of requests) {
1981
+ if (request.signal?.aborted) {
1982
+ throw new Error("CLI provider batch request was aborted before execution");
1983
+ }
1984
+ }
1985
+ const controller = new AbortController();
1986
+ for (const request of requests) {
1987
+ request.signal?.addEventListener("abort", () => controller.abort(), { once: true });
1988
+ }
1989
+ await this.ensureHealthy(controller.signal);
1990
+ const outputFilePath = generateOutputFilePath("batch", ".jsonl");
1991
+ const batchInputFiles = [];
1992
+ for (const request of requests) {
1993
+ if (request.inputFiles && request.inputFiles.length > 0) {
1994
+ batchInputFiles.push(...request.inputFiles);
1995
+ }
1996
+ }
1997
+ const templateValues = buildTemplateValues(
1998
+ {
1999
+ question: "",
2000
+ guidelines: "",
2001
+ inputFiles: batchInputFiles,
2002
+ evalCaseId: "batch",
2003
+ attempt: 0
2004
+ },
2005
+ this.config,
2006
+ outputFilePath
2007
+ );
2008
+ const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
2009
+ if (this.verbose) {
2010
+ console.log(
2011
+ `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
2012
+ );
2013
+ }
2014
+ const result = await this.runCommand(renderedCommand, {
2015
+ cwd: this.config.cwd,
2016
+ env: process.env,
2017
+ timeoutMs: this.config.timeoutMs,
2018
+ signal: controller.signal
2019
+ });
2020
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
2021
+ if (controller.signal.aborted) {
2022
+ throw new Error("CLI provider request was aborted");
2023
+ }
2024
+ if (result.timedOut) {
2025
+ throw new Error(
2026
+ `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
2027
+ );
2028
+ }
2029
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
2030
+ const detail = result.stderr.trim() || result.stdout.trim();
2031
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
2032
+ throw new Error(message);
2033
+ }
2034
+ const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
2035
+ const recordsById = this.parseJsonlBatchOutput(responseContent);
2036
+ const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
2037
+ const missingIds = requestedIds.filter((id) => !recordsById.has(id));
2038
+ if (missingIds.length > 0) {
2039
+ throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
2040
+ }
2041
+ const responses = requests.map((request) => {
2042
+ const evalCaseId = request.evalCaseId;
2043
+ if (!evalCaseId) {
2044
+ return {
2045
+ outputMessages: [],
2046
+ raw: {
2047
+ command: renderedCommand,
2048
+ stderr: result.stderr,
2049
+ exitCode: result.exitCode ?? 0,
2050
+ cwd: this.config.cwd,
2051
+ outputFile: outputFilePath
2052
+ }
2053
+ };
2054
+ }
2055
+ const parsed = recordsById.get(evalCaseId);
2056
+ if (!parsed) {
2057
+ return {
2058
+ outputMessages: [],
2059
+ raw: {
2060
+ command: renderedCommand,
2061
+ stderr: result.stderr,
2062
+ exitCode: result.exitCode ?? 0,
2063
+ cwd: this.config.cwd,
2064
+ outputFile: outputFilePath
2065
+ }
2066
+ };
2067
+ }
2068
+ return {
2069
+ outputMessages: parsed.outputMessages,
2070
+ raw: {
2071
+ command: renderedCommand,
2072
+ stderr: result.stderr,
2073
+ exitCode: result.exitCode ?? 0,
2074
+ cwd: this.config.cwd,
2075
+ outputFile: outputFilePath,
2076
+ recordId: evalCaseId
2077
+ }
2078
+ };
2079
+ });
2080
+ return responses;
2081
+ }
1966
2082
  /**
1967
2083
  * Parse output content from CLI.
1968
- * If the content is valid JSON with a 'text' field, extract text and optional trace.
1969
- * Otherwise, treat the entire content as plain text.
2084
+ * If the content is valid JSON with 'output_messages' or 'text' field, extract them.
2085
+ * If only 'text' is provided, wrap it in outputMessages.
2086
+ * Otherwise, treat the entire content as plain text wrapped in outputMessages.
1970
2087
  */
1971
2088
  parseOutputContent(content) {
1972
2089
  try {
1973
2090
  const parsed = JSON.parse(content);
1974
- if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
2091
+ if (typeof parsed === "object" && parsed !== null) {
1975
2092
  const obj = parsed;
1976
- const text = typeof obj.text === "string" ? obj.text : String(obj.text);
1977
- const trace = this.parseTrace(obj.trace);
1978
- return { text, trace };
2093
+ const outputMessages = this.parseOutputMessages(obj.output_messages);
2094
+ if (outputMessages && outputMessages.length > 0) {
2095
+ return { outputMessages };
2096
+ }
2097
+ if ("text" in obj) {
2098
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
2099
+ return { outputMessages: [{ role: "assistant", content: text }] };
2100
+ }
1979
2101
  }
1980
2102
  } catch {
1981
2103
  }
1982
- return { text: content };
2104
+ return { outputMessages: [{ role: "assistant", content }] };
2105
+ }
2106
+ /**
2107
+ * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
2108
+ */
2109
+ parseOutputMessages(outputMessages) {
2110
+ if (!Array.isArray(outputMessages)) {
2111
+ return void 0;
2112
+ }
2113
+ const messages = [];
2114
+ for (const msg of outputMessages) {
2115
+ if (typeof msg !== "object" || msg === null) {
2116
+ continue;
2117
+ }
2118
+ const rawMsg = msg;
2119
+ if (typeof rawMsg.role !== "string") {
2120
+ continue;
2121
+ }
2122
+ const message = {
2123
+ role: rawMsg.role,
2124
+ name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
2125
+ content: rawMsg.content,
2126
+ toolCalls: this.parseToolCalls(rawMsg.tool_calls),
2127
+ timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
2128
+ metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
2129
+ };
2130
+ messages.push(message);
2131
+ }
2132
+ return messages.length > 0 ? messages : void 0;
1983
2133
  }
1984
- parseTrace(trace) {
1985
- if (!Array.isArray(trace)) {
2134
+ /**
2135
+ * Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
2136
+ */
2137
+ parseToolCalls(toolCalls) {
2138
+ if (!Array.isArray(toolCalls)) {
1986
2139
  return void 0;
1987
2140
  }
1988
- const validEvents = trace.filter(isTraceEvent);
1989
- return validEvents.length > 0 ? validEvents : void 0;
2141
+ const calls = [];
2142
+ for (const call of toolCalls) {
2143
+ if (typeof call !== "object" || call === null) {
2144
+ continue;
2145
+ }
2146
+ const rawCall = call;
2147
+ if (typeof rawCall.tool !== "string") {
2148
+ continue;
2149
+ }
2150
+ calls.push({
2151
+ tool: rawCall.tool,
2152
+ input: rawCall.input,
2153
+ output: rawCall.output,
2154
+ id: typeof rawCall.id === "string" ? rawCall.id : void 0,
2155
+ timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
2156
+ });
2157
+ }
2158
+ return calls.length > 0 ? calls : void 0;
2159
+ }
2160
+ parseJsonlBatchOutput(content) {
2161
+ const records = /* @__PURE__ */ new Map();
2162
+ const lines = content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
2163
+ for (const line of lines) {
2164
+ let parsed;
2165
+ try {
2166
+ parsed = JSON.parse(line);
2167
+ } catch (error) {
2168
+ const reason = error instanceof Error ? error.message : String(error);
2169
+ throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
2170
+ }
2171
+ if (typeof parsed !== "object" || parsed === null) {
2172
+ throw new Error("CLI batch output JSONL line must be an object");
2173
+ }
2174
+ const obj = parsed;
2175
+ const id = typeof obj.id === "string" ? obj.id : void 0;
2176
+ if (!id || id.trim().length === 0) {
2177
+ throw new Error("CLI batch output JSONL line missing required string field: id");
2178
+ }
2179
+ if (records.has(id)) {
2180
+ throw new Error(`CLI batch output contains duplicate id: ${id}`);
2181
+ }
2182
+ const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
2183
+ let outputMessages;
2184
+ if (parsedOutputMessages && parsedOutputMessages.length > 0) {
2185
+ outputMessages = parsedOutputMessages;
2186
+ } else {
2187
+ const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
2188
+ outputMessages = text ? [{ role: "assistant", content: text }] : [];
2189
+ }
2190
+ records.set(id, {
2191
+ outputMessages
2192
+ });
2193
+ }
2194
+ return records;
1990
2195
  }
1991
2196
  async readAndCleanupOutputFile(filePath) {
1992
2197
  try {
@@ -1996,8 +2201,10 @@ var CliProvider = class {
1996
2201
  const errorMsg = error instanceof Error ? error.message : String(error);
1997
2202
  throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
1998
2203
  } finally {
1999
- await import_promises8.default.unlink(filePath).catch(() => {
2000
- });
2204
+ if (!this.keepTempFiles) {
2205
+ await import_promises8.default.unlink(filePath).catch(() => {
2206
+ });
2207
+ }
2001
2208
  }
2002
2209
  }
2003
2210
  async ensureHealthy(signal) {
@@ -2049,7 +2256,7 @@ var CliProvider = class {
2049
2256
  );
2050
2257
  if (this.verbose) {
2051
2258
  console.log(
2052
- `[cli-provider:${this.targetName}] (healthcheck) CLI_EVALS_DIR=${process.env.CLI_EVALS_DIR ?? ""} cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
2259
+ `[cli-provider:${this.targetName}] (healthcheck) cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
2053
2260
  );
2054
2261
  }
2055
2262
  const result = await this.runCommand(renderedCommand, {
@@ -2117,11 +2324,11 @@ function shellEscape(value) {
2117
2324
  }
2118
2325
  return `'${value.replace(/'/g, `'"'"'`)}'`;
2119
2326
  }
2120
- function generateOutputFilePath(evalCaseId) {
2327
+ function generateOutputFilePath(evalCaseId, extension = ".json") {
2121
2328
  const safeEvalId = evalCaseId || "unknown";
2122
2329
  const timestamp = Date.now();
2123
2330
  const random = Math.random().toString(36).substring(2, 9);
2124
- return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
2331
+ return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
2125
2332
  }
2126
2333
  function formatTimeoutSuffix(timeoutMs) {
2127
2334
  if (!timeoutMs || timeoutMs <= 0) {
@@ -2340,7 +2547,6 @@ var CodexProvider = class {
2340
2547
  const parsed = parseCodexJson(result.stdout);
2341
2548
  const assistantText = extractAssistantText(parsed);
2342
2549
  return {
2343
- text: assistantText,
2344
2550
  raw: {
2345
2551
  response: parsed,
2346
2552
  stdout: result.stdout,
@@ -2352,7 +2558,8 @@ var CodexProvider = class {
2352
2558
  workspace: workspaceRoot,
2353
2559
  inputFiles,
2354
2560
  logFile: logger?.filePath
2355
- }
2561
+ },
2562
+ outputMessages: [{ role: "assistant", content: assistantText }]
2356
2563
  };
2357
2564
  } finally {
2358
2565
  await logger?.close();
@@ -2974,7 +3181,6 @@ var MockProvider = class {
2974
3181
  delayMs;
2975
3182
  delayMinMs;
2976
3183
  delayMaxMs;
2977
- trace;
2978
3184
  constructor(targetName, config) {
2979
3185
  this.id = `mock:${targetName}`;
2980
3186
  this.targetName = targetName;
@@ -2982,7 +3188,6 @@ var MockProvider = class {
2982
3188
  this.delayMs = config.delayMs ?? 0;
2983
3189
  this.delayMinMs = config.delayMinMs ?? 0;
2984
3190
  this.delayMaxMs = config.delayMaxMs ?? 0;
2985
- this.trace = config.trace;
2986
3191
  }
2987
3192
  async invoke(request) {
2988
3193
  const delay = this.calculateDelay();
@@ -2990,12 +3195,11 @@ var MockProvider = class {
2990
3195
  await new Promise((resolve) => setTimeout(resolve, delay));
2991
3196
  }
2992
3197
  return {
2993
- text: this.cannedResponse,
3198
+ outputMessages: [{ role: "assistant", content: this.cannedResponse }],
2994
3199
  raw: {
2995
3200
  question: request.question,
2996
3201
  guidelines: request.guidelines
2997
- },
2998
- trace: this.trace
3202
+ }
2999
3203
  };
3000
3204
  }
3001
3205
  calculateDelay() {
@@ -3263,8 +3467,7 @@ function normalizeCodexLogFormat(value) {
3263
3467
  }
3264
3468
  function resolveMockConfig(target) {
3265
3469
  const response = typeof target.response === "string" ? target.response : void 0;
3266
- const trace = Array.isArray(target.trace) ? target.trace : void 0;
3267
- return { response, trace };
3470
+ return { response };
3268
3471
  }
3269
3472
  function resolveVSCodeConfig(target, env, insiders) {
3270
3473
  const workspaceTemplateEnvVar = resolveOptionalLiteralString(
@@ -3301,10 +3504,17 @@ function resolveCliConfig(target, env, evalFilePath) {
3301
3504
  const filesFormat = resolveOptionalLiteralString(
3302
3505
  target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
3303
3506
  );
3507
+ const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
3508
+ const keepTempFiles = resolveOptionalBoolean(
3509
+ target.keep_temp_files ?? target.keepTempFiles ?? target.keep_output_files ?? target.keepOutputFiles
3510
+ );
3304
3511
  let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
3305
3512
  allowLiteral: true,
3306
3513
  optionalEnv: true
3307
3514
  });
3515
+ if (cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd)) {
3516
+ cwd = import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd);
3517
+ }
3308
3518
  if (!cwd && evalFilePath) {
3309
3519
  cwd = import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath));
3310
3520
  }
@@ -3312,7 +3522,7 @@ function resolveCliConfig(target, env, evalFilePath) {
3312
3522
  target.timeout_seconds ?? target.timeoutSeconds,
3313
3523
  `${target.name} timeout`
3314
3524
  );
3315
- const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name);
3525
+ const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name, evalFilePath);
3316
3526
  const commandTemplate = resolveString(
3317
3527
  commandTemplateSource,
3318
3528
  env,
@@ -3325,7 +3535,9 @@ function resolveCliConfig(target, env, evalFilePath) {
3325
3535
  filesFormat,
3326
3536
  cwd,
3327
3537
  timeoutMs,
3328
- healthcheck
3538
+ healthcheck,
3539
+ verbose,
3540
+ keepTempFiles
3329
3541
  };
3330
3542
  }
3331
3543
  function resolveTimeoutMs(source, description) {
@@ -3338,7 +3550,7 @@ function resolveTimeoutMs(source, description) {
3338
3550
  }
3339
3551
  return Math.floor(seconds * 1e3);
3340
3552
  }
3341
- function resolveCliHealthcheck(source, env, targetName) {
3553
+ function resolveCliHealthcheck(source, env, targetName, evalFilePath) {
3342
3554
  if (source === void 0 || source === null) {
3343
3555
  return void 0;
3344
3556
  }
@@ -3371,11 +3583,12 @@ function resolveCliHealthcheck(source, env, targetName) {
3371
3583
  allowLiteral: true,
3372
3584
  optionalEnv: true
3373
3585
  });
3586
+ const resolvedCwd = cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd) ? import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd) : cwd;
3374
3587
  return {
3375
3588
  type: "command",
3376
3589
  commandTemplate,
3377
3590
  timeoutMs,
3378
- cwd
3591
+ cwd: resolvedCwd
3379
3592
  };
3380
3593
  }
3381
3594
  throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
@@ -3619,7 +3832,7 @@ var VSCodeProvider = class {
3619
3832
  }
3620
3833
  if (this.config.dryRun) {
3621
3834
  return {
3622
- text: "",
3835
+ outputMessages: [],
3623
3836
  raw: {
3624
3837
  session,
3625
3838
  inputFiles
@@ -3628,7 +3841,7 @@ var VSCodeProvider = class {
3628
3841
  }
3629
3842
  const responseText = await readTextFile(session.responseFile);
3630
3843
  return {
3631
- text: responseText,
3844
+ outputMessages: [{ role: "assistant", content: responseText }],
3632
3845
  raw: {
3633
3846
  session,
3634
3847
  inputFiles
@@ -3666,7 +3879,7 @@ var VSCodeProvider = class {
3666
3879
  }
3667
3880
  if (this.config.dryRun) {
3668
3881
  return normalizedRequests.map(({ inputFiles }) => ({
3669
- text: "",
3882
+ outputMessages: [],
3670
3883
  raw: {
3671
3884
  session,
3672
3885
  inputFiles,
@@ -3683,7 +3896,7 @@ var VSCodeProvider = class {
3683
3896
  for (const [index, responseFile] of session.responseFiles.entries()) {
3684
3897
  const responseText = await readTextFile(responseFile);
3685
3898
  responses.push({
3686
- text: responseText,
3899
+ outputMessages: [{ role: "assistant", content: responseText }],
3687
3900
  raw: {
3688
3901
  session,
3689
3902
  inputFiles: normalizedRequests[index]?.inputFiles,
@@ -3923,6 +4136,33 @@ function resolveAndCreateProvider(definition, env = process.env) {
3923
4136
  // src/evaluation/evaluators.ts
3924
4137
  var import_ai2 = require("ai");
3925
4138
  var import_zod2 = require("zod");
4139
+
4140
+ // src/evaluation/providers/types.ts
4141
+ var AGENT_PROVIDER_KINDS = [
4142
+ "codex",
4143
+ "vscode",
4144
+ "vscode-insiders"
4145
+ ];
4146
+ function extractLastAssistantContent(messages) {
4147
+ if (!messages || messages.length === 0) {
4148
+ return "";
4149
+ }
4150
+ for (let i = messages.length - 1; i >= 0; i--) {
4151
+ const msg = messages[i];
4152
+ if (msg.role === "assistant" && msg.content !== void 0) {
4153
+ if (typeof msg.content === "string") {
4154
+ return msg.content;
4155
+ }
4156
+ return JSON.stringify(msg.content);
4157
+ }
4158
+ }
4159
+ return "";
4160
+ }
4161
+ function isAgentProvider(provider) {
4162
+ return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
4163
+ }
4164
+
4165
+ // src/evaluation/evaluators.ts
3926
4166
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3927
4167
 
3928
4168
  Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
@@ -3987,6 +4227,7 @@ var LlmJudgeEvaluator = class {
3987
4227
  null,
3988
4228
  2
3989
4229
  ),
4230
+ [TEMPLATE_VARIABLES.OUTPUT_MESSAGES]: JSON.stringify(context.outputMessages ?? [], null, 2),
3990
4231
  [TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
3991
4232
  [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
3992
4233
  [TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
@@ -4011,7 +4252,7 @@ var LlmJudgeEvaluator = class {
4011
4252
  const score = clampScore(data.score);
4012
4253
  const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
4013
4254
  const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
4014
- const reasoning = data.reasoning ?? providerResponse?.reasoning;
4255
+ const reasoning = data.reasoning;
4015
4256
  const expectedAspectCount = Math.max(hits.length + misses.length, 1);
4016
4257
  return {
4017
4258
  score,
@@ -4113,7 +4354,9 @@ var LlmJudgeEvaluator = class {
4113
4354
  maxOutputTokens: this.maxOutputTokens,
4114
4355
  temperature: this.temperature
4115
4356
  });
4116
- const data = schema.parse(parseJsonFromText(response.text ?? ""));
4357
+ const data = schema.parse(
4358
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
4359
+ );
4117
4360
  return { data, providerResponse: response };
4118
4361
  } catch (e) {
4119
4362
  lastError = e instanceof Error ? e : new Error(String(e));
@@ -4196,15 +4439,16 @@ var CodeEvaluator = class {
4196
4439
  {
4197
4440
  question: context.evalCase.question,
4198
4441
  expected_outcome: context.evalCase.expected_outcome,
4442
+ expected_messages: context.evalCase.expected_messages,
4199
4443
  reference_answer: context.evalCase.reference_answer,
4200
4444
  candidate_answer: context.candidate,
4445
+ output_messages: context.outputMessages ?? null,
4201
4446
  guideline_files: context.evalCase.guideline_paths,
4202
4447
  input_files: context.evalCase.file_paths.filter(
4203
4448
  (path15) => !context.evalCase.guideline_paths.includes(path15)
4204
4449
  ),
4205
4450
  input_messages: context.evalCase.input_messages,
4206
- candidate_trace_file: context.candidateTraceRef ?? null,
4207
- candidate_trace_summary: context.candidateTraceSummary ?? null
4451
+ candidate_trace_summary: context.traceSummary ?? null
4208
4452
  },
4209
4453
  null,
4210
4454
  2
@@ -4331,8 +4575,19 @@ var ToolTrajectoryEvaluator = class {
4331
4575
  this.config = options.config;
4332
4576
  }
4333
4577
  evaluate(context) {
4334
- const { candidateTrace, candidateTraceSummary } = context;
4335
- if (!candidateTrace || !candidateTraceSummary) {
4578
+ const { outputMessages, traceSummary } = context;
4579
+ const toolCalls = this.extractToolCallsFromMessages(outputMessages);
4580
+ if (toolCalls.length === 0 && !traceSummary) {
4581
+ return {
4582
+ score: 0,
4583
+ verdict: "fail",
4584
+ hits: [],
4585
+ misses: ["No trace available for evaluation"],
4586
+ expectedAspectCount: 1
4587
+ };
4588
+ }
4589
+ const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
4590
+ if (!summary) {
4336
4591
  return {
4337
4592
  score: 0,
4338
4593
  verdict: "fail",
@@ -4343,11 +4598,11 @@ var ToolTrajectoryEvaluator = class {
4343
4598
  }
4344
4599
  switch (this.config.mode) {
4345
4600
  case "any_order":
4346
- return this.evaluateAnyOrder(candidateTraceSummary);
4601
+ return this.evaluateAnyOrder(summary);
4347
4602
  case "in_order":
4348
- return this.evaluateInOrder(candidateTrace);
4603
+ return this.evaluateInOrder(toolCalls);
4349
4604
  case "exact":
4350
- return this.evaluateExact(candidateTrace);
4605
+ return this.evaluateExact(toolCalls);
4351
4606
  default:
4352
4607
  return {
4353
4608
  score: 0,
@@ -4358,6 +4613,39 @@ var ToolTrajectoryEvaluator = class {
4358
4613
  };
4359
4614
  }
4360
4615
  }
4616
+ /**
4617
+ * Extract tool calls from output messages.
4618
+ */
4619
+ extractToolCallsFromMessages(messages) {
4620
+ if (!messages) {
4621
+ return [];
4622
+ }
4623
+ const toolCalls = [];
4624
+ for (const message of messages) {
4625
+ if (message.toolCalls) {
4626
+ for (const call of message.toolCalls) {
4627
+ toolCalls.push({ name: call.tool });
4628
+ }
4629
+ }
4630
+ }
4631
+ return toolCalls;
4632
+ }
4633
+ /**
4634
+ * Build a summary from extracted tool calls.
4635
+ */
4636
+ buildSummary(toolCalls) {
4637
+ const toolCallsByName = {};
4638
+ for (const call of toolCalls) {
4639
+ toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
4640
+ }
4641
+ const toolNames = Object.keys(toolCallsByName).sort();
4642
+ return {
4643
+ eventCount: toolCalls.length,
4644
+ toolNames,
4645
+ toolCallsByName,
4646
+ errorCount: 0
4647
+ };
4648
+ }
4361
4649
  evaluateAnyOrder(summary) {
4362
4650
  const minimums = this.config.minimums ?? {};
4363
4651
  const toolNames = Object.keys(minimums);
@@ -4390,7 +4678,7 @@ var ToolTrajectoryEvaluator = class {
4390
4678
  expectedAspectCount: toolNames.length
4391
4679
  };
4392
4680
  }
4393
- evaluateInOrder(trace) {
4681
+ evaluateInOrder(toolCalls) {
4394
4682
  const expected = this.config.expected ?? [];
4395
4683
  if (expected.length === 0) {
4396
4684
  return {
@@ -4401,15 +4689,14 @@ var ToolTrajectoryEvaluator = class {
4401
4689
  expectedAspectCount: 0
4402
4690
  };
4403
4691
  }
4404
- const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
4405
4692
  const hits = [];
4406
4693
  const misses = [];
4407
4694
  let actualIndex = 0;
4408
4695
  for (let i = 0; i < expected.length; i++) {
4409
4696
  const expectedTool = expected[i].tool;
4410
4697
  let found = false;
4411
- while (actualIndex < actualToolCalls.length) {
4412
- if (actualToolCalls[actualIndex].name === expectedTool) {
4698
+ while (actualIndex < toolCalls.length) {
4699
+ if (toolCalls[actualIndex].name === expectedTool) {
4413
4700
  hits.push(`Found ${expectedTool} at position ${actualIndex}`);
4414
4701
  actualIndex++;
4415
4702
  found = true;
@@ -4430,7 +4717,7 @@ var ToolTrajectoryEvaluator = class {
4430
4717
  expectedAspectCount: expected.length
4431
4718
  };
4432
4719
  }
4433
- evaluateExact(trace) {
4720
+ evaluateExact(toolCalls) {
4434
4721
  const expected = this.config.expected ?? [];
4435
4722
  if (expected.length === 0) {
4436
4723
  return {
@@ -4441,16 +4728,15 @@ var ToolTrajectoryEvaluator = class {
4441
4728
  expectedAspectCount: 0
4442
4729
  };
4443
4730
  }
4444
- const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
4445
4731
  const hits = [];
4446
4732
  const misses = [];
4447
- if (actualToolCalls.length !== expected.length) {
4448
- misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
4733
+ if (toolCalls.length !== expected.length) {
4734
+ misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
4449
4735
  }
4450
- const checkLength = Math.min(expected.length, actualToolCalls.length);
4736
+ const checkLength = Math.min(expected.length, toolCalls.length);
4451
4737
  for (let i = 0; i < checkLength; i++) {
4452
4738
  const expectedTool = expected[i].tool;
4453
- const actualTool = actualToolCalls[i].name;
4739
+ const actualTool = toolCalls[i].name;
4454
4740
  if (actualTool === expectedTool) {
4455
4741
  hits.push(`Position ${i}: ${expectedTool} \u2713`);
4456
4742
  } else {
@@ -4664,11 +4950,13 @@ var CompositeEvaluator = class {
4664
4950
  evalCaseId: context.evalCase.id,
4665
4951
  attempt: context.attempt
4666
4952
  });
4667
- const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
4953
+ const data = freeformEvaluationSchema.parse(
4954
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
4955
+ );
4668
4956
  const score = clampScore(data.score);
4669
4957
  const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
4670
4958
  const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
4671
- const reasoning = data.reasoning ?? response.reasoning;
4959
+ const reasoning = data.reasoning;
4672
4960
  return {
4673
4961
  score,
4674
4962
  verdict: scoreToVerdict(score),
@@ -4837,16 +5125,6 @@ function validateConcurrency(concurrency) {
4837
5125
  }
4838
5126
  }
4839
5127
 
4840
- // src/evaluation/providers/types.ts
4841
- var AGENT_PROVIDER_KINDS = [
4842
- "codex",
4843
- "vscode",
4844
- "vscode-insiders"
4845
- ];
4846
- function isAgentProvider(provider) {
4847
- return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
4848
- }
4849
-
4850
5128
  // src/evaluation/orchestrator.ts
4851
5129
  async function runEvaluation(options) {
4852
5130
  const {
@@ -5101,11 +5379,14 @@ async function runBatchEvaluation(options) {
5101
5379
  const evalCase = evalCases[i];
5102
5380
  const promptInputs = promptInputsList[i];
5103
5381
  const providerResponse = batchResponse[i];
5382
+ const outputMessages = providerResponse.outputMessages;
5383
+ const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
5384
+ const candidate = extractLastAssistantContent(outputMessages);
5104
5385
  let result;
5105
5386
  try {
5106
5387
  result = await evaluateCandidate({
5107
5388
  evalCase,
5108
- candidate: providerResponse.text ?? "",
5389
+ candidate,
5109
5390
  target,
5110
5391
  provider,
5111
5392
  evaluators: evaluatorRegistry,
@@ -5113,7 +5394,9 @@ async function runBatchEvaluation(options) {
5113
5394
  nowFn,
5114
5395
  attempt: 0,
5115
5396
  judgeProvider: await resolveJudgeProvider(target),
5116
- agentTimeoutMs
5397
+ agentTimeoutMs,
5398
+ outputMessages,
5399
+ traceSummary
5117
5400
  });
5118
5401
  } catch (error) {
5119
5402
  const errorResult = buildErrorResult(
@@ -5217,21 +5500,13 @@ async function runEvalCase(options) {
5217
5500
  if (cacheKey && cache && !cachedResponse) {
5218
5501
  await cache.set(cacheKey, providerResponse);
5219
5502
  }
5220
- let candidateTrace = providerResponse.trace;
5221
- if (!candidateTrace && providerResponse.traceRef) {
5222
- try {
5223
- const rawTrace = await readJsonFile(providerResponse.traceRef);
5224
- if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
5225
- candidateTrace = rawTrace;
5226
- }
5227
- } catch {
5228
- }
5229
- }
5230
- const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
5503
+ const outputMessages = providerResponse.outputMessages;
5504
+ const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
5505
+ const candidate = extractLastAssistantContent(outputMessages);
5231
5506
  try {
5232
5507
  return await evaluateCandidate({
5233
5508
  evalCase,
5234
- candidate: providerResponse.text ?? "",
5509
+ candidate,
5235
5510
  target,
5236
5511
  provider,
5237
5512
  evaluators,
@@ -5240,9 +5515,8 @@ async function runEvalCase(options) {
5240
5515
  attempt,
5241
5516
  judgeProvider,
5242
5517
  agentTimeoutMs,
5243
- candidateTrace,
5244
- candidateTraceRef: providerResponse.traceRef,
5245
- candidateTraceSummary
5518
+ outputMessages,
5519
+ traceSummary
5246
5520
  });
5247
5521
  } catch (error) {
5248
5522
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -5260,9 +5534,8 @@ async function evaluateCandidate(options) {
5260
5534
  attempt,
5261
5535
  judgeProvider,
5262
5536
  agentTimeoutMs,
5263
- candidateTrace,
5264
- candidateTraceRef,
5265
- candidateTraceSummary
5537
+ outputMessages,
5538
+ traceSummary
5266
5539
  } = options;
5267
5540
  const gradeTimestamp = nowFn();
5268
5541
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -5276,9 +5549,8 @@ async function evaluateCandidate(options) {
5276
5549
  now: gradeTimestamp,
5277
5550
  judgeProvider,
5278
5551
  agentTimeoutMs,
5279
- candidateTrace,
5280
- candidateTraceRef,
5281
- candidateTraceSummary
5552
+ outputMessages,
5553
+ traceSummary
5282
5554
  });
5283
5555
  const completedAt = nowFn();
5284
5556
  let agentProviderRequest;
@@ -5316,7 +5588,7 @@ async function evaluateCandidate(options) {
5316
5588
  lm_provider_request: lmProviderRequest,
5317
5589
  evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
5318
5590
  evaluator_results: evaluatorResults,
5319
- trace_summary: candidateTraceSummary
5591
+ trace_summary: traceSummary
5320
5592
  };
5321
5593
  }
5322
5594
  async function runEvaluatorsForCase(options) {
@@ -5331,9 +5603,8 @@ async function runEvaluatorsForCase(options) {
5331
5603
  now,
5332
5604
  judgeProvider,
5333
5605
  agentTimeoutMs,
5334
- candidateTrace,
5335
- candidateTraceRef,
5336
- candidateTraceSummary
5606
+ outputMessages,
5607
+ traceSummary
5337
5608
  } = options;
5338
5609
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
5339
5610
  return runEvaluatorList({
@@ -5348,9 +5619,8 @@ async function runEvaluatorsForCase(options) {
5348
5619
  now,
5349
5620
  judgeProvider,
5350
5621
  agentTimeoutMs,
5351
- candidateTrace,
5352
- candidateTraceRef,
5353
- candidateTraceSummary
5622
+ outputMessages,
5623
+ traceSummary
5354
5624
  });
5355
5625
  }
5356
5626
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -5367,9 +5637,8 @@ async function runEvaluatorsForCase(options) {
5367
5637
  promptInputs,
5368
5638
  now,
5369
5639
  judgeProvider,
5370
- candidateTrace,
5371
- candidateTraceRef,
5372
- candidateTraceSummary
5640
+ outputMessages,
5641
+ traceSummary
5373
5642
  });
5374
5643
  return { score };
5375
5644
  }
@@ -5386,9 +5655,8 @@ async function runEvaluatorList(options) {
5386
5655
  now,
5387
5656
  judgeProvider,
5388
5657
  agentTimeoutMs,
5389
- candidateTrace,
5390
- candidateTraceRef,
5391
- candidateTraceSummary
5658
+ outputMessages,
5659
+ traceSummary
5392
5660
  } = options;
5393
5661
  const scored = [];
5394
5662
  const evaluatorResults = [];
@@ -5435,8 +5703,8 @@ async function runEvaluatorList(options) {
5435
5703
  attempt,
5436
5704
  promptInputs,
5437
5705
  now,
5438
- candidateTraceRef,
5439
- candidateTraceSummary
5706
+ outputMessages,
5707
+ traceSummary
5440
5708
  });
5441
5709
  const weight = evaluator.weight ?? 1;
5442
5710
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -5522,9 +5790,8 @@ async function runEvaluatorList(options) {
5522
5790
  attempt,
5523
5791
  promptInputs,
5524
5792
  now,
5525
- candidateTrace,
5526
- candidateTraceRef,
5527
- candidateTraceSummary
5793
+ outputMessages,
5794
+ traceSummary
5528
5795
  });
5529
5796
  const weight = evaluator.weight ?? 1;
5530
5797
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -5918,8 +6185,6 @@ function createAgentKernel() {
5918
6185
  isJsonValue,
5919
6186
  isTestMessage,
5920
6187
  isTestMessageRole,
5921
- isTraceEvent,
5922
- isTraceEventType,
5923
6188
  listTargetNames,
5924
6189
  loadEvalCases,
5925
6190
  normalizeLineEndings,