@agentv/core 1.2.0 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,6 +1,7 @@
1
1
  import {
2
2
  buildDirectoryChain,
3
3
  buildSearchRoots,
4
+ extractLastAssistantContent,
4
5
  fileExists,
5
6
  findGitRoot,
6
7
  isAgentProvider,
@@ -9,7 +10,7 @@ import {
9
10
  readTextFile,
10
11
  resolveFileReference,
11
12
  resolveTargetDefinition
12
- } from "./chunk-V3JCB3HI.js";
13
+ } from "./chunk-KPHTMTZ3.js";
13
14
 
14
15
  // src/evaluation/types.ts
15
16
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -74,33 +75,22 @@ function getHitCount(result) {
74
75
  }
75
76
 
76
77
  // src/evaluation/trace.ts
77
- function isTraceEventType(value) {
78
- return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
79
- }
80
- function isTraceEvent(value) {
81
- if (typeof value !== "object" || value === null) {
82
- return false;
83
- }
84
- const candidate = value;
85
- return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
86
- }
87
- function computeTraceSummary(trace) {
78
+ function computeTraceSummary(messages) {
88
79
  const toolCallCounts = {};
89
- let errorCount = 0;
90
- for (const event of trace) {
91
- if (event.type === "tool_call" && event.name) {
92
- toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
93
- }
94
- if (event.type === "error") {
95
- errorCount++;
80
+ let totalToolCalls = 0;
81
+ for (const message of messages) {
82
+ if (!message.toolCalls) continue;
83
+ for (const toolCall of message.toolCalls) {
84
+ toolCallCounts[toolCall.tool] = (toolCallCounts[toolCall.tool] ?? 0) + 1;
85
+ totalToolCalls++;
96
86
  }
97
87
  }
98
88
  const toolNames = Object.keys(toolCallCounts).sort();
99
89
  return {
100
- eventCount: trace.length,
90
+ eventCount: totalToolCalls,
101
91
  toolNames,
102
92
  toolCallsByName: toolCallCounts,
103
- errorCount
93
+ errorCount: 0
104
94
  };
105
95
  }
106
96
 
@@ -376,7 +366,8 @@ var TEMPLATE_VARIABLES = {
376
366
  QUESTION: "question",
377
367
  EXPECTED_OUTCOME: "expected_outcome",
378
368
  REFERENCE_ANSWER: "reference_answer",
379
- INPUT_MESSAGES: "input_messages"
369
+ INPUT_MESSAGES: "input_messages",
370
+ OUTPUT_MESSAGES: "output_messages"
380
371
  };
381
372
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
382
373
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
@@ -774,6 +765,17 @@ async function processMessages(options) {
774
765
  }
775
766
  continue;
776
767
  }
768
+ if (isJsonObject(content)) {
769
+ const rendered = JSON.stringify(content, null, 2);
770
+ segments.push({ type: "text", value: rendered });
771
+ if (textParts) {
772
+ textParts.push(rendered);
773
+ }
774
+ continue;
775
+ }
776
+ if (!Array.isArray(content)) {
777
+ continue;
778
+ }
777
779
  for (const rawSegment of content) {
778
780
  if (!isJsonObject(rawSegment)) {
779
781
  continue;
@@ -1000,6 +1002,11 @@ async function buildPromptInputs(testCase, mode = "lm") {
1000
1002
  }
1001
1003
  }
1002
1004
  }
1005
+ } else if (isJsonObject(message.content)) {
1006
+ const rendered = JSON.stringify(message.content, null, 2);
1007
+ if (rendered.trim().length > 0) {
1008
+ messageSegments.push({ type: "text", value: rendered });
1009
+ }
1003
1010
  }
1004
1011
  segmentsByMessage.push(messageSegments);
1005
1012
  }
@@ -1243,16 +1250,16 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1243
1250
  }) : [];
1244
1251
  const codeSnippets = extractCodeBlocks(inputSegments);
1245
1252
  let referenceAnswer = "";
1246
- if (outputSegments.length > 1) {
1247
- referenceAnswer = JSON.stringify(outputSegments, null, 2);
1248
- } else if (outputSegments.length === 1) {
1249
- const singleMessage = outputSegments[0];
1250
- if (typeof singleMessage.content === "string") {
1251
- referenceAnswer = singleMessage.content;
1252
- } else if (singleMessage.content) {
1253
- referenceAnswer = JSON.stringify(singleMessage, null, 2);
1254
- } else if (singleMessage.tool_calls) {
1255
- referenceAnswer = JSON.stringify(singleMessage, null, 2);
1253
+ if (outputSegments.length > 0) {
1254
+ const lastMessage = outputSegments[outputSegments.length - 1];
1255
+ const content = lastMessage.content;
1256
+ const toolCalls = lastMessage.tool_calls;
1257
+ if (typeof content === "string") {
1258
+ referenceAnswer = content;
1259
+ } else if (content !== void 0 && content !== null) {
1260
+ referenceAnswer = JSON.stringify(content, null, 2);
1261
+ } else if (toolCalls !== void 0 && toolCalls !== null) {
1262
+ referenceAnswer = JSON.stringify(toolCalls, null, 2);
1256
1263
  }
1257
1264
  }
1258
1265
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
@@ -1580,11 +1587,11 @@ async function invokeModel(options) {
1580
1587
  return mapResponse(result);
1581
1588
  }
1582
1589
  function mapResponse(result) {
1590
+ const content = result.text ?? "";
1583
1591
  return {
1584
- text: result.text ?? "",
1585
- reasoning: result.reasoningText ?? void 0,
1586
1592
  raw: result,
1587
- usage: toJsonObject(result.totalUsage ?? result.usage)
1593
+ usage: toJsonObject(result.totalUsage ?? result.usage),
1594
+ outputMessages: [{ role: "assistant", content }]
1588
1595
  };
1589
1596
  }
1590
1597
  function toJsonObject(value) {
@@ -1733,10 +1740,11 @@ var CliProvider = class {
1733
1740
  id;
1734
1741
  kind = "cli";
1735
1742
  targetName;
1736
- supportsBatch = false;
1743
+ supportsBatch = true;
1737
1744
  config;
1738
1745
  runCommand;
1739
1746
  verbose;
1747
+ keepTempFiles;
1740
1748
  healthcheckPromise;
1741
1749
  constructor(targetName, config, runner = defaultCommandRunner) {
1742
1750
  this.targetName = targetName;
@@ -1744,6 +1752,7 @@ var CliProvider = class {
1744
1752
  this.config = config;
1745
1753
  this.runCommand = runner;
1746
1754
  this.verbose = config.verbose ?? false;
1755
+ this.keepTempFiles = config.keepTempFiles ?? false;
1747
1756
  }
1748
1757
  async invoke(request) {
1749
1758
  if (request.signal?.aborted) {
@@ -1753,6 +1762,11 @@ var CliProvider = class {
1753
1762
  const outputFilePath = generateOutputFilePath(request.evalCaseId);
1754
1763
  const templateValues = buildTemplateValues(request, this.config, outputFilePath);
1755
1764
  const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
1765
+ if (this.verbose) {
1766
+ console.log(
1767
+ `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
1768
+ );
1769
+ }
1756
1770
  const result = await this.runCommand(renderedCommand, {
1757
1771
  cwd: this.config.cwd,
1758
1772
  env: process.env,
@@ -1776,8 +1790,7 @@ var CliProvider = class {
1776
1790
  const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
1777
1791
  const parsed = this.parseOutputContent(responseContent);
1778
1792
  return {
1779
- text: parsed.text,
1780
- trace: parsed.trace,
1793
+ outputMessages: parsed.outputMessages,
1781
1794
  raw: {
1782
1795
  command: renderedCommand,
1783
1796
  stderr: result.stderr,
@@ -1787,30 +1800,225 @@ var CliProvider = class {
1787
1800
  }
1788
1801
  };
1789
1802
  }
1803
+ async invokeBatch(requests) {
1804
+ if (requests.length === 0) {
1805
+ return [];
1806
+ }
1807
+ for (const request of requests) {
1808
+ if (request.signal?.aborted) {
1809
+ throw new Error("CLI provider batch request was aborted before execution");
1810
+ }
1811
+ }
1812
+ const controller = new AbortController();
1813
+ for (const request of requests) {
1814
+ request.signal?.addEventListener("abort", () => controller.abort(), { once: true });
1815
+ }
1816
+ await this.ensureHealthy(controller.signal);
1817
+ const outputFilePath = generateOutputFilePath("batch", ".jsonl");
1818
+ const batchInputFiles = [];
1819
+ for (const request of requests) {
1820
+ if (request.inputFiles && request.inputFiles.length > 0) {
1821
+ batchInputFiles.push(...request.inputFiles);
1822
+ }
1823
+ }
1824
+ const templateValues = buildTemplateValues(
1825
+ {
1826
+ question: "",
1827
+ guidelines: "",
1828
+ inputFiles: batchInputFiles,
1829
+ evalCaseId: "batch",
1830
+ attempt: 0
1831
+ },
1832
+ this.config,
1833
+ outputFilePath
1834
+ );
1835
+ const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
1836
+ if (this.verbose) {
1837
+ console.log(
1838
+ `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
1839
+ );
1840
+ }
1841
+ const result = await this.runCommand(renderedCommand, {
1842
+ cwd: this.config.cwd,
1843
+ env: process.env,
1844
+ timeoutMs: this.config.timeoutMs,
1845
+ signal: controller.signal
1846
+ });
1847
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
1848
+ if (controller.signal.aborted) {
1849
+ throw new Error("CLI provider request was aborted");
1850
+ }
1851
+ if (result.timedOut) {
1852
+ throw new Error(
1853
+ `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
1854
+ );
1855
+ }
1856
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
1857
+ const detail = result.stderr.trim() || result.stdout.trim();
1858
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
1859
+ throw new Error(message);
1860
+ }
1861
+ const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
1862
+ const recordsById = this.parseJsonlBatchOutput(responseContent);
1863
+ const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
1864
+ const missingIds = requestedIds.filter((id) => !recordsById.has(id));
1865
+ if (missingIds.length > 0) {
1866
+ throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
1867
+ }
1868
+ const responses = requests.map((request) => {
1869
+ const evalCaseId = request.evalCaseId;
1870
+ if (!evalCaseId) {
1871
+ return {
1872
+ outputMessages: [],
1873
+ raw: {
1874
+ command: renderedCommand,
1875
+ stderr: result.stderr,
1876
+ exitCode: result.exitCode ?? 0,
1877
+ cwd: this.config.cwd,
1878
+ outputFile: outputFilePath
1879
+ }
1880
+ };
1881
+ }
1882
+ const parsed = recordsById.get(evalCaseId);
1883
+ if (!parsed) {
1884
+ return {
1885
+ outputMessages: [],
1886
+ raw: {
1887
+ command: renderedCommand,
1888
+ stderr: result.stderr,
1889
+ exitCode: result.exitCode ?? 0,
1890
+ cwd: this.config.cwd,
1891
+ outputFile: outputFilePath
1892
+ }
1893
+ };
1894
+ }
1895
+ return {
1896
+ outputMessages: parsed.outputMessages,
1897
+ raw: {
1898
+ command: renderedCommand,
1899
+ stderr: result.stderr,
1900
+ exitCode: result.exitCode ?? 0,
1901
+ cwd: this.config.cwd,
1902
+ outputFile: outputFilePath,
1903
+ recordId: evalCaseId
1904
+ }
1905
+ };
1906
+ });
1907
+ return responses;
1908
+ }
1790
1909
  /**
1791
1910
  * Parse output content from CLI.
1792
- * If the content is valid JSON with a 'text' field, extract text and optional trace.
1793
- * Otherwise, treat the entire content as plain text.
1911
+ * If the content is valid JSON with 'output_messages' or 'text' field, extract them.
1912
+ * If only 'text' is provided, wrap it in outputMessages.
1913
+ * Otherwise, treat the entire content as plain text wrapped in outputMessages.
1794
1914
  */
1795
1915
  parseOutputContent(content) {
1796
1916
  try {
1797
1917
  const parsed = JSON.parse(content);
1798
- if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
1918
+ if (typeof parsed === "object" && parsed !== null) {
1799
1919
  const obj = parsed;
1800
- const text = typeof obj.text === "string" ? obj.text : String(obj.text);
1801
- const trace = this.parseTrace(obj.trace);
1802
- return { text, trace };
1920
+ const outputMessages = this.parseOutputMessages(obj.output_messages);
1921
+ if (outputMessages && outputMessages.length > 0) {
1922
+ return { outputMessages };
1923
+ }
1924
+ if ("text" in obj) {
1925
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
1926
+ return { outputMessages: [{ role: "assistant", content: text }] };
1927
+ }
1803
1928
  }
1804
1929
  } catch {
1805
1930
  }
1806
- return { text: content };
1931
+ return { outputMessages: [{ role: "assistant", content }] };
1807
1932
  }
1808
- parseTrace(trace) {
1809
- if (!Array.isArray(trace)) {
1933
+ /**
1934
+ * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
1935
+ */
1936
+ parseOutputMessages(outputMessages) {
1937
+ if (!Array.isArray(outputMessages)) {
1810
1938
  return void 0;
1811
1939
  }
1812
- const validEvents = trace.filter(isTraceEvent);
1813
- return validEvents.length > 0 ? validEvents : void 0;
1940
+ const messages = [];
1941
+ for (const msg of outputMessages) {
1942
+ if (typeof msg !== "object" || msg === null) {
1943
+ continue;
1944
+ }
1945
+ const rawMsg = msg;
1946
+ if (typeof rawMsg.role !== "string") {
1947
+ continue;
1948
+ }
1949
+ const message = {
1950
+ role: rawMsg.role,
1951
+ name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
1952
+ content: rawMsg.content,
1953
+ toolCalls: this.parseToolCalls(rawMsg.tool_calls),
1954
+ timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
1955
+ metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
1956
+ };
1957
+ messages.push(message);
1958
+ }
1959
+ return messages.length > 0 ? messages : void 0;
1960
+ }
1961
+ /**
1962
+ * Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
1963
+ */
1964
+ parseToolCalls(toolCalls) {
1965
+ if (!Array.isArray(toolCalls)) {
1966
+ return void 0;
1967
+ }
1968
+ const calls = [];
1969
+ for (const call of toolCalls) {
1970
+ if (typeof call !== "object" || call === null) {
1971
+ continue;
1972
+ }
1973
+ const rawCall = call;
1974
+ if (typeof rawCall.tool !== "string") {
1975
+ continue;
1976
+ }
1977
+ calls.push({
1978
+ tool: rawCall.tool,
1979
+ input: rawCall.input,
1980
+ output: rawCall.output,
1981
+ id: typeof rawCall.id === "string" ? rawCall.id : void 0,
1982
+ timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
1983
+ });
1984
+ }
1985
+ return calls.length > 0 ? calls : void 0;
1986
+ }
1987
+ parseJsonlBatchOutput(content) {
1988
+ const records = /* @__PURE__ */ new Map();
1989
+ const lines = content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
1990
+ for (const line of lines) {
1991
+ let parsed;
1992
+ try {
1993
+ parsed = JSON.parse(line);
1994
+ } catch (error) {
1995
+ const reason = error instanceof Error ? error.message : String(error);
1996
+ throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
1997
+ }
1998
+ if (typeof parsed !== "object" || parsed === null) {
1999
+ throw new Error("CLI batch output JSONL line must be an object");
2000
+ }
2001
+ const obj = parsed;
2002
+ const id = typeof obj.id === "string" ? obj.id : void 0;
2003
+ if (!id || id.trim().length === 0) {
2004
+ throw new Error("CLI batch output JSONL line missing required string field: id");
2005
+ }
2006
+ if (records.has(id)) {
2007
+ throw new Error(`CLI batch output contains duplicate id: ${id}`);
2008
+ }
2009
+ const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
2010
+ let outputMessages;
2011
+ if (parsedOutputMessages && parsedOutputMessages.length > 0) {
2012
+ outputMessages = parsedOutputMessages;
2013
+ } else {
2014
+ const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
2015
+ outputMessages = text ? [{ role: "assistant", content: text }] : [];
2016
+ }
2017
+ records.set(id, {
2018
+ outputMessages
2019
+ });
2020
+ }
2021
+ return records;
1814
2022
  }
1815
2023
  async readAndCleanupOutputFile(filePath) {
1816
2024
  try {
@@ -1820,8 +2028,10 @@ var CliProvider = class {
1820
2028
  const errorMsg = error instanceof Error ? error.message : String(error);
1821
2029
  throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
1822
2030
  } finally {
1823
- await fs.unlink(filePath).catch(() => {
1824
- });
2031
+ if (!this.keepTempFiles) {
2032
+ await fs.unlink(filePath).catch(() => {
2033
+ });
2034
+ }
1825
2035
  }
1826
2036
  }
1827
2037
  async ensureHealthy(signal) {
@@ -1873,7 +2083,7 @@ var CliProvider = class {
1873
2083
  );
1874
2084
  if (this.verbose) {
1875
2085
  console.log(
1876
- `[cli-provider:${this.targetName}] (healthcheck) CLI_EVALS_DIR=${process.env.CLI_EVALS_DIR ?? ""} cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
2086
+ `[cli-provider:${this.targetName}] (healthcheck) cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
1877
2087
  );
1878
2088
  }
1879
2089
  const result = await this.runCommand(renderedCommand, {
@@ -1941,11 +2151,11 @@ function shellEscape(value) {
1941
2151
  }
1942
2152
  return `'${value.replace(/'/g, `'"'"'`)}'`;
1943
2153
  }
1944
- function generateOutputFilePath(evalCaseId) {
2154
+ function generateOutputFilePath(evalCaseId, extension = ".json") {
1945
2155
  const safeEvalId = evalCaseId || "unknown";
1946
2156
  const timestamp = Date.now();
1947
2157
  const random = Math.random().toString(36).substring(2, 9);
1948
- return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
2158
+ return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
1949
2159
  }
1950
2160
  function formatTimeoutSuffix(timeoutMs) {
1951
2161
  if (!timeoutMs || timeoutMs <= 0) {
@@ -2164,7 +2374,6 @@ var CodexProvider = class {
2164
2374
  const parsed = parseCodexJson(result.stdout);
2165
2375
  const assistantText = extractAssistantText(parsed);
2166
2376
  return {
2167
- text: assistantText,
2168
2377
  raw: {
2169
2378
  response: parsed,
2170
2379
  stdout: result.stdout,
@@ -2176,7 +2385,8 @@ var CodexProvider = class {
2176
2385
  workspace: workspaceRoot,
2177
2386
  inputFiles,
2178
2387
  logFile: logger?.filePath
2179
- }
2388
+ },
2389
+ outputMessages: [{ role: "assistant", content: assistantText }]
2180
2390
  };
2181
2391
  } finally {
2182
2392
  await logger?.close();
@@ -2798,7 +3008,6 @@ var MockProvider = class {
2798
3008
  delayMs;
2799
3009
  delayMinMs;
2800
3010
  delayMaxMs;
2801
- trace;
2802
3011
  constructor(targetName, config) {
2803
3012
  this.id = `mock:${targetName}`;
2804
3013
  this.targetName = targetName;
@@ -2806,7 +3015,6 @@ var MockProvider = class {
2806
3015
  this.delayMs = config.delayMs ?? 0;
2807
3016
  this.delayMinMs = config.delayMinMs ?? 0;
2808
3017
  this.delayMaxMs = config.delayMaxMs ?? 0;
2809
- this.trace = config.trace;
2810
3018
  }
2811
3019
  async invoke(request) {
2812
3020
  const delay = this.calculateDelay();
@@ -2814,12 +3022,11 @@ var MockProvider = class {
2814
3022
  await new Promise((resolve) => setTimeout(resolve, delay));
2815
3023
  }
2816
3024
  return {
2817
- text: this.cannedResponse,
3025
+ outputMessages: [{ role: "assistant", content: this.cannedResponse }],
2818
3026
  raw: {
2819
3027
  question: request.question,
2820
3028
  guidelines: request.guidelines
2821
- },
2822
- trace: this.trace
3029
+ }
2823
3030
  };
2824
3031
  }
2825
3032
  calculateDelay() {
@@ -2912,7 +3119,7 @@ var VSCodeProvider = class {
2912
3119
  }
2913
3120
  if (this.config.dryRun) {
2914
3121
  return {
2915
- text: "",
3122
+ outputMessages: [],
2916
3123
  raw: {
2917
3124
  session,
2918
3125
  inputFiles
@@ -2921,7 +3128,7 @@ var VSCodeProvider = class {
2921
3128
  }
2922
3129
  const responseText = await readTextFile(session.responseFile);
2923
3130
  return {
2924
- text: responseText,
3131
+ outputMessages: [{ role: "assistant", content: responseText }],
2925
3132
  raw: {
2926
3133
  session,
2927
3134
  inputFiles
@@ -2959,7 +3166,7 @@ var VSCodeProvider = class {
2959
3166
  }
2960
3167
  if (this.config.dryRun) {
2961
3168
  return normalizedRequests.map(({ inputFiles }) => ({
2962
- text: "",
3169
+ outputMessages: [],
2963
3170
  raw: {
2964
3171
  session,
2965
3172
  inputFiles,
@@ -2976,7 +3183,7 @@ var VSCodeProvider = class {
2976
3183
  for (const [index, responseFile] of session.responseFiles.entries()) {
2977
3184
  const responseText = await readTextFile(responseFile);
2978
3185
  responses.push({
2979
- text: responseText,
3186
+ outputMessages: [{ role: "assistant", content: responseText }],
2980
3187
  raw: {
2981
3188
  session,
2982
3189
  inputFiles: normalizedRequests[index]?.inputFiles,
@@ -3280,6 +3487,7 @@ var LlmJudgeEvaluator = class {
3280
3487
  null,
3281
3488
  2
3282
3489
  ),
3490
+ [TEMPLATE_VARIABLES.OUTPUT_MESSAGES]: JSON.stringify(context.outputMessages ?? [], null, 2),
3283
3491
  [TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
3284
3492
  [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
3285
3493
  [TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
@@ -3304,7 +3512,7 @@ var LlmJudgeEvaluator = class {
3304
3512
  const score = clampScore(data.score);
3305
3513
  const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
3306
3514
  const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
3307
- const reasoning = data.reasoning ?? providerResponse?.reasoning;
3515
+ const reasoning = data.reasoning;
3308
3516
  const expectedAspectCount = Math.max(hits.length + misses.length, 1);
3309
3517
  return {
3310
3518
  score,
@@ -3406,7 +3614,9 @@ var LlmJudgeEvaluator = class {
3406
3614
  maxOutputTokens: this.maxOutputTokens,
3407
3615
  temperature: this.temperature
3408
3616
  });
3409
- const data = schema.parse(parseJsonFromText(response.text ?? ""));
3617
+ const data = schema.parse(
3618
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
3619
+ );
3410
3620
  return { data, providerResponse: response };
3411
3621
  } catch (e) {
3412
3622
  lastError = e instanceof Error ? e : new Error(String(e));
@@ -3489,15 +3699,16 @@ var CodeEvaluator = class {
3489
3699
  {
3490
3700
  question: context.evalCase.question,
3491
3701
  expected_outcome: context.evalCase.expected_outcome,
3702
+ expected_messages: context.evalCase.expected_messages,
3492
3703
  reference_answer: context.evalCase.reference_answer,
3493
3704
  candidate_answer: context.candidate,
3705
+ output_messages: context.outputMessages ?? null,
3494
3706
  guideline_files: context.evalCase.guideline_paths,
3495
3707
  input_files: context.evalCase.file_paths.filter(
3496
3708
  (path13) => !context.evalCase.guideline_paths.includes(path13)
3497
3709
  ),
3498
3710
  input_messages: context.evalCase.input_messages,
3499
- candidate_trace_file: context.candidateTraceRef ?? null,
3500
- candidate_trace_summary: context.candidateTraceSummary ?? null
3711
+ candidate_trace_summary: context.traceSummary ?? null
3501
3712
  },
3502
3713
  null,
3503
3714
  2
@@ -3624,8 +3835,19 @@ var ToolTrajectoryEvaluator = class {
3624
3835
  this.config = options.config;
3625
3836
  }
3626
3837
  evaluate(context) {
3627
- const { candidateTrace, candidateTraceSummary } = context;
3628
- if (!candidateTrace || !candidateTraceSummary) {
3838
+ const { outputMessages, traceSummary } = context;
3839
+ const toolCalls = this.extractToolCallsFromMessages(outputMessages);
3840
+ if (toolCalls.length === 0 && !traceSummary) {
3841
+ return {
3842
+ score: 0,
3843
+ verdict: "fail",
3844
+ hits: [],
3845
+ misses: ["No trace available for evaluation"],
3846
+ expectedAspectCount: 1
3847
+ };
3848
+ }
3849
+ const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
3850
+ if (!summary) {
3629
3851
  return {
3630
3852
  score: 0,
3631
3853
  verdict: "fail",
@@ -3636,11 +3858,11 @@ var ToolTrajectoryEvaluator = class {
3636
3858
  }
3637
3859
  switch (this.config.mode) {
3638
3860
  case "any_order":
3639
- return this.evaluateAnyOrder(candidateTraceSummary);
3861
+ return this.evaluateAnyOrder(summary);
3640
3862
  case "in_order":
3641
- return this.evaluateInOrder(candidateTrace);
3863
+ return this.evaluateInOrder(toolCalls);
3642
3864
  case "exact":
3643
- return this.evaluateExact(candidateTrace);
3865
+ return this.evaluateExact(toolCalls);
3644
3866
  default:
3645
3867
  return {
3646
3868
  score: 0,
@@ -3651,6 +3873,39 @@ var ToolTrajectoryEvaluator = class {
3651
3873
  };
3652
3874
  }
3653
3875
  }
3876
+ /**
3877
+ * Extract tool calls from output messages.
3878
+ */
3879
+ extractToolCallsFromMessages(messages) {
3880
+ if (!messages) {
3881
+ return [];
3882
+ }
3883
+ const toolCalls = [];
3884
+ for (const message of messages) {
3885
+ if (message.toolCalls) {
3886
+ for (const call of message.toolCalls) {
3887
+ toolCalls.push({ name: call.tool });
3888
+ }
3889
+ }
3890
+ }
3891
+ return toolCalls;
3892
+ }
3893
+ /**
3894
+ * Build a summary from extracted tool calls.
3895
+ */
3896
+ buildSummary(toolCalls) {
3897
+ const toolCallsByName = {};
3898
+ for (const call of toolCalls) {
3899
+ toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
3900
+ }
3901
+ const toolNames = Object.keys(toolCallsByName).sort();
3902
+ return {
3903
+ eventCount: toolCalls.length,
3904
+ toolNames,
3905
+ toolCallsByName,
3906
+ errorCount: 0
3907
+ };
3908
+ }
3654
3909
  evaluateAnyOrder(summary) {
3655
3910
  const minimums = this.config.minimums ?? {};
3656
3911
  const toolNames = Object.keys(minimums);
@@ -3683,7 +3938,7 @@ var ToolTrajectoryEvaluator = class {
3683
3938
  expectedAspectCount: toolNames.length
3684
3939
  };
3685
3940
  }
3686
- evaluateInOrder(trace) {
3941
+ evaluateInOrder(toolCalls) {
3687
3942
  const expected = this.config.expected ?? [];
3688
3943
  if (expected.length === 0) {
3689
3944
  return {
@@ -3694,15 +3949,14 @@ var ToolTrajectoryEvaluator = class {
3694
3949
  expectedAspectCount: 0
3695
3950
  };
3696
3951
  }
3697
- const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
3698
3952
  const hits = [];
3699
3953
  const misses = [];
3700
3954
  let actualIndex = 0;
3701
3955
  for (let i = 0; i < expected.length; i++) {
3702
3956
  const expectedTool = expected[i].tool;
3703
3957
  let found = false;
3704
- while (actualIndex < actualToolCalls.length) {
3705
- if (actualToolCalls[actualIndex].name === expectedTool) {
3958
+ while (actualIndex < toolCalls.length) {
3959
+ if (toolCalls[actualIndex].name === expectedTool) {
3706
3960
  hits.push(`Found ${expectedTool} at position ${actualIndex}`);
3707
3961
  actualIndex++;
3708
3962
  found = true;
@@ -3723,7 +3977,7 @@ var ToolTrajectoryEvaluator = class {
3723
3977
  expectedAspectCount: expected.length
3724
3978
  };
3725
3979
  }
3726
- evaluateExact(trace) {
3980
+ evaluateExact(toolCalls) {
3727
3981
  const expected = this.config.expected ?? [];
3728
3982
  if (expected.length === 0) {
3729
3983
  return {
@@ -3734,16 +3988,15 @@ var ToolTrajectoryEvaluator = class {
3734
3988
  expectedAspectCount: 0
3735
3989
  };
3736
3990
  }
3737
- const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
3738
3991
  const hits = [];
3739
3992
  const misses = [];
3740
- if (actualToolCalls.length !== expected.length) {
3741
- misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
3993
+ if (toolCalls.length !== expected.length) {
3994
+ misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
3742
3995
  }
3743
- const checkLength = Math.min(expected.length, actualToolCalls.length);
3996
+ const checkLength = Math.min(expected.length, toolCalls.length);
3744
3997
  for (let i = 0; i < checkLength; i++) {
3745
3998
  const expectedTool = expected[i].tool;
3746
- const actualTool = actualToolCalls[i].name;
3999
+ const actualTool = toolCalls[i].name;
3747
4000
  if (actualTool === expectedTool) {
3748
4001
  hits.push(`Position ${i}: ${expectedTool} \u2713`);
3749
4002
  } else {
@@ -3957,11 +4210,13 @@ var CompositeEvaluator = class {
3957
4210
  evalCaseId: context.evalCase.id,
3958
4211
  attempt: context.attempt
3959
4212
  });
3960
- const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
4213
+ const data = freeformEvaluationSchema.parse(
4214
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
4215
+ );
3961
4216
  const score = clampScore(data.score);
3962
4217
  const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
3963
4218
  const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
3964
- const reasoning = data.reasoning ?? response.reasoning;
4219
+ const reasoning = data.reasoning;
3965
4220
  return {
3966
4221
  score,
3967
4222
  verdict: scoreToVerdict(score),
@@ -4384,11 +4639,14 @@ async function runBatchEvaluation(options) {
4384
4639
  const evalCase = evalCases[i];
4385
4640
  const promptInputs = promptInputsList[i];
4386
4641
  const providerResponse = batchResponse[i];
4642
+ const outputMessages = providerResponse.outputMessages;
4643
+ const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
4644
+ const candidate = extractLastAssistantContent(outputMessages);
4387
4645
  let result;
4388
4646
  try {
4389
4647
  result = await evaluateCandidate({
4390
4648
  evalCase,
4391
- candidate: providerResponse.text ?? "",
4649
+ candidate,
4392
4650
  target,
4393
4651
  provider,
4394
4652
  evaluators: evaluatorRegistry,
@@ -4396,7 +4654,9 @@ async function runBatchEvaluation(options) {
4396
4654
  nowFn,
4397
4655
  attempt: 0,
4398
4656
  judgeProvider: await resolveJudgeProvider(target),
4399
- agentTimeoutMs
4657
+ agentTimeoutMs,
4658
+ outputMessages,
4659
+ traceSummary
4400
4660
  });
4401
4661
  } catch (error) {
4402
4662
  const errorResult = buildErrorResult(
@@ -4500,21 +4760,13 @@ async function runEvalCase(options) {
4500
4760
  if (cacheKey && cache && !cachedResponse) {
4501
4761
  await cache.set(cacheKey, providerResponse);
4502
4762
  }
4503
- let candidateTrace = providerResponse.trace;
4504
- if (!candidateTrace && providerResponse.traceRef) {
4505
- try {
4506
- const rawTrace = await readJsonFile(providerResponse.traceRef);
4507
- if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
4508
- candidateTrace = rawTrace;
4509
- }
4510
- } catch {
4511
- }
4512
- }
4513
- const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
4763
+ const outputMessages = providerResponse.outputMessages;
4764
+ const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
4765
+ const candidate = extractLastAssistantContent(outputMessages);
4514
4766
  try {
4515
4767
  return await evaluateCandidate({
4516
4768
  evalCase,
4517
- candidate: providerResponse.text ?? "",
4769
+ candidate,
4518
4770
  target,
4519
4771
  provider,
4520
4772
  evaluators,
@@ -4523,9 +4775,8 @@ async function runEvalCase(options) {
4523
4775
  attempt,
4524
4776
  judgeProvider,
4525
4777
  agentTimeoutMs,
4526
- candidateTrace,
4527
- candidateTraceRef: providerResponse.traceRef,
4528
- candidateTraceSummary
4778
+ outputMessages,
4779
+ traceSummary
4529
4780
  });
4530
4781
  } catch (error) {
4531
4782
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -4543,9 +4794,8 @@ async function evaluateCandidate(options) {
4543
4794
  attempt,
4544
4795
  judgeProvider,
4545
4796
  agentTimeoutMs,
4546
- candidateTrace,
4547
- candidateTraceRef,
4548
- candidateTraceSummary
4797
+ outputMessages,
4798
+ traceSummary
4549
4799
  } = options;
4550
4800
  const gradeTimestamp = nowFn();
4551
4801
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -4559,9 +4809,8 @@ async function evaluateCandidate(options) {
4559
4809
  now: gradeTimestamp,
4560
4810
  judgeProvider,
4561
4811
  agentTimeoutMs,
4562
- candidateTrace,
4563
- candidateTraceRef,
4564
- candidateTraceSummary
4812
+ outputMessages,
4813
+ traceSummary
4565
4814
  });
4566
4815
  const completedAt = nowFn();
4567
4816
  let agentProviderRequest;
@@ -4599,7 +4848,7 @@ async function evaluateCandidate(options) {
4599
4848
  lm_provider_request: lmProviderRequest,
4600
4849
  evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4601
4850
  evaluator_results: evaluatorResults,
4602
- trace_summary: candidateTraceSummary
4851
+ trace_summary: traceSummary
4603
4852
  };
4604
4853
  }
4605
4854
  async function runEvaluatorsForCase(options) {
@@ -4614,9 +4863,8 @@ async function runEvaluatorsForCase(options) {
4614
4863
  now,
4615
4864
  judgeProvider,
4616
4865
  agentTimeoutMs,
4617
- candidateTrace,
4618
- candidateTraceRef,
4619
- candidateTraceSummary
4866
+ outputMessages,
4867
+ traceSummary
4620
4868
  } = options;
4621
4869
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
4622
4870
  return runEvaluatorList({
@@ -4631,9 +4879,8 @@ async function runEvaluatorsForCase(options) {
4631
4879
  now,
4632
4880
  judgeProvider,
4633
4881
  agentTimeoutMs,
4634
- candidateTrace,
4635
- candidateTraceRef,
4636
- candidateTraceSummary
4882
+ outputMessages,
4883
+ traceSummary
4637
4884
  });
4638
4885
  }
4639
4886
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -4650,9 +4897,8 @@ async function runEvaluatorsForCase(options) {
4650
4897
  promptInputs,
4651
4898
  now,
4652
4899
  judgeProvider,
4653
- candidateTrace,
4654
- candidateTraceRef,
4655
- candidateTraceSummary
4900
+ outputMessages,
4901
+ traceSummary
4656
4902
  });
4657
4903
  return { score };
4658
4904
  }
@@ -4669,9 +4915,8 @@ async function runEvaluatorList(options) {
4669
4915
  now,
4670
4916
  judgeProvider,
4671
4917
  agentTimeoutMs,
4672
- candidateTrace,
4673
- candidateTraceRef,
4674
- candidateTraceSummary
4918
+ outputMessages,
4919
+ traceSummary
4675
4920
  } = options;
4676
4921
  const scored = [];
4677
4922
  const evaluatorResults = [];
@@ -4718,8 +4963,8 @@ async function runEvaluatorList(options) {
4718
4963
  attempt,
4719
4964
  promptInputs,
4720
4965
  now,
4721
- candidateTraceRef,
4722
- candidateTraceSummary
4966
+ outputMessages,
4967
+ traceSummary
4723
4968
  });
4724
4969
  const weight = evaluator.weight ?? 1;
4725
4970
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -4805,9 +5050,8 @@ async function runEvaluatorList(options) {
4805
5050
  attempt,
4806
5051
  promptInputs,
4807
5052
  now,
4808
- candidateTrace,
4809
- candidateTraceRef,
4810
- candidateTraceSummary
5053
+ outputMessages,
5054
+ traceSummary
4811
5055
  });
4812
5056
  const weight = evaluator.weight ?? 1;
4813
5057
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -5200,8 +5444,6 @@ export {
5200
5444
  isJsonValue,
5201
5445
  isTestMessage,
5202
5446
  isTestMessageRole,
5203
- isTraceEvent,
5204
- isTraceEventType,
5205
5447
  listTargetNames,
5206
5448
  loadEvalCases,
5207
5449
  normalizeLineEndings,