@agentv/core 1.3.1 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,6 +1,7 @@
1
1
  import {
2
2
  buildDirectoryChain,
3
3
  buildSearchRoots,
4
+ extractLastAssistantContent,
4
5
  fileExists,
5
6
  findGitRoot,
6
7
  isAgentProvider,
@@ -9,7 +10,7 @@ import {
9
10
  readTextFile,
10
11
  resolveFileReference,
11
12
  resolveTargetDefinition
12
- } from "./chunk-4A6L2F6L.js";
13
+ } from "./chunk-KPHTMTZ3.js";
13
14
 
14
15
  // src/evaluation/types.ts
15
16
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -74,33 +75,22 @@ function getHitCount(result) {
74
75
  }
75
76
 
76
77
  // src/evaluation/trace.ts
77
- function isTraceEventType(value) {
78
- return typeof value === "string" && ["model_step", "tool_call", "tool_result", "message", "error"].includes(value);
79
- }
80
- function isTraceEvent(value) {
81
- if (typeof value !== "object" || value === null) {
82
- return false;
83
- }
84
- const candidate = value;
85
- return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
86
- }
87
- function computeTraceSummary(trace) {
78
+ function computeTraceSummary(messages) {
88
79
  const toolCallCounts = {};
89
- let errorCount = 0;
90
- for (const event of trace) {
91
- if (event.type === "tool_call" && event.name) {
92
- toolCallCounts[event.name] = (toolCallCounts[event.name] ?? 0) + 1;
93
- }
94
- if (event.type === "error") {
95
- errorCount++;
80
+ let totalToolCalls = 0;
81
+ for (const message of messages) {
82
+ if (!message.toolCalls) continue;
83
+ for (const toolCall of message.toolCalls) {
84
+ toolCallCounts[toolCall.tool] = (toolCallCounts[toolCall.tool] ?? 0) + 1;
85
+ totalToolCalls++;
96
86
  }
97
87
  }
98
88
  const toolNames = Object.keys(toolCallCounts).sort();
99
89
  return {
100
- eventCount: trace.length,
90
+ eventCount: totalToolCalls,
101
91
  toolNames,
102
92
  toolCallsByName: toolCallCounts,
103
- errorCount
93
+ errorCount: 0
104
94
  };
105
95
  }
106
96
 
@@ -376,7 +366,8 @@ var TEMPLATE_VARIABLES = {
376
366
  QUESTION: "question",
377
367
  EXPECTED_OUTCOME: "expected_outcome",
378
368
  REFERENCE_ANSWER: "reference_answer",
379
- INPUT_MESSAGES: "input_messages"
369
+ INPUT_MESSAGES: "input_messages",
370
+ OUTPUT_MESSAGES: "output_messages"
380
371
  };
381
372
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
382
373
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
@@ -1259,16 +1250,16 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1259
1250
  }) : [];
1260
1251
  const codeSnippets = extractCodeBlocks(inputSegments);
1261
1252
  let referenceAnswer = "";
1262
- if (outputSegments.length > 1) {
1263
- referenceAnswer = JSON.stringify(outputSegments, null, 2);
1264
- } else if (outputSegments.length === 1) {
1265
- const singleMessage = outputSegments[0];
1266
- if (typeof singleMessage.content === "string") {
1267
- referenceAnswer = singleMessage.content;
1268
- } else if (singleMessage.content) {
1269
- referenceAnswer = JSON.stringify(singleMessage, null, 2);
1270
- } else if (singleMessage.tool_calls) {
1271
- referenceAnswer = JSON.stringify(singleMessage, null, 2);
1253
+ if (outputSegments.length > 0) {
1254
+ const lastMessage = outputSegments[outputSegments.length - 1];
1255
+ const content = lastMessage.content;
1256
+ const toolCalls = lastMessage.tool_calls;
1257
+ if (typeof content === "string") {
1258
+ referenceAnswer = content;
1259
+ } else if (content !== void 0 && content !== null) {
1260
+ referenceAnswer = JSON.stringify(content, null, 2);
1261
+ } else if (toolCalls !== void 0 && toolCalls !== null) {
1262
+ referenceAnswer = JSON.stringify(toolCalls, null, 2);
1272
1263
  }
1273
1264
  }
1274
1265
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
@@ -1596,11 +1587,11 @@ async function invokeModel(options) {
1596
1587
  return mapResponse(result);
1597
1588
  }
1598
1589
  function mapResponse(result) {
1590
+ const content = result.text ?? "";
1599
1591
  return {
1600
- text: result.text ?? "",
1601
- reasoning: result.reasoningText ?? void 0,
1602
1592
  raw: result,
1603
- usage: toJsonObject(result.totalUsage ?? result.usage)
1593
+ usage: toJsonObject(result.totalUsage ?? result.usage),
1594
+ outputMessages: [{ role: "assistant", content }]
1604
1595
  };
1605
1596
  }
1606
1597
  function toJsonObject(value) {
@@ -1753,6 +1744,7 @@ var CliProvider = class {
1753
1744
  config;
1754
1745
  runCommand;
1755
1746
  verbose;
1747
+ keepTempFiles;
1756
1748
  healthcheckPromise;
1757
1749
  constructor(targetName, config, runner = defaultCommandRunner) {
1758
1750
  this.targetName = targetName;
@@ -1760,6 +1752,7 @@ var CliProvider = class {
1760
1752
  this.config = config;
1761
1753
  this.runCommand = runner;
1762
1754
  this.verbose = config.verbose ?? false;
1755
+ this.keepTempFiles = config.keepTempFiles ?? false;
1763
1756
  }
1764
1757
  async invoke(request) {
1765
1758
  if (request.signal?.aborted) {
@@ -1797,8 +1790,7 @@ var CliProvider = class {
1797
1790
  const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
1798
1791
  const parsed = this.parseOutputContent(responseContent);
1799
1792
  return {
1800
- text: parsed.text,
1801
- trace: parsed.trace,
1793
+ outputMessages: parsed.outputMessages,
1802
1794
  raw: {
1803
1795
  command: renderedCommand,
1804
1796
  stderr: result.stderr,
@@ -1877,7 +1869,7 @@ var CliProvider = class {
1877
1869
  const evalCaseId = request.evalCaseId;
1878
1870
  if (!evalCaseId) {
1879
1871
  return {
1880
- text: "",
1872
+ outputMessages: [],
1881
1873
  raw: {
1882
1874
  command: renderedCommand,
1883
1875
  stderr: result.stderr,
@@ -1890,7 +1882,7 @@ var CliProvider = class {
1890
1882
  const parsed = recordsById.get(evalCaseId);
1891
1883
  if (!parsed) {
1892
1884
  return {
1893
- text: "",
1885
+ outputMessages: [],
1894
1886
  raw: {
1895
1887
  command: renderedCommand,
1896
1888
  stderr: result.stderr,
@@ -1901,9 +1893,7 @@ var CliProvider = class {
1901
1893
  };
1902
1894
  }
1903
1895
  return {
1904
- text: parsed.text,
1905
- trace: parsed.trace,
1906
- traceRef: parsed.traceRef,
1896
+ outputMessages: parsed.outputMessages,
1907
1897
  raw: {
1908
1898
  command: renderedCommand,
1909
1899
  stderr: result.stderr,
@@ -1918,28 +1908,81 @@ var CliProvider = class {
1918
1908
  }
1919
1909
  /**
1920
1910
  * Parse output content from CLI.
1921
- * If the content is valid JSON with a 'text' field, extract text and optional trace.
1922
- * Otherwise, treat the entire content as plain text.
1911
+ * If the content is valid JSON with 'output_messages' or 'text' field, extract them.
1912
+ * If only 'text' is provided, wrap it in outputMessages.
1913
+ * Otherwise, treat the entire content as plain text wrapped in outputMessages.
1923
1914
  */
1924
1915
  parseOutputContent(content) {
1925
1916
  try {
1926
1917
  const parsed = JSON.parse(content);
1927
- if (typeof parsed === "object" && parsed !== null && "text" in parsed) {
1918
+ if (typeof parsed === "object" && parsed !== null) {
1928
1919
  const obj = parsed;
1929
- const text = typeof obj.text === "string" ? obj.text : String(obj.text);
1930
- const trace = this.parseTrace(obj.trace);
1931
- return { text, trace };
1920
+ const outputMessages = this.parseOutputMessages(obj.output_messages);
1921
+ if (outputMessages && outputMessages.length > 0) {
1922
+ return { outputMessages };
1923
+ }
1924
+ if ("text" in obj) {
1925
+ const text = typeof obj.text === "string" ? obj.text : String(obj.text);
1926
+ return { outputMessages: [{ role: "assistant", content: text }] };
1927
+ }
1932
1928
  }
1933
1929
  } catch {
1934
1930
  }
1935
- return { text: content };
1931
+ return { outputMessages: [{ role: "assistant", content }] };
1936
1932
  }
1937
- parseTrace(trace) {
1938
- if (!Array.isArray(trace)) {
1933
+ /**
1934
+ * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
1935
+ */
1936
+ parseOutputMessages(outputMessages) {
1937
+ if (!Array.isArray(outputMessages)) {
1939
1938
  return void 0;
1940
1939
  }
1941
- const validEvents = trace.filter(isTraceEvent);
1942
- return validEvents.length > 0 ? validEvents : void 0;
1940
+ const messages = [];
1941
+ for (const msg of outputMessages) {
1942
+ if (typeof msg !== "object" || msg === null) {
1943
+ continue;
1944
+ }
1945
+ const rawMsg = msg;
1946
+ if (typeof rawMsg.role !== "string") {
1947
+ continue;
1948
+ }
1949
+ const message = {
1950
+ role: rawMsg.role,
1951
+ name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
1952
+ content: rawMsg.content,
1953
+ toolCalls: this.parseToolCalls(rawMsg.tool_calls),
1954
+ timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
1955
+ metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
1956
+ };
1957
+ messages.push(message);
1958
+ }
1959
+ return messages.length > 0 ? messages : void 0;
1960
+ }
1961
+ /**
1962
+ * Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
1963
+ */
1964
+ parseToolCalls(toolCalls) {
1965
+ if (!Array.isArray(toolCalls)) {
1966
+ return void 0;
1967
+ }
1968
+ const calls = [];
1969
+ for (const call of toolCalls) {
1970
+ if (typeof call !== "object" || call === null) {
1971
+ continue;
1972
+ }
1973
+ const rawCall = call;
1974
+ if (typeof rawCall.tool !== "string") {
1975
+ continue;
1976
+ }
1977
+ calls.push({
1978
+ tool: rawCall.tool,
1979
+ input: rawCall.input,
1980
+ output: rawCall.output,
1981
+ id: typeof rawCall.id === "string" ? rawCall.id : void 0,
1982
+ timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
1983
+ });
1984
+ }
1985
+ return calls.length > 0 ? calls : void 0;
1943
1986
  }
1944
1987
  parseJsonlBatchOutput(content) {
1945
1988
  const records = /* @__PURE__ */ new Map();
@@ -1963,12 +2006,16 @@ var CliProvider = class {
1963
2006
  if (records.has(id)) {
1964
2007
  throw new Error(`CLI batch output contains duplicate id: ${id}`);
1965
2008
  }
1966
- const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
1967
- const traceRef = typeof obj.traceRef === "string" ? obj.traceRef : typeof obj.trace_ref === "string" ? obj.trace_ref : void 0;
2009
+ const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
2010
+ let outputMessages;
2011
+ if (parsedOutputMessages && parsedOutputMessages.length > 0) {
2012
+ outputMessages = parsedOutputMessages;
2013
+ } else {
2014
+ const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
2015
+ outputMessages = text ? [{ role: "assistant", content: text }] : [];
2016
+ }
1968
2017
  records.set(id, {
1969
- text,
1970
- trace: this.parseTrace(obj.trace),
1971
- traceRef
2018
+ outputMessages
1972
2019
  });
1973
2020
  }
1974
2021
  return records;
@@ -1981,8 +2028,10 @@ var CliProvider = class {
1981
2028
  const errorMsg = error instanceof Error ? error.message : String(error);
1982
2029
  throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
1983
2030
  } finally {
1984
- await fs.unlink(filePath).catch(() => {
1985
- });
2031
+ if (!this.keepTempFiles) {
2032
+ await fs.unlink(filePath).catch(() => {
2033
+ });
2034
+ }
1986
2035
  }
1987
2036
  }
1988
2037
  async ensureHealthy(signal) {
@@ -2325,7 +2374,6 @@ var CodexProvider = class {
2325
2374
  const parsed = parseCodexJson(result.stdout);
2326
2375
  const assistantText = extractAssistantText(parsed);
2327
2376
  return {
2328
- text: assistantText,
2329
2377
  raw: {
2330
2378
  response: parsed,
2331
2379
  stdout: result.stdout,
@@ -2337,7 +2385,8 @@ var CodexProvider = class {
2337
2385
  workspace: workspaceRoot,
2338
2386
  inputFiles,
2339
2387
  logFile: logger?.filePath
2340
- }
2388
+ },
2389
+ outputMessages: [{ role: "assistant", content: assistantText }]
2341
2390
  };
2342
2391
  } finally {
2343
2392
  await logger?.close();
@@ -2959,7 +3008,6 @@ var MockProvider = class {
2959
3008
  delayMs;
2960
3009
  delayMinMs;
2961
3010
  delayMaxMs;
2962
- trace;
2963
3011
  constructor(targetName, config) {
2964
3012
  this.id = `mock:${targetName}`;
2965
3013
  this.targetName = targetName;
@@ -2967,7 +3015,6 @@ var MockProvider = class {
2967
3015
  this.delayMs = config.delayMs ?? 0;
2968
3016
  this.delayMinMs = config.delayMinMs ?? 0;
2969
3017
  this.delayMaxMs = config.delayMaxMs ?? 0;
2970
- this.trace = config.trace;
2971
3018
  }
2972
3019
  async invoke(request) {
2973
3020
  const delay = this.calculateDelay();
@@ -2975,12 +3022,11 @@ var MockProvider = class {
2975
3022
  await new Promise((resolve) => setTimeout(resolve, delay));
2976
3023
  }
2977
3024
  return {
2978
- text: this.cannedResponse,
3025
+ outputMessages: [{ role: "assistant", content: this.cannedResponse }],
2979
3026
  raw: {
2980
3027
  question: request.question,
2981
3028
  guidelines: request.guidelines
2982
- },
2983
- trace: this.trace
3029
+ }
2984
3030
  };
2985
3031
  }
2986
3032
  calculateDelay() {
@@ -3073,7 +3119,7 @@ var VSCodeProvider = class {
3073
3119
  }
3074
3120
  if (this.config.dryRun) {
3075
3121
  return {
3076
- text: "",
3122
+ outputMessages: [],
3077
3123
  raw: {
3078
3124
  session,
3079
3125
  inputFiles
@@ -3082,7 +3128,7 @@ var VSCodeProvider = class {
3082
3128
  }
3083
3129
  const responseText = await readTextFile(session.responseFile);
3084
3130
  return {
3085
- text: responseText,
3131
+ outputMessages: [{ role: "assistant", content: responseText }],
3086
3132
  raw: {
3087
3133
  session,
3088
3134
  inputFiles
@@ -3120,7 +3166,7 @@ var VSCodeProvider = class {
3120
3166
  }
3121
3167
  if (this.config.dryRun) {
3122
3168
  return normalizedRequests.map(({ inputFiles }) => ({
3123
- text: "",
3169
+ outputMessages: [],
3124
3170
  raw: {
3125
3171
  session,
3126
3172
  inputFiles,
@@ -3137,7 +3183,7 @@ var VSCodeProvider = class {
3137
3183
  for (const [index, responseFile] of session.responseFiles.entries()) {
3138
3184
  const responseText = await readTextFile(responseFile);
3139
3185
  responses.push({
3140
- text: responseText,
3186
+ outputMessages: [{ role: "assistant", content: responseText }],
3141
3187
  raw: {
3142
3188
  session,
3143
3189
  inputFiles: normalizedRequests[index]?.inputFiles,
@@ -3441,6 +3487,7 @@ var LlmJudgeEvaluator = class {
3441
3487
  null,
3442
3488
  2
3443
3489
  ),
3490
+ [TEMPLATE_VARIABLES.OUTPUT_MESSAGES]: JSON.stringify(context.outputMessages ?? [], null, 2),
3444
3491
  [TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
3445
3492
  [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
3446
3493
  [TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
@@ -3465,7 +3512,7 @@ var LlmJudgeEvaluator = class {
3465
3512
  const score = clampScore(data.score);
3466
3513
  const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
3467
3514
  const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
3468
- const reasoning = data.reasoning ?? providerResponse?.reasoning;
3515
+ const reasoning = data.reasoning;
3469
3516
  const expectedAspectCount = Math.max(hits.length + misses.length, 1);
3470
3517
  return {
3471
3518
  score,
@@ -3567,7 +3614,9 @@ var LlmJudgeEvaluator = class {
3567
3614
  maxOutputTokens: this.maxOutputTokens,
3568
3615
  temperature: this.temperature
3569
3616
  });
3570
- const data = schema.parse(parseJsonFromText(response.text ?? ""));
3617
+ const data = schema.parse(
3618
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
3619
+ );
3571
3620
  return { data, providerResponse: response };
3572
3621
  } catch (e) {
3573
3622
  lastError = e instanceof Error ? e : new Error(String(e));
@@ -3653,13 +3702,13 @@ var CodeEvaluator = class {
3653
3702
  expected_messages: context.evalCase.expected_messages,
3654
3703
  reference_answer: context.evalCase.reference_answer,
3655
3704
  candidate_answer: context.candidate,
3705
+ output_messages: context.outputMessages ?? null,
3656
3706
  guideline_files: context.evalCase.guideline_paths,
3657
3707
  input_files: context.evalCase.file_paths.filter(
3658
3708
  (path13) => !context.evalCase.guideline_paths.includes(path13)
3659
3709
  ),
3660
3710
  input_messages: context.evalCase.input_messages,
3661
- candidate_trace_file: context.candidateTraceRef ?? null,
3662
- candidate_trace_summary: context.candidateTraceSummary ?? null
3711
+ candidate_trace_summary: context.traceSummary ?? null
3663
3712
  },
3664
3713
  null,
3665
3714
  2
@@ -3786,8 +3835,19 @@ var ToolTrajectoryEvaluator = class {
3786
3835
  this.config = options.config;
3787
3836
  }
3788
3837
  evaluate(context) {
3789
- const { candidateTrace, candidateTraceSummary } = context;
3790
- if (!candidateTrace || !candidateTraceSummary) {
3838
+ const { outputMessages, traceSummary } = context;
3839
+ const toolCalls = this.extractToolCallsFromMessages(outputMessages);
3840
+ if (toolCalls.length === 0 && !traceSummary) {
3841
+ return {
3842
+ score: 0,
3843
+ verdict: "fail",
3844
+ hits: [],
3845
+ misses: ["No trace available for evaluation"],
3846
+ expectedAspectCount: 1
3847
+ };
3848
+ }
3849
+ const summary = toolCalls.length > 0 ? this.buildSummary(toolCalls) : traceSummary;
3850
+ if (!summary) {
3791
3851
  return {
3792
3852
  score: 0,
3793
3853
  verdict: "fail",
@@ -3798,11 +3858,11 @@ var ToolTrajectoryEvaluator = class {
3798
3858
  }
3799
3859
  switch (this.config.mode) {
3800
3860
  case "any_order":
3801
- return this.evaluateAnyOrder(candidateTraceSummary);
3861
+ return this.evaluateAnyOrder(summary);
3802
3862
  case "in_order":
3803
- return this.evaluateInOrder(candidateTrace);
3863
+ return this.evaluateInOrder(toolCalls);
3804
3864
  case "exact":
3805
- return this.evaluateExact(candidateTrace);
3865
+ return this.evaluateExact(toolCalls);
3806
3866
  default:
3807
3867
  return {
3808
3868
  score: 0,
@@ -3813,6 +3873,39 @@ var ToolTrajectoryEvaluator = class {
3813
3873
  };
3814
3874
  }
3815
3875
  }
3876
+ /**
3877
+ * Extract tool calls from output messages.
3878
+ */
3879
+ extractToolCallsFromMessages(messages) {
3880
+ if (!messages) {
3881
+ return [];
3882
+ }
3883
+ const toolCalls = [];
3884
+ for (const message of messages) {
3885
+ if (message.toolCalls) {
3886
+ for (const call of message.toolCalls) {
3887
+ toolCalls.push({ name: call.tool });
3888
+ }
3889
+ }
3890
+ }
3891
+ return toolCalls;
3892
+ }
3893
+ /**
3894
+ * Build a summary from extracted tool calls.
3895
+ */
3896
+ buildSummary(toolCalls) {
3897
+ const toolCallsByName = {};
3898
+ for (const call of toolCalls) {
3899
+ toolCallsByName[call.name] = (toolCallsByName[call.name] ?? 0) + 1;
3900
+ }
3901
+ const toolNames = Object.keys(toolCallsByName).sort();
3902
+ return {
3903
+ eventCount: toolCalls.length,
3904
+ toolNames,
3905
+ toolCallsByName,
3906
+ errorCount: 0
3907
+ };
3908
+ }
3816
3909
  evaluateAnyOrder(summary) {
3817
3910
  const minimums = this.config.minimums ?? {};
3818
3911
  const toolNames = Object.keys(minimums);
@@ -3845,7 +3938,7 @@ var ToolTrajectoryEvaluator = class {
3845
3938
  expectedAspectCount: toolNames.length
3846
3939
  };
3847
3940
  }
3848
- evaluateInOrder(trace) {
3941
+ evaluateInOrder(toolCalls) {
3849
3942
  const expected = this.config.expected ?? [];
3850
3943
  if (expected.length === 0) {
3851
3944
  return {
@@ -3856,15 +3949,14 @@ var ToolTrajectoryEvaluator = class {
3856
3949
  expectedAspectCount: 0
3857
3950
  };
3858
3951
  }
3859
- const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
3860
3952
  const hits = [];
3861
3953
  const misses = [];
3862
3954
  let actualIndex = 0;
3863
3955
  for (let i = 0; i < expected.length; i++) {
3864
3956
  const expectedTool = expected[i].tool;
3865
3957
  let found = false;
3866
- while (actualIndex < actualToolCalls.length) {
3867
- if (actualToolCalls[actualIndex].name === expectedTool) {
3958
+ while (actualIndex < toolCalls.length) {
3959
+ if (toolCalls[actualIndex].name === expectedTool) {
3868
3960
  hits.push(`Found ${expectedTool} at position ${actualIndex}`);
3869
3961
  actualIndex++;
3870
3962
  found = true;
@@ -3885,7 +3977,7 @@ var ToolTrajectoryEvaluator = class {
3885
3977
  expectedAspectCount: expected.length
3886
3978
  };
3887
3979
  }
3888
- evaluateExact(trace) {
3980
+ evaluateExact(toolCalls) {
3889
3981
  const expected = this.config.expected ?? [];
3890
3982
  if (expected.length === 0) {
3891
3983
  return {
@@ -3896,16 +3988,15 @@ var ToolTrajectoryEvaluator = class {
3896
3988
  expectedAspectCount: 0
3897
3989
  };
3898
3990
  }
3899
- const actualToolCalls = trace.filter((e) => e.type === "tool_call" && e.name);
3900
3991
  const hits = [];
3901
3992
  const misses = [];
3902
- if (actualToolCalls.length !== expected.length) {
3903
- misses.push(`Expected ${expected.length} tool calls, got ${actualToolCalls.length}`);
3993
+ if (toolCalls.length !== expected.length) {
3994
+ misses.push(`Expected ${expected.length} tool calls, got ${toolCalls.length}`);
3904
3995
  }
3905
- const checkLength = Math.min(expected.length, actualToolCalls.length);
3996
+ const checkLength = Math.min(expected.length, toolCalls.length);
3906
3997
  for (let i = 0; i < checkLength; i++) {
3907
3998
  const expectedTool = expected[i].tool;
3908
- const actualTool = actualToolCalls[i].name;
3999
+ const actualTool = toolCalls[i].name;
3909
4000
  if (actualTool === expectedTool) {
3910
4001
  hits.push(`Position ${i}: ${expectedTool} \u2713`);
3911
4002
  } else {
@@ -4119,11 +4210,13 @@ var CompositeEvaluator = class {
4119
4210
  evalCaseId: context.evalCase.id,
4120
4211
  attempt: context.attempt
4121
4212
  });
4122
- const data = freeformEvaluationSchema.parse(parseJsonFromText(response.text ?? ""));
4213
+ const data = freeformEvaluationSchema.parse(
4214
+ parseJsonFromText(extractLastAssistantContent(response.outputMessages))
4215
+ );
4123
4216
  const score = clampScore(data.score);
4124
4217
  const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
4125
4218
  const misses = Array.isArray(data.misses) ? data.misses.filter(isNonEmptyString).slice(0, 4) : [];
4126
- const reasoning = data.reasoning ?? response.reasoning;
4219
+ const reasoning = data.reasoning;
4127
4220
  return {
4128
4221
  score,
4129
4222
  verdict: scoreToVerdict(score),
@@ -4546,11 +4639,14 @@ async function runBatchEvaluation(options) {
4546
4639
  const evalCase = evalCases[i];
4547
4640
  const promptInputs = promptInputsList[i];
4548
4641
  const providerResponse = batchResponse[i];
4642
+ const outputMessages = providerResponse.outputMessages;
4643
+ const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
4644
+ const candidate = extractLastAssistantContent(outputMessages);
4549
4645
  let result;
4550
4646
  try {
4551
4647
  result = await evaluateCandidate({
4552
4648
  evalCase,
4553
- candidate: providerResponse.text ?? "",
4649
+ candidate,
4554
4650
  target,
4555
4651
  provider,
4556
4652
  evaluators: evaluatorRegistry,
@@ -4558,7 +4654,9 @@ async function runBatchEvaluation(options) {
4558
4654
  nowFn,
4559
4655
  attempt: 0,
4560
4656
  judgeProvider: await resolveJudgeProvider(target),
4561
- agentTimeoutMs
4657
+ agentTimeoutMs,
4658
+ outputMessages,
4659
+ traceSummary
4562
4660
  });
4563
4661
  } catch (error) {
4564
4662
  const errorResult = buildErrorResult(
@@ -4662,21 +4760,13 @@ async function runEvalCase(options) {
4662
4760
  if (cacheKey && cache && !cachedResponse) {
4663
4761
  await cache.set(cacheKey, providerResponse);
4664
4762
  }
4665
- let candidateTrace = providerResponse.trace;
4666
- if (!candidateTrace && providerResponse.traceRef) {
4667
- try {
4668
- const rawTrace = await readJsonFile(providerResponse.traceRef);
4669
- if (Array.isArray(rawTrace) && rawTrace.every(isTraceEvent)) {
4670
- candidateTrace = rawTrace;
4671
- }
4672
- } catch {
4673
- }
4674
- }
4675
- const candidateTraceSummary = candidateTrace ? computeTraceSummary(candidateTrace) : void 0;
4763
+ const outputMessages = providerResponse.outputMessages;
4764
+ const traceSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
4765
+ const candidate = extractLastAssistantContent(outputMessages);
4676
4766
  try {
4677
4767
  return await evaluateCandidate({
4678
4768
  evalCase,
4679
- candidate: providerResponse.text ?? "",
4769
+ candidate,
4680
4770
  target,
4681
4771
  provider,
4682
4772
  evaluators,
@@ -4685,9 +4775,8 @@ async function runEvalCase(options) {
4685
4775
  attempt,
4686
4776
  judgeProvider,
4687
4777
  agentTimeoutMs,
4688
- candidateTrace,
4689
- candidateTraceRef: providerResponse.traceRef,
4690
- candidateTraceSummary
4778
+ outputMessages,
4779
+ traceSummary
4691
4780
  });
4692
4781
  } catch (error) {
4693
4782
  return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
@@ -4705,9 +4794,8 @@ async function evaluateCandidate(options) {
4705
4794
  attempt,
4706
4795
  judgeProvider,
4707
4796
  agentTimeoutMs,
4708
- candidateTrace,
4709
- candidateTraceRef,
4710
- candidateTraceSummary
4797
+ outputMessages,
4798
+ traceSummary
4711
4799
  } = options;
4712
4800
  const gradeTimestamp = nowFn();
4713
4801
  const { score, evaluatorResults } = await runEvaluatorsForCase({
@@ -4721,9 +4809,8 @@ async function evaluateCandidate(options) {
4721
4809
  now: gradeTimestamp,
4722
4810
  judgeProvider,
4723
4811
  agentTimeoutMs,
4724
- candidateTrace,
4725
- candidateTraceRef,
4726
- candidateTraceSummary
4812
+ outputMessages,
4813
+ traceSummary
4727
4814
  });
4728
4815
  const completedAt = nowFn();
4729
4816
  let agentProviderRequest;
@@ -4761,7 +4848,7 @@ async function evaluateCandidate(options) {
4761
4848
  lm_provider_request: lmProviderRequest,
4762
4849
  evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
4763
4850
  evaluator_results: evaluatorResults,
4764
- trace_summary: candidateTraceSummary
4851
+ trace_summary: traceSummary
4765
4852
  };
4766
4853
  }
4767
4854
  async function runEvaluatorsForCase(options) {
@@ -4776,9 +4863,8 @@ async function runEvaluatorsForCase(options) {
4776
4863
  now,
4777
4864
  judgeProvider,
4778
4865
  agentTimeoutMs,
4779
- candidateTrace,
4780
- candidateTraceRef,
4781
- candidateTraceSummary
4866
+ outputMessages,
4867
+ traceSummary
4782
4868
  } = options;
4783
4869
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
4784
4870
  return runEvaluatorList({
@@ -4793,9 +4879,8 @@ async function runEvaluatorsForCase(options) {
4793
4879
  now,
4794
4880
  judgeProvider,
4795
4881
  agentTimeoutMs,
4796
- candidateTrace,
4797
- candidateTraceRef,
4798
- candidateTraceSummary
4882
+ outputMessages,
4883
+ traceSummary
4799
4884
  });
4800
4885
  }
4801
4886
  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
@@ -4812,9 +4897,8 @@ async function runEvaluatorsForCase(options) {
4812
4897
  promptInputs,
4813
4898
  now,
4814
4899
  judgeProvider,
4815
- candidateTrace,
4816
- candidateTraceRef,
4817
- candidateTraceSummary
4900
+ outputMessages,
4901
+ traceSummary
4818
4902
  });
4819
4903
  return { score };
4820
4904
  }
@@ -4831,9 +4915,8 @@ async function runEvaluatorList(options) {
4831
4915
  now,
4832
4916
  judgeProvider,
4833
4917
  agentTimeoutMs,
4834
- candidateTrace,
4835
- candidateTraceRef,
4836
- candidateTraceSummary
4918
+ outputMessages,
4919
+ traceSummary
4837
4920
  } = options;
4838
4921
  const scored = [];
4839
4922
  const evaluatorResults = [];
@@ -4880,8 +4963,8 @@ async function runEvaluatorList(options) {
4880
4963
  attempt,
4881
4964
  promptInputs,
4882
4965
  now,
4883
- candidateTraceRef,
4884
- candidateTraceSummary
4966
+ outputMessages,
4967
+ traceSummary
4885
4968
  });
4886
4969
  const weight = evaluator.weight ?? 1;
4887
4970
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -4967,9 +5050,8 @@ async function runEvaluatorList(options) {
4967
5050
  attempt,
4968
5051
  promptInputs,
4969
5052
  now,
4970
- candidateTrace,
4971
- candidateTraceRef,
4972
- candidateTraceSummary
5053
+ outputMessages,
5054
+ traceSummary
4973
5055
  });
4974
5056
  const weight = evaluator.weight ?? 1;
4975
5057
  scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -5362,8 +5444,6 @@ export {
5362
5444
  isJsonValue,
5363
5445
  isTestMessage,
5364
5446
  isTestMessageRole,
5365
- isTraceEvent,
5366
- isTraceEventType,
5367
5447
  listTargetNames,
5368
5448
  loadEvalCases,
5369
5449
  normalizeLineEndings,