@agentv/core 1.0.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -32,7 +32,6 @@ var index_exports = {};
32
32
  __export(index_exports, {
33
33
  CodeEvaluator: () => CodeEvaluator,
34
34
  CompositeEvaluator: () => CompositeEvaluator,
35
- ExpectedToolCallsEvaluator: () => ExpectedToolCallsEvaluator,
36
35
  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
37
36
  TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
38
37
  ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
@@ -50,7 +49,6 @@ __export(index_exports, {
50
49
  generateRubrics: () => generateRubrics,
51
50
  getHitCount: () => getHitCount,
52
51
  isEvaluatorKind: () => isEvaluatorKind,
53
- isExpectedToolCall: () => isExpectedToolCall,
54
52
  isGuidelineFile: () => isGuidelineFile,
55
53
  isJsonObject: () => isJsonObject,
56
54
  isJsonValue: () => isJsonValue,
@@ -110,18 +108,23 @@ function isTestMessage(value) {
110
108
  if (typeof candidate.content === "string") {
111
109
  return true;
112
110
  }
113
- if (!Array.isArray(candidate.content)) {
114
- return false;
111
+ if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
112
+ return true;
113
+ }
114
+ if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
115
+ return true;
115
116
  }
116
- return candidate.content.every(isJsonObject);
117
+ if (isJsonObject(candidate.content)) {
118
+ return true;
119
+ }
120
+ return false;
117
121
  }
118
122
  var EVALUATOR_KIND_VALUES = [
119
123
  "code_judge",
120
124
  "llm_judge",
121
125
  "rubric",
122
126
  "composite",
123
- "tool_trajectory",
124
- "expected_tool_calls"
127
+ "tool_trajectory"
125
128
  ];
126
129
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
127
130
  function isEvaluatorKind(value) {
@@ -142,13 +145,6 @@ function isTraceEvent(value) {
142
145
  const candidate = value;
143
146
  return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
144
147
  }
145
- function isExpectedToolCall(value) {
146
- if (typeof value !== "object" || value === null) {
147
- return false;
148
- }
149
- const candidate = value;
150
- return typeof candidate.tool === "string";
151
- }
152
148
  function computeTraceSummary(trace) {
153
149
  const toolCallCounts = {};
154
150
  let errorCount = 0;
@@ -645,15 +641,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
645
641
  });
646
642
  continue;
647
643
  }
648
- if (typeValue === "expected_tool_calls") {
649
- const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
650
- evaluators.push({
651
- name,
652
- type: "expected_tool_calls",
653
- ...weight2 !== void 0 ? { weight: weight2 } : {}
654
- });
655
- continue;
656
- }
657
644
  if (typeValue === "tool_trajectory") {
658
645
  const mode = asString2(rawEvaluator.mode);
659
646
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
@@ -848,6 +835,17 @@ async function processMessages(options) {
848
835
  }
849
836
  continue;
850
837
  }
838
+ if (isJsonObject(content)) {
839
+ const rendered = JSON.stringify(content, null, 2);
840
+ segments.push({ type: "text", value: rendered });
841
+ if (textParts) {
842
+ textParts.push(rendered);
843
+ }
844
+ continue;
845
+ }
846
+ if (!Array.isArray(content)) {
847
+ continue;
848
+ }
851
849
  for (const rawSegment of content) {
852
850
  if (!isJsonObject(rawSegment)) {
853
851
  continue;
@@ -908,63 +906,6 @@ async function processMessages(options) {
908
906
  }
909
907
  return segments;
910
908
  }
911
- async function resolveAssistantContent(content, searchRoots, verbose) {
912
- if (typeof content === "string") {
913
- return content;
914
- }
915
- if (!content) {
916
- return "";
917
- }
918
- const parts = [];
919
- for (const entry of content) {
920
- if (typeof entry === "string") {
921
- parts.push({ content: entry, isFile: false });
922
- continue;
923
- }
924
- if (!isJsonObject(entry)) {
925
- continue;
926
- }
927
- const segmentType = asString3(entry.type);
928
- if (segmentType === "file") {
929
- const rawValue = asString3(entry.value);
930
- if (!rawValue) {
931
- continue;
932
- }
933
- const { displayPath, resolvedPath, attempted } = await resolveFileReference(
934
- rawValue,
935
- searchRoots
936
- );
937
- if (!resolvedPath) {
938
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
939
- logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
940
- continue;
941
- }
942
- try {
943
- const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
944
- parts.push({ content: fileContent, isFile: true, displayPath });
945
- if (verbose) {
946
- console.log(` [Expected Assistant File] Found: ${displayPath}`);
947
- console.log(` Resolved to: ${resolvedPath}`);
948
- }
949
- } catch (error) {
950
- logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
951
- }
952
- continue;
953
- }
954
- const textValue = asString3(entry.text);
955
- if (typeof textValue === "string") {
956
- parts.push({ content: textValue, isFile: false });
957
- continue;
958
- }
959
- const valueValue = asString3(entry.value);
960
- if (typeof valueValue === "string") {
961
- parts.push({ content: valueValue, isFile: false });
962
- continue;
963
- }
964
- parts.push({ content: JSON.stringify(entry), isFile: false });
965
- }
966
- return formatFileContents(parts);
967
- }
968
909
  function asString3(value) {
969
910
  return typeof value === "string" ? value : void 0;
970
911
  }
@@ -997,14 +938,15 @@ ${detailBlock}${ANSI_RESET4}`);
997
938
  }
998
939
  }
999
940
  async function processExpectedMessages(options) {
1000
- const { messages, searchRoots, repoRootPath, verbose } = options;
941
+ const { messages, searchRoots, verbose } = options;
1001
942
  const segments = [];
1002
943
  for (const message of messages) {
944
+ const extendedMessage = message;
1003
945
  const segment = {
1004
946
  role: message.role
1005
947
  };
1006
- if (message.role === "assistant" && message.tool_calls !== void 0) {
1007
- segment.tool_calls = message.tool_calls;
948
+ if (extendedMessage.name) {
949
+ segment.name = extendedMessage.name;
1008
950
  }
1009
951
  const content = message.content;
1010
952
  if (typeof content === "string") {
@@ -1052,6 +994,13 @@ async function processExpectedMessages(options) {
1052
994
  processedContent.push(cloneJsonObject(rawSegment));
1053
995
  }
1054
996
  segment.content = processedContent;
997
+ } else if (isJsonObject(content)) {
998
+ segment.content = cloneJsonObject(content);
999
+ }
1000
+ if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
1001
+ segment.tool_calls = extendedMessage.tool_calls.map(
1002
+ (tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
1003
+ );
1055
1004
  }
1056
1005
  segments.push(segment);
1057
1006
  }
@@ -1123,6 +1072,11 @@ async function buildPromptInputs(testCase, mode = "lm") {
1123
1072
  }
1124
1073
  }
1125
1074
  }
1075
+ } else if (isJsonObject(message.content)) {
1076
+ const rendered = JSON.stringify(message.content, null, 2);
1077
+ if (rendered.trim().length > 0) {
1078
+ messageSegments.push({ type: "text", value: rendered });
1079
+ }
1126
1080
  }
1127
1081
  segmentsByMessage.push(messageSegments);
1128
1082
  }
@@ -1346,9 +1300,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1346
1300
  logError(`No valid expected message found for eval case: ${id}`);
1347
1301
  continue;
1348
1302
  }
1349
- if (expectedMessages.length > 1) {
1350
- logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
1351
- }
1352
1303
  const guidelinePaths = [];
1353
1304
  const inputTextParts = [];
1354
1305
  const inputSegments = await processMessages({
@@ -1368,8 +1319,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1368
1319
  verbose
1369
1320
  }) : [];
1370
1321
  const codeSnippets = extractCodeBlocks(inputSegments);
1371
- const expectedContent = expectedMessages[0]?.content;
1372
- const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
1322
+ let referenceAnswer = "";
1323
+ if (outputSegments.length > 1) {
1324
+ referenceAnswer = JSON.stringify(outputSegments, null, 2);
1325
+ } else if (outputSegments.length === 1) {
1326
+ const singleMessage = outputSegments[0];
1327
+ if (typeof singleMessage.content === "string") {
1328
+ referenceAnswer = singleMessage.content;
1329
+ } else if (singleMessage.content) {
1330
+ referenceAnswer = JSON.stringify(singleMessage, null, 2);
1331
+ } else if (singleMessage.tool_calls) {
1332
+ referenceAnswer = JSON.stringify(singleMessage, null, 2);
1333
+ }
1334
+ }
1373
1335
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
1374
1336
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
1375
1337
  let evaluators;
@@ -1424,7 +1386,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1424
1386
  question,
1425
1387
  input_messages: inputMessages,
1426
1388
  input_segments: inputSegments,
1427
- expected_segments: outputSegments,
1389
+ expected_messages: outputSegments,
1428
1390
  reference_answer: referenceAnswer,
1429
1391
  guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
1430
1392
  guideline_patterns: guidelinePatterns,
@@ -1963,7 +1925,7 @@ var CliProvider = class {
1963
1925
  id;
1964
1926
  kind = "cli";
1965
1927
  targetName;
1966
- supportsBatch = false;
1928
+ supportsBatch = true;
1967
1929
  config;
1968
1930
  runCommand;
1969
1931
  verbose;
@@ -1983,6 +1945,11 @@ var CliProvider = class {
1983
1945
  const outputFilePath = generateOutputFilePath(request.evalCaseId);
1984
1946
  const templateValues = buildTemplateValues(request, this.config, outputFilePath);
1985
1947
  const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
1948
+ if (this.verbose) {
1949
+ console.log(
1950
+ `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
1951
+ );
1952
+ }
1986
1953
  const result = await this.runCommand(renderedCommand, {
1987
1954
  cwd: this.config.cwd,
1988
1955
  env: process.env,
@@ -2017,6 +1984,114 @@ var CliProvider = class {
2017
1984
  }
2018
1985
  };
2019
1986
  }
1987
+ async invokeBatch(requests) {
1988
+ if (requests.length === 0) {
1989
+ return [];
1990
+ }
1991
+ for (const request of requests) {
1992
+ if (request.signal?.aborted) {
1993
+ throw new Error("CLI provider batch request was aborted before execution");
1994
+ }
1995
+ }
1996
+ const controller = new AbortController();
1997
+ for (const request of requests) {
1998
+ request.signal?.addEventListener("abort", () => controller.abort(), { once: true });
1999
+ }
2000
+ await this.ensureHealthy(controller.signal);
2001
+ const outputFilePath = generateOutputFilePath("batch", ".jsonl");
2002
+ const batchInputFiles = [];
2003
+ for (const request of requests) {
2004
+ if (request.inputFiles && request.inputFiles.length > 0) {
2005
+ batchInputFiles.push(...request.inputFiles);
2006
+ }
2007
+ }
2008
+ const templateValues = buildTemplateValues(
2009
+ {
2010
+ question: "",
2011
+ guidelines: "",
2012
+ inputFiles: batchInputFiles,
2013
+ evalCaseId: "batch",
2014
+ attempt: 0
2015
+ },
2016
+ this.config,
2017
+ outputFilePath
2018
+ );
2019
+ const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
2020
+ if (this.verbose) {
2021
+ console.log(
2022
+ `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
2023
+ );
2024
+ }
2025
+ const result = await this.runCommand(renderedCommand, {
2026
+ cwd: this.config.cwd,
2027
+ env: process.env,
2028
+ timeoutMs: this.config.timeoutMs,
2029
+ signal: controller.signal
2030
+ });
2031
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
2032
+ if (controller.signal.aborted) {
2033
+ throw new Error("CLI provider request was aborted");
2034
+ }
2035
+ if (result.timedOut) {
2036
+ throw new Error(
2037
+ `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
2038
+ );
2039
+ }
2040
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
2041
+ const detail = result.stderr.trim() || result.stdout.trim();
2042
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
2043
+ throw new Error(message);
2044
+ }
2045
+ const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
2046
+ const recordsById = this.parseJsonlBatchOutput(responseContent);
2047
+ const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
2048
+ const missingIds = requestedIds.filter((id) => !recordsById.has(id));
2049
+ if (missingIds.length > 0) {
2050
+ throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
2051
+ }
2052
+ const responses = requests.map((request) => {
2053
+ const evalCaseId = request.evalCaseId;
2054
+ if (!evalCaseId) {
2055
+ return {
2056
+ text: "",
2057
+ raw: {
2058
+ command: renderedCommand,
2059
+ stderr: result.stderr,
2060
+ exitCode: result.exitCode ?? 0,
2061
+ cwd: this.config.cwd,
2062
+ outputFile: outputFilePath
2063
+ }
2064
+ };
2065
+ }
2066
+ const parsed = recordsById.get(evalCaseId);
2067
+ if (!parsed) {
2068
+ return {
2069
+ text: "",
2070
+ raw: {
2071
+ command: renderedCommand,
2072
+ stderr: result.stderr,
2073
+ exitCode: result.exitCode ?? 0,
2074
+ cwd: this.config.cwd,
2075
+ outputFile: outputFilePath
2076
+ }
2077
+ };
2078
+ }
2079
+ return {
2080
+ text: parsed.text,
2081
+ trace: parsed.trace,
2082
+ traceRef: parsed.traceRef,
2083
+ raw: {
2084
+ command: renderedCommand,
2085
+ stderr: result.stderr,
2086
+ exitCode: result.exitCode ?? 0,
2087
+ cwd: this.config.cwd,
2088
+ outputFile: outputFilePath,
2089
+ recordId: evalCaseId
2090
+ }
2091
+ };
2092
+ });
2093
+ return responses;
2094
+ }
2020
2095
  /**
2021
2096
  * Parse output content from CLI.
2022
2097
  * If the content is valid JSON with a 'text' field, extract text and optional trace.
@@ -2042,6 +2117,38 @@ var CliProvider = class {
2042
2117
  const validEvents = trace.filter(isTraceEvent);
2043
2118
  return validEvents.length > 0 ? validEvents : void 0;
2044
2119
  }
2120
+ parseJsonlBatchOutput(content) {
2121
+ const records = /* @__PURE__ */ new Map();
2122
+ const lines = content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
2123
+ for (const line of lines) {
2124
+ let parsed;
2125
+ try {
2126
+ parsed = JSON.parse(line);
2127
+ } catch (error) {
2128
+ const reason = error instanceof Error ? error.message : String(error);
2129
+ throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
2130
+ }
2131
+ if (typeof parsed !== "object" || parsed === null) {
2132
+ throw new Error("CLI batch output JSONL line must be an object");
2133
+ }
2134
+ const obj = parsed;
2135
+ const id = typeof obj.id === "string" ? obj.id : void 0;
2136
+ if (!id || id.trim().length === 0) {
2137
+ throw new Error("CLI batch output JSONL line missing required string field: id");
2138
+ }
2139
+ if (records.has(id)) {
2140
+ throw new Error(`CLI batch output contains duplicate id: ${id}`);
2141
+ }
2142
+ const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
2143
+ const traceRef = typeof obj.traceRef === "string" ? obj.traceRef : typeof obj.trace_ref === "string" ? obj.trace_ref : void 0;
2144
+ records.set(id, {
2145
+ text,
2146
+ trace: this.parseTrace(obj.trace),
2147
+ traceRef
2148
+ });
2149
+ }
2150
+ return records;
2151
+ }
2045
2152
  async readAndCleanupOutputFile(filePath) {
2046
2153
  try {
2047
2154
  const content = await readTextFile(filePath);
@@ -2103,7 +2210,7 @@ var CliProvider = class {
2103
2210
  );
2104
2211
  if (this.verbose) {
2105
2212
  console.log(
2106
- `[cli-provider:${this.targetName}] (healthcheck) CLI_EVALS_DIR=${process.env.CLI_EVALS_DIR ?? ""} cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
2213
+ `[cli-provider:${this.targetName}] (healthcheck) cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
2107
2214
  );
2108
2215
  }
2109
2216
  const result = await this.runCommand(renderedCommand, {
@@ -2171,11 +2278,11 @@ function shellEscape(value) {
2171
2278
  }
2172
2279
  return `'${value.replace(/'/g, `'"'"'`)}'`;
2173
2280
  }
2174
- function generateOutputFilePath(evalCaseId) {
2281
+ function generateOutputFilePath(evalCaseId, extension = ".json") {
2175
2282
  const safeEvalId = evalCaseId || "unknown";
2176
2283
  const timestamp = Date.now();
2177
2284
  const random = Math.random().toString(36).substring(2, 9);
2178
- return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
2285
+ return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
2179
2286
  }
2180
2287
  function formatTimeoutSuffix(timeoutMs) {
2181
2288
  if (!timeoutMs || timeoutMs <= 0) {
@@ -3355,10 +3462,14 @@ function resolveCliConfig(target, env, evalFilePath) {
3355
3462
  const filesFormat = resolveOptionalLiteralString(
3356
3463
  target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
3357
3464
  );
3465
+ const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
3358
3466
  let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
3359
3467
  allowLiteral: true,
3360
3468
  optionalEnv: true
3361
3469
  });
3470
+ if (cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd)) {
3471
+ cwd = import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd);
3472
+ }
3362
3473
  if (!cwd && evalFilePath) {
3363
3474
  cwd = import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath));
3364
3475
  }
@@ -3366,7 +3477,7 @@ function resolveCliConfig(target, env, evalFilePath) {
3366
3477
  target.timeout_seconds ?? target.timeoutSeconds,
3367
3478
  `${target.name} timeout`
3368
3479
  );
3369
- const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name);
3480
+ const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name, evalFilePath);
3370
3481
  const commandTemplate = resolveString(
3371
3482
  commandTemplateSource,
3372
3483
  env,
@@ -3379,7 +3490,8 @@ function resolveCliConfig(target, env, evalFilePath) {
3379
3490
  filesFormat,
3380
3491
  cwd,
3381
3492
  timeoutMs,
3382
- healthcheck
3493
+ healthcheck,
3494
+ verbose
3383
3495
  };
3384
3496
  }
3385
3497
  function resolveTimeoutMs(source, description) {
@@ -3392,7 +3504,7 @@ function resolveTimeoutMs(source, description) {
3392
3504
  }
3393
3505
  return Math.floor(seconds * 1e3);
3394
3506
  }
3395
- function resolveCliHealthcheck(source, env, targetName) {
3507
+ function resolveCliHealthcheck(source, env, targetName, evalFilePath) {
3396
3508
  if (source === void 0 || source === null) {
3397
3509
  return void 0;
3398
3510
  }
@@ -3425,11 +3537,12 @@ function resolveCliHealthcheck(source, env, targetName) {
3425
3537
  allowLiteral: true,
3426
3538
  optionalEnv: true
3427
3539
  });
3540
+ const resolvedCwd = cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd) ? import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd) : cwd;
3428
3541
  return {
3429
3542
  type: "command",
3430
3543
  commandTemplate,
3431
3544
  timeoutMs,
3432
- cwd
3545
+ cwd: resolvedCwd
3433
3546
  };
3434
3547
  }
3435
3548
  throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
@@ -3979,7 +4092,7 @@ var import_ai2 = require("ai");
3979
4092
  var import_zod2 = require("zod");
3980
4093
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3981
4094
 
3982
- Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
4095
+ Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
3983
4096
 
3984
4097
  Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
3985
4098
 
@@ -4037,7 +4150,7 @@ var LlmJudgeEvaluator = class {
4037
4150
  const variables = {
4038
4151
  [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
4039
4152
  [TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
4040
- context.evalCase.expected_segments,
4153
+ context.evalCase.expected_messages,
4041
4154
  null,
4042
4155
  2
4043
4156
  ),
@@ -4250,13 +4363,16 @@ var CodeEvaluator = class {
4250
4363
  {
4251
4364
  question: context.evalCase.question,
4252
4365
  expected_outcome: context.evalCase.expected_outcome,
4366
+ expected_messages: context.evalCase.expected_messages,
4253
4367
  reference_answer: context.evalCase.reference_answer,
4254
4368
  candidate_answer: context.candidate,
4255
4369
  guideline_files: context.evalCase.guideline_paths,
4256
4370
  input_files: context.evalCase.file_paths.filter(
4257
4371
  (path15) => !context.evalCase.guideline_paths.includes(path15)
4258
4372
  ),
4259
- input_messages: context.evalCase.input_messages
4373
+ input_messages: context.evalCase.input_messages,
4374
+ candidate_trace_file: context.candidateTraceRef ?? null,
4375
+ candidate_trace_summary: context.candidateTraceSummary ?? null
4260
4376
  },
4261
4377
  null,
4262
4378
  2
@@ -4522,105 +4638,6 @@ var ToolTrajectoryEvaluator = class {
4522
4638
  };
4523
4639
  }
4524
4640
  };
4525
- var ExpectedToolCallsEvaluator = class {
4526
- kind = "expected_tool_calls";
4527
- evaluate(context) {
4528
- const { candidateTrace, evalCase } = context;
4529
- const expectedSegments = evalCase.expected_segments;
4530
- const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
4531
- if (expectedToolCalls.length === 0) {
4532
- return {
4533
- score: 1,
4534
- verdict: "pass",
4535
- hits: ["No tool_calls specified in expected_messages"],
4536
- misses: [],
4537
- expectedAspectCount: 1
4538
- };
4539
- }
4540
- if (!candidateTrace || candidateTrace.length === 0) {
4541
- return {
4542
- score: 0,
4543
- verdict: "fail",
4544
- hits: [],
4545
- misses: ["No trace available to validate tool_calls"],
4546
- expectedAspectCount: expectedToolCalls.length
4547
- };
4548
- }
4549
- const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
4550
- return this.validateToolCalls(expectedToolCalls, actualToolCalls);
4551
- }
4552
- extractExpectedToolCalls(segments) {
4553
- if (!segments) {
4554
- return [];
4555
- }
4556
- const toolCalls = [];
4557
- for (const segment of segments) {
4558
- const role = segment.role;
4559
- const segmentToolCalls = segment.tool_calls;
4560
- if (role === "assistant" && Array.isArray(segmentToolCalls)) {
4561
- for (const tc of segmentToolCalls) {
4562
- if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
4563
- const toolCall = tc;
4564
- toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
4565
- }
4566
- }
4567
- }
4568
- }
4569
- return toolCalls;
4570
- }
4571
- validateToolCalls(expected, actual) {
4572
- const hits = [];
4573
- const misses = [];
4574
- for (let i = 0; i < expected.length; i++) {
4575
- const expectedCall = expected[i];
4576
- const actualCall = actual[i];
4577
- if (!actualCall) {
4578
- misses.push(
4579
- `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
4580
- );
4581
- continue;
4582
- }
4583
- if (actualCall.name !== expectedCall.tool) {
4584
- misses.push(
4585
- `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
4586
- );
4587
- continue;
4588
- }
4589
- if (expectedCall.input !== void 0) {
4590
- if (!this.deepEquals(expectedCall.input, actualCall.input)) {
4591
- misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
4592
- continue;
4593
- }
4594
- }
4595
- hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
4596
- }
4597
- const totalChecks = expected.length || 1;
4598
- const score = hits.length / totalChecks;
4599
- return {
4600
- score,
4601
- verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
4602
- hits,
4603
- misses,
4604
- expectedAspectCount: totalChecks
4605
- };
4606
- }
4607
- deepEquals(a, b) {
4608
- if (a === b) return true;
4609
- if (typeof a !== typeof b) return false;
4610
- if (typeof a !== "object" || a === null || b === null) return false;
4611
- if (Array.isArray(a) && Array.isArray(b)) {
4612
- if (a.length !== b.length) return false;
4613
- return a.every((val, i) => this.deepEquals(val, b[i]));
4614
- }
4615
- if (Array.isArray(a) || Array.isArray(b)) return false;
4616
- const aObj = a;
4617
- const bObj = b;
4618
- const aKeys = Object.keys(aObj);
4619
- const bKeys = Object.keys(bObj);
4620
- if (aKeys.length !== bKeys.length) return false;
4621
- return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
4622
- }
4623
- };
4624
4641
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
4625
4642
  {{EVALUATOR_RESULTS_JSON}}
4626
4643
 
@@ -5392,6 +5409,7 @@ async function runEvalCase(options) {
5392
5409
  judgeProvider,
5393
5410
  agentTimeoutMs,
5394
5411
  candidateTrace,
5412
+ candidateTraceRef: providerResponse.traceRef,
5395
5413
  candidateTraceSummary
5396
5414
  });
5397
5415
  } catch (error) {
@@ -5411,6 +5429,7 @@ async function evaluateCandidate(options) {
5411
5429
  judgeProvider,
5412
5430
  agentTimeoutMs,
5413
5431
  candidateTrace,
5432
+ candidateTraceRef,
5414
5433
  candidateTraceSummary
5415
5434
  } = options;
5416
5435
  const gradeTimestamp = nowFn();
@@ -5426,6 +5445,7 @@ async function evaluateCandidate(options) {
5426
5445
  judgeProvider,
5427
5446
  agentTimeoutMs,
5428
5447
  candidateTrace,
5448
+ candidateTraceRef,
5429
5449
  candidateTraceSummary
5430
5450
  });
5431
5451
  const completedAt = nowFn();
@@ -5480,6 +5500,7 @@ async function runEvaluatorsForCase(options) {
5480
5500
  judgeProvider,
5481
5501
  agentTimeoutMs,
5482
5502
  candidateTrace,
5503
+ candidateTraceRef,
5483
5504
  candidateTraceSummary
5484
5505
  } = options;
5485
5506
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
@@ -5496,6 +5517,7 @@ async function runEvaluatorsForCase(options) {
5496
5517
  judgeProvider,
5497
5518
  agentTimeoutMs,
5498
5519
  candidateTrace,
5520
+ candidateTraceRef,
5499
5521
  candidateTraceSummary
5500
5522
  });
5501
5523
  }
@@ -5514,6 +5536,7 @@ async function runEvaluatorsForCase(options) {
5514
5536
  now,
5515
5537
  judgeProvider,
5516
5538
  candidateTrace,
5539
+ candidateTraceRef,
5517
5540
  candidateTraceSummary
5518
5541
  });
5519
5542
  return { score };
@@ -5532,6 +5555,7 @@ async function runEvaluatorList(options) {
5532
5555
  judgeProvider,
5533
5556
  agentTimeoutMs,
5534
5557
  candidateTrace,
5558
+ candidateTraceRef,
5535
5559
  candidateTraceSummary
5536
5560
  } = options;
5537
5561
  const scored = [];
@@ -5578,7 +5602,9 @@ async function runEvaluatorList(options) {
5578
5602
  provider,
5579
5603
  attempt,
5580
5604
  promptInputs,
5581
- now
5605
+ now,
5606
+ candidateTraceRef,
5607
+ candidateTraceSummary
5582
5608
  });
5583
5609
  const weight = evaluator.weight ?? 1;
5584
5610
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -5616,8 +5642,6 @@ async function runEvaluatorList(options) {
5616
5642
  return new ToolTrajectoryEvaluator({
5617
5643
  config: memberConfig
5618
5644
  });
5619
- case "expected_tool_calls":
5620
- return new ExpectedToolCallsEvaluator();
5621
5645
  default: {
5622
5646
  const unknownConfig = memberConfig;
5623
5647
  throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -5667,32 +5691,7 @@ async function runEvaluatorList(options) {
5667
5691
  promptInputs,
5668
5692
  now,
5669
5693
  candidateTrace,
5670
- candidateTraceSummary
5671
- });
5672
- const weight = evaluator.weight ?? 1;
5673
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
5674
- evaluatorResults.push({
5675
- name: evaluator.name,
5676
- type: evaluator.type,
5677
- score: score2.score,
5678
- weight,
5679
- verdict: score2.verdict,
5680
- hits: score2.hits,
5681
- misses: score2.misses,
5682
- reasoning: score2.reasoning
5683
- });
5684
- }
5685
- if (evaluator.type === "expected_tool_calls") {
5686
- const expectedToolCallsEvaluator = new ExpectedToolCallsEvaluator();
5687
- const score2 = expectedToolCallsEvaluator.evaluate({
5688
- evalCase,
5689
- candidate,
5690
- target,
5691
- provider,
5692
- attempt,
5693
- promptInputs,
5694
- now,
5695
- candidateTrace,
5694
+ candidateTraceRef,
5696
5695
  candidateTraceSummary
5697
5696
  });
5698
5697
  const weight = evaluator.weight ?? 1;
@@ -6065,7 +6064,6 @@ function createAgentKernel() {
6065
6064
  0 && (module.exports = {
6066
6065
  CodeEvaluator,
6067
6066
  CompositeEvaluator,
6068
- ExpectedToolCallsEvaluator,
6069
6067
  LlmJudgeEvaluator,
6070
6068
  TEST_MESSAGE_ROLES,
6071
6069
  ToolTrajectoryEvaluator,
@@ -6083,7 +6081,6 @@ function createAgentKernel() {
6083
6081
  generateRubrics,
6084
6082
  getHitCount,
6085
6083
  isEvaluatorKind,
6086
- isExpectedToolCall,
6087
6084
  isGuidelineFile,
6088
6085
  isJsonObject,
6089
6086
  isJsonValue,