@agentv/core 1.0.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -9,7 +9,7 @@ import {
9
9
  readTextFile,
10
10
  resolveFileReference,
11
11
  resolveTargetDefinition
12
- } from "./chunk-V3JCB3HI.js";
12
+ } from "./chunk-4A6L2F6L.js";
13
13
 
14
14
  // src/evaluation/types.ts
15
15
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -47,18 +47,23 @@ function isTestMessage(value) {
47
47
  if (typeof candidate.content === "string") {
48
48
  return true;
49
49
  }
50
- if (!Array.isArray(candidate.content)) {
51
- return false;
50
+ if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
51
+ return true;
52
+ }
53
+ if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
54
+ return true;
52
55
  }
53
- return candidate.content.every(isJsonObject);
56
+ if (isJsonObject(candidate.content)) {
57
+ return true;
58
+ }
59
+ return false;
54
60
  }
55
61
  var EVALUATOR_KIND_VALUES = [
56
62
  "code_judge",
57
63
  "llm_judge",
58
64
  "rubric",
59
65
  "composite",
60
- "tool_trajectory",
61
- "expected_tool_calls"
66
+ "tool_trajectory"
62
67
  ];
63
68
  var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
64
69
  function isEvaluatorKind(value) {
@@ -79,13 +84,6 @@ function isTraceEvent(value) {
79
84
  const candidate = value;
80
85
  return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
81
86
  }
82
- function isExpectedToolCall(value) {
83
- if (typeof value !== "object" || value === null) {
84
- return false;
85
- }
86
- const candidate = value;
87
- return typeof candidate.tool === "string";
88
- }
89
87
  function computeTraceSummary(trace) {
90
88
  const toolCallCounts = {};
91
89
  let errorCount = 0;
@@ -582,15 +580,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
582
580
  });
583
581
  continue;
584
582
  }
585
- if (typeValue === "expected_tool_calls") {
586
- const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
587
- evaluators.push({
588
- name,
589
- type: "expected_tool_calls",
590
- ...weight2 !== void 0 ? { weight: weight2 } : {}
591
- });
592
- continue;
593
- }
594
583
  if (typeValue === "tool_trajectory") {
595
584
  const mode = asString2(rawEvaluator.mode);
596
585
  if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
@@ -785,6 +774,17 @@ async function processMessages(options) {
785
774
  }
786
775
  continue;
787
776
  }
777
+ if (isJsonObject(content)) {
778
+ const rendered = JSON.stringify(content, null, 2);
779
+ segments.push({ type: "text", value: rendered });
780
+ if (textParts) {
781
+ textParts.push(rendered);
782
+ }
783
+ continue;
784
+ }
785
+ if (!Array.isArray(content)) {
786
+ continue;
787
+ }
788
788
  for (const rawSegment of content) {
789
789
  if (!isJsonObject(rawSegment)) {
790
790
  continue;
@@ -845,63 +845,6 @@ async function processMessages(options) {
845
845
  }
846
846
  return segments;
847
847
  }
848
- async function resolveAssistantContent(content, searchRoots, verbose) {
849
- if (typeof content === "string") {
850
- return content;
851
- }
852
- if (!content) {
853
- return "";
854
- }
855
- const parts = [];
856
- for (const entry of content) {
857
- if (typeof entry === "string") {
858
- parts.push({ content: entry, isFile: false });
859
- continue;
860
- }
861
- if (!isJsonObject(entry)) {
862
- continue;
863
- }
864
- const segmentType = asString3(entry.type);
865
- if (segmentType === "file") {
866
- const rawValue = asString3(entry.value);
867
- if (!rawValue) {
868
- continue;
869
- }
870
- const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
871
- rawValue,
872
- searchRoots
873
- );
874
- if (!resolvedPath) {
875
- const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
876
- logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
877
- continue;
878
- }
879
- try {
880
- const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
881
- parts.push({ content: fileContent, isFile: true, displayPath });
882
- if (verbose) {
883
- console.log(` [Expected Assistant File] Found: ${displayPath}`);
884
- console.log(` Resolved to: ${resolvedPath}`);
885
- }
886
- } catch (error) {
887
- logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
888
- }
889
- continue;
890
- }
891
- const textValue = asString3(entry.text);
892
- if (typeof textValue === "string") {
893
- parts.push({ content: textValue, isFile: false });
894
- continue;
895
- }
896
- const valueValue = asString3(entry.value);
897
- if (typeof valueValue === "string") {
898
- parts.push({ content: valueValue, isFile: false });
899
- continue;
900
- }
901
- parts.push({ content: JSON.stringify(entry), isFile: false });
902
- }
903
- return formatFileContents(parts);
904
- }
905
848
  function asString3(value) {
906
849
  return typeof value === "string" ? value : void 0;
907
850
  }
@@ -934,14 +877,15 @@ ${detailBlock}${ANSI_RESET4}`);
934
877
  }
935
878
  }
936
879
  async function processExpectedMessages(options) {
937
- const { messages, searchRoots, repoRootPath, verbose } = options;
880
+ const { messages, searchRoots, verbose } = options;
938
881
  const segments = [];
939
882
  for (const message of messages) {
883
+ const extendedMessage = message;
940
884
  const segment = {
941
885
  role: message.role
942
886
  };
943
- if (message.role === "assistant" && message.tool_calls !== void 0) {
944
- segment.tool_calls = message.tool_calls;
887
+ if (extendedMessage.name) {
888
+ segment.name = extendedMessage.name;
945
889
  }
946
890
  const content = message.content;
947
891
  if (typeof content === "string") {
@@ -989,6 +933,13 @@ async function processExpectedMessages(options) {
989
933
  processedContent.push(cloneJsonObject(rawSegment));
990
934
  }
991
935
  segment.content = processedContent;
936
+ } else if (isJsonObject(content)) {
937
+ segment.content = cloneJsonObject(content);
938
+ }
939
+ if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
940
+ segment.tool_calls = extendedMessage.tool_calls.map(
941
+ (tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
942
+ );
992
943
  }
993
944
  segments.push(segment);
994
945
  }
@@ -1060,6 +1011,11 @@ async function buildPromptInputs(testCase, mode = "lm") {
1060
1011
  }
1061
1012
  }
1062
1013
  }
1014
+ } else if (isJsonObject(message.content)) {
1015
+ const rendered = JSON.stringify(message.content, null, 2);
1016
+ if (rendered.trim().length > 0) {
1017
+ messageSegments.push({ type: "text", value: rendered });
1018
+ }
1063
1019
  }
1064
1020
  segmentsByMessage.push(messageSegments);
1065
1021
  }
@@ -1283,9 +1239,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1283
1239
  logError(`No valid expected message found for eval case: ${id}`);
1284
1240
  continue;
1285
1241
  }
1286
- if (expectedMessages.length > 1) {
1287
- logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
1288
- }
1289
1242
  const guidelinePaths = [];
1290
1243
  const inputTextParts = [];
1291
1244
  const inputSegments = await processMessages({
@@ -1305,8 +1258,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1305
1258
  verbose
1306
1259
  }) : [];
1307
1260
  const codeSnippets = extractCodeBlocks(inputSegments);
1308
- const expectedContent = expectedMessages[0]?.content;
1309
- const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
1261
+ let referenceAnswer = "";
1262
+ if (outputSegments.length > 1) {
1263
+ referenceAnswer = JSON.stringify(outputSegments, null, 2);
1264
+ } else if (outputSegments.length === 1) {
1265
+ const singleMessage = outputSegments[0];
1266
+ if (typeof singleMessage.content === "string") {
1267
+ referenceAnswer = singleMessage.content;
1268
+ } else if (singleMessage.content) {
1269
+ referenceAnswer = JSON.stringify(singleMessage, null, 2);
1270
+ } else if (singleMessage.tool_calls) {
1271
+ referenceAnswer = JSON.stringify(singleMessage, null, 2);
1272
+ }
1273
+ }
1310
1274
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
1311
1275
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
1312
1276
  let evaluators;
@@ -1361,7 +1325,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
1361
1325
  question,
1362
1326
  input_messages: inputMessages,
1363
1327
  input_segments: inputSegments,
1364
- expected_segments: outputSegments,
1328
+ expected_messages: outputSegments,
1365
1329
  reference_answer: referenceAnswer,
1366
1330
  guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
1367
1331
  guideline_patterns: guidelinePatterns,
@@ -1785,7 +1749,7 @@ var CliProvider = class {
1785
1749
  id;
1786
1750
  kind = "cli";
1787
1751
  targetName;
1788
- supportsBatch = false;
1752
+ supportsBatch = true;
1789
1753
  config;
1790
1754
  runCommand;
1791
1755
  verbose;
@@ -1805,6 +1769,11 @@ var CliProvider = class {
1805
1769
  const outputFilePath = generateOutputFilePath(request.evalCaseId);
1806
1770
  const templateValues = buildTemplateValues(request, this.config, outputFilePath);
1807
1771
  const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
1772
+ if (this.verbose) {
1773
+ console.log(
1774
+ `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
1775
+ );
1776
+ }
1808
1777
  const result = await this.runCommand(renderedCommand, {
1809
1778
  cwd: this.config.cwd,
1810
1779
  env: process.env,
@@ -1839,6 +1808,114 @@ var CliProvider = class {
1839
1808
  }
1840
1809
  };
1841
1810
  }
1811
+ async invokeBatch(requests) {
1812
+ if (requests.length === 0) {
1813
+ return [];
1814
+ }
1815
+ for (const request of requests) {
1816
+ if (request.signal?.aborted) {
1817
+ throw new Error("CLI provider batch request was aborted before execution");
1818
+ }
1819
+ }
1820
+ const controller = new AbortController();
1821
+ for (const request of requests) {
1822
+ request.signal?.addEventListener("abort", () => controller.abort(), { once: true });
1823
+ }
1824
+ await this.ensureHealthy(controller.signal);
1825
+ const outputFilePath = generateOutputFilePath("batch", ".jsonl");
1826
+ const batchInputFiles = [];
1827
+ for (const request of requests) {
1828
+ if (request.inputFiles && request.inputFiles.length > 0) {
1829
+ batchInputFiles.push(...request.inputFiles);
1830
+ }
1831
+ }
1832
+ const templateValues = buildTemplateValues(
1833
+ {
1834
+ question: "",
1835
+ guidelines: "",
1836
+ inputFiles: batchInputFiles,
1837
+ evalCaseId: "batch",
1838
+ attempt: 0
1839
+ },
1840
+ this.config,
1841
+ outputFilePath
1842
+ );
1843
+ const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
1844
+ if (this.verbose) {
1845
+ console.log(
1846
+ `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
1847
+ );
1848
+ }
1849
+ const result = await this.runCommand(renderedCommand, {
1850
+ cwd: this.config.cwd,
1851
+ env: process.env,
1852
+ timeoutMs: this.config.timeoutMs,
1853
+ signal: controller.signal
1854
+ });
1855
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
1856
+ if (controller.signal.aborted) {
1857
+ throw new Error("CLI provider request was aborted");
1858
+ }
1859
+ if (result.timedOut) {
1860
+ throw new Error(
1861
+ `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
1862
+ );
1863
+ }
1864
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
1865
+ const detail = result.stderr.trim() || result.stdout.trim();
1866
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
1867
+ throw new Error(message);
1868
+ }
1869
+ const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
1870
+ const recordsById = this.parseJsonlBatchOutput(responseContent);
1871
+ const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
1872
+ const missingIds = requestedIds.filter((id) => !recordsById.has(id));
1873
+ if (missingIds.length > 0) {
1874
+ throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
1875
+ }
1876
+ const responses = requests.map((request) => {
1877
+ const evalCaseId = request.evalCaseId;
1878
+ if (!evalCaseId) {
1879
+ return {
1880
+ text: "",
1881
+ raw: {
1882
+ command: renderedCommand,
1883
+ stderr: result.stderr,
1884
+ exitCode: result.exitCode ?? 0,
1885
+ cwd: this.config.cwd,
1886
+ outputFile: outputFilePath
1887
+ }
1888
+ };
1889
+ }
1890
+ const parsed = recordsById.get(evalCaseId);
1891
+ if (!parsed) {
1892
+ return {
1893
+ text: "",
1894
+ raw: {
1895
+ command: renderedCommand,
1896
+ stderr: result.stderr,
1897
+ exitCode: result.exitCode ?? 0,
1898
+ cwd: this.config.cwd,
1899
+ outputFile: outputFilePath
1900
+ }
1901
+ };
1902
+ }
1903
+ return {
1904
+ text: parsed.text,
1905
+ trace: parsed.trace,
1906
+ traceRef: parsed.traceRef,
1907
+ raw: {
1908
+ command: renderedCommand,
1909
+ stderr: result.stderr,
1910
+ exitCode: result.exitCode ?? 0,
1911
+ cwd: this.config.cwd,
1912
+ outputFile: outputFilePath,
1913
+ recordId: evalCaseId
1914
+ }
1915
+ };
1916
+ });
1917
+ return responses;
1918
+ }
1842
1919
  /**
1843
1920
  * Parse output content from CLI.
1844
1921
  * If the content is valid JSON with a 'text' field, extract text and optional trace.
@@ -1864,6 +1941,38 @@ var CliProvider = class {
1864
1941
  const validEvents = trace.filter(isTraceEvent);
1865
1942
  return validEvents.length > 0 ? validEvents : void 0;
1866
1943
  }
1944
+ parseJsonlBatchOutput(content) {
1945
+ const records = /* @__PURE__ */ new Map();
1946
+ const lines = content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
1947
+ for (const line of lines) {
1948
+ let parsed;
1949
+ try {
1950
+ parsed = JSON.parse(line);
1951
+ } catch (error) {
1952
+ const reason = error instanceof Error ? error.message : String(error);
1953
+ throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
1954
+ }
1955
+ if (typeof parsed !== "object" || parsed === null) {
1956
+ throw new Error("CLI batch output JSONL line must be an object");
1957
+ }
1958
+ const obj = parsed;
1959
+ const id = typeof obj.id === "string" ? obj.id : void 0;
1960
+ if (!id || id.trim().length === 0) {
1961
+ throw new Error("CLI batch output JSONL line missing required string field: id");
1962
+ }
1963
+ if (records.has(id)) {
1964
+ throw new Error(`CLI batch output contains duplicate id: ${id}`);
1965
+ }
1966
+ const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
1967
+ const traceRef = typeof obj.traceRef === "string" ? obj.traceRef : typeof obj.trace_ref === "string" ? obj.trace_ref : void 0;
1968
+ records.set(id, {
1969
+ text,
1970
+ trace: this.parseTrace(obj.trace),
1971
+ traceRef
1972
+ });
1973
+ }
1974
+ return records;
1975
+ }
1867
1976
  async readAndCleanupOutputFile(filePath) {
1868
1977
  try {
1869
1978
  const content = await readTextFile(filePath);
@@ -1925,7 +2034,7 @@ var CliProvider = class {
1925
2034
  );
1926
2035
  if (this.verbose) {
1927
2036
  console.log(
1928
- `[cli-provider:${this.targetName}] (healthcheck) CLI_EVALS_DIR=${process.env.CLI_EVALS_DIR ?? ""} cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
2037
+ `[cli-provider:${this.targetName}] (healthcheck) cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
1929
2038
  );
1930
2039
  }
1931
2040
  const result = await this.runCommand(renderedCommand, {
@@ -1993,11 +2102,11 @@ function shellEscape(value) {
1993
2102
  }
1994
2103
  return `'${value.replace(/'/g, `'"'"'`)}'`;
1995
2104
  }
1996
- function generateOutputFilePath(evalCaseId) {
2105
+ function generateOutputFilePath(evalCaseId, extension = ".json") {
1997
2106
  const safeEvalId = evalCaseId || "unknown";
1998
2107
  const timestamp = Date.now();
1999
2108
  const random = Math.random().toString(36).substring(2, 9);
2000
- return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
2109
+ return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
2001
2110
  }
2002
2111
  function formatTimeoutSuffix(timeoutMs) {
2003
2112
  if (!timeoutMs || timeoutMs <= 0) {
@@ -3270,7 +3379,7 @@ import { generateText as generateText2 } from "ai";
3270
3379
  import { z } from "zod";
3271
3380
  var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
3272
3381
 
3273
- Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
3382
+ Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
3274
3383
 
3275
3384
  Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
3276
3385
 
@@ -3328,7 +3437,7 @@ var LlmJudgeEvaluator = class {
3328
3437
  const variables = {
3329
3438
  [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
3330
3439
  [TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
3331
- context.evalCase.expected_segments,
3440
+ context.evalCase.expected_messages,
3332
3441
  null,
3333
3442
  2
3334
3443
  ),
@@ -3541,13 +3650,16 @@ var CodeEvaluator = class {
3541
3650
  {
3542
3651
  question: context.evalCase.question,
3543
3652
  expected_outcome: context.evalCase.expected_outcome,
3653
+ expected_messages: context.evalCase.expected_messages,
3544
3654
  reference_answer: context.evalCase.reference_answer,
3545
3655
  candidate_answer: context.candidate,
3546
3656
  guideline_files: context.evalCase.guideline_paths,
3547
3657
  input_files: context.evalCase.file_paths.filter(
3548
3658
  (path13) => !context.evalCase.guideline_paths.includes(path13)
3549
3659
  ),
3550
- input_messages: context.evalCase.input_messages
3660
+ input_messages: context.evalCase.input_messages,
3661
+ candidate_trace_file: context.candidateTraceRef ?? null,
3662
+ candidate_trace_summary: context.candidateTraceSummary ?? null
3551
3663
  },
3552
3664
  null,
3553
3665
  2
@@ -3813,105 +3925,6 @@ var ToolTrajectoryEvaluator = class {
3813
3925
  };
3814
3926
  }
3815
3927
  };
3816
- var ExpectedToolCallsEvaluator = class {
3817
- kind = "expected_tool_calls";
3818
- evaluate(context) {
3819
- const { candidateTrace, evalCase } = context;
3820
- const expectedSegments = evalCase.expected_segments;
3821
- const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
3822
- if (expectedToolCalls.length === 0) {
3823
- return {
3824
- score: 1,
3825
- verdict: "pass",
3826
- hits: ["No tool_calls specified in expected_messages"],
3827
- misses: [],
3828
- expectedAspectCount: 1
3829
- };
3830
- }
3831
- if (!candidateTrace || candidateTrace.length === 0) {
3832
- return {
3833
- score: 0,
3834
- verdict: "fail",
3835
- hits: [],
3836
- misses: ["No trace available to validate tool_calls"],
3837
- expectedAspectCount: expectedToolCalls.length
3838
- };
3839
- }
3840
- const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
3841
- return this.validateToolCalls(expectedToolCalls, actualToolCalls);
3842
- }
3843
- extractExpectedToolCalls(segments) {
3844
- if (!segments) {
3845
- return [];
3846
- }
3847
- const toolCalls = [];
3848
- for (const segment of segments) {
3849
- const role = segment.role;
3850
- const segmentToolCalls = segment.tool_calls;
3851
- if (role === "assistant" && Array.isArray(segmentToolCalls)) {
3852
- for (const tc of segmentToolCalls) {
3853
- if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
3854
- const toolCall = tc;
3855
- toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
3856
- }
3857
- }
3858
- }
3859
- }
3860
- return toolCalls;
3861
- }
3862
- validateToolCalls(expected, actual) {
3863
- const hits = [];
3864
- const misses = [];
3865
- for (let i = 0; i < expected.length; i++) {
3866
- const expectedCall = expected[i];
3867
- const actualCall = actual[i];
3868
- if (!actualCall) {
3869
- misses.push(
3870
- `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
3871
- );
3872
- continue;
3873
- }
3874
- if (actualCall.name !== expectedCall.tool) {
3875
- misses.push(
3876
- `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
3877
- );
3878
- continue;
3879
- }
3880
- if (expectedCall.input !== void 0) {
3881
- if (!this.deepEquals(expectedCall.input, actualCall.input)) {
3882
- misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
3883
- continue;
3884
- }
3885
- }
3886
- hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
3887
- }
3888
- const totalChecks = expected.length || 1;
3889
- const score = hits.length / totalChecks;
3890
- return {
3891
- score,
3892
- verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
3893
- hits,
3894
- misses,
3895
- expectedAspectCount: totalChecks
3896
- };
3897
- }
3898
- deepEquals(a, b) {
3899
- if (a === b) return true;
3900
- if (typeof a !== typeof b) return false;
3901
- if (typeof a !== "object" || a === null || b === null) return false;
3902
- if (Array.isArray(a) && Array.isArray(b)) {
3903
- if (a.length !== b.length) return false;
3904
- return a.every((val, i) => this.deepEquals(val, b[i]));
3905
- }
3906
- if (Array.isArray(a) || Array.isArray(b)) return false;
3907
- const aObj = a;
3908
- const bObj = b;
3909
- const aKeys = Object.keys(aObj);
3910
- const bKeys = Object.keys(bObj);
3911
- if (aKeys.length !== bKeys.length) return false;
3912
- return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
3913
- }
3914
- };
3915
3928
  var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
3916
3929
  {{EVALUATOR_RESULTS_JSON}}
3917
3930
 
@@ -4673,6 +4686,7 @@ async function runEvalCase(options) {
4673
4686
  judgeProvider,
4674
4687
  agentTimeoutMs,
4675
4688
  candidateTrace,
4689
+ candidateTraceRef: providerResponse.traceRef,
4676
4690
  candidateTraceSummary
4677
4691
  });
4678
4692
  } catch (error) {
@@ -4692,6 +4706,7 @@ async function evaluateCandidate(options) {
4692
4706
  judgeProvider,
4693
4707
  agentTimeoutMs,
4694
4708
  candidateTrace,
4709
+ candidateTraceRef,
4695
4710
  candidateTraceSummary
4696
4711
  } = options;
4697
4712
  const gradeTimestamp = nowFn();
@@ -4707,6 +4722,7 @@ async function evaluateCandidate(options) {
4707
4722
  judgeProvider,
4708
4723
  agentTimeoutMs,
4709
4724
  candidateTrace,
4725
+ candidateTraceRef,
4710
4726
  candidateTraceSummary
4711
4727
  });
4712
4728
  const completedAt = nowFn();
@@ -4761,6 +4777,7 @@ async function runEvaluatorsForCase(options) {
4761
4777
  judgeProvider,
4762
4778
  agentTimeoutMs,
4763
4779
  candidateTrace,
4780
+ candidateTraceRef,
4764
4781
  candidateTraceSummary
4765
4782
  } = options;
4766
4783
  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
@@ -4777,6 +4794,7 @@ async function runEvaluatorsForCase(options) {
4777
4794
  judgeProvider,
4778
4795
  agentTimeoutMs,
4779
4796
  candidateTrace,
4797
+ candidateTraceRef,
4780
4798
  candidateTraceSummary
4781
4799
  });
4782
4800
  }
@@ -4795,6 +4813,7 @@ async function runEvaluatorsForCase(options) {
4795
4813
  now,
4796
4814
  judgeProvider,
4797
4815
  candidateTrace,
4816
+ candidateTraceRef,
4798
4817
  candidateTraceSummary
4799
4818
  });
4800
4819
  return { score };
@@ -4813,6 +4832,7 @@ async function runEvaluatorList(options) {
4813
4832
  judgeProvider,
4814
4833
  agentTimeoutMs,
4815
4834
  candidateTrace,
4835
+ candidateTraceRef,
4816
4836
  candidateTraceSummary
4817
4837
  } = options;
4818
4838
  const scored = [];
@@ -4859,7 +4879,9 @@ async function runEvaluatorList(options) {
4859
4879
  provider,
4860
4880
  attempt,
4861
4881
  promptInputs,
4862
- now
4882
+ now,
4883
+ candidateTraceRef,
4884
+ candidateTraceSummary
4863
4885
  });
4864
4886
  const weight = evaluator.weight ?? 1;
4865
4887
  scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -4897,8 +4919,6 @@ async function runEvaluatorList(options) {
4897
4919
  return new ToolTrajectoryEvaluator({
4898
4920
  config: memberConfig
4899
4921
  });
4900
- case "expected_tool_calls":
4901
- return new ExpectedToolCallsEvaluator();
4902
4922
  default: {
4903
4923
  const unknownConfig = memberConfig;
4904
4924
  throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -4948,32 +4968,7 @@ async function runEvaluatorList(options) {
4948
4968
  promptInputs,
4949
4969
  now,
4950
4970
  candidateTrace,
4951
- candidateTraceSummary
4952
- });
4953
- const weight = evaluator.weight ?? 1;
4954
- scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
4955
- evaluatorResults.push({
4956
- name: evaluator.name,
4957
- type: evaluator.type,
4958
- score: score2.score,
4959
- weight,
4960
- verdict: score2.verdict,
4961
- hits: score2.hits,
4962
- misses: score2.misses,
4963
- reasoning: score2.reasoning
4964
- });
4965
- }
4966
- if (evaluator.type === "expected_tool_calls") {
4967
- const expectedToolCallsEvaluator = new ExpectedToolCallsEvaluator();
4968
- const score2 = expectedToolCallsEvaluator.evaluate({
4969
- evalCase,
4970
- candidate,
4971
- target,
4972
- provider,
4973
- attempt,
4974
- promptInputs,
4975
- now,
4976
- candidateTrace,
4971
+ candidateTraceRef,
4977
4972
  candidateTraceSummary
4978
4973
  });
4979
4974
  const weight = evaluator.weight ?? 1;
@@ -5345,7 +5340,6 @@ function createAgentKernel() {
5345
5340
  export {
5346
5341
  CodeEvaluator,
5347
5342
  CompositeEvaluator,
5348
- ExpectedToolCallsEvaluator,
5349
5343
  LlmJudgeEvaluator,
5350
5344
  TEST_MESSAGE_ROLES,
5351
5345
  ToolTrajectoryEvaluator,
@@ -5363,7 +5357,6 @@ export {
5363
5357
  generateRubrics,
5364
5358
  getHitCount,
5365
5359
  isEvaluatorKind,
5366
- isExpectedToolCall,
5367
5360
  isGuidelineFile,
5368
5361
  isJsonObject,
5369
5362
  isJsonValue,