npm - @agentv/core - Versions diffs - 1.0.0 → 1.3.1 - Mend

@agentv/core 1.0.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-V3JCB3HI.js → chunk-4A6L2F6L.js} +11 -5
package/dist/chunk-4A6L2F6L.js.map +1 -0
package/dist/evaluation/validation/index.cjs +12 -44
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +13 -45
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +227 -230
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +10 -46
package/dist/index.d.ts +10 -46
package/dist/index.js +218 -225
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-V3JCB3HI.js.map +0 -1

package/dist/index.cjs CHANGED Viewed

@@ -32,7 +32,6 @@ var index_exports = {};
 __export(index_exports, {
   CodeEvaluator: () => CodeEvaluator,
   CompositeEvaluator: () => CompositeEvaluator,
-  ExpectedToolCallsEvaluator: () => ExpectedToolCallsEvaluator,
   LlmJudgeEvaluator: () => LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
   ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
@@ -50,7 +49,6 @@ __export(index_exports, {
   generateRubrics: () => generateRubrics,
   getHitCount: () => getHitCount,
   isEvaluatorKind: () => isEvaluatorKind,
-  isExpectedToolCall: () => isExpectedToolCall,
   isGuidelineFile: () => isGuidelineFile,
   isJsonObject: () => isJsonObject,
   isJsonValue: () => isJsonValue,
@@ -110,18 +108,23 @@ function isTestMessage(value) {
   if (typeof candidate.content === "string") {
     return true;
   }
-  if (!Array.isArray(candidate.content)) {
-    return false;
+  if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
+    return true;
+  }
+  if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
+    return true;
   }
-  return candidate.content.every(isJsonObject);
+  if (isJsonObject(candidate.content)) {
+    return true;
+  }
+  return false;
 }
 var EVALUATOR_KIND_VALUES = [
   "code_judge",
   "llm_judge",
   "rubric",
   "composite",
-  "tool_trajectory",
-  "expected_tool_calls"
+  "tool_trajectory"
 ];
 var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
 function isEvaluatorKind(value) {
@@ -142,13 +145,6 @@ function isTraceEvent(value) {
   const candidate = value;
   return isTraceEventType(candidate.type) && typeof candidate.timestamp === "string";
 }
-function isExpectedToolCall(value) {
-  if (typeof value !== "object" || value === null) {
-    return false;
-  }
-  const candidate = value;
-  return typeof candidate.tool === "string";
-}
 function computeTraceSummary(trace) {
   const toolCallCounts = {};
   let errorCount = 0;
@@ -645,15 +641,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       });
       continue;
     }
-    if (typeValue === "expected_tool_calls") {
-      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
-      evaluators.push({
-        name,
-        type: "expected_tool_calls",
-        ...weight2 !== void 0 ? { weight: weight2 } : {}
-      });
-      continue;
-    }
     if (typeValue === "tool_trajectory") {
       const mode = asString2(rawEvaluator.mode);
       if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
@@ -848,6 +835,17 @@ async function processMessages(options) {
       }
       continue;
     }
+    if (isJsonObject(content)) {
+      const rendered = JSON.stringify(content, null, 2);
+      segments.push({ type: "text", value: rendered });
+      if (textParts) {
+        textParts.push(rendered);
+      }
+      continue;
+    }
+    if (!Array.isArray(content)) {
+      continue;
+    }
     for (const rawSegment of content) {
       if (!isJsonObject(rawSegment)) {
         continue;
@@ -908,63 +906,6 @@ async function processMessages(options) {
   }
   return segments;
 }
-async function resolveAssistantContent(content, searchRoots, verbose) {
-  if (typeof content === "string") {
-    return content;
-  }
-  if (!content) {
-    return "";
-  }
-  const parts = [];
-  for (const entry of content) {
-    if (typeof entry === "string") {
-      parts.push({ content: entry, isFile: false });
-      continue;
-    }
-    if (!isJsonObject(entry)) {
-      continue;
-    }
-    const segmentType = asString3(entry.type);
-    if (segmentType === "file") {
-      const rawValue = asString3(entry.value);
-      if (!rawValue) {
-        continue;
-      }
-      const { displayPath, resolvedPath, attempted } = await resolveFileReference(
-        rawValue,
-        searchRoots
-      );
-      if (!resolvedPath) {
-        const attempts = attempted.length ? ["  Tried:", ...attempted.map((candidate) => `    ${candidate}`)] : void 0;
-        logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
-        continue;
-      }
-      try {
-        const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
-        parts.push({ content: fileContent, isFile: true, displayPath });
-        if (verbose) {
-          console.log(`  [Expected Assistant File] Found: ${displayPath}`);
-          console.log(`    Resolved to: ${resolvedPath}`);
-        }
-      } catch (error) {
-        logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
-      }
-      continue;
-    }
-    const textValue = asString3(entry.text);
-    if (typeof textValue === "string") {
-      parts.push({ content: textValue, isFile: false });
-      continue;
-    }
-    const valueValue = asString3(entry.value);
-    if (typeof valueValue === "string") {
-      parts.push({ content: valueValue, isFile: false });
-      continue;
-    }
-    parts.push({ content: JSON.stringify(entry), isFile: false });
-  }
-  return formatFileContents(parts);
-}
 function asString3(value) {
   return typeof value === "string" ? value : void 0;
 }
@@ -997,14 +938,15 @@ ${detailBlock}${ANSI_RESET4}`);
   }
 }
 async function processExpectedMessages(options) {
-  const { messages, searchRoots, repoRootPath, verbose } = options;
+  const { messages, searchRoots, verbose } = options;
   const segments = [];
   for (const message of messages) {
+    const extendedMessage = message;
     const segment = {
       role: message.role
     };
-    if (message.role === "assistant" && message.tool_calls !== void 0) {
-      segment.tool_calls = message.tool_calls;
+    if (extendedMessage.name) {
+      segment.name = extendedMessage.name;
     }
     const content = message.content;
     if (typeof content === "string") {
@@ -1052,6 +994,13 @@ async function processExpectedMessages(options) {
         processedContent.push(cloneJsonObject(rawSegment));
       }
       segment.content = processedContent;
+    } else if (isJsonObject(content)) {
+      segment.content = cloneJsonObject(content);
+    }
+    if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
+      segment.tool_calls = extendedMessage.tool_calls.map(
+        (tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
+      );
     }
     segments.push(segment);
   }
@@ -1123,6 +1072,11 @@ async function buildPromptInputs(testCase, mode = "lm") {
           }
         }
       }
+    } else if (isJsonObject(message.content)) {
+      const rendered = JSON.stringify(message.content, null, 2);
+      if (rendered.trim().length > 0) {
+        messageSegments.push({ type: "text", value: rendered });
+      }
     }
     segmentsByMessage.push(messageSegments);
   }
@@ -1346,9 +1300,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       logError(`No valid expected message found for eval case: ${id}`);
       continue;
     }
-    if (expectedMessages.length > 1) {
-      logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
-    }
     const guidelinePaths = [];
     const inputTextParts = [];
     const inputSegments = await processMessages({
@@ -1368,8 +1319,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       verbose
     }) : [];
     const codeSnippets = extractCodeBlocks(inputSegments);
-    const expectedContent = expectedMessages[0]?.content;
-    const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
+    let referenceAnswer = "";
+    if (outputSegments.length > 1) {
+      referenceAnswer = JSON.stringify(outputSegments, null, 2);
+    } else if (outputSegments.length === 1) {
+      const singleMessage = outputSegments[0];
+      if (typeof singleMessage.content === "string") {
+        referenceAnswer = singleMessage.content;
+      } else if (singleMessage.content) {
+        referenceAnswer = JSON.stringify(singleMessage, null, 2);
+      } else if (singleMessage.tool_calls) {
+        referenceAnswer = JSON.stringify(singleMessage, null, 2);
+      }
+    }
     const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
     const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
     let evaluators;
@@ -1424,7 +1386,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       question,
       input_messages: inputMessages,
       input_segments: inputSegments,
-      expected_segments: outputSegments,
+      expected_messages: outputSegments,
       reference_answer: referenceAnswer,
       guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
       guideline_patterns: guidelinePatterns,
@@ -1963,7 +1925,7 @@ var CliProvider = class {
   id;
   kind = "cli";
   targetName;
-  supportsBatch = false;
+  supportsBatch = true;
   config;
   runCommand;
   verbose;
@@ -1983,6 +1945,11 @@ var CliProvider = class {
     const outputFilePath = generateOutputFilePath(request.evalCaseId);
     const templateValues = buildTemplateValues(request, this.config, outputFilePath);
     const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
+    if (this.verbose) {
+      console.log(
+        `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
+      );
+    }
     const result = await this.runCommand(renderedCommand, {
       cwd: this.config.cwd,
       env: process.env,
@@ -2017,6 +1984,114 @@ var CliProvider = class {
       }
     };
   }
+  async invokeBatch(requests) {
+    if (requests.length === 0) {
+      return [];
+    }
+    for (const request of requests) {
+      if (request.signal?.aborted) {
+        throw new Error("CLI provider batch request was aborted before execution");
+      }
+    }
+    const controller = new AbortController();
+    for (const request of requests) {
+      request.signal?.addEventListener("abort", () => controller.abort(), { once: true });
+    }
+    await this.ensureHealthy(controller.signal);
+    const outputFilePath = generateOutputFilePath("batch", ".jsonl");
+    const batchInputFiles = [];
+    for (const request of requests) {
+      if (request.inputFiles && request.inputFiles.length > 0) {
+        batchInputFiles.push(...request.inputFiles);
+      }
+    }
+    const templateValues = buildTemplateValues(
+      {
+        question: "",
+        guidelines: "",
+        inputFiles: batchInputFiles,
+        evalCaseId: "batch",
+        attempt: 0
+      },
+      this.config,
+      outputFilePath
+    );
+    const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
+    if (this.verbose) {
+      console.log(
+        `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
+      );
+    }
+    const result = await this.runCommand(renderedCommand, {
+      cwd: this.config.cwd,
+      env: process.env,
+      timeoutMs: this.config.timeoutMs,
+      signal: controller.signal
+    });
+    if (result.failed || (result.exitCode ?? 0) !== 0) {
+      if (controller.signal.aborted) {
+        throw new Error("CLI provider request was aborted");
+      }
+      if (result.timedOut) {
+        throw new Error(
+          `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
+        );
+      }
+      const codeText = result.exitCode !== null ? result.exitCode : "unknown";
+      const detail = result.stderr.trim() || result.stdout.trim();
+      const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
+      throw new Error(message);
+    }
+    const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
+    const recordsById = this.parseJsonlBatchOutput(responseContent);
+    const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
+    const missingIds = requestedIds.filter((id) => !recordsById.has(id));
+    if (missingIds.length > 0) {
+      throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
+    }
+    const responses = requests.map((request) => {
+      const evalCaseId = request.evalCaseId;
+      if (!evalCaseId) {
+        return {
+          text: "",
+          raw: {
+            command: renderedCommand,
+            stderr: result.stderr,
+            exitCode: result.exitCode ?? 0,
+            cwd: this.config.cwd,
+            outputFile: outputFilePath
+          }
+        };
+      }
+      const parsed = recordsById.get(evalCaseId);
+      if (!parsed) {
+        return {
+          text: "",
+          raw: {
+            command: renderedCommand,
+            stderr: result.stderr,
+            exitCode: result.exitCode ?? 0,
+            cwd: this.config.cwd,
+            outputFile: outputFilePath
+          }
+        };
+      }
+      return {
+        text: parsed.text,
+        trace: parsed.trace,
+        traceRef: parsed.traceRef,
+        raw: {
+          command: renderedCommand,
+          stderr: result.stderr,
+          exitCode: result.exitCode ?? 0,
+          cwd: this.config.cwd,
+          outputFile: outputFilePath,
+          recordId: evalCaseId
+        }
+      };
+    });
+    return responses;
+  }
   /**
    * Parse output content from CLI.
    * If the content is valid JSON with a 'text' field, extract text and optional trace.
@@ -2042,6 +2117,38 @@ var CliProvider = class {
     const validEvents = trace.filter(isTraceEvent);
     return validEvents.length > 0 ? validEvents : void 0;
   }
+  parseJsonlBatchOutput(content) {
+    const records = /* @__PURE__ */ new Map();
+    const lines = content.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
+    for (const line of lines) {
+      let parsed;
+      try {
+        parsed = JSON.parse(line);
+      } catch (error) {
+        const reason = error instanceof Error ? error.message : String(error);
+        throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
+      }
+      if (typeof parsed !== "object" || parsed === null) {
+        throw new Error("CLI batch output JSONL line must be an object");
+      }
+      const obj = parsed;
+      const id = typeof obj.id === "string" ? obj.id : void 0;
+      if (!id || id.trim().length === 0) {
+        throw new Error("CLI batch output JSONL line missing required string field: id");
+      }
+      if (records.has(id)) {
+        throw new Error(`CLI batch output contains duplicate id: ${id}`);
+      }
+      const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
+      const traceRef = typeof obj.traceRef === "string" ? obj.traceRef : typeof obj.trace_ref === "string" ? obj.trace_ref : void 0;
+      records.set(id, {
+        text,
+        trace: this.parseTrace(obj.trace),
+        traceRef
+      });
+    }
+    return records;
+  }
   async readAndCleanupOutputFile(filePath) {
     try {
       const content = await readTextFile(filePath);
@@ -2103,7 +2210,7 @@ var CliProvider = class {
     );
     if (this.verbose) {
       console.log(
-        `[cli-provider:${this.targetName}] (healthcheck) CLI_EVALS_DIR=${process.env.CLI_EVALS_DIR ?? ""} cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
+        `[cli-provider:${this.targetName}] (healthcheck) cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
       );
     }
     const result = await this.runCommand(renderedCommand, {
@@ -2171,11 +2278,11 @@ function shellEscape(value) {
   }
   return `'${value.replace(/'/g, `'"'"'`)}'`;
 }
-function generateOutputFilePath(evalCaseId) {
+function generateOutputFilePath(evalCaseId, extension = ".json") {
   const safeEvalId = evalCaseId || "unknown";
   const timestamp = Date.now();
   const random = Math.random().toString(36).substring(2, 9);
-  return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
+  return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
 }
 function formatTimeoutSuffix(timeoutMs) {
   if (!timeoutMs || timeoutMs <= 0) {
@@ -3355,10 +3462,14 @@ function resolveCliConfig(target, env, evalFilePath) {
   const filesFormat = resolveOptionalLiteralString(
     target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
   );
+  const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
   let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
     allowLiteral: true,
     optionalEnv: true
   });
+  if (cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd)) {
+    cwd = import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd);
+  }
   if (!cwd && evalFilePath) {
     cwd = import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath));
   }
@@ -3366,7 +3477,7 @@ function resolveCliConfig(target, env, evalFilePath) {
     target.timeout_seconds ?? target.timeoutSeconds,
     `${target.name} timeout`
   );
-  const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name);
+  const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name, evalFilePath);
   const commandTemplate = resolveString(
     commandTemplateSource,
     env,
@@ -3379,7 +3490,8 @@ function resolveCliConfig(target, env, evalFilePath) {
     filesFormat,
     cwd,
     timeoutMs,
-    healthcheck
+    healthcheck,
+    verbose
   };
 }
 function resolveTimeoutMs(source, description) {
@@ -3392,7 +3504,7 @@ function resolveTimeoutMs(source, description) {
   }
   return Math.floor(seconds * 1e3);
 }
-function resolveCliHealthcheck(source, env, targetName) {
+function resolveCliHealthcheck(source, env, targetName, evalFilePath) {
   if (source === void 0 || source === null) {
     return void 0;
   }
@@ -3425,11 +3537,12 @@ function resolveCliHealthcheck(source, env, targetName) {
       allowLiteral: true,
       optionalEnv: true
     });
+    const resolvedCwd = cwd && evalFilePath && !import_node_path11.default.isAbsolute(cwd) ? import_node_path11.default.resolve(import_node_path11.default.dirname(import_node_path11.default.resolve(evalFilePath)), cwd) : cwd;
     return {
       type: "command",
       commandTemplate,
       timeoutMs,
-      cwd
+      cwd: resolvedCwd
     };
   }
   throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
@@ -3979,7 +4092,7 @@ var import_ai2 = require("ai");
 var import_zod2 = require("zod");
 var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
-Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
+Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
 Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
@@ -4037,7 +4150,7 @@ var LlmJudgeEvaluator = class {
     const variables = {
       [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
       [TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
-        context.evalCase.expected_segments,
+        context.evalCase.expected_messages,
         null,
         2
       ),
@@ -4250,13 +4363,16 @@ var CodeEvaluator = class {
       {
         question: context.evalCase.question,
         expected_outcome: context.evalCase.expected_outcome,
+        expected_messages: context.evalCase.expected_messages,
         reference_answer: context.evalCase.reference_answer,
         candidate_answer: context.candidate,
         guideline_files: context.evalCase.guideline_paths,
         input_files: context.evalCase.file_paths.filter(
           (path15) => !context.evalCase.guideline_paths.includes(path15)
         ),
-        input_messages: context.evalCase.input_messages
+        input_messages: context.evalCase.input_messages,
+        candidate_trace_file: context.candidateTraceRef ?? null,
+        candidate_trace_summary: context.candidateTraceSummary ?? null
       },
       null,
       2
@@ -4522,105 +4638,6 @@ var ToolTrajectoryEvaluator = class {
     };
   }
 };
-var ExpectedToolCallsEvaluator = class {
-  kind = "expected_tool_calls";
-  evaluate(context) {
-    const { candidateTrace, evalCase } = context;
-    const expectedSegments = evalCase.expected_segments;
-    const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
-    if (expectedToolCalls.length === 0) {
-      return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool_calls specified in expected_messages"],
-        misses: [],
-        expectedAspectCount: 1
-      };
-    }
-    if (!candidateTrace || candidateTrace.length === 0) {
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: ["No trace available to validate tool_calls"],
-        expectedAspectCount: expectedToolCalls.length
-      };
-    }
-    const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
-    return this.validateToolCalls(expectedToolCalls, actualToolCalls);
-  }
-  extractExpectedToolCalls(segments) {
-    if (!segments) {
-      return [];
-    }
-    const toolCalls = [];
-    for (const segment of segments) {
-      const role = segment.role;
-      const segmentToolCalls = segment.tool_calls;
-      if (role === "assistant" && Array.isArray(segmentToolCalls)) {
-        for (const tc of segmentToolCalls) {
-          if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
-            const toolCall = tc;
-            toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
-          }
-        }
-      }
-    }
-    return toolCalls;
-  }
-  validateToolCalls(expected, actual) {
-    const hits = [];
-    const misses = [];
-    for (let i = 0; i < expected.length; i++) {
-      const expectedCall = expected[i];
-      const actualCall = actual[i];
-      if (!actualCall) {
-        misses.push(
-          `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
-        );
-        continue;
-      }
-      if (actualCall.name !== expectedCall.tool) {
-        misses.push(
-          `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
-        );
-        continue;
-      }
-      if (expectedCall.input !== void 0) {
-        if (!this.deepEquals(expectedCall.input, actualCall.input)) {
-          misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
-          continue;
-        }
-      }
-      hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
-    }
-    const totalChecks = expected.length || 1;
-    const score = hits.length / totalChecks;
-    return {
-      score,
-      verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
-      hits,
-      misses,
-      expectedAspectCount: totalChecks
-    };
-  }
-  deepEquals(a, b) {
-    if (a === b) return true;
-    if (typeof a !== typeof b) return false;
-    if (typeof a !== "object" || a === null || b === null) return false;
-    if (Array.isArray(a) && Array.isArray(b)) {
-      if (a.length !== b.length) return false;
-      return a.every((val, i) => this.deepEquals(val, b[i]));
-    }
-    if (Array.isArray(a) || Array.isArray(b)) return false;
-    const aObj = a;
-    const bObj = b;
-    const aKeys = Object.keys(aObj);
-    const bKeys = Object.keys(bObj);
-    if (aKeys.length !== bKeys.length) return false;
-    return aKeys.every((key) => this.deepEquals(aObj[key], bObj[key]));
-  }
-};
 var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
 {{EVALUATOR_RESULTS_JSON}}
@@ -5392,6 +5409,7 @@ async function runEvalCase(options) {
       judgeProvider,
       agentTimeoutMs,
       candidateTrace,
+      candidateTraceRef: providerResponse.traceRef,
       candidateTraceSummary
     });
   } catch (error) {
@@ -5411,6 +5429,7 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   } = options;
   const gradeTimestamp = nowFn();
@@ -5426,6 +5445,7 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   });
   const completedAt = nowFn();
@@ -5480,6 +5500,7 @@ async function runEvaluatorsForCase(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   } = options;
   if (evalCase.evaluators && evalCase.evaluators.length > 0) {
@@ -5496,6 +5517,7 @@ async function runEvaluatorsForCase(options) {
       judgeProvider,
       agentTimeoutMs,
       candidateTrace,
+      candidateTraceRef,
       candidateTraceSummary
     });
   }
@@ -5514,6 +5536,7 @@ async function runEvaluatorsForCase(options) {
     now,
     judgeProvider,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   });
   return { score };
@@ -5532,6 +5555,7 @@ async function runEvaluatorList(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   } = options;
   const scored = [];
@@ -5578,7 +5602,9 @@ async function runEvaluatorList(options) {
           provider,
           attempt,
           promptInputs,
-          now
+          now,
+          candidateTraceRef,
+          candidateTraceSummary
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -5616,8 +5642,6 @@ async function runEvaluatorList(options) {
               return new ToolTrajectoryEvaluator({
                 config: memberConfig
               });
-            case "expected_tool_calls":
-              return new ExpectedToolCallsEvaluator();
             default: {
               const unknownConfig = memberConfig;
               throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -5667,32 +5691,7 @@ async function runEvaluatorList(options) {
           promptInputs,
           now,
           candidateTrace,
-          candidateTraceSummary
-        });
-        const weight = evaluator.weight ?? 1;
-        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
-        evaluatorResults.push({
-          name: evaluator.name,
-          type: evaluator.type,
-          score: score2.score,
-          weight,
-          verdict: score2.verdict,
-          hits: score2.hits,
-          misses: score2.misses,
-          reasoning: score2.reasoning
-        });
-      }
-      if (evaluator.type === "expected_tool_calls") {
-        const expectedToolCallsEvaluator = new ExpectedToolCallsEvaluator();
-        const score2 = expectedToolCallsEvaluator.evaluate({
-          evalCase,
-          candidate,
-          target,
-          provider,
-          attempt,
-          promptInputs,
-          now,
-          candidateTrace,
+          candidateTraceRef,
           candidateTraceSummary
         });
         const weight = evaluator.weight ?? 1;
@@ -6065,7 +6064,6 @@ function createAgentKernel() {
 0 && (module.exports = {
   CodeEvaluator,
   CompositeEvaluator,
-  ExpectedToolCallsEvaluator,
   LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES,
   ToolTrajectoryEvaluator,
@@ -6083,7 +6081,6 @@ function createAgentKernel() {
   generateRubrics,
   getHitCount,
   isEvaluatorKind,
-  isExpectedToolCall,
   isGuidelineFile,
   isJsonObject,
   isJsonValue,