npm - agentv - Versions diffs - 1.0.0 → 1.3.1 - Mend

agentv 1.0.0 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

package/dist/{chunk-RIJO5WBF.js → chunk-6R2YRXCQ.js} RENAMED Viewed

@@ -164,7 +164,7 @@ import { access as access6, mkdir as mkdir7 } from "node:fs/promises";
 import path19 from "node:path";
 import { pathToFileURL } from "node:url";
-// ../../packages/core/dist/chunk-V3JCB3HI.js
+// ../../packages/core/dist/chunk-4A6L2F6L.js
 import { constants } from "node:fs";
 import { access, readFile } from "node:fs/promises";
 import path from "node:path";
@@ -4211,7 +4211,7 @@ var coerce = {
 };
 var NEVER = INVALID;
-// ../../packages/core/dist/chunk-V3JCB3HI.js
+// ../../packages/core/dist/chunk-4A6L2F6L.js
 async function fileExists(filePath) {
   try {
     await access(filePath, constants.F_OK);
@@ -4612,10 +4612,14 @@ function resolveCliConfig(target, env, evalFilePath) {
   const filesFormat = resolveOptionalLiteralString(
     target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
   );
+  const verbose = resolveOptionalBoolean(target.verbose ?? target.cli_verbose ?? target.cliVerbose);
   let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
     allowLiteral: true,
     optionalEnv: true
   });
+  if (cwd && evalFilePath && !path2.isAbsolute(cwd)) {
+    cwd = path2.resolve(path2.dirname(path2.resolve(evalFilePath)), cwd);
+  }
   if (!cwd && evalFilePath) {
     cwd = path2.dirname(path2.resolve(evalFilePath));
   }
@@ -4623,7 +4627,7 @@ function resolveCliConfig(target, env, evalFilePath) {
     target.timeout_seconds ?? target.timeoutSeconds,
     `${target.name} timeout`
   );
-  const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name);
+  const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name, evalFilePath);
   const commandTemplate = resolveString(
     commandTemplateSource,
     env,
@@ -4636,7 +4640,8 @@ function resolveCliConfig(target, env, evalFilePath) {
     filesFormat,
     cwd,
     timeoutMs,
-    healthcheck
+    healthcheck,
+    verbose
   };
 }
 function resolveTimeoutMs(source2, description) {
@@ -4649,7 +4654,7 @@ function resolveTimeoutMs(source2, description) {
   }
   return Math.floor(seconds * 1e3);
 }
-function resolveCliHealthcheck(source2, env, targetName) {
+function resolveCliHealthcheck(source2, env, targetName, evalFilePath) {
   if (source2 === void 0 || source2 === null) {
     return void 0;
   }
@@ -4682,11 +4687,12 @@ function resolveCliHealthcheck(source2, env, targetName) {
       allowLiteral: true,
       optionalEnv: true
     });
+    const resolvedCwd = cwd && evalFilePath && !path2.isAbsolute(cwd) ? path2.resolve(path2.dirname(path2.resolve(evalFilePath)), cwd) : cwd;
     return {
       type: "command",
       commandTemplate,
       timeoutMs,
-      cwd
+      cwd: resolvedCwd
     };
   }
   throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
@@ -34567,18 +34573,23 @@ function isTestMessage(value) {
   if (typeof candidate.content === "string") {
     return true;
   }
-  if (!Array.isArray(candidate.content)) {
-    return false;
+  if (Array.isArray(candidate.content) && candidate.content.every(isJsonObject)) {
+    return true;
+  }
+  if (Array.isArray(candidate.tool_calls) && candidate.tool_calls.length > 0) {
+    return true;
+  }
+  if (isJsonObject(candidate.content)) {
+    return true;
   }
-  return candidate.content.every(isJsonObject);
+  return false;
 }
 var EVALUATOR_KIND_VALUES = [
   "code_judge",
   "llm_judge",
   "rubric",
   "composite",
-  "tool_trajectory",
-  "expected_tool_calls"
+  "tool_trajectory"
 ];
 var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
 function isEvaluatorKind(value) {
@@ -35058,15 +35069,6 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       });
       continue;
     }
-    if (typeValue === "expected_tool_calls") {
-      const weight2 = validateWeight(rawEvaluator.weight, name16, evalId);
-      evaluators.push({
-        name: name16,
-        type: "expected_tool_calls",
-        ...weight2 !== void 0 ? { weight: weight2 } : {}
-      });
-      continue;
-    }
     if (typeValue === "tool_trajectory") {
       const mode = asString2(rawEvaluator.mode);
       if (mode !== "any_order" && mode !== "in_order" && mode !== "exact") {
@@ -35257,6 +35259,17 @@ async function processMessages(options) {
       }
       continue;
     }
+    if (isJsonObject(content)) {
+      const rendered = JSON.stringify(content, null, 2);
+      segments.push({ type: "text", value: rendered });
+      if (textParts) {
+        textParts.push(rendered);
+      }
+      continue;
+    }
+    if (!Array.isArray(content)) {
+      continue;
+    }
     for (const rawSegment of content) {
       if (!isJsonObject(rawSegment)) {
         continue;
@@ -35317,63 +35330,6 @@ async function processMessages(options) {
   }
   return segments;
 }
-async function resolveAssistantContent(content, searchRoots, verbose) {
-  if (typeof content === "string") {
-    return content;
-  }
-  if (!content) {
-    return "";
-  }
-  const parts = [];
-  for (const entry of content) {
-    if (typeof entry === "string") {
-      parts.push({ content: entry, isFile: false });
-      continue;
-    }
-    if (!isJsonObject(entry)) {
-      continue;
-    }
-    const segmentType = asString3(entry.type);
-    if (segmentType === "file") {
-      const rawValue = asString3(entry.value);
-      if (!rawValue) {
-        continue;
-      }
-      const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
-        rawValue,
-        searchRoots
-      );
-      if (!resolvedPath) {
-        const attempts = attempted.length ? ["  Tried:", ...attempted.map((candidate) => `    ${candidate}`)] : void 0;
-        logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
-        continue;
-      }
-      try {
-        const fileContent = (await readFile32(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
-        parts.push({ content: fileContent, isFile: true, displayPath });
-        if (verbose) {
-          console.log(`  [Expected Assistant File] Found: ${displayPath}`);
-          console.log(`    Resolved to: ${resolvedPath}`);
-        }
-      } catch (error40) {
-        logWarning3(`Could not read file ${resolvedPath}: ${error40.message}`);
-      }
-      continue;
-    }
-    const textValue = asString3(entry.text);
-    if (typeof textValue === "string") {
-      parts.push({ content: textValue, isFile: false });
-      continue;
-    }
-    const valueValue = asString3(entry.value);
-    if (typeof valueValue === "string") {
-      parts.push({ content: valueValue, isFile: false });
-      continue;
-    }
-    parts.push({ content: JSON.stringify(entry), isFile: false });
-  }
-  return formatFileContents(parts);
-}
 function asString3(value) {
   return typeof value === "string" ? value : void 0;
 }
@@ -35406,14 +35362,15 @@ ${detailBlock}${ANSI_RESET4}`);
   }
 }
 async function processExpectedMessages(options) {
-  const { messages, searchRoots, repoRootPath, verbose } = options;
+  const { messages, searchRoots, verbose } = options;
   const segments = [];
   for (const message of messages) {
+    const extendedMessage = message;
     const segment = {
       role: message.role
     };
-    if (message.role === "assistant" && message.tool_calls !== void 0) {
-      segment.tool_calls = message.tool_calls;
+    if (extendedMessage.name) {
+      segment.name = extendedMessage.name;
     }
     const content = message.content;
     if (typeof content === "string") {
@@ -35461,6 +35418,13 @@ async function processExpectedMessages(options) {
         processedContent.push(cloneJsonObject(rawSegment));
       }
       segment.content = processedContent;
+    } else if (isJsonObject(content)) {
+      segment.content = cloneJsonObject(content);
+    }
+    if (extendedMessage.tool_calls && Array.isArray(extendedMessage.tool_calls)) {
+      segment.tool_calls = extendedMessage.tool_calls.map(
+        (tc) => isJsonObject(tc) ? cloneJsonObject(tc) : tc
+      );
     }
     segments.push(segment);
   }
@@ -35528,6 +35492,11 @@ async function buildPromptInputs(testCase, mode = "lm") {
           }
         }
       }
+    } else if (isJsonObject(message.content)) {
+      const rendered = JSON.stringify(message.content, null, 2);
+      if (rendered.trim().length > 0) {
+        messageSegments.push({ type: "text", value: rendered });
+      }
     }
     segmentsByMessage.push(messageSegments);
   }
@@ -35749,9 +35718,6 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       logError(`No valid expected message found for eval case: ${id}`);
       continue;
     }
-    if (expectedMessages.length > 1) {
-      logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
-    }
     const guidelinePaths = [];
     const inputTextParts = [];
     const inputSegments = await processMessages({
@@ -35771,8 +35737,19 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       verbose
     }) : [];
     const codeSnippets = extractCodeBlocks(inputSegments);
-    const expectedContent = expectedMessages[0]?.content;
-    const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
+    let referenceAnswer = "";
+    if (outputSegments.length > 1) {
+      referenceAnswer = JSON.stringify(outputSegments, null, 2);
+    } else if (outputSegments.length === 1) {
+      const singleMessage = outputSegments[0];
+      if (typeof singleMessage.content === "string") {
+        referenceAnswer = singleMessage.content;
+      } else if (singleMessage.content) {
+        referenceAnswer = JSON.stringify(singleMessage, null, 2);
+      } else if (singleMessage.tool_calls) {
+        referenceAnswer = JSON.stringify(singleMessage, null, 2);
+      }
+    }
     const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
     const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
     let evaluators;
@@ -35827,7 +35804,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
       question,
       input_messages: inputMessages,
       input_segments: inputSegments,
-      expected_segments: outputSegments,
+      expected_messages: outputSegments,
       reference_answer: referenceAnswer,
       guideline_paths: guidelinePaths.map((guidelinePath) => path62.resolve(guidelinePath)),
       guideline_patterns: guidelinePatterns,
@@ -36238,7 +36215,7 @@ var CliProvider = class {
   id;
   kind = "cli";
   targetName;
-  supportsBatch = false;
+  supportsBatch = true;
   config;
   runCommand;
   verbose;
@@ -36258,6 +36235,11 @@ var CliProvider = class {
     const outputFilePath = generateOutputFilePath(request.evalCaseId);
     const templateValues = buildTemplateValues(request, this.config, outputFilePath);
     const renderedCommand = renderTemplate2(this.config.commandTemplate, templateValues);
+    if (this.verbose) {
+      console.log(
+        `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
+      );
+    }
     const result = await this.runCommand(renderedCommand, {
       cwd: this.config.cwd,
       env: process.env,
@@ -36292,6 +36274,114 @@ var CliProvider = class {
       }
     };
   }
+  async invokeBatch(requests) {
+    if (requests.length === 0) {
+      return [];
+    }
+    for (const request of requests) {
+      if (request.signal?.aborted) {
+        throw new Error("CLI provider batch request was aborted before execution");
+      }
+    }
+    const controller = new AbortController();
+    for (const request of requests) {
+      request.signal?.addEventListener("abort", () => controller.abort(), { once: true });
+    }
+    await this.ensureHealthy(controller.signal);
+    const outputFilePath = generateOutputFilePath("batch", ".jsonl");
+    const batchInputFiles = [];
+    for (const request of requests) {
+      if (request.inputFiles && request.inputFiles.length > 0) {
+        batchInputFiles.push(...request.inputFiles);
+      }
+    }
+    const templateValues = buildTemplateValues(
+      {
+        question: "",
+        guidelines: "",
+        inputFiles: batchInputFiles,
+        evalCaseId: "batch",
+        attempt: 0
+      },
+      this.config,
+      outputFilePath
+    );
+    const renderedCommand = renderTemplate2(this.config.commandTemplate, templateValues);
+    if (this.verbose) {
+      console.log(
+        `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
+      );
+    }
+    const result = await this.runCommand(renderedCommand, {
+      cwd: this.config.cwd,
+      env: process.env,
+      timeoutMs: this.config.timeoutMs,
+      signal: controller.signal
+    });
+    if (result.failed || (result.exitCode ?? 0) !== 0) {
+      if (controller.signal.aborted) {
+        throw new Error("CLI provider request was aborted");
+      }
+      if (result.timedOut) {
+        throw new Error(
+          `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
+        );
+      }
+      const codeText = result.exitCode !== null ? result.exitCode : "unknown";
+      const detail = result.stderr.trim() || result.stdout.trim();
+      const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
+      throw new Error(message);
+    }
+    const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
+    const recordsById = this.parseJsonlBatchOutput(responseContent);
+    const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
+    const missingIds = requestedIds.filter((id) => !recordsById.has(id));
+    if (missingIds.length > 0) {
+      throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
+    }
+    const responses = requests.map((request) => {
+      const evalCaseId = request.evalCaseId;
+      if (!evalCaseId) {
+        return {
+          text: "",
+          raw: {
+            command: renderedCommand,
+            stderr: result.stderr,
+            exitCode: result.exitCode ?? 0,
+            cwd: this.config.cwd,
+            outputFile: outputFilePath
+          }
+        };
+      }
+      const parsed = recordsById.get(evalCaseId);
+      if (!parsed) {
+        return {
+          text: "",
+          raw: {
+            command: renderedCommand,
+            stderr: result.stderr,
+            exitCode: result.exitCode ?? 0,
+            cwd: this.config.cwd,
+            outputFile: outputFilePath
+          }
+        };
+      }
+      return {
+        text: parsed.text,
+        trace: parsed.trace,
+        traceRef: parsed.traceRef,
+        raw: {
+          command: renderedCommand,
+          stderr: result.stderr,
+          exitCode: result.exitCode ?? 0,
+          cwd: this.config.cwd,
+          outputFile: outputFilePath,
+          recordId: evalCaseId
+        }
+      };
+    });
+    return responses;
+  }
   /**
    * Parse output content from CLI.
    * If the content is valid JSON with a 'text' field, extract text and optional trace.
@@ -36317,6 +36407,38 @@ var CliProvider = class {
     const validEvents = trace2.filter(isTraceEvent);
     return validEvents.length > 0 ? validEvents : void 0;
   }
+  parseJsonlBatchOutput(content) {
+    const records = /* @__PURE__ */ new Map();
+    const lines = content.split(/\r?\n/).map((line2) => line2.trim()).filter((line2) => line2.length > 0);
+    for (const line2 of lines) {
+      let parsed;
+      try {
+        parsed = JSON.parse(line2);
+      } catch (error40) {
+        const reason = error40 instanceof Error ? error40.message : String(error40);
+        throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
+      }
+      if (typeof parsed !== "object" || parsed === null) {
+        throw new Error("CLI batch output JSONL line must be an object");
+      }
+      const obj = parsed;
+      const id = typeof obj.id === "string" ? obj.id : void 0;
+      if (!id || id.trim().length === 0) {
+        throw new Error("CLI batch output JSONL line missing required string field: id");
+      }
+      if (records.has(id)) {
+        throw new Error(`CLI batch output contains duplicate id: ${id}`);
+      }
+      const text2 = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
+      const traceRef = typeof obj.traceRef === "string" ? obj.traceRef : typeof obj.trace_ref === "string" ? obj.trace_ref : void 0;
+      records.set(id, {
+        text: text2,
+        trace: this.parseTrace(obj.trace),
+        traceRef
+      });
+    }
+    return records;
+  }
   async readAndCleanupOutputFile(filePath) {
     try {
       const content = await readTextFile(filePath);
@@ -36378,7 +36500,7 @@ var CliProvider = class {
     );
     if (this.verbose) {
       console.log(
-        `[cli-provider:${this.targetName}] (healthcheck) CLI_EVALS_DIR=${process.env.CLI_EVALS_DIR ?? ""} cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
+        `[cli-provider:${this.targetName}] (healthcheck) cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
       );
     }
     const result = await this.runCommand(renderedCommand, {
@@ -36446,11 +36568,11 @@ function shellEscape(value) {
   }
   return `'${value.replace(/'/g, `'"'"'`)}'`;
 }
-function generateOutputFilePath(evalCaseId) {
+function generateOutputFilePath(evalCaseId, extension = ".json") {
   const safeEvalId = evalCaseId || "unknown";
   const timestamp = Date.now();
   const random = Math.random().toString(36).substring(2, 9);
-  return path72.join(os2.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
+  return path72.join(os2.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
 }
 function formatTimeoutSuffix(timeoutMs) {
   if (!timeoutMs || timeoutMs <= 0) {
@@ -37669,7 +37791,7 @@ function createProvider(target) {
 }
 var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
-Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
+Use the reference_answer as a gold standard for a high-quality response (if provided). The reference_answer may be a simple text response, or it may contain a sequence of expected agent messages including tool calls. When it contains multiple messages, the last message represents the final expected answer. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
 Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
@@ -37727,7 +37849,7 @@ var LlmJudgeEvaluator = class {
     const variables = {
       [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
       [TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(
-        context.evalCase.expected_segments,
+        context.evalCase.expected_messages,
         null,
         2
       ),
@@ -37940,13 +38062,16 @@ var CodeEvaluator = class {
       {
         question: context.evalCase.question,
         expected_outcome: context.evalCase.expected_outcome,
+        expected_messages: context.evalCase.expected_messages,
         reference_answer: context.evalCase.reference_answer,
         candidate_answer: context.candidate,
         guideline_files: context.evalCase.guideline_paths,
         input_files: context.evalCase.file_paths.filter(
           (path132) => !context.evalCase.guideline_paths.includes(path132)
         ),
-        input_messages: context.evalCase.input_messages
+        input_messages: context.evalCase.input_messages,
+        candidate_trace_file: context.candidateTraceRef ?? null,
+        candidate_trace_summary: context.candidateTraceSummary ?? null
       },
       null,
       2
@@ -38212,105 +38337,6 @@ var ToolTrajectoryEvaluator = class {
     };
   }
 };
-var ExpectedToolCallsEvaluator = class {
-  kind = "expected_tool_calls";
-  evaluate(context) {
-    const { candidateTrace, evalCase } = context;
-    const expectedSegments = evalCase.expected_segments;
-    const expectedToolCalls = this.extractExpectedToolCalls(expectedSegments);
-    if (expectedToolCalls.length === 0) {
-      return {
-        score: 1,
-        verdict: "pass",
-        hits: ["No tool_calls specified in expected_messages"],
-        misses: [],
-        expectedAspectCount: 1
-      };
-    }
-    if (!candidateTrace || candidateTrace.length === 0) {
-      return {
-        score: 0,
-        verdict: "fail",
-        hits: [],
-        misses: ["No trace available to validate tool_calls"],
-        expectedAspectCount: expectedToolCalls.length
-      };
-    }
-    const actualToolCalls = candidateTrace.filter((e) => e.type === "tool_call");
-    return this.validateToolCalls(expectedToolCalls, actualToolCalls);
-  }
-  extractExpectedToolCalls(segments) {
-    if (!segments) {
-      return [];
-    }
-    const toolCalls = [];
-    for (const segment of segments) {
-      const role = segment.role;
-      const segmentToolCalls = segment.tool_calls;
-      if (role === "assistant" && Array.isArray(segmentToolCalls)) {
-        for (const tc of segmentToolCalls) {
-          if (typeof tc === "object" && tc !== null && typeof tc.tool === "string") {
-            const toolCall = tc;
-            toolCalls.push({ tool: toolCall.tool, input: toolCall.input });
-          }
-        }
-      }
-    }
-    return toolCalls;
-  }
-  validateToolCalls(expected, actual) {
-    const hits = [];
-    const misses = [];
-    for (let i = 0; i < expected.length; i++) {
-      const expectedCall = expected[i];
-      const actualCall = actual[i];
-      if (!actualCall) {
-        misses.push(
-          `tool_calls[${i}]: expected ${expectedCall.tool}, but no more tool calls in trace`
-        );
-        continue;
-      }
-      if (actualCall.name !== expectedCall.tool) {
-        misses.push(
-          `tool_calls[${i}]: expected ${expectedCall.tool}, got ${actualCall.name ?? "unknown"}`
-        );
-        continue;
-      }
-      if (expectedCall.input !== void 0) {
-        if (!this.deepEquals(expectedCall.input, actualCall.input)) {
-          misses.push(`tool_calls[${i}]: ${expectedCall.tool} input mismatch`);
-          continue;
-        }
-      }
-      hits.push(`tool_calls[${i}]: ${expectedCall.tool} matched`);
-    }
-    const totalChecks = expected.length || 1;
-    const score = hits.length / totalChecks;
-    return {
-      score,
-      verdict: score >= 0.8 ? "pass" : score >= 0.6 ? "borderline" : "fail",
-      hits,
-      misses,
-      expectedAspectCount: totalChecks
-    };
-  }
-  deepEquals(a, b) {
-    if (a === b) return true;
-    if (typeof a !== typeof b) return false;
-    if (typeof a !== "object" || a === null || b === null) return false;
-    if (Array.isArray(a) && Array.isArray(b)) {
-      if (a.length !== b.length) return false;
-      return a.every((val, i) => this.deepEquals(val, b[i]));
-    }
-    if (Array.isArray(a) || Array.isArray(b)) return false;
-    const aObj = a;
-    const bObj = b;
-    const aKeys = Object.keys(aObj);
-    const bKeys = Object.keys(bObj);
-    if (aKeys.length !== bKeys.length) return false;
-    return aKeys.every((key2) => this.deepEquals(aObj[key2], bObj[key2]));
-  }
-};
 var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
 {{EVALUATOR_RESULTS_JSON}}
@@ -39061,6 +39087,7 @@ async function runEvalCase(options) {
       judgeProvider,
       agentTimeoutMs,
       candidateTrace,
+      candidateTraceRef: providerResponse.traceRef,
       candidateTraceSummary
     });
   } catch (error40) {
@@ -39080,6 +39107,7 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   } = options;
   const gradeTimestamp = nowFn();
@@ -39095,6 +39123,7 @@ async function evaluateCandidate(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   });
   const completedAt = nowFn();
@@ -39149,6 +39178,7 @@ async function runEvaluatorsForCase(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   } = options;
   if (evalCase.evaluators && evalCase.evaluators.length > 0) {
@@ -39165,6 +39195,7 @@ async function runEvaluatorsForCase(options) {
       judgeProvider,
       agentTimeoutMs,
       candidateTrace,
+      candidateTraceRef,
       candidateTraceSummary
     });
   }
@@ -39183,6 +39214,7 @@ async function runEvaluatorsForCase(options) {
     now,
     judgeProvider,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   });
   return { score };
@@ -39201,6 +39233,7 @@ async function runEvaluatorList(options) {
     judgeProvider,
     agentTimeoutMs,
     candidateTrace,
+    candidateTraceRef,
     candidateTraceSummary
   } = options;
   const scored = [];
@@ -39247,7 +39280,9 @@ async function runEvaluatorList(options) {
           provider,
           attempt,
           promptInputs,
-          now
+          now,
+          candidateTraceRef,
+          candidateTraceSummary
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: "code_judge", weight });
@@ -39285,8 +39320,6 @@ async function runEvaluatorList(options) {
               return new ToolTrajectoryEvaluator({
                 config: memberConfig
               });
-            case "expected_tool_calls":
-              return new ExpectedToolCallsEvaluator();
             default: {
               const unknownConfig = memberConfig;
               throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -39336,32 +39369,7 @@ async function runEvaluatorList(options) {
           promptInputs,
           now,
           candidateTrace,
-          candidateTraceSummary
-        });
-        const weight = evaluator.weight ?? 1;
-        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
-        evaluatorResults.push({
-          name: evaluator.name,
-          type: evaluator.type,
-          score: score2.score,
-          weight,
-          verdict: score2.verdict,
-          hits: score2.hits,
-          misses: score2.misses,
-          reasoning: score2.reasoning
-        });
-      }
-      if (evaluator.type === "expected_tool_calls") {
-        const expectedToolCallsEvaluator = new ExpectedToolCallsEvaluator();
-        const score2 = expectedToolCallsEvaluator.evaluate({
-          evalCase,
-          candidate,
-          target,
-          provider,
-          attempt,
-          promptInputs,
-          now,
-          candidateTrace,
+          candidateTraceRef,
           candidateTraceSummary
         });
         const weight = evaluator.weight ?? 1;
@@ -40123,61 +40131,56 @@ function getDefaultExtension(format) {
 }
 // src/commands/eval/progress-display.ts
-import { stripVTControlCharacters } from "node:util";
-var ESC = "\x1B[";
-var CLEAR_LINE = `${ESC}K`;
-var MOVE_CURSOR_UP = `${ESC}1A`;
 var ProgressDisplay = class {
   workers = /* @__PURE__ */ new Map();
-  maxWorkers;
   totalTests = 0;
   completedTests = 0;
-  renderTimer;
-  renderScheduled = false;
-  isInteractive;
   logPaths = [];
   logPathSet = /* @__PURE__ */ new Set();
   hasPrintedLogHeader = false;
-  windowHeight = 0;
   started = false;
   finished = false;
-  constructor(maxWorkers) {
-    this.maxWorkers = maxWorkers;
-    this.isInteractive = process.stdout.isTTY && !process.env.CI;
+  verbose;
+  constructor(_maxWorkers, options) {
+    this.verbose = options?.verbose ?? false;
   }
   isInteractiveMode() {
-    return this.isInteractive;
+    return false;
   }
   start() {
     this.started = true;
     this.finished = false;
-    if (this.isInteractive) {
-      this.write("\n");
-      this.renderTimer = setInterval(() => {
-        this.scheduleRender();
-      }, 1e3);
-      this.renderTimer.unref?.();
-    }
   }
   setTotalTests(count) {
     this.totalTests = count;
   }
   updateWorker(progress) {
+    const previous = this.workers.get(progress.workerId);
     this.workers.set(progress.workerId, progress);
     if (progress.status === "completed" || progress.status === "failed") {
       this.completedTests++;
     }
-    if (this.isInteractive) {
-      this.scheduleRender();
-    } else {
-      const targetSuffix = progress.targetLabel ? ` | ${progress.targetLabel}` : "";
-      if (progress.status === "completed") {
-        console.log(`\u2713 Eval ${progress.evalId}${targetSuffix} completed`);
-      } else if (progress.status === "failed") {
+    const targetSuffix = progress.targetLabel ? ` | ${progress.targetLabel}` : "";
+    const countPrefix = `${this.completedTests}/${this.totalTests}`;
+    switch (progress.status) {
+      case "pending":
+        if (this.verbose && !previous) {
+          console.log(`${countPrefix}   \u23F3 ${progress.evalId}${targetSuffix}`);
+        }
+        break;
+      case "running":
+        if (!previous || previous.status === "pending") {
+          console.log(`${countPrefix}   \u{1F504} ${progress.evalId}${targetSuffix}`);
+        }
+        break;
+      case "completed":
+        console.log(`${countPrefix}   \u2705 ${progress.evalId}${targetSuffix}`);
+        break;
+      case "failed":
         console.log(
-          `\u2717 Eval ${progress.evalId}${targetSuffix} failed${progress.error ? `: ${progress.error}` : ""}`
+          `${countPrefix}   \u274C ${progress.evalId}${targetSuffix}${progress.error ? `: ${progress.error}` : ""}`
         );
-      }
+        break;
     }
   }
   addLogPaths(paths) {
@@ -40193,10 +40196,6 @@ var ProgressDisplay = class {
       return;
     }
     this.logPaths.push(...newPaths);
-    if (this.isInteractive) {
-      this.scheduleRender();
-      return;
-    }
     if (!this.hasPrintedLogHeader) {
       console.log("");
       console.log("Codex CLI logs:");
@@ -40207,112 +40206,11 @@ var ProgressDisplay = class {
       console.log(`${startIndex + offset + 1}. ${path27}`);
     });
   }
-  scheduleRender() {
-    if (this.renderScheduled || this.finished) {
-      return;
-    }
-    this.renderScheduled = true;
-    setTimeout(() => {
-      this.renderScheduled = false;
-      this.render();
-    }, 100);
-  }
-  write(content) {
-    process.stdout.write(content);
-  }
-  clearWindow() {
-    if (this.windowHeight === 0) {
-      return;
-    }
-    this.write(`\r${CLEAR_LINE}`);
-    for (let i = 1; i < this.windowHeight; i++) {
-      this.write(`${MOVE_CURSOR_UP}\r${CLEAR_LINE}`);
-    }
-    this.windowHeight = 0;
-  }
-  getRenderedRowCount(rows) {
-    const columns = process.stdout.columns || 80;
-    let count = 0;
-    for (const row of rows) {
-      const text2 = stripVTControlCharacters(row);
-      count += Math.max(1, Math.ceil(text2.length / columns));
-    }
-    return count;
-  }
-  render() {
-    if (!this.isInteractive || !this.started || this.finished) {
-      return;
-    }
-    const lines = [];
-    const sortedWorkers = Array.from(this.workers.values()).sort((a, b) => a.workerId - b.workerId);
-    for (const worker of sortedWorkers) {
-      const line2 = this.formatWorkerLine(worker);
-      lines.push(line2);
-    }
-    if (this.logPaths.length > 0) {
-      lines.push("");
-      lines.push("Codex CLI logs:");
-      this.logPaths.forEach((path27, index) => {
-        lines.push(`${index + 1}. ${path27}`);
-      });
-    }
-    const rowCount = this.getRenderedRowCount(lines);
-    this.clearWindow();
-    if (lines.length > 0) {
-      this.write(lines.join("\n"));
-    }
-    this.windowHeight = rowCount;
-  }
-  formatWorkerLine(worker) {
-    const workerLabel = `${worker.workerId}.`.padEnd(4);
-    const statusIcon = this.getStatusIcon(worker.status);
-    const targetLabel = worker.targetLabel ? `  | ${worker.targetLabel}` : "";
-    const columns = process.stdout.columns || 80;
-    const maxLineLength = Math.max(40, columns - 4);
-    const reservedLength = workerLabel.length + statusIcon.length + targetLabel.length + 4;
-    const availableLabelLength = Math.max(15, maxLineLength - reservedLength);
-    let testLabel = worker.evalId;
-    if (testLabel.length > availableLabelLength) {
-      testLabel = `${testLabel.substring(0, Math.max(0, availableLabelLength - 3))}...`;
-    }
-    return `${workerLabel} ${statusIcon} ${testLabel}${targetLabel}`;
-  }
-  getStatusIcon(status) {
-    switch (status) {
-      case "pending":
-        return "\u23F3";
-      case "running":
-        return "\u{1F504}";
-      case "completed":
-        return "\u2705";
-      case "failed":
-        return "\u274C";
-      default:
-        return "  ";
-    }
-  }
   finish() {
-    if (this.renderTimer) {
-      clearInterval(this.renderTimer);
-      this.renderTimer = void 0;
-    }
     this.finished = true;
-    if (this.isInteractive && this.started) {
-      this.clearWindow();
-      const sortedWorkers = Array.from(this.workers.values()).sort(
-        (a, b) => a.workerId - b.workerId
-      );
-      for (const worker of sortedWorkers) {
-        this.write(`${this.formatWorkerLine(worker)}
-`);
-      }
-      this.write("\n");
-    }
+    console.log("");
   }
   clear() {
-    if (this.isInteractive) {
-      this.clearWindow();
-    }
   }
 };
@@ -40649,26 +40547,6 @@ function validateMessages(messages, location, filePath, errors) {
         message: `Invalid role '${role}'. Must be one of: ${validRoles.join(", ")}`
       });
     }
-    const toolCalls = message.tool_calls;
-    if (toolCalls !== void 0) {
-      if (role !== "assistant") {
-        errors.push({
-          severity: "error",
-          filePath,
-          location: `${msgLocation}.tool_calls`,
-          message: "tool_calls can only be specified on assistant messages"
-        });
-      } else if (!Array.isArray(toolCalls)) {
-        errors.push({
-          severity: "error",
-          filePath,
-          location: `${msgLocation}.tool_calls`,
-          message: "tool_calls must be an array"
-        });
-      } else {
-        validateToolCalls(toolCalls, `${msgLocation}.tool_calls`, filePath, errors);
-      }
-    }
     const content = message.content;
     if (typeof content === "string") {
       validateContentForRoleMarkers(content, `${msgLocation}.content`, filePath, errors);
@@ -40733,30 +40611,6 @@ function validateContentForRoleMarkers(content, location, filePath, errors) {
     }
   }
 }
-function validateToolCalls(toolCalls, location, filePath, errors) {
-  for (let i = 0; i < toolCalls.length; i++) {
-    const toolCall = toolCalls[i];
-    const callLocation = `${location}[${i}]`;
-    if (!isObject2(toolCall)) {
-      errors.push({
-        severity: "error",
-        filePath,
-        location: callLocation,
-        message: "Tool call must be an object"
-      });
-      continue;
-    }
-    const tool2 = toolCall.tool;
-    if (typeof tool2 !== "string" || tool2.trim().length === 0) {
-      errors.push({
-        severity: "error",
-        filePath,
-        location: `${callLocation}.tool`,
-        message: "Missing or invalid 'tool' field (must be a non-empty string)"
-      });
-    }
-  }
-}
 function isObject22(value) {
   return typeof value === "object" && value !== null && !Array.isArray(value);
 }
@@ -40860,6 +40714,9 @@ var CLI_SETTINGS = /* @__PURE__ */ new Set([
   ...COMMON_SETTINGS,
   "command_template",
   "commandTemplate",
+  "verbose",
+  "cli_verbose",
+  "cliVerbose",
   "files_format",
   "filesFormat",
   "attachments_format",
@@ -40993,6 +40850,15 @@ async function validateTargetsFile(filePath) {
     if (healthcheck !== void 0) {
       validateCliHealthcheck(healthcheck, absolutePath2, `${location}.healthcheck`, errors2);
     }
+    const verbose = target.verbose ?? target.cli_verbose ?? target.cliVerbose;
+    if (verbose !== void 0 && typeof verbose !== "boolean") {
+      errors2.push({
+        severity: "error",
+        filePath: absolutePath2,
+        location: `${location}.verbose`,
+        message: "'verbose' must be a boolean when provided"
+      });
+    }
   }
   function validateCliHealthcheck(healthcheck, absolutePath2, location, errors2) {
     if (!isObject22(healthcheck)) {
@@ -41636,8 +41502,8 @@ function createEvaluationCache() {
     }
   };
 }
-function createProgressReporter(maxWorkers) {
-  const display = new ProgressDisplay(maxWorkers);
+function createProgressReporter(maxWorkers, options) {
+  const display = new ProgressDisplay(maxWorkers, options);
   return {
     isInteractive: display.isInteractiveMode(),
     start: () => display.start(),
@@ -41665,6 +41531,22 @@ function createDisplayIdTracker() {
     }
   };
 }
+function applyVerboseOverride(selection, cliVerbose) {
+  const { resolvedTarget } = selection;
+  if (resolvedTarget.kind !== "cli") {
+    return selection;
+  }
+  return {
+    ...selection,
+    resolvedTarget: {
+      ...resolvedTarget,
+      config: {
+        ...resolvedTarget.config,
+        verbose: cliVerbose
+      }
+    }
+  };
+}
 async function prepareFileMetadata(params) {
   const { testFilePath, repoRoot, cwd, options } = params;
   await ensureFileExists(testFilePath, "Test file");
@@ -41724,7 +41606,7 @@ async function runSingleEvalFile(params) {
     evalCases
   } = params;
   await ensureFileExists(testFilePath, "Test file");
-  const resolvedTargetSelection = selection;
+  const resolvedTargetSelection = applyVerboseOverride(selection, options.verbose);
   const providerLabel = options.dryRun ? `${resolvedTargetSelection.resolvedTarget.kind} (dry-run)` : resolvedTargetSelection.resolvedTarget.kind;
   const targetMessage = options.verbose ? `Using target (${resolvedTargetSelection.targetSource}): ${resolvedTargetSelection.targetName} [provider=${providerLabel}] via ${resolvedTargetSelection.targetsFilePath}` : `Using target: ${inlineTargetLabel}`;
   if (!progressReporter.isInteractive || options.verbose) {
@@ -41837,7 +41719,7 @@ async function runEvalCommand(input) {
   if (totalEvalCount === 0) {
     throw new Error("No eval cases matched the provided filters.");
   }
-  const progressReporter = createProgressReporter(totalWorkers);
+  const progressReporter = createProgressReporter(totalWorkers, { verbose: options.verbose });
   progressReporter.start();
   progressReporter.setTotal(totalEvalCount);
   const seenCodexLogPaths = /* @__PURE__ */ new Set();
@@ -42708,4 +42590,4 @@ export {
   app,
   runCli
 };
-//# sourceMappingURL=chunk-RIJO5WBF.js.map
+//# sourceMappingURL=chunk-6R2YRXCQ.js.map