npm - @agentv/core - Versions diffs - 1.5.0 → 2.0.1 - Mend

@agentv/core 1.5.0 → 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +77 -77
package/dist/{chunk-E2VSU4WZ.js → chunk-IBTKEEOT.js} +73 -1
package/dist/chunk-IBTKEEOT.js.map +1 -0
package/dist/evaluation/validation/index.cjs +1 -0
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +1 -1
package/dist/index.cjs +2536 -663
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +221 -10
package/dist/index.d.ts +221 -10
package/dist/index.js +2362 -568
package/dist/index.js.map +1 -1
package/package.json +5 -2
package/dist/chunk-E2VSU4WZ.js.map +0 -1

package/dist/index.js CHANGED Viewed

@@ -10,7 +10,7 @@ import {
   readTextFile,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-E2VSU4WZ.js";
+} from "./chunk-IBTKEEOT.js";
 // src/evaluation/types.ts
 var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -64,7 +64,11 @@ var EVALUATOR_KIND_VALUES = [
   "llm_judge",
   "rubric",
   "composite",
-  "tool_trajectory"
+  "tool_trajectory",
+  "field_accuracy",
+  "latency",
+  "cost",
+  "token_usage"
 ];
 var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
 function isEvaluatorKind(value) {
@@ -486,7 +490,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       continue;
     }
     if (typeValue === "code_judge") {
-      const script = asString2(rawEvaluator.script);
+      let script;
+      const rawScript = rawEvaluator.script;
+      if (typeof rawScript === "string") {
+        const trimmed = rawScript.trim();
+        if (trimmed.length === 0) {
+          throw new Error(
+            `Invalid code_judge script for evaluator '${name}' in '${evalId}': script cannot be empty`
+          );
+        }
+        script = parseCommandToArgv(trimmed);
+      } else {
+        script = asStringArray(
+          rawScript,
+          `code_judge script for evaluator '${name}' in '${evalId}'`
+        );
+      }
       if (!script) {
         logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
         continue;
@@ -507,13 +526,21 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       } else {
         resolvedCwd = searchRoots[0];
       }
+      const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
+      const config = {};
+      for (const [key, value] of Object.entries(rawEvaluator)) {
+        if (!knownProps.has(key) && value !== void 0) {
+          config[key] = value;
+        }
+      }
       evaluators.push({
         name,
         type: "code",
         script,
         cwd,
         resolvedCwd,
-        ...weight2 !== void 0 ? { weight: weight2 } : {}
+        ...weight2 !== void 0 ? { weight: weight2 } : {},
+        ...Object.keys(config).length > 0 ? { config } : {}
       });
       continue;
     }
@@ -688,6 +715,140 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       evaluators.push(config);
       continue;
     }
+    if (typeValue === "field_accuracy") {
+      const rawFields = rawEvaluator.fields;
+      if (!Array.isArray(rawFields)) {
+        logWarning2(
+          `Skipping field_accuracy evaluator '${name}' in '${evalId}': missing fields array`
+        );
+        continue;
+      }
+      if (rawFields.length === 0) {
+        logWarning2(
+          `Skipping field_accuracy evaluator '${name}' in '${evalId}': fields array is empty`
+        );
+        continue;
+      }
+      const fields = [];
+      for (const rawField of rawFields) {
+        if (!isJsonObject2(rawField)) {
+          logWarning2(
+            `Skipping invalid field entry in field_accuracy evaluator '${name}' (expected object)`
+          );
+          continue;
+        }
+        const fieldPath = asString2(rawField.path);
+        const match = asString2(rawField.match);
+        if (!fieldPath) {
+          logWarning2(
+            `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
+          );
+          continue;
+        }
+        if (!match || !isValidFieldMatchType(match)) {
+          logWarning2(
+            `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
+          );
+          continue;
+        }
+        const fieldConfig = {
+          path: fieldPath,
+          match,
+          ...typeof rawField.required === "boolean" ? { required: rawField.required } : {},
+          ...typeof rawField.weight === "number" ? { weight: rawField.weight } : {},
+          ...typeof rawField.tolerance === "number" ? { tolerance: rawField.tolerance } : {},
+          ...typeof rawField.relative === "boolean" ? { relative: rawField.relative } : {},
+          ...Array.isArray(rawField.formats) ? { formats: rawField.formats.filter((f) => typeof f === "string") } : {}
+        };
+        fields.push(fieldConfig);
+      }
+      if (fields.length === 0) {
+        logWarning2(
+          `Skipping field_accuracy evaluator '${name}' in '${evalId}': no valid fields found`
+        );
+        continue;
+      }
+      const aggregation = asString2(rawEvaluator.aggregation);
+      const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
+      evaluators.push({
+        name,
+        type: "field_accuracy",
+        fields,
+        ...validAggregation ? { aggregation: validAggregation } : {},
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
+      });
+      continue;
+    }
+    if (typeValue === "latency") {
+      const threshold = rawEvaluator.threshold;
+      if (typeof threshold !== "number" || threshold < 0) {
+        logWarning2(
+          `Skipping latency evaluator '${name}' in '${evalId}': threshold must be a non-negative number`
+        );
+        continue;
+      }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
+      evaluators.push({
+        name,
+        type: "latency",
+        threshold,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
+      });
+      continue;
+    }
+    if (typeValue === "cost") {
+      const budget = rawEvaluator.budget;
+      if (typeof budget !== "number" || budget < 0) {
+        logWarning2(
+          `Skipping cost evaluator '${name}' in '${evalId}': budget must be a non-negative number`
+        );
+        continue;
+      }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
+      evaluators.push({
+        name,
+        type: "cost",
+        budget,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
+      });
+      continue;
+    }
+    if (typeValue === "token_usage") {
+      const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
+      const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
+      const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
+      const limits = [
+        ["max_total", maxTotal],
+        ["max_input", maxInput],
+        ["max_output", maxOutput]
+      ];
+      const validLimits = {};
+      for (const [key, raw] of limits) {
+        if (raw === void 0) continue;
+        if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
+          logWarning2(
+            `Skipping token_usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
+          );
+          continue;
+        }
+        validLimits[key] = raw;
+      }
+      if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
+        logWarning2(
+          `Skipping token_usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
+        );
+        continue;
+      }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
+      evaluators.push({
+        name,
+        type: "token_usage",
+        ...validLimits,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
+      });
+      continue;
+    }
     const prompt = asString2(rawEvaluator.prompt);
     let promptPath;
     if (prompt) {
@@ -758,6 +919,34 @@ function coerceEvaluator(candidate, contextId) {
 function asString2(value) {
   return typeof value === "string" ? value : void 0;
 }
+function asStringArray(value, description) {
+  if (value === void 0) {
+    return void 0;
+  }
+  if (!Array.isArray(value)) {
+    throw new Error(`${description} must be an array of strings (argv tokens)`);
+  }
+  if (value.length === 0) {
+    throw new Error(`${description} cannot be empty`);
+  }
+  const result = [];
+  for (const [index, entry] of value.entries()) {
+    if (typeof entry !== "string") {
+      throw new Error(`${description}[${index}] must be a string`);
+    }
+    if (entry.trim().length === 0) {
+      throw new Error(`${description}[${index}] cannot be empty`);
+    }
+    result.push(entry);
+  }
+  return result;
+}
+function parseCommandToArgv(command) {
+  if (process.platform === "win32") {
+    return ["cmd.exe", "/c", command];
+  }
+  return ["sh", "-lc", command];
+}
 function isJsonObject2(value) {
   return typeof value === "object" && value !== null && !Array.isArray(value);
 }
@@ -791,6 +980,14 @@ function validateWeight(rawWeight, evaluatorName, evalId) {
   }
   return rawWeight;
 }
+var VALID_FIELD_MATCH_TYPES = /* @__PURE__ */ new Set(["exact", "numeric_tolerance", "date"]);
+function isValidFieldMatchType(value) {
+  return typeof value === "string" && VALID_FIELD_MATCH_TYPES.has(value);
+}
+var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average", "all_or_nothing"]);
+function isValidFieldAggregationType(value) {
+  return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
+}
 // src/evaluation/loaders/message-processor.ts
 import { readFile as readFile3 } from "node:fs/promises";
@@ -1750,91 +1947,992 @@ async function withRetry(fn, retryConfig, signal) {
   throw lastError;
 }
-// src/evaluation/providers/cli.ts
-import { exec as execWithCallback } from "node:child_process";
-import fs from "node:fs/promises";
-import os from "node:os";
-import path7 from "node:path";
-import { promisify } from "node:util";
-var execAsync = promisify(execWithCallback);
-var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
-async function defaultCommandRunner(command, options) {
-  const execOptions = {
-    cwd: options.cwd,
-    env: options.env,
-    timeout: options.timeoutMs,
-    signal: options.signal,
-    maxBuffer: DEFAULT_MAX_BUFFER,
-    shell: process.platform === "win32" ? "powershell.exe" : void 0
+// src/evaluation/providers/claude-code.ts
+import { spawn } from "node:child_process";
+import { randomUUID } from "node:crypto";
+import { createWriteStream } from "node:fs";
+import { mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import path8 from "node:path";
+// src/evaluation/providers/claude-code-log-tracker.ts
+var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
+var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeCodeLogSubscribers");
+function getClaudeCodeLogStore() {
+  const globalObject = globalThis;
+  const existing = globalObject[GLOBAL_LOGS_KEY];
+  if (existing) {
+    return existing;
+  }
+  const created = [];
+  globalObject[GLOBAL_LOGS_KEY] = created;
+  return created;
+}
+function getSubscriberStore() {
+  const globalObject = globalThis;
+  const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
+  if (existing) {
+    return existing;
+  }
+  const created = /* @__PURE__ */ new Set();
+  globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
+  return created;
+}
+function notifySubscribers(entry) {
+  const subscribers = Array.from(getSubscriberStore());
+  for (const listener of subscribers) {
+    try {
+      listener(entry);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.warn(`Claude Code log subscriber failed: ${message}`);
+    }
+  }
+}
+function recordClaudeCodeLogEntry(entry) {
+  getClaudeCodeLogStore().push(entry);
+  notifySubscribers(entry);
+}
+function consumeClaudeCodeLogEntries() {
+  const store = getClaudeCodeLogStore();
+  if (store.length === 0) {
+    return [];
+  }
+  return store.splice(0, store.length);
+}
+function subscribeToClaudeCodeLogEntries(listener) {
+  const store = getSubscriberStore();
+  store.add(listener);
+  return () => {
+    store.delete(listener);
   };
-  try {
-    const { stdout, stderr } = await execAsync(command, execOptions);
-    return {
-      stdout,
-      stderr,
-      exitCode: 0,
-      failed: false,
-      timedOut: false,
-      signal: null
-    };
-  } catch (error) {
-    const execError = error;
-    return {
-      stdout: execError.stdout ?? "",
-      stderr: execError.stderr ?? "",
-      exitCode: typeof execError.code === "number" ? execError.code : null,
-      failed: true,
-      timedOut: execError.timedOut === true || execError.killed === true,
-      signal: execError.signal ?? null
-    };
+}
+// src/evaluation/providers/preread.ts
+import path7 from "node:path";
+function buildPromptDocument(request, inputFiles, options) {
+  const parts = [];
+  const guidelineFiles = collectGuidelineFiles(
+    inputFiles,
+    options?.guidelinePatterns ?? request.guideline_patterns,
+    options?.guidelineOverrides
+  );
+  const inputFilesList = collectInputFiles(inputFiles);
+  const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
+  const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
+  if (prereadBlock.length > 0) {
+    parts.push("\n", prereadBlock);
   }
+  parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
+  return parts.join("\n").trim();
 }
-var CliProvider = class {
-  id;
-  kind = "cli";
-  targetName;
-  supportsBatch = true;
-  config;
-  runCommand;
-  verbose;
-  keepTempFiles;
-  healthcheckPromise;
-  constructor(targetName, config, runner = defaultCommandRunner) {
-    this.targetName = targetName;
-    this.id = `cli:${targetName}`;
-    this.config = config;
-    this.runCommand = runner;
-    this.verbose = config.verbose ?? false;
-    this.keepTempFiles = config.keepTempFiles ?? false;
+function normalizeInputFiles(inputFiles) {
+  if (!inputFiles || inputFiles.length === 0) {
+    return void 0;
   }
-  async invoke(request) {
-    if (request.signal?.aborted) {
-      throw new Error("CLI provider request was aborted before execution");
+  const deduped = /* @__PURE__ */ new Map();
+  for (const inputFile of inputFiles) {
+    const absolutePath = path7.resolve(inputFile);
+    if (!deduped.has(absolutePath)) {
+      deduped.set(absolutePath, absolutePath);
     }
-    await this.ensureHealthy(request.signal);
-    const outputFilePath = generateOutputFilePath(request.evalCaseId);
-    const templateValues = buildTemplateValues(request, this.config, outputFilePath);
-    const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
-    if (this.verbose) {
-      console.log(
-        `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
-      );
+  }
+  return Array.from(deduped.values());
+}
+function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
+  if (!inputFiles || inputFiles.length === 0) {
+    return [];
+  }
+  const unique = /* @__PURE__ */ new Map();
+  for (const inputFile of inputFiles) {
+    const absolutePath = path7.resolve(inputFile);
+    if (overrides?.has(absolutePath)) {
+      if (!unique.has(absolutePath)) {
+        unique.set(absolutePath, absolutePath);
+      }
+      continue;
     }
-    const startTime = Date.now();
-    const result = await this.runCommand(renderedCommand, {
-      cwd: this.config.cwd,
-      env: process.env,
-      timeoutMs: this.config.timeoutMs,
-      signal: request.signal
-    });
-    const measuredDurationMs = Date.now() - startTime;
-    if (result.failed || (result.exitCode ?? 0) !== 0) {
-      if (request.signal?.aborted) {
-        throw new Error("CLI provider request was aborted");
+    const normalized = absolutePath.split(path7.sep).join("/");
+    if (isGuidelineFile(normalized, guidelinePatterns)) {
+      if (!unique.has(absolutePath)) {
+        unique.set(absolutePath, absolutePath);
+      }
+    }
+  }
+  return Array.from(unique.values());
+}
+function collectInputFiles(inputFiles) {
+  if (!inputFiles || inputFiles.length === 0) {
+    return [];
+  }
+  const unique = /* @__PURE__ */ new Map();
+  for (const inputFile of inputFiles) {
+    const absolutePath = path7.resolve(inputFile);
+    if (!unique.has(absolutePath)) {
+      unique.set(absolutePath, absolutePath);
+    }
+  }
+  return Array.from(unique.values());
+}
+function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
+  if (guidelineFiles.length === 0 && inputFiles.length === 0) {
+    return "";
+  }
+  const buildList = (files) => files.map((absolutePath) => {
+    const fileName = path7.basename(absolutePath);
+    const fileUri = pathToFileUri(absolutePath);
+    return `* [${fileName}](${fileUri})`;
+  });
+  const sections = [];
+  if (guidelineFiles.length > 0) {
+    sections.push(`Read all guideline files:
+${buildList(guidelineFiles).join("\n")}.`);
+  }
+  if (inputFiles.length > 0) {
+    sections.push(`Read all input files:
+${buildList(inputFiles).join("\n")}.`);
+  }
+  sections.push(
+    "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
+    "Then apply system_instructions on the user query below."
+  );
+  return sections.join("\n");
+}
+function pathToFileUri(filePath) {
+  const absolutePath = path7.isAbsolute(filePath) ? filePath : path7.resolve(filePath);
+  const normalizedPath = absolutePath.replace(/\\/g, "/");
+  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
+    return `file:///${normalizedPath}`;
+  }
+  return `file://${normalizedPath}`;
+}
+// src/evaluation/providers/claude-code.ts
+var WORKSPACE_PREFIX = "agentv-claude-code-";
+var PROMPT_FILENAME = "prompt.md";
+var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
+- Do NOT create any additional output files in the workspace.
+- All intended file outputs/changes MUST be written in your response.
+- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
+This is required for evaluation scoring.`;
+var ClaudeCodeProvider = class {
+  id;
+  kind = "claude-code";
+  targetName;
+  supportsBatch = false;
+  config;
+  runClaudeCode;
+  constructor(targetName, config, runner = defaultClaudeCodeRunner) {
+    this.id = `claude-code:${targetName}`;
+    this.targetName = targetName;
+    this.config = config;
+    this.runClaudeCode = runner;
+  }
+  async invoke(request) {
+    if (request.signal?.aborted) {
+      throw new Error("Claude Code request was aborted before execution");
+    }
+    const inputFiles = normalizeInputFiles(request.inputFiles);
+    const workspaceRoot = await this.createWorkspace();
+    const logger = await this.createStreamLogger(request).catch(() => void 0);
+    try {
+      const promptFile = path8.join(workspaceRoot, PROMPT_FILENAME);
+      await writeFile(promptFile, request.question, "utf8");
+      const args = this.buildClaudeCodeArgs(request.question, inputFiles);
+      const cwd = this.resolveCwd();
+      const result = await this.executeClaudeCode(args, cwd, request.signal, logger);
+      if (result.timedOut) {
+        throw new Error(
+          `Claude Code CLI timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
+        );
+      }
+      if (result.exitCode !== 0) {
+        const detail = pickDetail(result.stderr, result.stdout);
+        const prefix = `Claude Code CLI exited with code ${result.exitCode}`;
+        if (isNestedClaudeCodeAuthError(result.stdout)) {
+          throw new Error(
+            `${prefix}: Claude Code detected a nested session and requires API key authentication. Set ANTHROPIC_API_KEY environment variable or run AgentV outside of a Claude Code session.`
+          );
+        }
+        throw new Error(detail ? `${prefix}: ${detail}` : prefix);
+      }
+      const parsed = parseClaudeCodeJsonl(result.stdout);
+      const outputMessages = extractOutputMessages(parsed);
+      const usage = extractUsage(parsed);
+      return {
+        raw: {
+          response: parsed,
+          stdout: result.stdout,
+          stderr: result.stderr,
+          exitCode: result.exitCode,
+          args,
+          executable: this.config.executable,
+          promptFile,
+          workspace: workspaceRoot,
+          inputFiles,
+          logFile: logger?.filePath
+        },
+        outputMessages,
+        usage
+      };
+    } finally {
+      await logger?.close();
+      await this.cleanupWorkspace(workspaceRoot);
+    }
+  }
+  resolveCwd() {
+    if (!this.config.cwd) {
+      return process.cwd();
+    }
+    return path8.resolve(this.config.cwd);
+  }
+  buildClaudeCodeArgs(prompt, inputFiles) {
+    const args = [];
+    args.push("--output-format", "stream-json");
+    args.push("--verbose");
+    args.push("-p");
+    if (this.config.model) {
+      args.push("--model", this.config.model);
+    }
+    if (this.config.args && this.config.args.length > 0) {
+      args.push(...this.config.args);
+    }
+    const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
+    const fullPrompt = `${systemPrompt}
+${prompt}`;
+    let finalPrompt = fullPrompt;
+    if (inputFiles && inputFiles.length > 0) {
+      const filesContext = inputFiles.map((f) => `[File: ${f}]`).join("\n");
+      finalPrompt = `${fullPrompt}
+## Input Files
+${filesContext}`;
+    }
+    args.push(finalPrompt);
+    return args;
+  }
+  buildEnv() {
+    const env = { ...process.env };
+    env.CLAUDECODE = void 0;
+    env.CLAUDE_CODE_ENTRYPOINT = void 0;
+    return env;
+  }
+  async executeClaudeCode(args, cwd, signal, logger) {
+    try {
+      return await this.runClaudeCode({
+        executable: this.config.executable,
+        args,
+        cwd,
+        timeoutMs: this.config.timeoutMs,
+        env: this.buildEnv(),
+        signal,
+        onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
+        onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
+      });
+    } catch (error) {
+      const err = error;
+      if (err.code === "ENOENT") {
+        throw new Error(
+          `Claude Code executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
+        );
+      }
+      throw error;
+    }
+  }
+  async createWorkspace() {
+    return await mkdtemp(path8.join(tmpdir(), WORKSPACE_PREFIX));
+  }
+  async cleanupWorkspace(workspaceRoot) {
+    try {
+      await rm(workspaceRoot, { recursive: true, force: true });
+    } catch {
+    }
+  }
+  resolveLogDirectory() {
+    const disabled = isClaudeCodeLogStreamingDisabled();
+    if (disabled) {
+      return void 0;
+    }
+    if (this.config.logDir) {
+      return path8.resolve(this.config.logDir);
+    }
+    return path8.join(process.cwd(), ".agentv", "logs", "claude-code");
+  }
+  async createStreamLogger(request) {
+    const logDir = this.resolveLogDirectory();
+    if (!logDir) {
+      return void 0;
+    }
+    try {
+      await mkdir(logDir, { recursive: true });
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
+      return void 0;
+    }
+    const filePath = path8.join(logDir, buildLogFilename(request, this.targetName));
+    try {
+      const logger = await ClaudeCodeStreamLogger.create({
+        filePath,
+        targetName: this.targetName,
+        evalCaseId: request.evalCaseId,
+        attempt: request.attempt,
+        format: this.config.logFormat ?? "summary"
+      });
+      recordClaudeCodeLogEntry({
+        filePath,
+        targetName: this.targetName,
+        evalCaseId: request.evalCaseId,
+        attempt: request.attempt
+      });
+      return logger;
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.warn(`Skipping Claude Code stream logging for ${filePath}: ${message}`);
+      return void 0;
+    }
+  }
+};
+var ClaudeCodeStreamLogger = class _ClaudeCodeStreamLogger {
+  filePath;
+  stream;
+  startedAt = Date.now();
+  stdoutBuffer = "";
+  stderrBuffer = "";
+  format;
+  constructor(filePath, format) {
+    this.filePath = filePath;
+    this.format = format;
+    this.stream = createWriteStream(filePath, { flags: "a" });
+  }
+  static async create(options) {
+    const logger = new _ClaudeCodeStreamLogger(options.filePath, options.format);
+    const header = [
+      "# Claude Code CLI stream log",
+      `# target: ${options.targetName}`,
+      options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
+      options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
+      `# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
+      ""
+    ].filter((line) => Boolean(line));
+    logger.writeLines(header);
+    return logger;
+  }
+  handleStdoutChunk(chunk) {
+    this.stdoutBuffer += chunk;
+    this.flushBuffer("stdout");
+  }
+  handleStderrChunk(chunk) {
+    this.stderrBuffer += chunk;
+    this.flushBuffer("stderr");
+  }
+  async close() {
+    this.flushBuffer("stdout");
+    this.flushBuffer("stderr");
+    this.flushRemainder();
+    await new Promise((resolve, reject) => {
+      this.stream.once("error", reject);
+      this.stream.end(() => resolve());
+    });
+  }
+  writeLines(lines) {
+    for (const line of lines) {
+      this.stream.write(`${line}
+`);
+    }
+  }
+  flushBuffer(source) {
+    const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
+    const lines = buffer.split(/\r?\n/);
+    const remainder = lines.pop() ?? "";
+    if (source === "stdout") {
+      this.stdoutBuffer = remainder;
+    } else {
+      this.stderrBuffer = remainder;
+    }
+    for (const line of lines) {
+      const formatted = this.formatLine(line, source);
+      if (formatted) {
+        this.stream.write(formatted);
+        this.stream.write("\n");
+      }
+    }
+  }
+  formatLine(rawLine, source) {
+    const trimmed = rawLine.trim();
+    if (trimmed.length === 0) {
+      return void 0;
+    }
+    const message = this.format === "json" ? formatClaudeCodeJsonLog(trimmed) : formatClaudeCodeLogMessage(trimmed, source);
+    return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
+  }
+  flushRemainder() {
+    const stdoutRemainder = this.stdoutBuffer.trim();
+    if (stdoutRemainder.length > 0) {
+      const formatted = this.formatLine(stdoutRemainder, "stdout");
+      if (formatted) {
+        this.stream.write(formatted);
+        this.stream.write("\n");
+      }
+    }
+    const stderrRemainder = this.stderrBuffer.trim();
+    if (stderrRemainder.length > 0) {
+      const formatted = this.formatLine(stderrRemainder, "stderr");
+      if (formatted) {
+        this.stream.write(formatted);
+        this.stream.write("\n");
+      }
+    }
+    this.stdoutBuffer = "";
+    this.stderrBuffer = "";
+  }
+};
+function isClaudeCodeLogStreamingDisabled() {
+  const envValue = process.env.AGENTV_CLAUDE_CODE_STREAM_LOGS;
+  if (!envValue) {
+    return false;
+  }
+  const normalized = envValue.trim().toLowerCase();
+  return normalized === "false" || normalized === "0" || normalized === "off";
+}
+function buildLogFilename(request, targetName) {
+  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
+  const evalId = sanitizeForFilename(request.evalCaseId ?? "claude-code");
+  const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
+  const target = sanitizeForFilename(targetName);
+  return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID().slice(0, 8)}.log`;
+}
+function sanitizeForFilename(value) {
+  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
+  return sanitized.length > 0 ? sanitized : "claude-code";
+}
+function formatElapsed(startedAt) {
+  const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
+  const hours = Math.floor(elapsedSeconds / 3600);
+  const minutes = Math.floor(elapsedSeconds % 3600 / 60);
+  const seconds = elapsedSeconds % 60;
+  if (hours > 0) {
+    return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
+  }
+  return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
+}
+function formatClaudeCodeLogMessage(rawLine, source) {
+  const parsed = tryParseJsonValue(rawLine);
+  if (parsed) {
+    const summary = summarizeClaudeCodeEvent(parsed);
+    if (summary) {
+      return summary;
+    }
+  }
+  if (source === "stderr") {
+    return `stderr: ${rawLine}`;
+  }
+  return rawLine;
+}
+function formatClaudeCodeJsonLog(rawLine) {
+  const parsed = tryParseJsonValue(rawLine);
+  if (!parsed) {
+    return rawLine;
+  }
+  try {
+    return JSON.stringify(parsed, null, 2);
+  } catch {
+    return rawLine;
+  }
+}
+function summarizeClaudeCodeEvent(event) {
+  if (!event || typeof event !== "object") {
+    return void 0;
+  }
+  const record = event;
+  const type = typeof record.type === "string" ? record.type : void 0;
+  if (!type) {
+    return void 0;
+  }
+  switch (type) {
+    case "system":
+      return "system: init";
+    case "assistant": {
+      const message = record.message;
+      if (message) {
+        const content = message.content;
+        if (Array.isArray(content) && content.length > 0) {
+          const first = content[0];
+          if (first?.type === "tool_use") {
+            return `assistant: tool_use (${first.name})`;
+          }
+          if (first?.type === "text") {
+            const text = first.text;
+            if (typeof text === "string") {
+              const preview = text.length > 50 ? `${text.slice(0, 50)}...` : text;
+              return `assistant: ${preview}`;
+            }
+          }
+        }
+      }
+      return "assistant";
+    }
+    case "user": {
+      const message = record.message;
+      if (message) {
+        const content = message.content;
+        if (Array.isArray(content) && content.length > 0) {
+          const first = content[0];
+          if (first?.type === "tool_result") {
+            return `user: tool_result (${first.tool_use_id})`;
+          }
+        }
+      }
+      return "user";
+    }
+    case "result": {
+      const cost = record.cost_usd;
+      const duration = record.duration_ms;
+      if (typeof cost === "number" && typeof duration === "number") {
+        return `result: $${cost.toFixed(4)}, ${Math.round(duration)}ms`;
+      }
+      return "result";
+    }
+    default:
+      return type;
+  }
+}
+function tryParseJsonValue(rawLine) {
+  try {
+    return JSON.parse(rawLine);
+  } catch {
+    return void 0;
+  }
+}
+function parseClaudeCodeJsonl(output) {
+  const trimmed = output.trim();
+  if (trimmed.length === 0) {
+    throw new Error("Claude Code CLI produced no output");
+  }
+  const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
+  const parsed = [];
+  for (const line of lines) {
+    try {
+      parsed.push(JSON.parse(line));
+    } catch {
+    }
+  }
+  if (parsed.length === 0) {
+    throw new Error("Claude Code CLI produced no valid JSON output");
+  }
+  return parsed;
+}
+function extractOutputMessages(events) {
+  const outputMessages = [];
+  for (const event of events) {
+    if (!event || typeof event !== "object") {
+      continue;
+    }
+    const record = event;
+    const type = record.type;
+    if (type === "assistant" || type === "user") {
+      const message = record.message;
+      if (message) {
+        const converted = convertClaudeCodeMessage(message, type);
+        if (converted) {
+          outputMessages.push(converted);
+        }
+      }
+    }
+  }
+  return outputMessages;
+}
+function convertClaudeCodeMessage(message, type) {
+  const role = type === "assistant" ? "assistant" : "user";
+  const content = extractTextContent(message.content);
+  const toolCalls = extractToolCalls(message.content);
+  return {
+    role,
+    content,
+    toolCalls: toolCalls.length > 0 ? toolCalls : void 0
+  };
+}
+function extractTextContent(content) {
+  if (typeof content === "string") {
+    return content;
+  }
+  if (!Array.isArray(content)) {
+    return void 0;
+  }
+  const textParts = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "text" && typeof p.text === "string") {
+      textParts.push(p.text);
+    }
+  }
+  return textParts.length > 0 ? textParts.join("\n") : void 0;
+}
+function extractToolCalls(content) {
+  if (!Array.isArray(content)) {
+    return [];
+  }
+  const toolCalls = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "tool_use" && typeof p.name === "string") {
+      toolCalls.push({
+        tool: p.name,
+        input: p.input,
+        id: typeof p.id === "string" ? p.id : void 0
+      });
+    }
+    if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
+      toolCalls.push({
+        tool: "tool_result",
+        output: p.content,
+        id: p.tool_use_id
+      });
+    }
+  }
+  return toolCalls;
+}
+function extractUsage(events) {
+  for (let i = events.length - 1; i >= 0; i--) {
+    const event = events[i];
+    if (!event || typeof event !== "object") {
+      continue;
+    }
+    const record = event;
+    if (record.type !== "result") {
+      continue;
+    }
+    const usage = {};
+    if (typeof record.cost_usd === "number") {
+      usage.cost_usd = record.cost_usd;
+    }
+    if (typeof record.duration_ms === "number") {
+      usage.duration_ms = record.duration_ms;
+    }
+    if (typeof record.duration_api_ms === "number") {
+      usage.duration_api_ms = record.duration_api_ms;
+    }
+    if (typeof record.input_tokens === "number") {
+      usage.input_tokens = record.input_tokens;
+    }
+    if (typeof record.output_tokens === "number") {
+      usage.output_tokens = record.output_tokens;
+    }
+    if (typeof record.session_id === "string") {
+      usage.session_id = record.session_id;
+    }
+    return Object.keys(usage).length > 0 ? usage : void 0;
+  }
+  return void 0;
+}
+function pickDetail(stderr, stdout) {
+  const errorText = stderr.trim();
+  if (errorText.length > 0) {
+    return errorText;
+  }
+  const stdoutText = stdout.trim();
+  return stdoutText.length > 0 ? stdoutText : void 0;
+}
+function formatTimeoutSuffix(timeoutMs) {
+  if (!timeoutMs || timeoutMs <= 0) {
+    return "";
+  }
+  const seconds = Math.ceil(timeoutMs / 1e3);
+  return ` after ${seconds}s`;
+}
+function isNestedClaudeCodeAuthError(stdout) {
+  try {
+    const lines = stdout.split("\n");
+    let hasApiKeySource = false;
+    let hasAuthError = false;
+    for (const line of lines) {
+      const trimmed = line.trim();
+      if (!trimmed) continue;
+      try {
+        const event = JSON.parse(trimmed);
+        if (event.type === "system" && event.apiKeySource === "ANTHROPIC_API_KEY") {
+          hasApiKeySource = true;
+        }
+        if (event.error === "authentication_failed" || event.type === "result" && event.is_error) {
+          hasAuthError = true;
+        }
+      } catch {
+      }
+    }
+    return hasApiKeySource && hasAuthError;
+  } catch {
+    return false;
+  }
+}
+function escapeShellArg(arg) {
+  return `'${arg.replace(/'/g, "'\\''")}'`;
+}
+async function defaultClaudeCodeRunner(options) {
+  const tempId = randomUUID();
+  const stdoutFile = path8.join(tmpdir(), `agentv-cc-${tempId}-stdout`);
+  const stderrFile = path8.join(tmpdir(), `agentv-cc-${tempId}-stderr`);
+  const exitFile = path8.join(tmpdir(), `agentv-cc-${tempId}-exit`);
+  const pidFile = path8.join(tmpdir(), `agentv-cc-${tempId}-pid`);
+  try {
+    return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
+  } finally {
+    for (const file of [stdoutFile, stderrFile, exitFile, pidFile]) {
+      try {
+        await rm(file, { force: true });
+      } catch {
+      }
+    }
+  }
+}
+async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile) {
+  const parts = options.executable.split(/\s+/);
+  const executable = parts[0];
+  const executableArgs = parts.slice(1);
+  const allArgs = [...executableArgs, ...options.args];
+  const escapedArgs = allArgs.map((arg) => escapeShellArg(arg));
+  const fullCommand = [escapeShellArg(executable), ...escapedArgs].join(" ");
+  const bashScript = `
+    unset CLAUDECODE CLAUDE_CODE_ENTRYPOINT 2>/dev/null
+    ${fullCommand} >${escapeShellArg(stdoutFile)} 2>${escapeShellArg(stderrFile)} &
+    CHILD_PID=$!
+    echo $CHILD_PID > ${escapeShellArg(pidFile)}
+    wait $CHILD_PID
+    echo $? > ${escapeShellArg(exitFile)}
+  `;
+  const child = spawn("setsid", ["bash", "-c", bashScript], {
+    cwd: options.cwd,
+    env: options.env,
+    detached: true,
+    stdio: "ignore"
+  });
+  child.unref();
+  const pollInterval = 100;
+  const startTime = Date.now();
+  let timedOut = false;
+  let lastStdoutSize = 0;
+  const readFileIfExists = async (filePath) => {
+    try {
+      const { readFile: readFile7 } = await import("node:fs/promises");
+      return await readFile7(filePath, "utf8");
+    } catch {
+      return "";
+    }
+  };
+  const fileExists4 = async (filePath) => {
+    try {
+      const { access: access4 } = await import("node:fs/promises");
+      await access4(filePath);
+      return true;
+    } catch {
+      return false;
+    }
+  };
+  const killProcess = async () => {
+    try {
+      const pid = await readFileIfExists(pidFile);
+      if (pid.trim()) {
+        process.kill(Number.parseInt(pid.trim(), 10), "SIGTERM");
+      }
+    } catch {
+    }
+  };
+  if (options.signal?.aborted) {
+    await killProcess();
+    return { stdout: "", stderr: "Aborted", exitCode: -1, timedOut: false };
+  }
+  const abortHandler = () => {
+    killProcess().catch(() => {
+    });
+  };
+  options.signal?.addEventListener("abort", abortHandler, { once: true });
+  try {
+    while (true) {
+      if (options.timeoutMs && Date.now() - startTime > options.timeoutMs) {
+        timedOut = true;
+        await killProcess();
+        break;
+      }
+      if (options.signal?.aborted) {
+        await killProcess();
+        break;
+      }
+      if (options.onStdoutChunk) {
+        const currentStdout = await readFileIfExists(stdoutFile);
+        if (currentStdout.length > lastStdoutSize) {
+          options.onStdoutChunk(currentStdout.slice(lastStdoutSize));
+          lastStdoutSize = currentStdout.length;
+        }
+      }
+      if (await fileExists4(exitFile)) {
+        break;
+      }
+      await new Promise((resolve) => setTimeout(resolve, pollInterval));
+    }
+    const stdout = await readFileIfExists(stdoutFile);
+    const stderr = await readFileIfExists(stderrFile);
+    const exitCodeStr = await readFileIfExists(exitFile);
+    const exitCode = exitCodeStr.trim() ? Number.parseInt(exitCodeStr.trim(), 10) : -1;
+    if (options.onStdoutChunk && stdout.length > lastStdoutSize) {
+      options.onStdoutChunk(stdout.slice(lastStdoutSize));
+    }
+    if (options.onStderrChunk && stderr) {
+      options.onStderrChunk(stderr);
+    }
+    return { stdout, stderr, exitCode, timedOut };
+  } finally {
+    options.signal?.removeEventListener("abort", abortHandler);
+  }
+}
+// src/evaluation/providers/cli.ts
+import { exec as execWithCallback } from "node:child_process";
+import fs from "node:fs/promises";
+import os from "node:os";
+import path9 from "node:path";
+import { promisify } from "node:util";
+import { z } from "zod";
+var ToolCallSchema = z.object({
+  tool: z.string(),
+  input: z.unknown().optional(),
+  output: z.unknown().optional(),
+  id: z.string().optional(),
+  timestamp: z.string().optional()
+});
+var OutputMessageInputSchema = z.object({
+  role: z.string(),
+  name: z.string().optional(),
+  content: z.unknown().optional(),
+  tool_calls: z.array(ToolCallSchema).optional(),
+  timestamp: z.string().optional(),
+  metadata: z.record(z.unknown()).optional()
+});
+var TokenUsageSchema = z.object({
+  input: z.number(),
+  output: z.number(),
+  cached: z.number().optional()
+});
+var CliOutputSchema = z.object({
+  text: z.unknown().optional(),
+  output_messages: z.array(OutputMessageInputSchema).optional(),
+  token_usage: TokenUsageSchema.optional(),
+  cost_usd: z.number().optional(),
+  duration_ms: z.number().optional()
+});
+var CliJsonlRecordSchema = CliOutputSchema.extend({
+  id: z.string().min(1)
+});
+function validateMetrics(costUsd, durationMs, context) {
+  let validCostUsd = costUsd;
+  let validDurationMs = durationMs;
+  if (costUsd !== void 0 && costUsd < 0) {
+    console.warn(`[cli-provider] ${context}: ignoring negative cost_usd value (${costUsd})`);
+    validCostUsd = void 0;
+  }
+  if (durationMs !== void 0 && durationMs < 0) {
+    console.warn(`[cli-provider] ${context}: ignoring negative duration_ms value (${durationMs})`);
+    validDurationMs = void 0;
+  }
+  return { costUsd: validCostUsd, durationMs: validDurationMs };
+}
+function convertOutputMessages(messages) {
+  if (!messages || messages.length === 0) {
+    return void 0;
+  }
+  return messages.map((msg) => ({
+    role: msg.role,
+    name: msg.name,
+    content: msg.content,
+    toolCalls: msg.tool_calls,
+    timestamp: msg.timestamp,
+    metadata: msg.metadata
+  }));
+}
+var execAsync = promisify(execWithCallback);
+var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
+async function defaultCommandRunner(command, options) {
+  const execOptions = {
+    cwd: options.cwd,
+    env: options.env,
+    timeout: options.timeoutMs,
+    signal: options.signal,
+    maxBuffer: DEFAULT_MAX_BUFFER,
+    shell: process.platform === "win32" ? "powershell.exe" : void 0
+  };
+  try {
+    const { stdout, stderr } = await execAsync(command, execOptions);
+    return {
+      stdout,
+      stderr,
+      exitCode: 0,
+      failed: false,
+      timedOut: false,
+      signal: null
+    };
+  } catch (error) {
+    const execError = error;
+    return {
+      stdout: execError.stdout ?? "",
+      stderr: execError.stderr ?? "",
+      exitCode: typeof execError.code === "number" ? execError.code : null,
+      failed: true,
+      timedOut: execError.timedOut === true || execError.killed === true,
+      signal: execError.signal ?? null
+    };
+  }
+}
+var CliProvider = class {
+  id;
+  kind = "cli";
+  targetName;
+  supportsBatch = true;
+  config;
+  runCommand;
+  verbose;
+  keepTempFiles;
+  healthcheckPromise;
+  constructor(targetName, config, runner = defaultCommandRunner) {
+    this.targetName = targetName;
+    this.id = `cli:${targetName}`;
+    this.config = config;
+    this.runCommand = runner;
+    this.verbose = config.verbose ?? false;
+    this.keepTempFiles = config.keepTempFiles ?? false;
+  }
+  async invoke(request) {
+    if (request.signal?.aborted) {
+      throw new Error("CLI provider request was aborted before execution");
+    }
+    await this.ensureHealthy(request.signal);
+    const outputFilePath = generateOutputFilePath(request.evalCaseId);
+    const templateValues = buildTemplateValues(request, this.config, outputFilePath);
+    const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
+    if (this.verbose) {
+      console.log(
+        `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
+      );
+    }
+    const startTime = Date.now();
+    const result = await this.runCommand(renderedCommand, {
+      cwd: this.config.cwd,
+      env: process.env,
+      timeoutMs: this.config.timeoutMs,
+      signal: request.signal
+    });
+    const measuredDurationMs = Date.now() - startTime;
+    if (result.failed || (result.exitCode ?? 0) !== 0) {
+      if (request.signal?.aborted) {
+        throw new Error("CLI provider request was aborted");
       }
       if (result.timedOut) {
         throw new Error(
-          `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
+          `CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
         );
       }
       const codeText = result.exitCode !== null ? result.exitCode : "unknown";
@@ -1910,7 +3008,7 @@ var CliProvider = class {
       }
       if (result.timedOut) {
         throw new Error(
-          `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
+          `CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
         );
       }
       const codeText = result.exitCode !== null ? result.exitCode : "unknown";
@@ -1920,11 +3018,6 @@ var CliProvider = class {
     }
     const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
     const recordsById = this.parseJsonlBatchOutput(responseContent);
-    const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
-    const missingIds = requestedIds.filter((id) => !recordsById.has(id));
-    if (missingIds.length > 0) {
-      throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
-    }
     const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
     const responses = requests.map((request) => {
       const evalCaseId = request.evalCaseId;
@@ -1943,15 +3036,20 @@ var CliProvider = class {
       }
       const parsed = recordsById.get(evalCaseId);
       if (!parsed) {
+        const errorMessage = `Batch output missing id '${evalCaseId}'`;
+        if (this.verbose) {
+          console.warn(`[cli-provider:${this.targetName}] ${errorMessage}`);
+        }
         return {
-          outputMessages: [],
+          outputMessages: [{ role: "assistant", content: `Error: ${errorMessage}` }],
           durationMs: perRequestFallbackMs,
           raw: {
             command: renderedCommand,
             stderr: result.stderr,
             exitCode: result.exitCode ?? 0,
             cwd: this.config.cwd,
-            outputFile: outputFilePath
+            outputFile: outputFilePath,
+            error: errorMessage
           }
         };
       }
@@ -1984,101 +3082,37 @@ var CliProvider = class {
    * - duration_ms: number
    */
   parseOutputContent(content) {
+    let parsed;
     try {
-      const parsed = JSON.parse(content);
-      if (typeof parsed === "object" && parsed !== null) {
-        const obj = parsed;
-        const tokenUsage = this.parseTokenUsage(obj.token_usage);
-        const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
-        const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
-        const outputMessages = this.parseOutputMessages(obj.output_messages);
-        if (outputMessages && outputMessages.length > 0) {
-          return { outputMessages, tokenUsage, costUsd, durationMs };
-        }
-        if ("text" in obj) {
-          const text = typeof obj.text === "string" ? obj.text : String(obj.text);
-          return {
-            outputMessages: [{ role: "assistant", content: text }],
-            tokenUsage,
-            costUsd,
-            durationMs
-          };
-        }
-      }
+      parsed = JSON.parse(content);
     } catch {
+      return { outputMessages: [{ role: "assistant", content }] };
     }
-    return { outputMessages: [{ role: "assistant", content }] };
-  }
-  /**
-   * Parse token_usage from CLI output.
-   */
-  parseTokenUsage(tokenUsage) {
-    if (typeof tokenUsage !== "object" || tokenUsage === null) {
-      return void 0;
-    }
-    const obj = tokenUsage;
-    if (typeof obj.input !== "number" || typeof obj.output !== "number") {
-      return void 0;
-    }
-    return {
-      input: obj.input,
-      output: obj.output,
-      cached: typeof obj.cached === "number" ? obj.cached : void 0
-    };
-  }
-  /**
-   * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
-   */
-  parseOutputMessages(outputMessages) {
-    if (!Array.isArray(outputMessages)) {
-      return void 0;
+    const result = CliOutputSchema.safeParse(parsed);
+    if (!result.success) {
+      return { outputMessages: [{ role: "assistant", content }] };
     }
-    const messages = [];
-    for (const msg of outputMessages) {
-      if (typeof msg !== "object" || msg === null) {
-        continue;
-      }
-      const rawMsg = msg;
-      if (typeof rawMsg.role !== "string") {
-        continue;
-      }
-      const message = {
-        role: rawMsg.role,
-        name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
-        content: rawMsg.content,
-        toolCalls: this.parseToolCalls(rawMsg.tool_calls),
-        timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
-        metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
+    const obj = result.data;
+    const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, "parsing output");
+    const outputMessages = convertOutputMessages(obj.output_messages);
+    if (outputMessages && outputMessages.length > 0) {
+      return {
+        outputMessages,
+        tokenUsage: obj.token_usage,
+        costUsd: metrics.costUsd,
+        durationMs: metrics.durationMs
       };
-      messages.push(message);
-    }
-    return messages.length > 0 ? messages : void 0;
-  }
-  /**
-   * Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
-   */
-  parseToolCalls(toolCalls) {
-    if (!Array.isArray(toolCalls)) {
-      return void 0;
     }
-    const calls = [];
-    for (const call of toolCalls) {
-      if (typeof call !== "object" || call === null) {
-        continue;
-      }
-      const rawCall = call;
-      if (typeof rawCall.tool !== "string") {
-        continue;
-      }
-      calls.push({
-        tool: rawCall.tool,
-        input: rawCall.input,
-        output: rawCall.output,
-        id: typeof rawCall.id === "string" ? rawCall.id : void 0,
-        timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
-      });
+    if (obj.text !== void 0) {
+      const text = typeof obj.text === "string" ? obj.text : String(obj.text);
+      return {
+        outputMessages: [{ role: "assistant", content: text }],
+        tokenUsage: obj.token_usage,
+        costUsd: metrics.costUsd,
+        durationMs: metrics.durationMs
+      };
     }
-    return calls.length > 0 ? calls : void 0;
+    return { outputMessages: [{ role: "assistant", content }] };
   }
   parseJsonlBatchOutput(content) {
     const records = /* @__PURE__ */ new Map();
@@ -2091,33 +3125,32 @@ var CliProvider = class {
         const reason = error instanceof Error ? error.message : String(error);
         throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
       }
-      if (typeof parsed !== "object" || parsed === null) {
+      const result = CliJsonlRecordSchema.safeParse(parsed);
+      if (!result.success) {
+        const firstError = result.error.errors[0];
+        if (firstError?.path.includes("id")) {
+          throw new Error("CLI batch output JSONL line missing required string field: id");
+        }
         throw new Error("CLI batch output JSONL line must be an object");
       }
-      const obj = parsed;
-      const id = typeof obj.id === "string" ? obj.id : void 0;
-      if (!id || id.trim().length === 0) {
-        throw new Error("CLI batch output JSONL line missing required string field: id");
-      }
-      if (records.has(id)) {
-        throw new Error(`CLI batch output contains duplicate id: ${id}`);
-      }
-      const tokenUsage = this.parseTokenUsage(obj.token_usage);
-      const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
-      const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
-      const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
-      let outputMessages;
-      if (parsedOutputMessages && parsedOutputMessages.length > 0) {
-        outputMessages = parsedOutputMessages;
+      const obj = result.data;
+      if (records.has(obj.id)) {
+        throw new Error(`CLI batch output contains duplicate id: ${obj.id}`);
+      }
+      const outputMessages = convertOutputMessages(obj.output_messages);
+      let finalOutputMessages;
+      if (outputMessages && outputMessages.length > 0) {
+        finalOutputMessages = outputMessages;
       } else {
         const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
-        outputMessages = text ? [{ role: "assistant", content: text }] : [];
-      }
-      records.set(id, {
-        outputMessages,
-        tokenUsage,
-        costUsd,
-        durationMs
+        finalOutputMessages = text ? [{ role: "assistant", content: text }] : [];
+      }
+      const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, `batch record '${obj.id}'`);
+      records.set(obj.id, {
+        outputMessages: finalOutputMessages,
+        tokenUsage: obj.token_usage,
+        costUsd: metrics.costUsd,
+        durationMs: metrics.durationMs
       });
     }
     return records;
@@ -2203,7 +3236,7 @@ var CliProvider = class {
   }
 };
 function buildTemplateValues(request, config, outputFilePath) {
-  const inputFiles = normalizeInputFiles(request.inputFiles);
+  const inputFiles = normalizeInputFiles2(request.inputFiles);
   return {
     PROMPT: shellEscape(request.question ?? ""),
     GUIDELINES: shellEscape(request.guidelines ?? ""),
@@ -2213,13 +3246,13 @@ function buildTemplateValues(request, config, outputFilePath) {
     OUTPUT_FILE: shellEscape(outputFilePath)
   };
 }
-function normalizeInputFiles(inputFiles) {
+function normalizeInputFiles2(inputFiles) {
   if (!inputFiles || inputFiles.length === 0) {
     return void 0;
   }
   const unique = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = path7.resolve(inputFile);
+    const absolutePath = path9.resolve(inputFile);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -2233,7 +3266,7 @@ function formatFileList(files, template) {
   const formatter = template ?? "{path}";
   return files.map((filePath) => {
     const escapedPath = shellEscape(filePath);
-    const escapedName = shellEscape(path7.basename(filePath));
+    const escapedName = shellEscape(path9.basename(filePath));
     return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
   }).join(" ");
 }
@@ -2257,9 +3290,9 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
   const safeEvalId = evalCaseId || "unknown";
   const timestamp = Date.now();
   const random = Math.random().toString(36).substring(2, 9);
-  return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
+  return path9.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
 }
-function formatTimeoutSuffix(timeoutMs) {
+function formatTimeoutSuffix2(timeoutMs) {
   if (!timeoutMs || timeoutMs <= 0) {
     return "";
   }
@@ -2268,39 +3301,39 @@ function formatTimeoutSuffix(timeoutMs) {
 }
 // src/evaluation/providers/codex.ts
-import { exec as execCallback, spawn } from "node:child_process";
-import { randomUUID } from "node:crypto";
-import { constants as constants2, createWriteStream } from "node:fs";
-import { access as access2, mkdir, mkdtemp, rm, writeFile } from "node:fs/promises";
-import { tmpdir } from "node:os";
-import path9 from "node:path";
+import { exec as execCallback, spawn as spawn2 } from "node:child_process";
+import { randomUUID as randomUUID2 } from "node:crypto";
+import { constants as constants2, createWriteStream as createWriteStream2 } from "node:fs";
+import { access as access2, mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
+import { tmpdir as tmpdir2 } from "node:os";
+import path10 from "node:path";
 import { promisify as promisify2 } from "node:util";
 // src/evaluation/providers/codex-log-tracker.ts
-var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
-var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
+var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.codexLogs");
+var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.codexLogSubscribers");
 function getCodexLogStore() {
   const globalObject = globalThis;
-  const existing = globalObject[GLOBAL_LOGS_KEY];
+  const existing = globalObject[GLOBAL_LOGS_KEY2];
   if (existing) {
     return existing;
   }
   const created = [];
-  globalObject[GLOBAL_LOGS_KEY] = created;
+  globalObject[GLOBAL_LOGS_KEY2] = created;
   return created;
 }
-function getSubscriberStore() {
+function getSubscriberStore2() {
   const globalObject = globalThis;
-  const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
+  const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
   if (existing) {
     return existing;
   }
   const created = /* @__PURE__ */ new Set();
-  globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
+  globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
   return created;
 }
-function notifySubscribers(entry) {
-  const subscribers = Array.from(getSubscriberStore());
+function notifySubscribers2(entry) {
+  const subscribers = Array.from(getSubscriberStore2());
   for (const listener of subscribers) {
     try {
       listener(entry);
@@ -2312,7 +3345,7 @@ function notifySubscribers(entry) {
 }
 function recordCodexLogEntry(entry) {
   getCodexLogStore().push(entry);
-  notifySubscribers(entry);
+  notifySubscribers2(entry);
 }
 function consumeCodexLogEntries() {
   const store = getCodexLogStore();
@@ -2322,118 +3355,19 @@ function consumeCodexLogEntries() {
   return store.splice(0, store.length);
 }
 function subscribeToCodexLogEntries(listener) {
-  const store = getSubscriberStore();
+  const store = getSubscriberStore2();
   store.add(listener);
   return () => {
     store.delete(listener);
   };
 }
-// src/evaluation/providers/preread.ts
-import path8 from "node:path";
-function buildPromptDocument(request, inputFiles, options) {
-  const parts = [];
-  const guidelineFiles = collectGuidelineFiles(
-    inputFiles,
-    options?.guidelinePatterns ?? request.guideline_patterns,
-    options?.guidelineOverrides
-  );
-  const inputFilesList = collectInputFiles(inputFiles);
-  const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
-  const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
-  if (prereadBlock.length > 0) {
-    parts.push("\n", prereadBlock);
-  }
-  parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
-  return parts.join("\n").trim();
-}
-function normalizeInputFiles2(inputFiles) {
-  if (!inputFiles || inputFiles.length === 0) {
-    return void 0;
-  }
-  const deduped = /* @__PURE__ */ new Map();
-  for (const inputFile of inputFiles) {
-    const absolutePath = path8.resolve(inputFile);
-    if (!deduped.has(absolutePath)) {
-      deduped.set(absolutePath, absolutePath);
-    }
-  }
-  return Array.from(deduped.values());
-}
-function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
-  if (!inputFiles || inputFiles.length === 0) {
-    return [];
-  }
-  const unique = /* @__PURE__ */ new Map();
-  for (const inputFile of inputFiles) {
-    const absolutePath = path8.resolve(inputFile);
-    if (overrides?.has(absolutePath)) {
-      if (!unique.has(absolutePath)) {
-        unique.set(absolutePath, absolutePath);
-      }
-      continue;
-    }
-    const normalized = absolutePath.split(path8.sep).join("/");
-    if (isGuidelineFile(normalized, guidelinePatterns)) {
-      if (!unique.has(absolutePath)) {
-        unique.set(absolutePath, absolutePath);
-      }
-    }
-  }
-  return Array.from(unique.values());
-}
-function collectInputFiles(inputFiles) {
-  if (!inputFiles || inputFiles.length === 0) {
-    return [];
-  }
-  const unique = /* @__PURE__ */ new Map();
-  for (const inputFile of inputFiles) {
-    const absolutePath = path8.resolve(inputFile);
-    if (!unique.has(absolutePath)) {
-      unique.set(absolutePath, absolutePath);
-    }
-  }
-  return Array.from(unique.values());
-}
-function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
-  if (guidelineFiles.length === 0 && inputFiles.length === 0) {
-    return "";
-  }
-  const buildList = (files) => files.map((absolutePath) => {
-    const fileName = path8.basename(absolutePath);
-    const fileUri = pathToFileUri(absolutePath);
-    return `* [${fileName}](${fileUri})`;
-  });
-  const sections = [];
-  if (guidelineFiles.length > 0) {
-    sections.push(`Read all guideline files:
-${buildList(guidelineFiles).join("\n")}.`);
-  }
-  if (inputFiles.length > 0) {
-    sections.push(`Read all input files:
-${buildList(inputFiles).join("\n")}.`);
-  }
-  sections.push(
-    "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
-    "Then apply system_instructions on the user query below."
-  );
-  return sections.join("\n");
-}
-function pathToFileUri(filePath) {
-  const absolutePath = path8.isAbsolute(filePath) ? filePath : path8.resolve(filePath);
-  const normalizedPath = absolutePath.replace(/\\/g, "/");
-  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
-    return `file:///${normalizedPath}`;
-  }
-  return `file://${normalizedPath}`;
-}
 // src/evaluation/providers/codex.ts
 var execAsync2 = promisify2(execCallback);
-var WORKSPACE_PREFIX = "agentv-codex-";
-var PROMPT_FILENAME = "prompt.md";
+var WORKSPACE_PREFIX2 = "agentv-codex-";
+var PROMPT_FILENAME2 = "prompt.md";
 var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
-var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
+var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
 - Do NOT create any additional output files in the workspace.
 - All intended file outputs/changes MUST be written in your response.
 - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
@@ -2458,27 +3392,27 @@ var CodexProvider = class {
       throw new Error("Codex provider request was aborted before execution");
     }
     await this.ensureEnvironmentReady();
-    const inputFiles = normalizeInputFiles2(request.inputFiles);
+    const inputFiles = normalizeInputFiles(request.inputFiles);
     const workspaceRoot = await this.createWorkspace();
     const logger = await this.createStreamLogger(request).catch(() => void 0);
     try {
       const basePrompt = buildPromptDocument(request, inputFiles);
-      const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
+      const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
       const promptContent = `${systemPrompt}
 ${basePrompt}`;
-      const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
-      await writeFile(promptFile, promptContent, "utf8");
+      const promptFile = path10.join(workspaceRoot, PROMPT_FILENAME2);
+      await writeFile2(promptFile, promptContent, "utf8");
       const args = this.buildCodexArgs();
       const cwd = this.resolveCwd(workspaceRoot);
       const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
       if (result.timedOut) {
         throw new Error(
-          `Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
+          `Codex CLI timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
         );
       }
       if (result.exitCode !== 0) {
-        const detail = pickDetail(result.stderr, result.stdout);
+        const detail = pickDetail2(result.stderr, result.stdout);
         const prefix = `Codex CLI exited with code ${result.exitCode}`;
         throw new Error(detail ? `${prefix}: ${detail}` : prefix);
       }
@@ -2517,7 +3451,7 @@ ${basePrompt}`;
     if (!this.config.cwd) {
       return workspaceRoot;
     }
-    return path9.resolve(this.config.cwd);
+    return path10.resolve(this.config.cwd);
   }
   buildCodexArgs() {
     const args = [
@@ -2559,11 +3493,11 @@ ${basePrompt}`;
     }
   }
   async createWorkspace() {
-    return await mkdtemp(path9.join(tmpdir(), WORKSPACE_PREFIX));
+    return await mkdtemp2(path10.join(tmpdir2(), WORKSPACE_PREFIX2));
   }
   async cleanupWorkspace(workspaceRoot) {
     try {
-      await rm(workspaceRoot, { recursive: true, force: true });
+      await rm2(workspaceRoot, { recursive: true, force: true });
     } catch {
     }
   }
@@ -2573,9 +3507,9 @@ ${basePrompt}`;
       return void 0;
     }
     if (this.config.logDir) {
-      return path9.resolve(this.config.logDir);
+      return path10.resolve(this.config.logDir);
     }
-    return path9.join(process.cwd(), ".agentv", "logs", "codex");
+    return path10.join(process.cwd(), ".agentv", "logs", "codex");
   }
   async createStreamLogger(request) {
     const logDir = this.resolveLogDirectory();
@@ -2583,13 +3517,13 @@ ${basePrompt}`;
       return void 0;
     }
     try {
-      await mkdir(logDir, { recursive: true });
+      await mkdir2(logDir, { recursive: true });
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
       return void 0;
     }
-    const filePath = path9.join(logDir, buildLogFilename(request, this.targetName));
+    const filePath = path10.join(logDir, buildLogFilename2(request, this.targetName));
     try {
       const logger = await CodexStreamLogger.create({
         filePath,
@@ -2622,7 +3556,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
   constructor(filePath, format) {
     this.filePath = filePath;
     this.format = format;
-    this.stream = createWriteStream(filePath, { flags: "a" });
+    this.stream = createWriteStream2(filePath, { flags: "a" });
   }
   static async create(options) {
     const logger = new _CodexStreamLogger(options.filePath, options.format);
@@ -2683,7 +3617,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
       return void 0;
     }
     const message = this.format === "json" ? formatCodexJsonLog(trimmed) : formatCodexLogMessage(trimmed, source);
-    return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
+    return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
   }
   flushRemainder() {
     const stdoutRemainder = this.stdoutBuffer.trim();
@@ -2714,18 +3648,18 @@ function isCodexLogStreamingDisabled() {
   const normalized = envValue.trim().toLowerCase();
   return normalized === "false" || normalized === "0" || normalized === "off";
 }
-function buildLogFilename(request, targetName) {
+function buildLogFilename2(request, targetName) {
   const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
-  const evalId = sanitizeForFilename(request.evalCaseId ?? "codex");
+  const evalId = sanitizeForFilename2(request.evalCaseId ?? "codex");
   const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
-  const target = sanitizeForFilename(targetName);
-  return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID().slice(0, 8)}.log`;
+  const target = sanitizeForFilename2(targetName);
+  return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID2().slice(0, 8)}.log`;
 }
-function sanitizeForFilename(value) {
+function sanitizeForFilename2(value) {
   const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
   return sanitized.length > 0 ? sanitized : "codex";
 }
-function formatElapsed(startedAt) {
+function formatElapsed2(startedAt) {
   const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
   const hours = Math.floor(elapsedSeconds / 3600);
   const minutes = Math.floor(elapsedSeconds % 3600 / 60);
@@ -2736,7 +3670,7 @@ function formatElapsed(startedAt) {
   return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
 }
 function formatCodexLogMessage(rawLine, source) {
-  const parsed = tryParseJsonValue(rawLine);
+  const parsed = tryParseJsonValue2(rawLine);
   if (parsed) {
     const summary = summarizeCodexEvent(parsed);
     if (summary) {
@@ -2749,7 +3683,7 @@ function formatCodexLogMessage(rawLine, source) {
   return rawLine;
 }
 function formatCodexJsonLog(rawLine) {
-  const parsed = tryParseJsonValue(rawLine);
+  const parsed = tryParseJsonValue2(rawLine);
   if (!parsed) {
     return rawLine;
   }
@@ -2794,7 +3728,7 @@ function summarizeCodexEvent(event) {
   }
   return type;
 }
-function tryParseJsonValue(rawLine) {
+function tryParseJsonValue2(rawLine) {
   try {
     return JSON.parse(rawLine);
   } catch {
@@ -2804,7 +3738,7 @@ function tryParseJsonValue(rawLine) {
 async function locateExecutable(candidate) {
   const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
   if (includesPathSeparator) {
-    const resolved = path9.isAbsolute(candidate) ? candidate : path9.resolve(candidate);
+    const resolved = path10.isAbsolute(candidate) ? candidate : path10.resolve(candidate);
     const executablePath = await ensureWindowsExecutableVariant(resolved);
     await access2(executablePath, constants2.F_OK);
     return executablePath;
@@ -3023,7 +3957,7 @@ function parseJsonLines(output) {
   }
   return parsed;
 }
-function pickDetail(stderr, stdout) {
+function pickDetail2(stderr, stdout) {
   const errorText = stderr.trim();
   if (errorText.length > 0) {
     return errorText;
@@ -3031,7 +3965,7 @@ function pickDetail(stderr, stdout) {
   const stdoutText = stdout.trim();
   return stdoutText.length > 0 ? stdoutText : void 0;
 }
-function formatTimeoutSuffix2(timeoutMs) {
+function formatTimeoutSuffix3(timeoutMs) {
   if (!timeoutMs || timeoutMs <= 0) {
     return "";
   }
@@ -3040,7 +3974,7 @@ function formatTimeoutSuffix2(timeoutMs) {
 }
 async function defaultCodexRunner(options) {
   return await new Promise((resolve, reject) => {
-    const child = spawn(options.executable, options.args, {
+    const child = spawn2(options.executable, options.args, {
       cwd: options.cwd,
       env: options.env,
       stdio: ["pipe", "pipe", "pipe"],
@@ -3151,38 +4085,38 @@ var MockProvider = class {
 };
 // src/evaluation/providers/pi-coding-agent.ts
-import { spawn as spawn2 } from "node:child_process";
-import { randomUUID as randomUUID2 } from "node:crypto";
-import { createWriteStream as createWriteStream2 } from "node:fs";
-import { mkdir as mkdir2, mkdtemp as mkdtemp2, rm as rm2, writeFile as writeFile2 } from "node:fs/promises";
-import { tmpdir as tmpdir2 } from "node:os";
-import path10 from "node:path";
+import { spawn as spawn3 } from "node:child_process";
+import { randomUUID as randomUUID3 } from "node:crypto";
+import { createWriteStream as createWriteStream3 } from "node:fs";
+import { mkdir as mkdir3, mkdtemp as mkdtemp3, rm as rm3, writeFile as writeFile3 } from "node:fs/promises";
+import { tmpdir as tmpdir3 } from "node:os";
+import path11 from "node:path";
 // src/evaluation/providers/pi-log-tracker.ts
-var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
-var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
+var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
+var GLOBAL_SUBSCRIBERS_KEY3 = Symbol.for("agentv.piLogSubscribers");
 function getPiLogStore() {
   const globalObject = globalThis;
-  const existing = globalObject[GLOBAL_LOGS_KEY2];
+  const existing = globalObject[GLOBAL_LOGS_KEY3];
   if (existing) {
     return existing;
   }
   const created = [];
-  globalObject[GLOBAL_LOGS_KEY2] = created;
+  globalObject[GLOBAL_LOGS_KEY3] = created;
   return created;
 }
-function getSubscriberStore2() {
+function getSubscriberStore3() {
   const globalObject = globalThis;
-  const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
+  const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY3];
   if (existing) {
     return existing;
   }
   const created = /* @__PURE__ */ new Set();
-  globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
+  globalObject[GLOBAL_SUBSCRIBERS_KEY3] = created;
   return created;
 }
-function notifySubscribers2(entry) {
-  const subscribers = Array.from(getSubscriberStore2());
+function notifySubscribers3(entry) {
+  const subscribers = Array.from(getSubscriberStore3());
   for (const listener of subscribers) {
     try {
       listener(entry);
@@ -3194,7 +4128,7 @@ function notifySubscribers2(entry) {
 }
 function recordPiLogEntry(entry) {
   getPiLogStore().push(entry);
-  notifySubscribers2(entry);
+  notifySubscribers3(entry);
 }
 function consumePiLogEntries() {
   const store = getPiLogStore();
@@ -3204,7 +4138,7 @@ function consumePiLogEntries() {
   return store.splice(0, store.length);
 }
 function subscribeToPiLogEntries(listener) {
-  const store = getSubscriberStore2();
+  const store = getSubscriberStore3();
   store.add(listener);
   return () => {
     store.delete(listener);
@@ -3212,9 +4146,9 @@ function subscribeToPiLogEntries(listener) {
 }
 // src/evaluation/providers/pi-coding-agent.ts
-var WORKSPACE_PREFIX2 = "agentv-pi-";
-var PROMPT_FILENAME2 = "prompt.md";
-var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
+var WORKSPACE_PREFIX3 = "agentv-pi-";
+var PROMPT_FILENAME3 = "prompt.md";
+var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
 - Do NOT create any additional output files in the workspace.
 - All intended file outputs/changes MUST be written in your response.
 - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
@@ -3236,27 +4170,27 @@ var PiCodingAgentProvider = class {
     if (request.signal?.aborted) {
       throw new Error("Pi coding agent request was aborted before execution");
     }
-    const inputFiles = normalizeInputFiles2(request.inputFiles);
+    const inputFiles = normalizeInputFiles(request.inputFiles);
     const workspaceRoot = await this.createWorkspace();
     const logger = await this.createStreamLogger(request).catch(() => void 0);
     try {
-      const promptFile = path10.join(workspaceRoot, PROMPT_FILENAME2);
-      await writeFile2(promptFile, request.question, "utf8");
+      const promptFile = path11.join(workspaceRoot, PROMPT_FILENAME3);
+      await writeFile3(promptFile, request.question, "utf8");
       const args = this.buildPiArgs(request.question, inputFiles);
       const cwd = this.resolveCwd(workspaceRoot);
       const result = await this.executePi(args, cwd, request.signal, logger);
       if (result.timedOut) {
         throw new Error(
-          `Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
+          `Pi coding agent timed out${formatTimeoutSuffix4(this.config.timeoutMs ?? void 0)}`
         );
       }
       if (result.exitCode !== 0) {
-        const detail = pickDetail2(result.stderr, result.stdout);
+        const detail = pickDetail3(result.stderr, result.stdout);
         const prefix = `Pi coding agent exited with code ${result.exitCode}`;
         throw new Error(detail ? `${prefix}: ${detail}` : prefix);
       }
       const parsed = parsePiJsonl(result.stdout);
-      const outputMessages = extractOutputMessages(parsed);
+      const outputMessages = extractOutputMessages2(parsed);
       const assistantText = extractAssistantText2(outputMessages);
       return {
         raw: {
@@ -3282,7 +4216,7 @@ var PiCodingAgentProvider = class {
     if (!this.config.cwd) {
       return workspaceRoot;
     }
-    return path10.resolve(this.config.cwd);
+    return path11.resolve(this.config.cwd);
   }
   buildPiArgs(prompt, inputFiles) {
     const args = [];
@@ -3312,7 +4246,7 @@ var PiCodingAgentProvider = class {
         args.push(`@${file}`);
       }
     }
-    const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
+    const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT4;
     const fullPrompt = `${systemPrompt}
 ${prompt}`;
@@ -3371,19 +4305,19 @@ ${prompt}`;
     return env;
   }
   async createWorkspace() {
-    return await mkdtemp2(path10.join(tmpdir2(), WORKSPACE_PREFIX2));
+    return await mkdtemp3(path11.join(tmpdir3(), WORKSPACE_PREFIX3));
   }
   async cleanupWorkspace(workspaceRoot) {
     try {
-      await rm2(workspaceRoot, { recursive: true, force: true });
+      await rm3(workspaceRoot, { recursive: true, force: true });
     } catch {
     }
   }
   resolveLogDirectory() {
     if (this.config.logDir) {
-      return path10.resolve(this.config.logDir);
+      return path11.resolve(this.config.logDir);
     }
-    return path10.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
+    return path11.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
   }
   async createStreamLogger(request) {
     const logDir = this.resolveLogDirectory();
@@ -3391,13 +4325,13 @@ ${prompt}`;
       return void 0;
     }
     try {
-      await mkdir2(logDir, { recursive: true });
+      await mkdir3(logDir, { recursive: true });
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
       return void 0;
     }
-    const filePath = path10.join(logDir, buildLogFilename2(request, this.targetName));
+    const filePath = path11.join(logDir, buildLogFilename3(request, this.targetName));
     try {
       const logger = await PiStreamLogger.create({
         filePath,
@@ -3430,7 +4364,7 @@ var PiStreamLogger = class _PiStreamLogger {
   constructor(filePath, format) {
     this.filePath = filePath;
     this.format = format;
-    this.stream = createWriteStream2(filePath, { flags: "a" });
+    this.stream = createWriteStream3(filePath, { flags: "a" });
   }
   static async create(options) {
     const logger = new _PiStreamLogger(options.filePath, options.format);
@@ -3491,7 +4425,7 @@ var PiStreamLogger = class _PiStreamLogger {
       return void 0;
     }
     const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
-    return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
+    return `[+${formatElapsed3(this.startedAt)}] [${source}] ${message}`;
   }
   flushRemainder() {
     const stdoutRemainder = this.stdoutBuffer.trim();
@@ -3514,18 +4448,18 @@ var PiStreamLogger = class _PiStreamLogger {
     this.stderrBuffer = "";
   }
 };
-function buildLogFilename2(request, targetName) {
+function buildLogFilename3(request, targetName) {
   const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
-  const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
+  const evalId = sanitizeForFilename3(request.evalCaseId ?? "pi");
   const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
-  const target = sanitizeForFilename2(targetName);
-  return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID2().slice(0, 8)}.log`;
+  const target = sanitizeForFilename3(targetName);
+  return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID3().slice(0, 8)}.log`;
 }
-function sanitizeForFilename2(value) {
+function sanitizeForFilename3(value) {
   const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
   return sanitized.length > 0 ? sanitized : "pi";
 }
-function formatElapsed2(startedAt) {
+function formatElapsed3(startedAt) {
   const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
   const hours = Math.floor(elapsedSeconds / 3600);
   const minutes = Math.floor(elapsedSeconds % 3600 / 60);
@@ -3536,7 +4470,7 @@ function formatElapsed2(startedAt) {
   return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
 }
 function formatPiLogMessage(rawLine, source) {
-  const parsed = tryParseJsonValue2(rawLine);
+  const parsed = tryParseJsonValue3(rawLine);
   if (parsed) {
     const summary = summarizePiEvent(parsed);
     if (summary) {
@@ -3549,7 +4483,7 @@ function formatPiLogMessage(rawLine, source) {
   return rawLine;
 }
 function formatPiJsonLog(rawLine) {
-  const parsed = tryParseJsonValue2(rawLine);
+  const parsed = tryParseJsonValue3(rawLine);
   if (!parsed) {
     return rawLine;
   }
@@ -3599,7 +4533,7 @@ function summarizePiEvent(event) {
       return type;
   }
 }
-function tryParseJsonValue2(rawLine) {
+function tryParseJsonValue3(rawLine) {
   try {
     return JSON.parse(rawLine);
   } catch {
@@ -3624,7 +4558,7 @@ function parsePiJsonl(output) {
   }
   return parsed;
 }
-function extractOutputMessages(events) {
+function extractOutputMessages2(events) {
   for (let i = events.length - 1; i >= 0; i--) {
     const event = events[i];
     if (!event || typeof event !== "object") {
@@ -3665,8 +4599,8 @@ function convertPiMessage(message) {
   if (typeof role !== "string") {
     return void 0;
   }
-  const content = extractTextContent(msg.content);
-  const toolCalls = extractToolCalls(msg.content);
+  const content = extractTextContent2(msg.content);
+  const toolCalls = extractToolCalls2(msg.content);
   const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
   const metadata = {};
   if (msg.api) metadata.api = msg.api;
@@ -3682,7 +4616,7 @@ function convertPiMessage(message) {
     metadata: Object.keys(metadata).length > 0 ? metadata : void 0
   };
 }
-function extractTextContent(content) {
+function extractTextContent2(content) {
   if (typeof content === "string") {
     return content;
   }
@@ -3701,7 +4635,7 @@ function extractTextContent(content) {
   }
   return textParts.length > 0 ? textParts.join("\n") : void 0;
 }
-function extractToolCalls(content) {
+function extractToolCalls2(content) {
   if (!Array.isArray(content)) {
     return [];
   }
@@ -3746,7 +4680,7 @@ function extractAssistantText2(messages) {
 function escapeAtSymbols(prompt) {
   return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
 }
-function pickDetail2(stderr, stdout) {
+function pickDetail3(stderr, stdout) {
   const errorText = stderr.trim();
   if (errorText.length > 0) {
     return errorText;
@@ -3754,7 +4688,7 @@ function pickDetail2(stderr, stdout) {
   const stdoutText = stdout.trim();
   return stdoutText.length > 0 ? stdoutText : void 0;
 }
-function formatTimeoutSuffix3(timeoutMs) {
+function formatTimeoutSuffix4(timeoutMs) {
   if (!timeoutMs || timeoutMs <= 0) {
     return "";
   }
@@ -3767,7 +4701,7 @@ async function defaultPiRunner(options) {
     const executable = parts[0];
     const executableArgs = parts.slice(1);
     const allArgs = [...executableArgs, ...options.args];
-    const child = spawn2(executable, allArgs, {
+    const child = spawn3(executable, allArgs, {
       cwd: options.cwd,
       env: options.env,
       stdio: ["pipe", "pipe", "pipe"],
@@ -3830,7 +4764,7 @@ async function defaultPiRunner(options) {
 }
 // src/evaluation/providers/vscode.ts
-import path11 from "node:path";
+import path12 from "node:path";
 import {
   dispatchAgentSession,
   dispatchBatchAgent,
@@ -4005,7 +4939,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
     return "";
   }
   const buildList = (files) => files.map((absolutePath) => {
-    const fileName = path11.basename(absolutePath);
+    const fileName = path12.basename(absolutePath);
     const fileUri = pathToFileUri2(absolutePath);
     return `* [${fileName}](${fileUri})`;
   });
@@ -4030,8 +4964,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = path11.resolve(attachment);
-    const normalized = absolutePath.split(path11.sep).join("/");
+    const absolutePath = path12.resolve(attachment);
+    const normalized = absolutePath.split(path12.sep).join("/");
     if (isGuidelineFile(normalized, guidelinePatterns)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
@@ -4046,7 +4980,7 @@ function collectAttachmentFiles(attachments) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = path11.resolve(attachment);
+    const absolutePath = path12.resolve(attachment);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -4054,7 +4988,7 @@ function collectAttachmentFiles(attachments) {
   return Array.from(unique.values());
 }
 function pathToFileUri2(filePath) {
-  const absolutePath = path11.isAbsolute(filePath) ? filePath : path11.resolve(filePath);
+  const absolutePath = path12.isAbsolute(filePath) ? filePath : path12.resolve(filePath);
   const normalizedPath = absolutePath.replace(/\\/g, "/");
   if (/^[a-zA-Z]:\//.test(normalizedPath)) {
     return `file:///${normalizedPath}`;
@@ -4067,7 +5001,7 @@ function normalizeAttachments(attachments) {
   }
   const deduped = /* @__PURE__ */ new Set();
   for (const attachment of attachments) {
-    deduped.add(path11.resolve(attachment));
+    deduped.add(path12.resolve(attachment));
   }
   return Array.from(deduped);
 }
@@ -4076,7 +5010,7 @@ function mergeAttachments(all) {
   for (const list of all) {
     if (!list) continue;
     for (const inputFile of list) {
-      deduped.add(path11.resolve(inputFile));
+      deduped.add(path12.resolve(inputFile));
     }
   }
   return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -4125,7 +5059,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
 // src/evaluation/providers/targets-file.ts
 import { constants as constants3 } from "node:fs";
 import { access as access3, readFile as readFile6 } from "node:fs/promises";
-import path12 from "node:path";
+import path13 from "node:path";
 import { parse as parse3 } from "yaml";
 function isRecord(value) {
   return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -4162,7 +5096,7 @@ async function fileExists3(filePath) {
   }
 }
 async function readTargetDefinitions(filePath) {
-  const absolutePath = path12.resolve(filePath);
+  const absolutePath = path13.resolve(filePath);
   if (!await fileExists3(absolutePath)) {
     throw new Error(`targets.yaml not found at ${absolutePath}`);
   }
@@ -4196,6 +5130,8 @@ function createProvider(target) {
       return new CodexProvider(target.name, target.config);
     case "pi-coding-agent":
       return new PiCodingAgentProvider(target.name, target.config);
+    case "claude-code":
+      return new ClaudeCodeProvider(target.name, target.config);
     case "mock":
       return new MockProvider(target.name, target.config);
     case "vscode":
@@ -4214,73 +5150,193 @@ function resolveAndCreateProvider(definition, env = process.env) {
 // src/evaluation/evaluators.ts
 import { generateText as generateText2 } from "ai";
-import { z } from "zod";
+import { z as z2 } from "zod";
 // src/runtime/exec.ts
-function getBunSpawn() {
-  const bunSpawn = globalThis.Bun?.spawn;
-  return typeof bunSpawn === "function" ? bunSpawn : void 0;
+function shellEscapePath(value) {
+  if (process.platform === "win32") {
+    return `"${value.replaceAll('"', '""')}"`;
+  }
+  return `'${value.replaceAll("'", `'"'"'`)}'`;
 }
-async function execShellWithStdin(command, stdinPayload, options = {}) {
-  const bunSpawn = getBunSpawn();
-  if (bunSpawn) {
-    const encoder = new TextEncoder();
-    const proc = bunSpawn({
-      cmd: ["sh", "-c", command],
-      cwd: options.cwd,
-      stdin: encoder.encode(stdinPayload),
-      stdout: "pipe",
-      stderr: "pipe"
-    });
-    const timeout = options.timeoutMs ? setTimeout(() => {
-      proc.kill();
-    }, options.timeoutMs) : void 0;
-    try {
-      const stdout = await new Response(proc.stdout).text();
-      const stderr = await new Response(proc.stderr).text();
-      const exitCode = await proc.exited;
-      return { stdout, stderr, exitCode };
-    } finally {
-      if (timeout !== void 0) {
-        clearTimeout(timeout);
-      }
+async function execFileWithStdin(argv, stdinPayload, options = {}) {
+  if (argv.length === 0) {
+    throw new Error("Executable argv must include at least one entry");
+  }
+  if (typeof Bun !== "undefined") {
+    return execFileWithStdinBun(argv, stdinPayload, options);
+  }
+  return execFileWithStdinNode(argv, stdinPayload, options);
+}
+async function execFileWithStdinBun(argv, stdinPayload, options) {
+  const command = [...argv];
+  const encoder = new TextEncoder();
+  const proc = Bun.spawn(command, {
+    cwd: options.cwd,
+    stdin: encoder.encode(stdinPayload),
+    stdout: "pipe",
+    stderr: "pipe"
+  });
+  let timedOut = false;
+  const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
+    timedOut = true;
+    proc.kill("SIGKILL");
+  }, options.timeoutMs) : void 0;
+  try {
+    const stdoutPromise = proc.stdout ? new Response(proc.stdout).text() : Promise.resolve("");
+    const stderrPromise = proc.stderr ? new Response(proc.stderr).text() : Promise.resolve("");
+    const [stdout, stderr, exitCode] = await Promise.all([
+      stdoutPromise,
+      stderrPromise,
+      proc.exited
+    ]);
+    if (timedOut) {
+      throw new Error(`Process timed out after ${options.timeoutMs}ms`);
+    }
+    return {
+      stdout: stdout.replace(/\r\n/g, "\n"),
+      stderr: stderr.replace(/\r\n/g, "\n"),
+      exitCode
+    };
+  } finally {
+    if (timeout !== void 0) {
+      clearTimeout(timeout);
     }
   }
-  const { spawn: spawn3 } = await import("node:child_process");
-  return await new Promise((resolve, reject) => {
-    const child = spawn3(command, {
-      shell: true,
+}
+async function execFileWithStdinNode(argv, stdinPayload, options) {
+  const { spawn: spawn4 } = await import("node:child_process");
+  return new Promise((resolve, reject) => {
+    const [cmd, ...args] = argv;
+    const child = spawn4(cmd, args, {
       cwd: options.cwd,
       stdio: ["pipe", "pipe", "pipe"]
     });
-    let stdout = "";
-    let stderr = "";
-    const timeout = options.timeoutMs ? setTimeout(() => {
-      child.kill();
-      reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
+    const stdoutChunks = [];
+    const stderrChunks = [];
+    child.stdout?.on("data", (chunk) => stdoutChunks.push(chunk));
+    child.stderr?.on("data", (chunk) => stderrChunks.push(chunk));
+    let timedOut = false;
+    const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
+      timedOut = true;
+      child.kill("SIGKILL");
     }, options.timeoutMs) : void 0;
-    child.stdout?.on("data", (data) => {
-      stdout += data.toString();
-    });
-    child.stderr?.on("data", (data) => {
-      stderr += data.toString();
-    });
     child.on("error", (error) => {
-      if (timeout !== void 0) {
-        clearTimeout(timeout);
-      }
+      if (timeout !== void 0) clearTimeout(timeout);
       reject(error);
     });
-    child.on("exit", (code) => {
-      if (timeout !== void 0) {
-        clearTimeout(timeout);
+    child.on("close", (code) => {
+      if (timeout !== void 0) clearTimeout(timeout);
+      if (timedOut) {
+        reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
+        return;
       }
-      resolve({ stdout, stderr, exitCode: code ?? 0 });
+      const stdout = Buffer.concat(stdoutChunks).toString("utf8").replace(/\r\n/g, "\n");
+      const stderr = Buffer.concat(stderrChunks).toString("utf8").replace(/\r\n/g, "\n");
+      resolve({
+        stdout,
+        stderr,
+        exitCode: code ?? 0
+      });
     });
-    child.stdin?.write(stdinPayload);
-    child.stdin?.end();
+    if (child.stdin) {
+      child.stdin.write(stdinPayload);
+      child.stdin.end();
+    }
   });
 }
+async function execShellWithStdin(command, stdinPayload, options = {}) {
+  const { mkdir: mkdir4, readFile: readFile7, rm: rm4, writeFile: writeFile4 } = await import("node:fs/promises");
+  const { tmpdir: tmpdir4 } = await import("node:os");
+  const path15 = await import("node:path");
+  const { randomUUID: randomUUID4 } = await import("node:crypto");
+  const dir = path15.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
+  await mkdir4(dir, { recursive: true });
+  const stdinPath = path15.join(dir, "stdin.txt");
+  const stdoutPath = path15.join(dir, "stdout.txt");
+  const stderrPath = path15.join(dir, "stderr.txt");
+  await writeFile4(stdinPath, stdinPayload, "utf8");
+  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
+  const { spawn: spawn4 } = await import("node:child_process");
+  try {
+    const exitCode = await new Promise((resolve, reject) => {
+      const child = spawn4(wrappedCommand, {
+        shell: true,
+        cwd: options.cwd,
+        stdio: ["ignore", "ignore", "ignore"]
+      });
+      const timeout = options.timeoutMs ? setTimeout(() => {
+        child.kill();
+        reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
+      }, options.timeoutMs) : void 0;
+      child.on("error", (error) => {
+        if (timeout !== void 0) {
+          clearTimeout(timeout);
+        }
+        reject(error);
+      });
+      child.on("exit", (code) => {
+        if (timeout !== void 0) {
+          clearTimeout(timeout);
+        }
+        resolve(code ?? 0);
+      });
+    });
+    const stdout = (await readFile7(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
+    const stderr = (await readFile7(stderrPath, "utf8")).replace(/\r\n/g, "\n");
+    return { stdout, stderr, exitCode };
+  } finally {
+    await rm4(dir, { recursive: true, force: true });
+  }
+}
+// src/evaluation/case-conversion.ts
+function toSnakeCase(str) {
+  if (/^[A-Z]/.test(str)) {
+    return str;
+  }
+  return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
+}
+function toCamelCase(str) {
+  if (/^[A-Z]/.test(str)) {
+    return str;
+  }
+  return str.replace(/_([a-z0-9])/g, (_, letter) => letter.toUpperCase());
+}
+function toSnakeCaseDeep(obj) {
+  if (obj === null || obj === void 0) {
+    return obj;
+  }
+  if (Array.isArray(obj)) {
+    return obj.map((item) => toSnakeCaseDeep(item));
+  }
+  if (typeof obj === "object") {
+    const result = {};
+    for (const [key, value] of Object.entries(obj)) {
+      const snakeKey = toSnakeCase(key);
+      result[snakeKey] = toSnakeCaseDeep(value);
+    }
+    return result;
+  }
+  return obj;
+}
+function toCamelCaseDeep(obj) {
+  if (obj === null || obj === void 0) {
+    return obj;
+  }
+  if (Array.isArray(obj)) {
+    return obj.map((item) => toCamelCaseDeep(item));
+  }
+  if (typeof obj === "object") {
+    const result = {};
+    for (const [key, value] of Object.entries(obj)) {
+      const camelKey = toCamelCase(key);
+      result[camelKey] = toCamelCaseDeep(value);
+    }
+    return result;
+  }
+  return obj;
+}
 // src/evaluation/evaluators.ts
 var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
@@ -4300,20 +5356,20 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
 [[ ## candidate_answer ## ]]
 {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
-var freeformEvaluationSchema = z.object({
-  score: z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
-  hits: z.array(z.string()).describe("Brief specific achievements").optional(),
-  misses: z.array(z.string()).describe("Brief failures or omissions").optional(),
-  reasoning: z.string().describe("Concise explanation (1-2 sentences)").optional()
+var freeformEvaluationSchema = z2.object({
+  score: z2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
+  hits: z2.array(z2.string()).describe("Brief specific achievements").optional(),
+  misses: z2.array(z2.string()).describe("Brief failures or omissions").optional(),
+  reasoning: z2.string().describe("Concise explanation (1-2 sentences)").optional()
 });
-var rubricCheckResultSchema = z.object({
-  id: z.string().describe("The ID of the rubric item being checked"),
-  satisfied: z.boolean().describe("Whether this rubric requirement is met"),
-  reasoning: z.string().describe("Brief explanation (1-2 sentences) for this check")
+var rubricCheckResultSchema = z2.object({
+  id: z2.string().describe("The ID of the rubric item being checked"),
+  satisfied: z2.boolean().describe("Whether this rubric requirement is met"),
+  reasoning: z2.string().describe("Brief explanation (1-2 sentences) for this check")
 });
-var rubricEvaluationSchema = z.object({
-  checks: z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
-  overall_reasoning: z.string().describe("Overall assessment summary (1-2 sentences)")
+var rubricEvaluationSchema = z2.object({
+  checks: z2.array(rubricCheckResultSchema).describe("Results for each rubric item"),
+  overall_reasoning: z2.string().describe("Overall assessment summary (1-2 sentences)")
 });
 var LlmJudgeEvaluator = class {
   kind = "llm_judge";
@@ -4549,30 +5605,30 @@ var CodeEvaluator = class {
   script;
   cwd;
   agentTimeoutMs;
+  config;
   constructor(options) {
     this.script = options.script;
     this.cwd = options.cwd;
     this.agentTimeoutMs = options.agentTimeoutMs;
+    this.config = options.config;
   }
   async evaluate(context) {
-    const inputPayload = JSON.stringify(
-      {
-        question: context.evalCase.question,
-        expectedOutcome: context.evalCase.expected_outcome,
-        expectedMessages: context.evalCase.expected_messages,
-        referenceAnswer: context.evalCase.reference_answer,
-        candidateAnswer: context.candidate,
-        outputMessages: context.outputMessages ?? null,
-        guidelineFiles: context.evalCase.guideline_paths,
-        inputFiles: context.evalCase.file_paths.filter(
-          (path14) => !context.evalCase.guideline_paths.includes(path14)
-        ),
-        inputMessages: context.evalCase.input_messages,
-        traceSummary: context.traceSummary ?? null
-      },
-      null,
-      2
-    );
+    const payload = {
+      question: context.evalCase.question,
+      expectedOutcome: context.evalCase.expected_outcome,
+      expectedMessages: context.evalCase.expected_messages,
+      referenceAnswer: context.evalCase.reference_answer,
+      candidateAnswer: context.candidate,
+      outputMessages: context.outputMessages ?? null,
+      guidelineFiles: context.evalCase.guideline_paths,
+      inputFiles: context.evalCase.file_paths.filter(
+        (path15) => !context.evalCase.guideline_paths.includes(path15)
+      ),
+      inputMessages: context.evalCase.input_messages,
+      traceSummary: context.traceSummary ?? null,
+      config: this.config ?? null
+    };
+    const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
     try {
       const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
       const parsed = parseJsonSafe(stdout);
@@ -4638,18 +5694,25 @@ function calculateRubricScore(result, rubrics) {
   return { score, verdict, hits, misses };
 }
 async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
-  const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
-    cwd,
-    timeoutMs: agentTimeoutMs
-  });
+  const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
   if (exitCode !== 0) {
-    const trimmedErr = stderr.trim();
+    const trimmedErr = formatStderr(stderr);
     throw new Error(
       trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
     );
   }
   return stdout.trim();
 }
+function formatStderr(stderr) {
+  const trimmed = stderr.trim();
+  const maxLength = 2e3;
+  if (trimmed.length <= maxLength) {
+    return trimmed;
+  }
+  const tail = trimmed.slice(-maxLength);
+  return `...(truncated, last ${maxLength} chars)
+${tail}`;
+}
 function parseJsonSafe(payload) {
   try {
     return JSON.parse(payload);
@@ -4881,22 +5944,438 @@ var ToolTrajectoryEvaluator = class {
           misses.push(`Position ${i}: ${expectedTool} args mismatch`);
         }
       } else {
-        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
+        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
+      }
+    }
+    for (let i = checkLength; i < expected.length; i++) {
+      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
+    }
+    const score = hits.length / expected.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: expected.length
+    };
+  }
+};
+var DEFAULT_DATE_FORMATS = [
+  "YYYY-MM-DDTHH:mm:ssZ",
+  // ISO with timezone
+  "YYYY-MM-DDTHH:mm:ss",
+  // ISO with time
+  "YYYY-MM-DD",
+  // ISO date
+  "DD-MMM-YYYY",
+  // Localized (e.g., "15-JAN-2025")
+  "MM/DD/YYYY",
+  // US format
+  "DD/MM/YYYY",
+  // EU format
+  "MM-DD-YYYY",
+  // US with dashes
+  "DD-MM-YYYY"
+  // EU with dashes
+];
+var MONTH_NAMES = {
+  jan: 0,
+  january: 0,
+  feb: 1,
+  february: 1,
+  mar: 2,
+  march: 2,
+  apr: 3,
+  april: 3,
+  may: 4,
+  jun: 5,
+  june: 5,
+  jul: 6,
+  july: 6,
+  aug: 7,
+  august: 7,
+  sep: 8,
+  sept: 8,
+  september: 8,
+  oct: 9,
+  october: 9,
+  nov: 10,
+  november: 10,
+  dec: 11,
+  december: 11
+};
+var FieldAccuracyEvaluator = class {
+  kind = "field_accuracy";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const { evalCase, candidate } = context;
+    let candidateData;
+    try {
+      candidateData = parseJsonFromTextSafe(candidate);
+    } catch {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["Failed to parse candidate answer as JSON"],
+        expectedAspectCount: this.config.fields.length,
+        reasoning: "Candidate answer is not valid JSON"
+      };
+    }
+    const expectedData = this.extractExpectedData(evalCase.expected_messages);
+    if (!expectedData) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No expected data found in expected_messages"],
+        expectedAspectCount: this.config.fields.length,
+        reasoning: "Could not extract expected data from expected_messages"
+      };
+    }
+    const fieldResults = [];
+    for (const fieldConfig of this.config.fields) {
+      const result = this.evaluateField(fieldConfig, candidateData, expectedData);
+      fieldResults.push(result);
+    }
+    return this.aggregateResults(fieldResults);
+  }
+  /**
+   * Extract expected data from expected_messages array.
+   * Looks for the last assistant message with content.
+   */
+  extractExpectedData(expectedMessages) {
+    for (let i = expectedMessages.length - 1; i >= 0; i--) {
+      const message = expectedMessages[i];
+      if (message.role === "assistant" && message.content) {
+        if (typeof message.content === "object" && message.content !== null) {
+          return message.content;
+        }
+        if (typeof message.content === "string") {
+          try {
+            return parseJsonFromTextSafe(message.content);
+          } catch {
+          }
+        }
+      }
+    }
+    return void 0;
+  }
+  /**
+   * Evaluate a single field against the expected value.
+   */
+  evaluateField(fieldConfig, candidateData, expectedData) {
+    const { path: path15, match, required = true, weight = 1 } = fieldConfig;
+    const candidateValue = resolvePath(candidateData, path15);
+    const expectedValue = resolvePath(expectedData, path15);
+    if (expectedValue === void 0) {
+      return {
+        path: path15,
+        score: 1,
+        // No expected value means no comparison needed
+        weight,
+        hit: true,
+        message: `${path15}: no expected value`
+      };
+    }
+    if (candidateValue === void 0) {
+      if (required) {
+        return {
+          path: path15,
+          score: 0,
+          weight,
+          hit: false,
+          message: `${path15} (required, missing)`
+        };
+      }
+      return {
+        path: path15,
+        score: 1,
+        // Don't penalize missing optional fields
+        weight: 0,
+        // Zero weight means it won't affect the score
+        hit: true,
+        message: `${path15}: optional field missing`
+      };
+    }
+    switch (match) {
+      case "exact":
+        return this.compareExact(path15, candidateValue, expectedValue, weight);
+      case "numeric_tolerance":
+        return this.compareNumericTolerance(
+          path15,
+          candidateValue,
+          expectedValue,
+          fieldConfig,
+          weight
+        );
+      case "date":
+        return this.compareDate(path15, candidateValue, expectedValue, fieldConfig, weight);
+      default:
+        return {
+          path: path15,
+          score: 0,
+          weight,
+          hit: false,
+          message: `${path15}: unknown match type "${match}"`
+        };
+    }
+  }
+  /**
+   * Exact equality comparison.
+   */
+  compareExact(path15, candidateValue, expectedValue, weight) {
+    if (deepEqual(candidateValue, expectedValue)) {
+      return {
+        path: path15,
+        score: 1,
+        weight,
+        hit: true,
+        message: path15
+      };
+    }
+    if (typeof candidateValue !== typeof expectedValue) {
+      return {
+        path: path15,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path15} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
+      };
+    }
+    return {
+      path: path15,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path15} (value mismatch)`
+    };
+  }
+  /**
+   * Numeric comparison with absolute or relative tolerance.
+   */
+  compareNumericTolerance(path15, candidateValue, expectedValue, fieldConfig, weight) {
+    const { tolerance = 0, relative = false } = fieldConfig;
+    const candidateNum = toNumber(candidateValue);
+    const expectedNum = toNumber(expectedValue);
+    if (candidateNum === null || expectedNum === null) {
+      return {
+        path: path15,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path15} (non-numeric value)`
+      };
+    }
+    if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
+      return {
+        path: path15,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path15} (invalid numeric value)`
+      };
+    }
+    const diff = Math.abs(candidateNum - expectedNum);
+    let withinTolerance;
+    if (relative) {
+      const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
+      withinTolerance = relativeDiff <= tolerance;
+    } else {
+      withinTolerance = diff <= tolerance;
+    }
+    if (withinTolerance) {
+      return {
+        path: path15,
+        score: 1,
+        weight,
+        hit: true,
+        message: `${path15} (within tolerance: diff=${diff.toFixed(2)})`
+      };
+    }
+    return {
+      path: path15,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path15} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
+    };
+  }
+  /**
+   * Date comparison with format normalization.
+   */
+  compareDate(path15, candidateValue, expectedValue, fieldConfig, weight) {
+    const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
+    const candidateDate = parseDate(String(candidateValue), formats);
+    const expectedDate = parseDate(String(expectedValue), formats);
+    if (candidateDate === null) {
+      return {
+        path: path15,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path15} (unparseable candidate date)`
+      };
+    }
+    if (expectedDate === null) {
+      return {
+        path: path15,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path15} (unparseable expected date)`
+      };
+    }
+    if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
+      return {
+        path: path15,
+        score: 1,
+        weight,
+        hit: true,
+        message: path15
+      };
+    }
+    return {
+      path: path15,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path15} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
+    };
+  }
+  /**
+   * Aggregate field results using configured strategy.
+   */
+  aggregateResults(results) {
+    const aggregation = this.config.aggregation ?? "weighted_average";
+    const hits = [];
+    const misses = [];
+    for (const result of results) {
+      if (result.hit) {
+        hits.push(result.message);
+      } else {
+        misses.push(result.message);
       }
     }
-    for (let i = checkLength; i < expected.length; i++) {
-      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
+    let score;
+    if (aggregation === "all_or_nothing") {
+      score = misses.length === 0 ? 1 : 0;
+    } else {
+      const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
+      if (totalWeight === 0) {
+        score = results.length === 0 ? 1 : 0;
+      } else {
+        const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
+        score = weightedSum / totalWeight;
+      }
     }
-    const score = hits.length / expected.length;
+    const reasoning = `${hits.length}/${results.length} fields matched`;
     return {
-      score,
+      score: clampScore(score),
       verdict: scoreToVerdict(score),
-      hits,
-      misses,
-      expectedAspectCount: expected.length
+      hits: hits.slice(0, 4),
+      misses: misses.slice(0, 4),
+      expectedAspectCount: results.length,
+      reasoning
     };
   }
 };
+function resolvePath(obj, path15) {
+  if (!path15 || !obj) {
+    return void 0;
+  }
+  const parts = path15.split(/\.|\[|\]/).filter((p) => p.length > 0);
+  let current = obj;
+  for (const part of parts) {
+    if (current === null || current === void 0) {
+      return void 0;
+    }
+    if (typeof current !== "object") {
+      return void 0;
+    }
+    const isIndex = /^\d+$/.test(part);
+    if (isIndex && Array.isArray(current)) {
+      current = current[Number.parseInt(part, 10)];
+    } else {
+      current = current[part];
+    }
+  }
+  return current;
+}
+function toNumber(value) {
+  if (typeof value === "number") {
+    return value;
+  }
+  if (typeof value === "string") {
+    const num = Number.parseFloat(value);
+    return Number.isNaN(num) ? null : num;
+  }
+  return null;
+}
+function parseDate(dateStr, formats) {
+  if (!dateStr) return null;
+  const trimmed = dateStr.trim();
+  const isoDate = new Date(trimmed);
+  if (!Number.isNaN(isoDate.getTime())) {
+    return isoDate;
+  }
+  const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
+  if (localizedMatch) {
+    const day = Number.parseInt(localizedMatch[1], 10);
+    const monthName = localizedMatch[2].toLowerCase();
+    const year = Number.parseInt(localizedMatch[3], 10);
+    const month = MONTH_NAMES[monthName];
+    if (month !== void 0) {
+      return new Date(year, month, day);
+    }
+  }
+  const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
+  if (usMatch) {
+    const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
+    const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
+    if (hasUSFormat && !hasEUFormat) {
+      const month = Number.parseInt(usMatch[1], 10) - 1;
+      const day = Number.parseInt(usMatch[2], 10);
+      const year = Number.parseInt(usMatch[3], 10);
+      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
+        return new Date(year, month, day);
+      }
+    } else if (hasEUFormat && !hasUSFormat) {
+      const day = Number.parseInt(usMatch[1], 10);
+      const month = Number.parseInt(usMatch[2], 10) - 1;
+      const year = Number.parseInt(usMatch[3], 10);
+      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
+        return new Date(year, month, day);
+      }
+    } else {
+      const num1 = Number.parseInt(usMatch[1], 10);
+      const num2 = Number.parseInt(usMatch[2], 10);
+      const year = Number.parseInt(usMatch[3], 10);
+      if (num1 > 12 && num2 <= 12) {
+        return new Date(year, num2 - 1, num1);
+      }
+      if (num2 > 12 && num1 <= 12) {
+        return new Date(year, num1 - 1, num2);
+      }
+      if (num1 <= 12 && num2 <= 31) {
+        return new Date(year, num1 - 1, num2);
+      }
+    }
+  }
+  return null;
+}
+function formatDateISO(date) {
+  return date.toISOString().split("T")[0];
+}
+function parseJsonFromTextSafe(text) {
+  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
+  const match = cleaned.match(/\{[\s\S]*\}/);
+  const blob = match?.[0] ?? cleaned;
+  return JSON.parse(blob);
+}
 var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
 {{EVALUATOR_RESULTS_JSON}}
@@ -5121,11 +6600,175 @@ var CompositeEvaluator = class {
     }
   }
 };
+var LatencyEvaluator = class {
+  kind = "latency";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const { threshold } = this.config;
+    const durationMs = context.traceSummary?.durationMs;
+    if (durationMs === void 0) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No duration data available in trace"],
+        expectedAspectCount: 1,
+        reasoning: "Execution duration not reported by provider",
+        evaluatorRawRequest: {
+          type: "latency",
+          threshold,
+          durationMs: null
+        }
+      };
+    }
+    const passed = durationMs <= threshold;
+    const score = passed ? 1 : 0;
+    return {
+      score,
+      verdict: passed ? "pass" : "fail",
+      hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
+      misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
+      expectedAspectCount: 1,
+      reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
+      evaluatorRawRequest: {
+        type: "latency",
+        threshold,
+        durationMs
+      }
+    };
+  }
+};
+var CostEvaluator = class {
+  kind = "cost";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const { budget } = this.config;
+    const costUsd = context.traceSummary?.costUsd;
+    if (costUsd === void 0) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No cost data available in trace"],
+        expectedAspectCount: 1,
+        reasoning: "Execution cost not reported by provider",
+        evaluatorRawRequest: {
+          type: "cost",
+          budget,
+          costUsd: null
+        }
+      };
+    }
+    const passed = costUsd <= budget;
+    const score = passed ? 1 : 0;
+    const formatCost = (n) => `$${n.toFixed(4)}`;
+    return {
+      score,
+      verdict: passed ? "pass" : "fail",
+      hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
+      misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
+      expectedAspectCount: 1,
+      reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
+      evaluatorRawRequest: {
+        type: "cost",
+        budget,
+        costUsd
+      }
+    };
+  }
+};
+var TokenUsageEvaluator = class {
+  kind = "token_usage";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const usage = context.traceSummary?.tokenUsage;
+    const maxTotal = this.config.max_total;
+    const maxInput = this.config.max_input;
+    const maxOutput = this.config.max_output;
+    const expectedAspectCount = Math.max(
+      [maxTotal, maxInput, maxOutput].filter((v) => typeof v === "number").length,
+      1
+    );
+    if (!usage) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No token usage data available in trace"],
+        expectedAspectCount,
+        reasoning: "Token usage not reported by provider",
+        evaluatorRawRequest: {
+          type: "token_usage",
+          max_total: maxTotal ?? null,
+          max_input: maxInput ?? null,
+          max_output: maxOutput ?? null,
+          tokenUsage: null
+        }
+      };
+    }
+    const input = usage.input;
+    const output = usage.output;
+    const cached = usage.cached ?? 0;
+    const total = input + output + cached;
+    const hits = [];
+    const misses = [];
+    if (typeof maxInput === "number") {
+      if (input <= maxInput) {
+        hits.push(`Input tokens ${input} <= ${maxInput}`);
+      } else {
+        misses.push(`Input tokens ${input} > ${maxInput}`);
+      }
+    }
+    if (typeof maxOutput === "number") {
+      if (output <= maxOutput) {
+        hits.push(`Output tokens ${output} <= ${maxOutput}`);
+      } else {
+        misses.push(`Output tokens ${output} > ${maxOutput}`);
+      }
+    }
+    if (typeof maxTotal === "number") {
+      if (total <= maxTotal) {
+        hits.push(`Total tokens ${total} <= ${maxTotal}`);
+      } else {
+        misses.push(`Total tokens ${total} > ${maxTotal}`);
+      }
+    }
+    const passed = misses.length === 0;
+    return {
+      score: passed ? 1 : 0,
+      verdict: passed ? "pass" : "fail",
+      hits,
+      misses,
+      expectedAspectCount,
+      reasoning: `token_usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
+      evaluatorRawRequest: {
+        type: "token_usage",
+        max_total: maxTotal ?? null,
+        max_input: maxInput ?? null,
+        max_output: maxOutput ?? null,
+        tokenUsage: {
+          input,
+          output,
+          cached,
+          total
+        }
+      }
+    };
+  }
+};
 // src/evaluation/orchestrator.ts
-import { createHash, randomUUID as randomUUID3 } from "node:crypto";
-import { mkdir as mkdir3, writeFile as writeFile3 } from "node:fs/promises";
-import path13 from "node:path";
+import { createHash } from "node:crypto";
+import path14 from "node:path";
 // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
 var Node = class {
@@ -5267,6 +6910,9 @@ function validateConcurrency(concurrency) {
 }
 // src/evaluation/orchestrator.ts
+function usesFileReferencePrompt(provider) {
+  return isAgentProvider(provider) || provider.kind === "cli";
+}
 async function runEvaluation(options) {
   const {
     testFilePath: evalFilePath,
@@ -5278,7 +6924,6 @@ async function runEvaluation(options) {
     evaluators,
     maxRetries,
     agentTimeoutMs,
-    promptDumpDir,
     cache,
     useCache,
     now,
@@ -5358,7 +7003,6 @@ async function runEvaluation(options) {
         provider: primaryProvider,
         target,
         evaluatorRegistry,
-        promptDumpDir,
         nowFn: now ?? (() => /* @__PURE__ */ new Date()),
         onProgress,
         onResult,
@@ -5400,7 +7044,6 @@ async function runEvaluation(options) {
           evaluators: evaluatorRegistry,
           maxRetries,
           agentTimeoutMs,
-          promptDumpDir,
           cache,
           useCache,
           now,
@@ -5443,7 +7086,8 @@ async function runEvaluation(options) {
       results.push(outcome.value);
     } else {
       const evalCase = filteredEvalCases[i];
-      const promptInputs = await buildPromptInputs(evalCase);
+      const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
+      const promptInputs = await buildPromptInputs(evalCase, formattingMode);
       const errorResult = buildErrorResult(
         evalCase,
         target.name,
@@ -5466,7 +7110,6 @@ async function runBatchEvaluation(options) {
     provider,
     target,
     evaluatorRegistry,
-    promptDumpDir,
     nowFn,
     onProgress,
     onResult,
@@ -5474,12 +7117,9 @@ async function runBatchEvaluation(options) {
     agentTimeoutMs
   } = options;
   const promptInputsList = [];
-  const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
+  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
   for (const evalCase of evalCases) {
     const promptInputs = await buildPromptInputs(evalCase, formattingMode);
-    if (promptDumpDir) {
-      await dumpPrompt(promptDumpDir, evalCase, promptInputs);
-    }
     promptInputsList.push(promptInputs);
   }
   const batchRequests = evalCases.map((evalCase, index) => {
@@ -5521,13 +7161,20 @@ async function runBatchEvaluation(options) {
     const promptInputs = promptInputsList[i];
     const providerResponse = batchResponse[i];
     const outputMessages = providerResponse.outputMessages;
-    const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
+    const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
+    const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
+      eventCount: 0,
+      toolNames: [],
+      toolCallsByName: {},
+      errorCount: 0
+    } : void 0;
     const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
       tokenUsage: providerResponse.tokenUsage,
       costUsd: providerResponse.costUsd,
       durationMs: providerResponse.durationMs
     }) : void 0;
     const candidate = extractLastAssistantContent(outputMessages);
+    const providerError = extractProviderError(providerResponse);
     let result;
     try {
       result = await evaluateCandidate({
@@ -5544,6 +7191,9 @@ async function runBatchEvaluation(options) {
         outputMessages,
         traceSummary
       });
+      if (providerError) {
+        result = { ...result, error: providerError };
+      }
     } catch (error) {
       const errorResult = buildErrorResult(
         evalCase,
@@ -5576,9 +7226,10 @@ async function runBatchEvaluation(options) {
       await onProgress({
         workerId: 1,
         evalId: evalCase.id,
-        status: "completed",
+        status: result.error ? "failed" : "completed",
         startedAt: 0,
-        completedAt: Date.now()
+        completedAt: Date.now(),
+        error: result.error
       });
     }
   }
@@ -5593,17 +7244,13 @@ async function runEvalCase(options) {
     now,
     maxRetries,
     agentTimeoutMs,
-    promptDumpDir,
     cache,
     useCache,
     signal,
     judgeProvider
   } = options;
-  const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
+  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
   const promptInputs = await buildPromptInputs(evalCase, formattingMode);
-  if (promptDumpDir) {
-    await dumpPrompt(promptDumpDir, evalCase, promptInputs);
-  }
   const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
   let cachedResponse;
   if (cacheKey && cache) {
@@ -5647,15 +7294,22 @@ async function runEvalCase(options) {
     await cache.set(cacheKey, providerResponse);
   }
   const outputMessages = providerResponse.outputMessages;
-  const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
+  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
+  const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
+    eventCount: 0,
+    toolNames: [],
+    toolCallsByName: {},
+    errorCount: 0
+  } : void 0;
   const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
     tokenUsage: providerResponse.tokenUsage,
     costUsd: providerResponse.costUsd,
     durationMs: providerResponse.durationMs
   }) : void 0;
   const candidate = extractLastAssistantContent(outputMessages);
+  const providerError = extractProviderError(providerResponse);
   try {
-    return await evaluateCandidate({
+    const result = await evaluateCandidate({
       evalCase,
       candidate,
       target,
@@ -5669,6 +7323,7 @@ async function runEvalCase(options) {
       outputMessages,
       traceSummary
     });
+    return providerError ? { ...result, error: providerError } : result;
   } catch (error) {
     return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
   }
@@ -5734,7 +7389,6 @@ async function evaluateCandidate(options) {
     candidateAnswer: candidate,
     target: target.name,
     reasoning: score.reasoning,
-    rawAspects: score.rawAspects,
     agentProviderRequest,
     lmProviderRequest,
     evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
@@ -5844,7 +7498,8 @@ async function runEvaluatorList(options) {
         const codeEvaluator = new CodeEvaluator({
           script: evaluator.script,
           cwd: evaluator.resolvedCwd ?? evaluator.cwd,
-          agentTimeoutMs
+          agentTimeoutMs,
+          config: evaluator.config
         });
         const score2 = await codeEvaluator.evaluate({
           evalCase,
@@ -5872,7 +7527,7 @@ async function runEvaluatorList(options) {
         });
       }
       if (evaluator.type === "composite") {
-        const evalFileDir = evalCase.guideline_paths[0] ? path13.dirname(evalCase.guideline_paths[0]) : process.cwd();
+        const evalFileDir = evalCase.guideline_paths[0] ? path14.dirname(evalCase.guideline_paths[0]) : process.cwd();
         const createEvaluator = (memberConfig) => {
           switch (memberConfig.type) {
             case "llm_judge":
@@ -5881,7 +7536,8 @@ async function runEvaluatorList(options) {
               return new CodeEvaluator({
                 script: memberConfig.script,
                 cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
-                agentTimeoutMs
+                agentTimeoutMs,
+                config: memberConfig.config
               });
             case "composite":
               return new CompositeEvaluator({
@@ -5893,6 +7549,22 @@ async function runEvaluatorList(options) {
               return new ToolTrajectoryEvaluator({
                 config: memberConfig
               });
+            case "field_accuracy":
+              return new FieldAccuracyEvaluator({
+                config: memberConfig
+              });
+            case "latency":
+              return new LatencyEvaluator({
+                config: memberConfig
+              });
+            case "cost":
+              return new CostEvaluator({
+                config: memberConfig
+              });
+            case "token_usage":
+              return new TokenUsageEvaluator({
+                config: memberConfig
+              });
             default: {
               const unknownConfig = memberConfig;
               throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -5912,7 +7584,9 @@ async function runEvaluatorList(options) {
           attempt,
           promptInputs,
           now,
-          judgeProvider
+          judgeProvider,
+          outputMessages,
+          traceSummary
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -5957,6 +7631,118 @@ async function runEvaluatorList(options) {
           reasoning: score2.reasoning
         });
       }
+      if (evaluator.type === "field_accuracy") {
+        const fieldAccuracyEvaluator = new FieldAccuracyEvaluator({
+          config: evaluator
+        });
+        const score2 = fieldAccuracyEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now,
+          outputMessages,
+          traceSummary
+        });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          weight,
+          verdict: score2.verdict,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning
+        });
+      }
+      if (evaluator.type === "latency") {
+        const latencyEvaluator = new LatencyEvaluator({
+          config: evaluator
+        });
+        const score2 = latencyEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now,
+          outputMessages,
+          traceSummary
+        });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          weight,
+          verdict: score2.verdict,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning
+        });
+      }
+      if (evaluator.type === "cost") {
+        const costEvaluator = new CostEvaluator({
+          config: evaluator
+        });
+        const score2 = costEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now,
+          outputMessages,
+          traceSummary
+        });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          weight,
+          verdict: score2.verdict,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning
+        });
+      }
+      if (evaluator.type === "token_usage") {
+        const tokenUsageEvaluator = new TokenUsageEvaluator({
+          config: evaluator
+        });
+        const score2 = tokenUsageEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now,
+          outputMessages,
+          traceSummary
+        });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          weight,
+          verdict: score2.verdict,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning
+        });
+      }
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       const fallbackScore = {
@@ -5996,7 +7782,6 @@ async function runEvaluatorList(options) {
     (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
     0
   );
-  const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
   const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
   const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
   const score = {
@@ -6005,8 +7790,7 @@ async function runEvaluatorList(options) {
     hits,
     misses,
     expectedAspectCount,
-    reasoning,
-    rawAspects: rawAspects.length > 0 ? rawAspects : void 0
+    reasoning
   };
   return { score, evaluatorResults };
 }
@@ -6081,26 +7865,6 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
     llm_judge: llmJudge
   };
 }
-async function dumpPrompt(directory, evalCase, promptInputs) {
-  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
-  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
-  const filePath = path13.resolve(directory, filename);
-  await mkdir3(path13.dirname(filePath), { recursive: true });
-  const payload = {
-    eval_id: evalCase.id,
-    question: promptInputs.question,
-    guidelines: promptInputs.guidelines,
-    guideline_paths: evalCase.guideline_paths
-  };
-  await writeFile3(filePath, JSON.stringify(payload, null, 2), "utf8");
-}
-function sanitizeFilename(value) {
-  if (!value) {
-    return "prompt";
-  }
-  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
-  return sanitized.length > 0 ? sanitized : randomUUID3();
-}
 async function invokeProvider(provider, options) {
   const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
   const controller = new AbortController();
@@ -6164,12 +7928,23 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
     misses: [`Error: ${message}`],
     candidateAnswer: `Error occurred: ${message}`,
     target: targetName,
-    rawAspects: [],
     agentProviderRequest,
     lmProviderRequest,
     error: message
   };
 }
+function extractProviderError(response) {
+  const raw = response.raw;
+  if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
+    return void 0;
+  }
+  const error = raw.error;
+  if (typeof error !== "string") {
+    return void 0;
+  }
+  const trimmed = error.trim();
+  return trimmed.length > 0 ? trimmed : void 0;
+}
 function createCacheKey(provider, target, evalCase, promptInputs) {
   const hash = createHash("sha256");
   hash.update(provider.id);
@@ -6228,15 +8003,15 @@ function computeWeightedMean(entries) {
 // src/evaluation/generators/rubric-generator.ts
 import { generateText as generateText3 } from "ai";
-import { z as z2 } from "zod";
-var rubricItemSchema = z2.object({
-  id: z2.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
-  description: z2.string().describe("What this rubric checks for"),
-  weight: z2.number().default(1).describe("Relative importance (default 1.0)"),
-  required: z2.boolean().default(true).describe("Whether this is a mandatory requirement")
+import { z as z3 } from "zod";
+var rubricItemSchema = z3.object({
+  id: z3.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
+  description: z3.string().describe("What this rubric checks for"),
+  weight: z3.number().default(1).describe("Relative importance (default 1.0)"),
+  required: z3.boolean().default(true).describe("Whether this is a mandatory requirement")
 });
-var rubricGenerationSchema = z2.object({
-  rubrics: z2.array(rubricItemSchema).describe("List of evaluation rubrics")
+var rubricGenerationSchema = z3.object({
+  rubrics: z3.array(rubricItemSchema).describe("List of evaluation rubrics")
 });
 async function generateRubrics(options) {
   const { expectedOutcome, question, referenceAnswer, provider } = options;
@@ -6306,6 +8081,17 @@ function buildPrompt(expectedOutcome, question, referenceAnswer) {
   return parts.join("\n");
 }
+// src/evaluation/code-judge-sdk.ts
+import { readFileSync } from "node:fs";
+function parseCodeJudgePayload(payload) {
+  const parsed = JSON.parse(payload);
+  return toCamelCaseDeep(parsed);
+}
+function readCodeJudgePayload() {
+  const stdin = readFileSync(0, "utf8");
+  return parseCodeJudgePayload(stdin);
+}
 // src/index.ts
 function createAgentKernel() {
   return { status: "stub" };
@@ -6313,15 +8099,20 @@ function createAgentKernel() {
 export {
   CodeEvaluator,
   CompositeEvaluator,
+  CostEvaluator,
   DEFAULT_EXPLORATION_TOOLS,
+  FieldAccuracyEvaluator,
+  LatencyEvaluator,
   LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES,
+  TokenUsageEvaluator,
   ToolTrajectoryEvaluator,
   avgToolDurationMs,
   buildDirectoryChain,
   buildPromptInputs,
   buildSearchRoots,
   computeTraceSummary,
+  consumeClaudeCodeLogEntries,
   consumeCodexLogEntries,
   consumePiLogEntries,
   createAgentKernel,
@@ -6343,6 +8134,8 @@ export {
   loadEvalCases,
   mergeExecutionMetrics,
   normalizeLineEndings,
+  parseCodeJudgePayload,
+  readCodeJudgePayload,
   readJsonFile,
   readTargetDefinitions,
   readTestSuiteMetadata,
@@ -6352,6 +8145,7 @@ export {
   resolveTargetDefinition,
   runEvalCase,
   runEvaluation,
+  subscribeToClaudeCodeLogEntries,
   subscribeToCodexLogEntries,
   subscribeToPiLogEntries,
   tokensPerTool