npm - @agentv/core - Versions diffs - 1.5.0 → 2.0.2 - Mend

@agentv/core 1.5.0 → 2.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/README.md +77 -77
package/dist/{chunk-E2VSU4WZ.js → chunk-KDEP4I7G.js} +116 -1
package/dist/chunk-KDEP4I7G.js.map +1 -0
package/dist/evaluation/validation/index.cjs +2 -0
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +1 -1
package/dist/index.cjs +2715 -675
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +207 -10
package/dist/index.d.ts +207 -10
package/dist/index.js +2491 -570
package/dist/index.js.map +1 -1
package/package.json +8 -2
package/dist/chunk-E2VSU4WZ.js.map +0 -1

package/dist/index.cjs CHANGED Viewed

@@ -32,15 +32,20 @@ var index_exports = {};
 __export(index_exports, {
   CodeEvaluator: () => CodeEvaluator,
   CompositeEvaluator: () => CompositeEvaluator,
+  CostEvaluator: () => CostEvaluator,
   DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
+  FieldAccuracyEvaluator: () => FieldAccuracyEvaluator,
+  LatencyEvaluator: () => LatencyEvaluator,
   LlmJudgeEvaluator: () => LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
+  TokenUsageEvaluator: () => TokenUsageEvaluator,
   ToolTrajectoryEvaluator: () => ToolTrajectoryEvaluator,
   avgToolDurationMs: () => avgToolDurationMs,
   buildDirectoryChain: () => buildDirectoryChain2,
   buildPromptInputs: () => buildPromptInputs,
   buildSearchRoots: () => buildSearchRoots2,
   computeTraceSummary: () => computeTraceSummary,
+  consumeClaudeCodeLogEntries: () => consumeClaudeCodeLogEntries,
   consumeCodexLogEntries: () => consumeCodexLogEntries,
   consumePiLogEntries: () => consumePiLogEntries,
   createAgentKernel: () => createAgentKernel,
@@ -71,6 +76,7 @@ __export(index_exports, {
   resolveTargetDefinition: () => resolveTargetDefinition,
   runEvalCase: () => runEvalCase,
   runEvaluation: () => runEvaluation,
+  subscribeToClaudeCodeLogEntries: () => subscribeToClaudeCodeLogEntries,
   subscribeToCodexLogEntries: () => subscribeToCodexLogEntries,
   subscribeToPiLogEntries: () => subscribeToPiLogEntries,
   tokensPerTool: () => tokensPerTool
@@ -129,7 +135,11 @@ var EVALUATOR_KIND_VALUES = [
   "llm_judge",
   "rubric",
   "composite",
-  "tool_trajectory"
+  "tool_trajectory",
+  "field_accuracy",
+  "latency",
+  "cost",
+  "token_usage"
 ];
 var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
 function isEvaluatorKind(value) {
@@ -551,7 +561,22 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       continue;
     }
     if (typeValue === "code_judge") {
-      const script = asString2(rawEvaluator.script);
+      let script;
+      const rawScript = rawEvaluator.script;
+      if (typeof rawScript === "string") {
+        const trimmed = rawScript.trim();
+        if (trimmed.length === 0) {
+          throw new Error(
+            `Invalid code_judge script for evaluator '${name}' in '${evalId}': script cannot be empty`
+          );
+        }
+        script = parseCommandToArgv(trimmed);
+      } else {
+        script = asStringArray(
+          rawScript,
+          `code_judge script for evaluator '${name}' in '${evalId}'`
+        );
+      }
       if (!script) {
         logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
         continue;
@@ -572,13 +597,21 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       } else {
         resolvedCwd = searchRoots[0];
       }
+      const knownProps = /* @__PURE__ */ new Set(["name", "type", "script", "cwd", "weight"]);
+      const config = {};
+      for (const [key, value] of Object.entries(rawEvaluator)) {
+        if (!knownProps.has(key) && value !== void 0) {
+          config[key] = value;
+        }
+      }
       evaluators.push({
         name,
         type: "code",
         script,
         cwd,
         resolvedCwd,
-        ...weight2 !== void 0 ? { weight: weight2 } : {}
+        ...weight2 !== void 0 ? { weight: weight2 } : {},
+        ...Object.keys(config).length > 0 ? { config } : {}
       });
       continue;
     }
@@ -753,6 +786,140 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       evaluators.push(config);
       continue;
     }
+    if (typeValue === "field_accuracy") {
+      const rawFields = rawEvaluator.fields;
+      if (!Array.isArray(rawFields)) {
+        logWarning2(
+          `Skipping field_accuracy evaluator '${name}' in '${evalId}': missing fields array`
+        );
+        continue;
+      }
+      if (rawFields.length === 0) {
+        logWarning2(
+          `Skipping field_accuracy evaluator '${name}' in '${evalId}': fields array is empty`
+        );
+        continue;
+      }
+      const fields = [];
+      for (const rawField of rawFields) {
+        if (!isJsonObject2(rawField)) {
+          logWarning2(
+            `Skipping invalid field entry in field_accuracy evaluator '${name}' (expected object)`
+          );
+          continue;
+        }
+        const fieldPath = asString2(rawField.path);
+        const match = asString2(rawField.match);
+        if (!fieldPath) {
+          logWarning2(
+            `Skipping field without path in field_accuracy evaluator '${name}' in '${evalId}'`
+          );
+          continue;
+        }
+        if (!match || !isValidFieldMatchType(match)) {
+          logWarning2(
+            `Skipping field '${fieldPath}' with invalid match type '${match}' in evaluator '${name}' (must be exact, numeric_tolerance, or date). For fuzzy matching, use a code_judge evaluator.`
+          );
+          continue;
+        }
+        const fieldConfig = {
+          path: fieldPath,
+          match,
+          ...typeof rawField.required === "boolean" ? { required: rawField.required } : {},
+          ...typeof rawField.weight === "number" ? { weight: rawField.weight } : {},
+          ...typeof rawField.tolerance === "number" ? { tolerance: rawField.tolerance } : {},
+          ...typeof rawField.relative === "boolean" ? { relative: rawField.relative } : {},
+          ...Array.isArray(rawField.formats) ? { formats: rawField.formats.filter((f) => typeof f === "string") } : {}
+        };
+        fields.push(fieldConfig);
+      }
+      if (fields.length === 0) {
+        logWarning2(
+          `Skipping field_accuracy evaluator '${name}' in '${evalId}': no valid fields found`
+        );
+        continue;
+      }
+      const aggregation = asString2(rawEvaluator.aggregation);
+      const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
+      evaluators.push({
+        name,
+        type: "field_accuracy",
+        fields,
+        ...validAggregation ? { aggregation: validAggregation } : {},
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
+      });
+      continue;
+    }
+    if (typeValue === "latency") {
+      const threshold = rawEvaluator.threshold;
+      if (typeof threshold !== "number" || threshold < 0) {
+        logWarning2(
+          `Skipping latency evaluator '${name}' in '${evalId}': threshold must be a non-negative number`
+        );
+        continue;
+      }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
+      evaluators.push({
+        name,
+        type: "latency",
+        threshold,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
+      });
+      continue;
+    }
+    if (typeValue === "cost") {
+      const budget = rawEvaluator.budget;
+      if (typeof budget !== "number" || budget < 0) {
+        logWarning2(
+          `Skipping cost evaluator '${name}' in '${evalId}': budget must be a non-negative number`
+        );
+        continue;
+      }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
+      evaluators.push({
+        name,
+        type: "cost",
+        budget,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
+      });
+      continue;
+    }
+    if (typeValue === "token_usage") {
+      const maxTotal = rawEvaluator.max_total ?? rawEvaluator.maxTotal;
+      const maxInput = rawEvaluator.max_input ?? rawEvaluator.maxInput;
+      const maxOutput = rawEvaluator.max_output ?? rawEvaluator.maxOutput;
+      const limits = [
+        ["max_total", maxTotal],
+        ["max_input", maxInput],
+        ["max_output", maxOutput]
+      ];
+      const validLimits = {};
+      for (const [key, raw] of limits) {
+        if (raw === void 0) continue;
+        if (typeof raw !== "number" || !Number.isFinite(raw) || raw < 0) {
+          logWarning2(
+            `Skipping token_usage evaluator '${name}' in '${evalId}': ${key} must be a non-negative finite number`
+          );
+          continue;
+        }
+        validLimits[key] = raw;
+      }
+      if (validLimits.max_total === void 0 && validLimits.max_input === void 0 && validLimits.max_output === void 0) {
+        logWarning2(
+          `Skipping token_usage evaluator '${name}' in '${evalId}': must set at least one of max_total, max_input, max_output`
+        );
+        continue;
+      }
+      const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
+      evaluators.push({
+        name,
+        type: "token_usage",
+        ...validLimits,
+        ...weight2 !== void 0 ? { weight: weight2 } : {}
+      });
+      continue;
+    }
     const prompt = asString2(rawEvaluator.prompt);
     let promptPath;
     if (prompt) {
@@ -823,6 +990,34 @@ function coerceEvaluator(candidate, contextId) {
 function asString2(value) {
   return typeof value === "string" ? value : void 0;
 }
+function asStringArray(value, description) {
+  if (value === void 0) {
+    return void 0;
+  }
+  if (!Array.isArray(value)) {
+    throw new Error(`${description} must be an array of strings (argv tokens)`);
+  }
+  if (value.length === 0) {
+    throw new Error(`${description} cannot be empty`);
+  }
+  const result = [];
+  for (const [index, entry] of value.entries()) {
+    if (typeof entry !== "string") {
+      throw new Error(`${description}[${index}] must be a string`);
+    }
+    if (entry.trim().length === 0) {
+      throw new Error(`${description}[${index}] cannot be empty`);
+    }
+    result.push(entry);
+  }
+  return result;
+}
+function parseCommandToArgv(command) {
+  if (process.platform === "win32") {
+    return ["cmd.exe", "/c", command];
+  }
+  return ["sh", "-lc", command];
+}
 function isJsonObject2(value) {
   return typeof value === "object" && value !== null && !Array.isArray(value);
 }
@@ -856,6 +1051,14 @@ function validateWeight(rawWeight, evaluatorName, evalId) {
   }
   return rawWeight;
 }
+var VALID_FIELD_MATCH_TYPES = /* @__PURE__ */ new Set(["exact", "numeric_tolerance", "date"]);
+function isValidFieldMatchType(value) {
+  return typeof value === "string" && VALID_FIELD_MATCH_TYPES.has(value);
+}
+var VALID_FIELD_AGGREGATION_TYPES = /* @__PURE__ */ new Set(["weighted_average", "all_or_nothing"]);
+function isValidFieldAggregationType(value) {
+  return typeof value === "string" && VALID_FIELD_AGGREGATION_TYPES.has(value);
+}
 // src/evaluation/loaders/message-processor.ts
 var import_promises4 = require("fs/promises");
@@ -1930,92 +2133,993 @@ async function withRetry(fn, retryConfig, signal) {
   throw lastError;
 }
-// src/evaluation/providers/cli.ts
+// src/evaluation/providers/claude-code.ts
 var import_node_child_process = require("child_process");
-var import_promises8 = __toESM(require("fs/promises"), 1);
-var import_node_os = __toESM(require("os"), 1);
-var import_node_path8 = __toESM(require("path"), 1);
-var import_node_util = require("util");
-var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
-var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
-async function defaultCommandRunner(command, options) {
-  const execOptions = {
-    cwd: options.cwd,
-    env: options.env,
-    timeout: options.timeoutMs,
-    signal: options.signal,
-    maxBuffer: DEFAULT_MAX_BUFFER,
-    shell: process.platform === "win32" ? "powershell.exe" : void 0
+var import_node_crypto = require("crypto");
+var import_node_fs3 = require("fs");
+var import_promises8 = require("fs/promises");
+var import_node_os = require("os");
+var import_node_path9 = __toESM(require("path"), 1);
+// src/evaluation/providers/claude-code-log-tracker.ts
+var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeCodeLogs");
+var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeCodeLogSubscribers");
+function getClaudeCodeLogStore() {
+  const globalObject = globalThis;
+  const existing = globalObject[GLOBAL_LOGS_KEY];
+  if (existing) {
+    return existing;
+  }
+  const created = [];
+  globalObject[GLOBAL_LOGS_KEY] = created;
+  return created;
+}
+function getSubscriberStore() {
+  const globalObject = globalThis;
+  const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
+  if (existing) {
+    return existing;
+  }
+  const created = /* @__PURE__ */ new Set();
+  globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
+  return created;
+}
+function notifySubscribers(entry) {
+  const subscribers = Array.from(getSubscriberStore());
+  for (const listener of subscribers) {
+    try {
+      listener(entry);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.warn(`Claude Code log subscriber failed: ${message}`);
+    }
+  }
+}
+function recordClaudeCodeLogEntry(entry) {
+  getClaudeCodeLogStore().push(entry);
+  notifySubscribers(entry);
+}
+function consumeClaudeCodeLogEntries() {
+  const store = getClaudeCodeLogStore();
+  if (store.length === 0) {
+    return [];
+  }
+  return store.splice(0, store.length);
+}
+function subscribeToClaudeCodeLogEntries(listener) {
+  const store = getSubscriberStore();
+  store.add(listener);
+  return () => {
+    store.delete(listener);
   };
-  try {
-    const { stdout, stderr } = await execAsync(command, execOptions);
-    return {
-      stdout,
-      stderr,
-      exitCode: 0,
-      failed: false,
-      timedOut: false,
-      signal: null
-    };
-  } catch (error) {
-    const execError = error;
-    return {
-      stdout: execError.stdout ?? "",
-      stderr: execError.stderr ?? "",
-      exitCode: typeof execError.code === "number" ? execError.code : null,
-      failed: true,
-      timedOut: execError.timedOut === true || execError.killed === true,
-      signal: execError.signal ?? null
-    };
+}
+// src/evaluation/providers/preread.ts
+var import_node_path8 = __toESM(require("path"), 1);
+function buildPromptDocument(request, inputFiles, options) {
+  const parts = [];
+  const guidelineFiles = collectGuidelineFiles(
+    inputFiles,
+    options?.guidelinePatterns ?? request.guideline_patterns,
+    options?.guidelineOverrides
+  );
+  const inputFilesList = collectInputFiles(inputFiles);
+  const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
+  const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
+  if (prereadBlock.length > 0) {
+    parts.push("\n", prereadBlock);
   }
+  parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
+  return parts.join("\n").trim();
 }
-var CliProvider = class {
-  id;
-  kind = "cli";
-  targetName;
-  supportsBatch = true;
-  config;
-  runCommand;
-  verbose;
-  keepTempFiles;
-  healthcheckPromise;
-  constructor(targetName, config, runner = defaultCommandRunner) {
-    this.targetName = targetName;
-    this.id = `cli:${targetName}`;
-    this.config = config;
-    this.runCommand = runner;
-    this.verbose = config.verbose ?? false;
-    this.keepTempFiles = config.keepTempFiles ?? false;
+function normalizeInputFiles(inputFiles) {
+  if (!inputFiles || inputFiles.length === 0) {
+    return void 0;
   }
-  async invoke(request) {
-    if (request.signal?.aborted) {
-      throw new Error("CLI provider request was aborted before execution");
+  const deduped = /* @__PURE__ */ new Map();
+  for (const inputFile of inputFiles) {
+    const absolutePath = import_node_path8.default.resolve(inputFile);
+    if (!deduped.has(absolutePath)) {
+      deduped.set(absolutePath, absolutePath);
     }
-    await this.ensureHealthy(request.signal);
-    const outputFilePath = generateOutputFilePath(request.evalCaseId);
-    const templateValues = buildTemplateValues(request, this.config, outputFilePath);
-    const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
-    if (this.verbose) {
-      console.log(
-        `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
-      );
+  }
+  return Array.from(deduped.values());
+}
+function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
+  if (!inputFiles || inputFiles.length === 0) {
+    return [];
+  }
+  const unique = /* @__PURE__ */ new Map();
+  for (const inputFile of inputFiles) {
+    const absolutePath = import_node_path8.default.resolve(inputFile);
+    if (overrides?.has(absolutePath)) {
+      if (!unique.has(absolutePath)) {
+        unique.set(absolutePath, absolutePath);
+      }
+      continue;
     }
-    const startTime = Date.now();
-    const result = await this.runCommand(renderedCommand, {
-      cwd: this.config.cwd,
-      env: process.env,
-      timeoutMs: this.config.timeoutMs,
-      signal: request.signal
-    });
-    const measuredDurationMs = Date.now() - startTime;
-    if (result.failed || (result.exitCode ?? 0) !== 0) {
-      if (request.signal?.aborted) {
-        throw new Error("CLI provider request was aborted");
+    const normalized = absolutePath.split(import_node_path8.default.sep).join("/");
+    if (isGuidelineFile(normalized, guidelinePatterns)) {
+      if (!unique.has(absolutePath)) {
+        unique.set(absolutePath, absolutePath);
       }
-      if (result.timedOut) {
-        throw new Error(
-          `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
-        );
+    }
+  }
+  return Array.from(unique.values());
+}
+function collectInputFiles(inputFiles) {
+  if (!inputFiles || inputFiles.length === 0) {
+    return [];
+  }
+  const unique = /* @__PURE__ */ new Map();
+  for (const inputFile of inputFiles) {
+    const absolutePath = import_node_path8.default.resolve(inputFile);
+    if (!unique.has(absolutePath)) {
+      unique.set(absolutePath, absolutePath);
+    }
+  }
+  return Array.from(unique.values());
+}
+function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
+  if (guidelineFiles.length === 0 && inputFiles.length === 0) {
+    return "";
+  }
+  const buildList = (files) => files.map((absolutePath) => {
+    const fileName = import_node_path8.default.basename(absolutePath);
+    const fileUri = pathToFileUri(absolutePath);
+    return `* [${fileName}](${fileUri})`;
+  });
+  const sections = [];
+  if (guidelineFiles.length > 0) {
+    sections.push(`Read all guideline files:
+${buildList(guidelineFiles).join("\n")}.`);
+  }
+  if (inputFiles.length > 0) {
+    sections.push(`Read all input files:
+${buildList(inputFiles).join("\n")}.`);
+  }
+  sections.push(
+    "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
+    "Then apply system_instructions on the user query below."
+  );
+  return sections.join("\n");
+}
+function pathToFileUri(filePath) {
+  const absolutePath = import_node_path8.default.isAbsolute(filePath) ? filePath : import_node_path8.default.resolve(filePath);
+  const normalizedPath = absolutePath.replace(/\\/g, "/");
+  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
+    return `file:///${normalizedPath}`;
+  }
+  return `file://${normalizedPath}`;
+}
+// src/evaluation/providers/claude-code.ts
+var WORKSPACE_PREFIX = "agentv-claude-code-";
+var PROMPT_FILENAME = "prompt.md";
+var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
+- Do NOT create any additional output files in the workspace.
+- All intended file outputs/changes MUST be written in your response.
+- For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
+This is required for evaluation scoring.`;
+var ClaudeCodeProvider = class {
+  id;
+  kind = "claude-code";
+  targetName;
+  supportsBatch = false;
+  config;
+  runClaudeCode;
+  constructor(targetName, config, runner = defaultClaudeCodeRunner) {
+    this.id = `claude-code:${targetName}`;
+    this.targetName = targetName;
+    this.config = config;
+    this.runClaudeCode = runner;
+  }
+  async invoke(request) {
+    if (request.signal?.aborted) {
+      throw new Error("Claude Code request was aborted before execution");
+    }
+    const inputFiles = normalizeInputFiles(request.inputFiles);
+    const workspaceRoot = await this.createWorkspace();
+    const logger = await this.createStreamLogger(request).catch(() => void 0);
+    try {
+      const promptFile = import_node_path9.default.join(workspaceRoot, PROMPT_FILENAME);
+      await (0, import_promises8.writeFile)(promptFile, request.question, "utf8");
+      const args = this.buildClaudeCodeArgs(request.question, inputFiles);
+      const cwd = this.resolveCwd();
+      const result = await this.executeClaudeCode(args, cwd, request.signal, logger);
+      if (result.timedOut) {
+        throw new Error(
+          `Claude Code CLI timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
+        );
+      }
+      if (result.exitCode !== 0) {
+        const detail = pickDetail(result.stderr, result.stdout);
+        const prefix = `Claude Code CLI exited with code ${result.exitCode}`;
+        if (isNestedClaudeCodeAuthError(result.stdout)) {
+          throw new Error(
+            `${prefix}: Claude Code detected a nested session and requires API key authentication. Set ANTHROPIC_API_KEY environment variable or run AgentV outside of a Claude Code session.`
+          );
+        }
+        throw new Error(detail ? `${prefix}: ${detail}` : prefix);
+      }
+      const parsed = parseClaudeCodeJsonl(result.stdout);
+      const outputMessages = extractOutputMessages(parsed);
+      const usage = extractUsage(parsed);
+      return {
+        raw: {
+          response: parsed,
+          stdout: result.stdout,
+          stderr: result.stderr,
+          exitCode: result.exitCode,
+          args,
+          executable: this.config.executable,
+          promptFile,
+          workspace: workspaceRoot,
+          inputFiles,
+          logFile: logger?.filePath
+        },
+        outputMessages,
+        usage
+      };
+    } finally {
+      await logger?.close();
+      await this.cleanupWorkspace(workspaceRoot);
+    }
+  }
+  resolveCwd() {
+    if (!this.config.cwd) {
+      return process.cwd();
+    }
+    return import_node_path9.default.resolve(this.config.cwd);
+  }
+  buildClaudeCodeArgs(prompt, inputFiles) {
+    const args = [];
+    args.push("--output-format", "stream-json");
+    args.push("--verbose");
+    args.push("-p");
+    if (this.config.model) {
+      args.push("--model", this.config.model);
+    }
+    if (this.config.args && this.config.args.length > 0) {
+      args.push(...this.config.args);
+    }
+    const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
+    const fullPrompt = `${systemPrompt}
+${prompt}`;
+    let finalPrompt = fullPrompt;
+    if (inputFiles && inputFiles.length > 0) {
+      const filesContext = inputFiles.map((f) => `[File: ${f}]`).join("\n");
+      finalPrompt = `${fullPrompt}
+## Input Files
+${filesContext}`;
+    }
+    args.push(finalPrompt);
+    return args;
+  }
+  buildEnv() {
+    const env = { ...process.env };
+    env.CLAUDECODE = void 0;
+    env.CLAUDE_CODE_ENTRYPOINT = void 0;
+    return env;
+  }
+  async executeClaudeCode(args, cwd, signal, logger) {
+    try {
+      return await this.runClaudeCode({
+        executable: this.config.executable,
+        args,
+        cwd,
+        timeoutMs: this.config.timeoutMs,
+        env: this.buildEnv(),
+        signal,
+        onStdoutChunk: logger ? (chunk) => logger.handleStdoutChunk(chunk) : void 0,
+        onStderrChunk: logger ? (chunk) => logger.handleStderrChunk(chunk) : void 0
+      });
+    } catch (error) {
+      const err = error;
+      if (err.code === "ENOENT") {
+        throw new Error(
+          `Claude Code executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
+        );
+      }
+      throw error;
+    }
+  }
+  async createWorkspace() {
+    return await (0, import_promises8.mkdtemp)(import_node_path9.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
+  }
+  async cleanupWorkspace(workspaceRoot) {
+    try {
+      await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
+    } catch {
+    }
+  }
+  resolveLogDirectory() {
+    const disabled = isClaudeCodeLogStreamingDisabled();
+    if (disabled) {
+      return void 0;
+    }
+    if (this.config.logDir) {
+      return import_node_path9.default.resolve(this.config.logDir);
+    }
+    return import_node_path9.default.join(process.cwd(), ".agentv", "logs", "claude-code");
+  }
+  async createStreamLogger(request) {
+    const logDir = this.resolveLogDirectory();
+    if (!logDir) {
+      return void 0;
+    }
+    try {
+      await (0, import_promises8.mkdir)(logDir, { recursive: true });
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.warn(`Skipping Claude Code stream logging (could not create ${logDir}): ${message}`);
+      return void 0;
+    }
+    const filePath = import_node_path9.default.join(logDir, buildLogFilename(request, this.targetName));
+    try {
+      const logger = await ClaudeCodeStreamLogger.create({
+        filePath,
+        targetName: this.targetName,
+        evalCaseId: request.evalCaseId,
+        attempt: request.attempt,
+        format: this.config.logFormat ?? "summary"
+      });
+      recordClaudeCodeLogEntry({
+        filePath,
+        targetName: this.targetName,
+        evalCaseId: request.evalCaseId,
+        attempt: request.attempt
+      });
+      return logger;
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.warn(`Skipping Claude Code stream logging for ${filePath}: ${message}`);
+      return void 0;
+    }
+  }
+};
+var ClaudeCodeStreamLogger = class _ClaudeCodeStreamLogger {
+  filePath;
+  stream;
+  startedAt = Date.now();
+  stdoutBuffer = "";
+  stderrBuffer = "";
+  format;
+  constructor(filePath, format) {
+    this.filePath = filePath;
+    this.format = format;
+    this.stream = (0, import_node_fs3.createWriteStream)(filePath, { flags: "a" });
+  }
+  static async create(options) {
+    const logger = new _ClaudeCodeStreamLogger(options.filePath, options.format);
+    const header = [
+      "# Claude Code CLI stream log",
+      `# target: ${options.targetName}`,
+      options.evalCaseId ? `# eval: ${options.evalCaseId}` : void 0,
+      options.attempt !== void 0 ? `# attempt: ${options.attempt + 1}` : void 0,
+      `# started: ${(/* @__PURE__ */ new Date()).toISOString()}`,
+      ""
+    ].filter((line) => Boolean(line));
+    logger.writeLines(header);
+    return logger;
+  }
+  handleStdoutChunk(chunk) {
+    this.stdoutBuffer += chunk;
+    this.flushBuffer("stdout");
+  }
+  handleStderrChunk(chunk) {
+    this.stderrBuffer += chunk;
+    this.flushBuffer("stderr");
+  }
+  async close() {
+    this.flushBuffer("stdout");
+    this.flushBuffer("stderr");
+    this.flushRemainder();
+    await new Promise((resolve, reject) => {
+      this.stream.once("error", reject);
+      this.stream.end(() => resolve());
+    });
+  }
+  writeLines(lines) {
+    for (const line of lines) {
+      this.stream.write(`${line}
+`);
+    }
+  }
+  flushBuffer(source) {
+    const buffer = source === "stdout" ? this.stdoutBuffer : this.stderrBuffer;
+    const lines = buffer.split(/\r?\n/);
+    const remainder = lines.pop() ?? "";
+    if (source === "stdout") {
+      this.stdoutBuffer = remainder;
+    } else {
+      this.stderrBuffer = remainder;
+    }
+    for (const line of lines) {
+      const formatted = this.formatLine(line, source);
+      if (formatted) {
+        this.stream.write(formatted);
+        this.stream.write("\n");
+      }
+    }
+  }
+  formatLine(rawLine, source) {
+    const trimmed = rawLine.trim();
+    if (trimmed.length === 0) {
+      return void 0;
+    }
+    const message = this.format === "json" ? formatClaudeCodeJsonLog(trimmed) : formatClaudeCodeLogMessage(trimmed, source);
+    return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
+  }
+  flushRemainder() {
+    const stdoutRemainder = this.stdoutBuffer.trim();
+    if (stdoutRemainder.length > 0) {
+      const formatted = this.formatLine(stdoutRemainder, "stdout");
+      if (formatted) {
+        this.stream.write(formatted);
+        this.stream.write("\n");
+      }
+    }
+    const stderrRemainder = this.stderrBuffer.trim();
+    if (stderrRemainder.length > 0) {
+      const formatted = this.formatLine(stderrRemainder, "stderr");
+      if (formatted) {
+        this.stream.write(formatted);
+        this.stream.write("\n");
+      }
+    }
+    this.stdoutBuffer = "";
+    this.stderrBuffer = "";
+  }
+};
+function isClaudeCodeLogStreamingDisabled() {
+  const envValue = process.env.AGENTV_CLAUDE_CODE_STREAM_LOGS;
+  if (!envValue) {
+    return false;
+  }
+  const normalized = envValue.trim().toLowerCase();
+  return normalized === "false" || normalized === "0" || normalized === "off";
+}
+function buildLogFilename(request, targetName) {
+  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
+  const evalId = sanitizeForFilename(request.evalCaseId ?? "claude-code");
+  const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
+  const target = sanitizeForFilename(targetName);
+  return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto.randomUUID)().slice(0, 8)}.log`;
+}
+function sanitizeForFilename(value) {
+  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
+  return sanitized.length > 0 ? sanitized : "claude-code";
+}
+function formatElapsed(startedAt) {
+  const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
+  const hours = Math.floor(elapsedSeconds / 3600);
+  const minutes = Math.floor(elapsedSeconds % 3600 / 60);
+  const seconds = elapsedSeconds % 60;
+  if (hours > 0) {
+    return `${hours.toString().padStart(2, "0")}:${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
+  }
+  return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
+}
+function formatClaudeCodeLogMessage(rawLine, source) {
+  const parsed = tryParseJsonValue(rawLine);
+  if (parsed) {
+    const summary = summarizeClaudeCodeEvent(parsed);
+    if (summary) {
+      return summary;
+    }
+  }
+  if (source === "stderr") {
+    return `stderr: ${rawLine}`;
+  }
+  return rawLine;
+}
+function formatClaudeCodeJsonLog(rawLine) {
+  const parsed = tryParseJsonValue(rawLine);
+  if (!parsed) {
+    return rawLine;
+  }
+  try {
+    return JSON.stringify(parsed, null, 2);
+  } catch {
+    return rawLine;
+  }
+}
+function summarizeClaudeCodeEvent(event) {
+  if (!event || typeof event !== "object") {
+    return void 0;
+  }
+  const record = event;
+  const type = typeof record.type === "string" ? record.type : void 0;
+  if (!type) {
+    return void 0;
+  }
+  switch (type) {
+    case "system":
+      return "system: init";
+    case "assistant": {
+      const message = record.message;
+      if (message) {
+        const content = message.content;
+        if (Array.isArray(content) && content.length > 0) {
+          const first = content[0];
+          if (first?.type === "tool_use") {
+            return `assistant: tool_use (${first.name})`;
+          }
+          if (first?.type === "text") {
+            const text = first.text;
+            if (typeof text === "string") {
+              const preview = text.length > 50 ? `${text.slice(0, 50)}...` : text;
+              return `assistant: ${preview}`;
+            }
+          }
+        }
+      }
+      return "assistant";
+    }
+    case "user": {
+      const message = record.message;
+      if (message) {
+        const content = message.content;
+        if (Array.isArray(content) && content.length > 0) {
+          const first = content[0];
+          if (first?.type === "tool_result") {
+            return `user: tool_result (${first.tool_use_id})`;
+          }
+        }
+      }
+      return "user";
+    }
+    case "result": {
+      const cost = record.cost_usd;
+      const duration = record.duration_ms;
+      if (typeof cost === "number" && typeof duration === "number") {
+        return `result: $${cost.toFixed(4)}, ${Math.round(duration)}ms`;
+      }
+      return "result";
+    }
+    default:
+      return type;
+  }
+}
+function tryParseJsonValue(rawLine) {
+  try {
+    return JSON.parse(rawLine);
+  } catch {
+    return void 0;
+  }
+}
+function parseClaudeCodeJsonl(output) {
+  const trimmed = output.trim();
+  if (trimmed.length === 0) {
+    throw new Error("Claude Code CLI produced no output");
+  }
+  const lines = trimmed.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
+  const parsed = [];
+  for (const line of lines) {
+    try {
+      parsed.push(JSON.parse(line));
+    } catch {
+    }
+  }
+  if (parsed.length === 0) {
+    throw new Error("Claude Code CLI produced no valid JSON output");
+  }
+  return parsed;
+}
+function extractOutputMessages(events) {
+  const outputMessages = [];
+  for (const event of events) {
+    if (!event || typeof event !== "object") {
+      continue;
+    }
+    const record = event;
+    const type = record.type;
+    if (type === "assistant" || type === "user") {
+      const message = record.message;
+      if (message) {
+        const converted = convertClaudeCodeMessage(message, type);
+        if (converted) {
+          outputMessages.push(converted);
+        }
+      }
+    }
+  }
+  return outputMessages;
+}
+function convertClaudeCodeMessage(message, type) {
+  const role = type === "assistant" ? "assistant" : "user";
+  const content = extractTextContent(message.content);
+  const toolCalls = extractToolCalls(message.content);
+  return {
+    role,
+    content,
+    toolCalls: toolCalls.length > 0 ? toolCalls : void 0
+  };
+}
+function extractTextContent(content) {
+  if (typeof content === "string") {
+    return content;
+  }
+  if (!Array.isArray(content)) {
+    return void 0;
+  }
+  const textParts = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "text" && typeof p.text === "string") {
+      textParts.push(p.text);
+    }
+  }
+  return textParts.length > 0 ? textParts.join("\n") : void 0;
+}
+function extractToolCalls(content) {
+  if (!Array.isArray(content)) {
+    return [];
+  }
+  const toolCalls = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "tool_use" && typeof p.name === "string") {
+      toolCalls.push({
+        tool: p.name,
+        input: p.input,
+        id: typeof p.id === "string" ? p.id : void 0
+      });
+    }
+    if (p.type === "tool_result" && typeof p.tool_use_id === "string") {
+      toolCalls.push({
+        tool: "tool_result",
+        output: p.content,
+        id: p.tool_use_id
+      });
+    }
+  }
+  return toolCalls;
+}
+function extractUsage(events) {
+  for (let i = events.length - 1; i >= 0; i--) {
+    const event = events[i];
+    if (!event || typeof event !== "object") {
+      continue;
+    }
+    const record = event;
+    if (record.type !== "result") {
+      continue;
+    }
+    const usage = {};
+    if (typeof record.cost_usd === "number") {
+      usage.cost_usd = record.cost_usd;
+    }
+    if (typeof record.duration_ms === "number") {
+      usage.duration_ms = record.duration_ms;
+    }
+    if (typeof record.duration_api_ms === "number") {
+      usage.duration_api_ms = record.duration_api_ms;
+    }
+    if (typeof record.input_tokens === "number") {
+      usage.input_tokens = record.input_tokens;
+    }
+    if (typeof record.output_tokens === "number") {
+      usage.output_tokens = record.output_tokens;
+    }
+    if (typeof record.session_id === "string") {
+      usage.session_id = record.session_id;
+    }
+    return Object.keys(usage).length > 0 ? usage : void 0;
+  }
+  return void 0;
+}
+function pickDetail(stderr, stdout) {
+  const errorText = stderr.trim();
+  if (errorText.length > 0) {
+    return errorText;
+  }
+  const stdoutText = stdout.trim();
+  return stdoutText.length > 0 ? stdoutText : void 0;
+}
+function formatTimeoutSuffix(timeoutMs) {
+  if (!timeoutMs || timeoutMs <= 0) {
+    return "";
+  }
+  const seconds = Math.ceil(timeoutMs / 1e3);
+  return ` after ${seconds}s`;
+}
+function isNestedClaudeCodeAuthError(stdout) {
+  try {
+    const lines = stdout.split("\n");
+    let hasApiKeySource = false;
+    let hasAuthError = false;
+    for (const line of lines) {
+      const trimmed = line.trim();
+      if (!trimmed) continue;
+      try {
+        const event = JSON.parse(trimmed);
+        if (event.type === "system" && event.apiKeySource === "ANTHROPIC_API_KEY") {
+          hasApiKeySource = true;
+        }
+        if (event.error === "authentication_failed" || event.type === "result" && event.is_error) {
+          hasAuthError = true;
+        }
+      } catch {
+      }
+    }
+    return hasApiKeySource && hasAuthError;
+  } catch {
+    return false;
+  }
+}
+function escapeShellArg(arg) {
+  return `'${arg.replace(/'/g, "'\\''")}'`;
+}
+async function defaultClaudeCodeRunner(options) {
+  const tempId = (0, import_node_crypto.randomUUID)();
+  const stdoutFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stdout`);
+  const stderrFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-stderr`);
+  const exitFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-exit`);
+  const pidFile = import_node_path9.default.join((0, import_node_os.tmpdir)(), `agentv-cc-${tempId}-pid`);
+  try {
+    return await runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile);
+  } finally {
+    for (const file of [stdoutFile, stderrFile, exitFile, pidFile]) {
+      try {
+        await (0, import_promises8.rm)(file, { force: true });
+      } catch {
+      }
+    }
+  }
+}
+async function runClaudeCodeWithTempFiles(options, stdoutFile, stderrFile, exitFile, pidFile) {
+  const parts = options.executable.split(/\s+/);
+  const executable = parts[0];
+  const executableArgs = parts.slice(1);
+  const allArgs = [...executableArgs, ...options.args];
+  const escapedArgs = allArgs.map((arg) => escapeShellArg(arg));
+  const fullCommand = [escapeShellArg(executable), ...escapedArgs].join(" ");
+  const bashScript = `
+    unset CLAUDECODE CLAUDE_CODE_ENTRYPOINT 2>/dev/null
+    ${fullCommand} >${escapeShellArg(stdoutFile)} 2>${escapeShellArg(stderrFile)} &
+    CHILD_PID=$!
+    echo $CHILD_PID > ${escapeShellArg(pidFile)}
+    wait $CHILD_PID
+    echo $? > ${escapeShellArg(exitFile)}
+  `;
+  const child = (0, import_node_child_process.spawn)("setsid", ["bash", "-c", bashScript], {
+    cwd: options.cwd,
+    env: options.env,
+    detached: true,
+    stdio: "ignore"
+  });
+  child.unref();
+  const pollInterval = 100;
+  const startTime = Date.now();
+  let timedOut = false;
+  let lastStdoutSize = 0;
+  const readFileIfExists = async (filePath) => {
+    try {
+      const { readFile: readFile8 } = await import("fs/promises");
+      return await readFile8(filePath, "utf8");
+    } catch {
+      return "";
+    }
+  };
+  const fileExists4 = async (filePath) => {
+    try {
+      const { access: access5 } = await import("fs/promises");
+      await access5(filePath);
+      return true;
+    } catch {
+      return false;
+    }
+  };
+  const killProcess = async () => {
+    try {
+      const pid = await readFileIfExists(pidFile);
+      if (pid.trim()) {
+        process.kill(Number.parseInt(pid.trim(), 10), "SIGTERM");
+      }
+    } catch {
+    }
+  };
+  if (options.signal?.aborted) {
+    await killProcess();
+    return { stdout: "", stderr: "Aborted", exitCode: -1, timedOut: false };
+  }
+  const abortHandler = () => {
+    killProcess().catch(() => {
+    });
+  };
+  options.signal?.addEventListener("abort", abortHandler, { once: true });
+  try {
+    while (true) {
+      if (options.timeoutMs && Date.now() - startTime > options.timeoutMs) {
+        timedOut = true;
+        await killProcess();
+        break;
+      }
+      if (options.signal?.aborted) {
+        await killProcess();
+        break;
+      }
+      if (options.onStdoutChunk) {
+        const currentStdout = await readFileIfExists(stdoutFile);
+        if (currentStdout.length > lastStdoutSize) {
+          options.onStdoutChunk(currentStdout.slice(lastStdoutSize));
+          lastStdoutSize = currentStdout.length;
+        }
+      }
+      if (await fileExists4(exitFile)) {
+        break;
+      }
+      await new Promise((resolve) => setTimeout(resolve, pollInterval));
+    }
+    const stdout = await readFileIfExists(stdoutFile);
+    const stderr = await readFileIfExists(stderrFile);
+    const exitCodeStr = await readFileIfExists(exitFile);
+    const exitCode = exitCodeStr.trim() ? Number.parseInt(exitCodeStr.trim(), 10) : -1;
+    if (options.onStdoutChunk && stdout.length > lastStdoutSize) {
+      options.onStdoutChunk(stdout.slice(lastStdoutSize));
+    }
+    if (options.onStderrChunk && stderr) {
+      options.onStderrChunk(stderr);
+    }
+    return { stdout, stderr, exitCode, timedOut };
+  } finally {
+    options.signal?.removeEventListener("abort", abortHandler);
+  }
+}
+// src/evaluation/providers/cli.ts
+var import_node_child_process2 = require("child_process");
+var import_promises9 = __toESM(require("fs/promises"), 1);
+var import_node_os2 = __toESM(require("os"), 1);
+var import_node_path10 = __toESM(require("path"), 1);
+var import_node_util = require("util");
+var import_zod = require("zod");
+var ToolCallSchema = import_zod.z.object({
+  tool: import_zod.z.string(),
+  input: import_zod.z.unknown().optional(),
+  output: import_zod.z.unknown().optional(),
+  id: import_zod.z.string().optional(),
+  timestamp: import_zod.z.string().optional()
+});
+var OutputMessageInputSchema = import_zod.z.object({
+  role: import_zod.z.string(),
+  name: import_zod.z.string().optional(),
+  content: import_zod.z.unknown().optional(),
+  tool_calls: import_zod.z.array(ToolCallSchema).optional(),
+  timestamp: import_zod.z.string().optional(),
+  metadata: import_zod.z.record(import_zod.z.unknown()).optional()
+});
+var TokenUsageSchema = import_zod.z.object({
+  input: import_zod.z.number(),
+  output: import_zod.z.number(),
+  cached: import_zod.z.number().optional()
+});
+var CliOutputSchema = import_zod.z.object({
+  text: import_zod.z.unknown().optional(),
+  output_messages: import_zod.z.array(OutputMessageInputSchema).optional(),
+  token_usage: TokenUsageSchema.optional(),
+  cost_usd: import_zod.z.number().optional(),
+  duration_ms: import_zod.z.number().optional()
+});
+var CliJsonlRecordSchema = CliOutputSchema.extend({
+  id: import_zod.z.string().min(1)
+});
+function validateMetrics(costUsd, durationMs, context) {
+  let validCostUsd = costUsd;
+  let validDurationMs = durationMs;
+  if (costUsd !== void 0 && costUsd < 0) {
+    console.warn(`[cli-provider] ${context}: ignoring negative cost_usd value (${costUsd})`);
+    validCostUsd = void 0;
+  }
+  if (durationMs !== void 0 && durationMs < 0) {
+    console.warn(`[cli-provider] ${context}: ignoring negative duration_ms value (${durationMs})`);
+    validDurationMs = void 0;
+  }
+  return { costUsd: validCostUsd, durationMs: validDurationMs };
+}
+function convertOutputMessages(messages) {
+  if (!messages || messages.length === 0) {
+    return void 0;
+  }
+  return messages.map((msg) => ({
+    role: msg.role,
+    name: msg.name,
+    content: msg.content,
+    toolCalls: msg.tool_calls,
+    timestamp: msg.timestamp,
+    metadata: msg.metadata
+  }));
+}
+var execAsync = (0, import_node_util.promisify)(import_node_child_process2.exec);
+var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
+async function defaultCommandRunner(command, options) {
+  const execOptions = {
+    cwd: options.cwd,
+    env: options.env,
+    timeout: options.timeoutMs,
+    signal: options.signal,
+    maxBuffer: DEFAULT_MAX_BUFFER,
+    shell: process.platform === "win32" ? "powershell.exe" : void 0
+  };
+  try {
+    const { stdout, stderr } = await execAsync(command, execOptions);
+    return {
+      stdout,
+      stderr,
+      exitCode: 0,
+      failed: false,
+      timedOut: false,
+      signal: null
+    };
+  } catch (error) {
+    const execError = error;
+    return {
+      stdout: execError.stdout ?? "",
+      stderr: execError.stderr ?? "",
+      exitCode: typeof execError.code === "number" ? execError.code : null,
+      failed: true,
+      timedOut: execError.timedOut === true || execError.killed === true,
+      signal: execError.signal ?? null
+    };
+  }
+}
+var CliProvider = class {
+  id;
+  kind = "cli";
+  targetName;
+  supportsBatch = true;
+  config;
+  runCommand;
+  verbose;
+  keepTempFiles;
+  healthcheckPromise;
+  constructor(targetName, config, runner = defaultCommandRunner) {
+    this.targetName = targetName;
+    this.id = `cli:${targetName}`;
+    this.config = config;
+    this.runCommand = runner;
+    this.verbose = config.verbose ?? false;
+    this.keepTempFiles = config.keepTempFiles ?? false;
+  }
+  async invoke(request) {
+    if (request.signal?.aborted) {
+      throw new Error("CLI provider request was aborted before execution");
+    }
+    await this.ensureHealthy(request.signal);
+    const outputFilePath = generateOutputFilePath(request.evalCaseId);
+    const templateValues = buildTemplateValues(request, this.config, outputFilePath);
+    const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
+    if (this.verbose) {
+      console.log(
+        `[cli-provider:${this.targetName}] cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
+      );
+    }
+    const startTime = Date.now();
+    const result = await this.runCommand(renderedCommand, {
+      cwd: this.config.cwd,
+      env: process.env,
+      timeoutMs: this.config.timeoutMs,
+      signal: request.signal
+    });
+    const measuredDurationMs = Date.now() - startTime;
+    if (result.failed || (result.exitCode ?? 0) !== 0) {
+      if (request.signal?.aborted) {
+        throw new Error("CLI provider request was aborted");
+      }
+      if (result.timedOut) {
+        throw new Error(
+          `CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
+        );
       }
       const codeText = result.exitCode !== null ? result.exitCode : "unknown";
       const detail = result.stderr.trim() || result.stdout.trim();
@@ -2090,7 +3194,7 @@ var CliProvider = class {
       }
       if (result.timedOut) {
         throw new Error(
-          `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
+          `CLI provider timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
         );
       }
       const codeText = result.exitCode !== null ? result.exitCode : "unknown";
@@ -2100,11 +3204,6 @@ var CliProvider = class {
     }
     const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
     const recordsById = this.parseJsonlBatchOutput(responseContent);
-    const requestedIds = requests.map((request) => request.evalCaseId).filter((id) => typeof id === "string" && id.trim().length > 0);
-    const missingIds = requestedIds.filter((id) => !recordsById.has(id));
-    if (missingIds.length > 0) {
-      throw new Error(`CLI batch output missing ids: ${missingIds.join(", ")}`);
-    }
     const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
     const responses = requests.map((request) => {
       const evalCaseId = request.evalCaseId;
@@ -2123,15 +3222,20 @@ var CliProvider = class {
       }
       const parsed = recordsById.get(evalCaseId);
       if (!parsed) {
+        const errorMessage = `Batch output missing id '${evalCaseId}'`;
+        if (this.verbose) {
+          console.warn(`[cli-provider:${this.targetName}] ${errorMessage}`);
+        }
         return {
-          outputMessages: [],
+          outputMessages: [{ role: "assistant", content: `Error: ${errorMessage}` }],
           durationMs: perRequestFallbackMs,
           raw: {
             command: renderedCommand,
             stderr: result.stderr,
             exitCode: result.exitCode ?? 0,
             cwd: this.config.cwd,
-            outputFile: outputFilePath
+            outputFile: outputFilePath,
+            error: errorMessage
           }
         };
       }
@@ -2164,101 +3268,37 @@ var CliProvider = class {
    * - duration_ms: number
    */
   parseOutputContent(content) {
+    let parsed;
     try {
-      const parsed = JSON.parse(content);
-      if (typeof parsed === "object" && parsed !== null) {
-        const obj = parsed;
-        const tokenUsage = this.parseTokenUsage(obj.token_usage);
-        const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
-        const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
-        const outputMessages = this.parseOutputMessages(obj.output_messages);
-        if (outputMessages && outputMessages.length > 0) {
-          return { outputMessages, tokenUsage, costUsd, durationMs };
-        }
-        if ("text" in obj) {
-          const text = typeof obj.text === "string" ? obj.text : String(obj.text);
-          return {
-            outputMessages: [{ role: "assistant", content: text }],
-            tokenUsage,
-            costUsd,
-            durationMs
-          };
-        }
-      }
+      parsed = JSON.parse(content);
     } catch {
+      return { outputMessages: [{ role: "assistant", content }] };
     }
-    return { outputMessages: [{ role: "assistant", content }] };
-  }
-  /**
-   * Parse token_usage from CLI output.
-   */
-  parseTokenUsage(tokenUsage) {
-    if (typeof tokenUsage !== "object" || tokenUsage === null) {
-      return void 0;
-    }
-    const obj = tokenUsage;
-    if (typeof obj.input !== "number" || typeof obj.output !== "number") {
-      return void 0;
-    }
-    return {
-      input: obj.input,
-      output: obj.output,
-      cached: typeof obj.cached === "number" ? obj.cached : void 0
-    };
-  }
-  /**
-   * Parse output_messages from JSONL (snake_case) and convert to OutputMessage[] (camelCase).
-   */
-  parseOutputMessages(outputMessages) {
-    if (!Array.isArray(outputMessages)) {
-      return void 0;
+    const result = CliOutputSchema.safeParse(parsed);
+    if (!result.success) {
+      return { outputMessages: [{ role: "assistant", content }] };
     }
-    const messages = [];
-    for (const msg of outputMessages) {
-      if (typeof msg !== "object" || msg === null) {
-        continue;
-      }
-      const rawMsg = msg;
-      if (typeof rawMsg.role !== "string") {
-        continue;
-      }
-      const message = {
-        role: rawMsg.role,
-        name: typeof rawMsg.name === "string" ? rawMsg.name : void 0,
-        content: rawMsg.content,
-        toolCalls: this.parseToolCalls(rawMsg.tool_calls),
-        timestamp: typeof rawMsg.timestamp === "string" ? rawMsg.timestamp : void 0,
-        metadata: typeof rawMsg.metadata === "object" && rawMsg.metadata !== null ? rawMsg.metadata : void 0
+    const obj = result.data;
+    const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, "parsing output");
+    const outputMessages = convertOutputMessages(obj.output_messages);
+    if (outputMessages && outputMessages.length > 0) {
+      return {
+        outputMessages,
+        tokenUsage: obj.token_usage,
+        costUsd: metrics.costUsd,
+        durationMs: metrics.durationMs
       };
-      messages.push(message);
-    }
-    return messages.length > 0 ? messages : void 0;
-  }
-  /**
-   * Parse tool_calls from JSONL (snake_case) and convert to ToolCall[] format.
-   */
-  parseToolCalls(toolCalls) {
-    if (!Array.isArray(toolCalls)) {
-      return void 0;
     }
-    const calls = [];
-    for (const call of toolCalls) {
-      if (typeof call !== "object" || call === null) {
-        continue;
-      }
-      const rawCall = call;
-      if (typeof rawCall.tool !== "string") {
-        continue;
-      }
-      calls.push({
-        tool: rawCall.tool,
-        input: rawCall.input,
-        output: rawCall.output,
-        id: typeof rawCall.id === "string" ? rawCall.id : void 0,
-        timestamp: typeof rawCall.timestamp === "string" ? rawCall.timestamp : void 0
-      });
+    if (obj.text !== void 0) {
+      const text = typeof obj.text === "string" ? obj.text : String(obj.text);
+      return {
+        outputMessages: [{ role: "assistant", content: text }],
+        tokenUsage: obj.token_usage,
+        costUsd: metrics.costUsd,
+        durationMs: metrics.durationMs
+      };
     }
-    return calls.length > 0 ? calls : void 0;
+    return { outputMessages: [{ role: "assistant", content }] };
   }
   parseJsonlBatchOutput(content) {
     const records = /* @__PURE__ */ new Map();
@@ -2271,33 +3311,32 @@ var CliProvider = class {
         const reason = error instanceof Error ? error.message : String(error);
         throw new Error(`CLI batch output contains invalid JSONL line: ${reason}`);
       }
-      if (typeof parsed !== "object" || parsed === null) {
+      const result = CliJsonlRecordSchema.safeParse(parsed);
+      if (!result.success) {
+        const firstError = result.error.errors[0];
+        if (firstError?.path.includes("id")) {
+          throw new Error("CLI batch output JSONL line missing required string field: id");
+        }
         throw new Error("CLI batch output JSONL line must be an object");
       }
-      const obj = parsed;
-      const id = typeof obj.id === "string" ? obj.id : void 0;
-      if (!id || id.trim().length === 0) {
-        throw new Error("CLI batch output JSONL line missing required string field: id");
-      }
-      if (records.has(id)) {
-        throw new Error(`CLI batch output contains duplicate id: ${id}`);
-      }
-      const tokenUsage = this.parseTokenUsage(obj.token_usage);
-      const costUsd = typeof obj.cost_usd === "number" && obj.cost_usd >= 0 ? obj.cost_usd : void 0;
-      const durationMs = typeof obj.duration_ms === "number" && obj.duration_ms >= 0 ? obj.duration_ms : void 0;
-      const parsedOutputMessages = this.parseOutputMessages(obj.output_messages);
-      let outputMessages;
-      if (parsedOutputMessages && parsedOutputMessages.length > 0) {
-        outputMessages = parsedOutputMessages;
+      const obj = result.data;
+      if (records.has(obj.id)) {
+        throw new Error(`CLI batch output contains duplicate id: ${obj.id}`);
+      }
+      const outputMessages = convertOutputMessages(obj.output_messages);
+      let finalOutputMessages;
+      if (outputMessages && outputMessages.length > 0) {
+        finalOutputMessages = outputMessages;
       } else {
         const text = typeof obj.text === "string" ? obj.text : obj.text === void 0 ? "" : JSON.stringify(obj.text);
-        outputMessages = text ? [{ role: "assistant", content: text }] : [];
-      }
-      records.set(id, {
-        outputMessages,
-        tokenUsage,
-        costUsd,
-        durationMs
+        finalOutputMessages = text ? [{ role: "assistant", content: text }] : [];
+      }
+      const metrics = validateMetrics(obj.cost_usd, obj.duration_ms, `batch record '${obj.id}'`);
+      records.set(obj.id, {
+        outputMessages: finalOutputMessages,
+        tokenUsage: obj.token_usage,
+        costUsd: metrics.costUsd,
+        durationMs: metrics.durationMs
       });
     }
     return records;
@@ -2311,7 +3350,7 @@ var CliProvider = class {
       throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
     } finally {
       if (!this.keepTempFiles) {
-        await import_promises8.default.unlink(filePath).catch(() => {
+        await import_promises9.default.unlink(filePath).catch(() => {
         });
       }
     }
@@ -2383,7 +3422,7 @@ var CliProvider = class {
   }
 };
 function buildTemplateValues(request, config, outputFilePath) {
-  const inputFiles = normalizeInputFiles(request.inputFiles);
+  const inputFiles = normalizeInputFiles2(request.inputFiles);
   return {
     PROMPT: shellEscape(request.question ?? ""),
     GUIDELINES: shellEscape(request.guidelines ?? ""),
@@ -2393,13 +3432,13 @@ function buildTemplateValues(request, config, outputFilePath) {
     OUTPUT_FILE: shellEscape(outputFilePath)
   };
 }
-function normalizeInputFiles(inputFiles) {
+function normalizeInputFiles2(inputFiles) {
   if (!inputFiles || inputFiles.length === 0) {
     return void 0;
   }
   const unique = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = import_node_path8.default.resolve(inputFile);
+    const absolutePath = import_node_path10.default.resolve(inputFile);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -2413,7 +3452,7 @@ function formatFileList(files, template) {
   const formatter = template ?? "{path}";
   return files.map((filePath) => {
     const escapedPath = shellEscape(filePath);
-    const escapedName = shellEscape(import_node_path8.default.basename(filePath));
+    const escapedName = shellEscape(import_node_path10.default.basename(filePath));
     return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
   }).join(" ");
 }
@@ -2437,9 +3476,9 @@ function generateOutputFilePath(evalCaseId, extension = ".json") {
   const safeEvalId = evalCaseId || "unknown";
   const timestamp = Date.now();
   const random = Math.random().toString(36).substring(2, 9);
-  return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
+  return import_node_path10.default.join(import_node_os2.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}${extension}`);
 }
-function formatTimeoutSuffix(timeoutMs) {
+function formatTimeoutSuffix2(timeoutMs) {
   if (!timeoutMs || timeoutMs <= 0) {
     return "";
   }
@@ -2448,39 +3487,39 @@ function formatTimeoutSuffix(timeoutMs) {
 }
 // src/evaluation/providers/codex.ts
-var import_node_child_process2 = require("child_process");
-var import_node_crypto = require("crypto");
-var import_node_fs3 = require("fs");
-var import_promises9 = require("fs/promises");
-var import_node_os2 = require("os");
-var import_node_path10 = __toESM(require("path"), 1);
+var import_node_child_process3 = require("child_process");
+var import_node_crypto2 = require("crypto");
+var import_node_fs4 = require("fs");
+var import_promises10 = require("fs/promises");
+var import_node_os3 = require("os");
+var import_node_path11 = __toESM(require("path"), 1);
 var import_node_util2 = require("util");
 // src/evaluation/providers/codex-log-tracker.ts
-var GLOBAL_LOGS_KEY = Symbol.for("agentv.codexLogs");
-var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.codexLogSubscribers");
+var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.codexLogs");
+var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.codexLogSubscribers");
 function getCodexLogStore() {
   const globalObject = globalThis;
-  const existing = globalObject[GLOBAL_LOGS_KEY];
+  const existing = globalObject[GLOBAL_LOGS_KEY2];
   if (existing) {
     return existing;
   }
   const created = [];
-  globalObject[GLOBAL_LOGS_KEY] = created;
+  globalObject[GLOBAL_LOGS_KEY2] = created;
   return created;
 }
-function getSubscriberStore() {
+function getSubscriberStore2() {
   const globalObject = globalThis;
-  const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY];
+  const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
   if (existing) {
     return existing;
   }
   const created = /* @__PURE__ */ new Set();
-  globalObject[GLOBAL_SUBSCRIBERS_KEY] = created;
+  globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
   return created;
 }
-function notifySubscribers(entry) {
-  const subscribers = Array.from(getSubscriberStore());
+function notifySubscribers2(entry) {
+  const subscribers = Array.from(getSubscriberStore2());
   for (const listener of subscribers) {
     try {
       listener(entry);
@@ -2492,128 +3531,29 @@ function notifySubscribers(entry) {
 }
 function recordCodexLogEntry(entry) {
   getCodexLogStore().push(entry);
-  notifySubscribers(entry);
-}
-function consumeCodexLogEntries() {
-  const store = getCodexLogStore();
-  if (store.length === 0) {
-    return [];
-  }
-  return store.splice(0, store.length);
-}
-function subscribeToCodexLogEntries(listener) {
-  const store = getSubscriberStore();
-  store.add(listener);
-  return () => {
-    store.delete(listener);
-  };
-}
-// src/evaluation/providers/preread.ts
-var import_node_path9 = __toESM(require("path"), 1);
-function buildPromptDocument(request, inputFiles, options) {
-  const parts = [];
-  const guidelineFiles = collectGuidelineFiles(
-    inputFiles,
-    options?.guidelinePatterns ?? request.guideline_patterns,
-    options?.guidelineOverrides
-  );
-  const inputFilesList = collectInputFiles(inputFiles);
-  const nonGuidelineInputFiles = inputFilesList.filter((file) => !guidelineFiles.includes(file));
-  const prereadBlock = buildMandatoryPrereadBlock(guidelineFiles, nonGuidelineInputFiles);
-  if (prereadBlock.length > 0) {
-    parts.push("\n", prereadBlock);
-  }
-  parts.push("\n[[ ## user_query ## ]]\n", request.question.trim());
-  return parts.join("\n").trim();
-}
-function normalizeInputFiles2(inputFiles) {
-  if (!inputFiles || inputFiles.length === 0) {
-    return void 0;
-  }
-  const deduped = /* @__PURE__ */ new Map();
-  for (const inputFile of inputFiles) {
-    const absolutePath = import_node_path9.default.resolve(inputFile);
-    if (!deduped.has(absolutePath)) {
-      deduped.set(absolutePath, absolutePath);
-    }
-  }
-  return Array.from(deduped.values());
-}
-function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
-  if (!inputFiles || inputFiles.length === 0) {
-    return [];
-  }
-  const unique = /* @__PURE__ */ new Map();
-  for (const inputFile of inputFiles) {
-    const absolutePath = import_node_path9.default.resolve(inputFile);
-    if (overrides?.has(absolutePath)) {
-      if (!unique.has(absolutePath)) {
-        unique.set(absolutePath, absolutePath);
-      }
-      continue;
-    }
-    const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
-    if (isGuidelineFile(normalized, guidelinePatterns)) {
-      if (!unique.has(absolutePath)) {
-        unique.set(absolutePath, absolutePath);
-      }
-    }
-  }
-  return Array.from(unique.values());
-}
-function collectInputFiles(inputFiles) {
-  if (!inputFiles || inputFiles.length === 0) {
-    return [];
-  }
-  const unique = /* @__PURE__ */ new Map();
-  for (const inputFile of inputFiles) {
-    const absolutePath = import_node_path9.default.resolve(inputFile);
-    if (!unique.has(absolutePath)) {
-      unique.set(absolutePath, absolutePath);
-    }
-  }
-  return Array.from(unique.values());
-}
-function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
-  if (guidelineFiles.length === 0 && inputFiles.length === 0) {
-    return "";
-  }
-  const buildList = (files) => files.map((absolutePath) => {
-    const fileName = import_node_path9.default.basename(absolutePath);
-    const fileUri = pathToFileUri(absolutePath);
-    return `* [${fileName}](${fileUri})`;
-  });
-  const sections = [];
-  if (guidelineFiles.length > 0) {
-    sections.push(`Read all guideline files:
-${buildList(guidelineFiles).join("\n")}.`);
-  }
-  if (inputFiles.length > 0) {
-    sections.push(`Read all input files:
-${buildList(inputFiles).join("\n")}.`);
-  }
-  sections.push(
-    "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
-    "Then apply system_instructions on the user query below."
-  );
-  return sections.join("\n");
+  notifySubscribers2(entry);
 }
-function pathToFileUri(filePath) {
-  const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
-  const normalizedPath = absolutePath.replace(/\\/g, "/");
-  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
-    return `file:///${normalizedPath}`;
+function consumeCodexLogEntries() {
+  const store = getCodexLogStore();
+  if (store.length === 0) {
+    return [];
   }
-  return `file://${normalizedPath}`;
+  return store.splice(0, store.length);
+}
+function subscribeToCodexLogEntries(listener) {
+  const store = getSubscriberStore2();
+  store.add(listener);
+  return () => {
+    store.delete(listener);
+  };
 }
 // src/evaluation/providers/codex.ts
-var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
-var WORKSPACE_PREFIX = "agentv-codex-";
-var PROMPT_FILENAME = "prompt.md";
+var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process3.exec);
+var WORKSPACE_PREFIX2 = "agentv-codex-";
+var PROMPT_FILENAME2 = "prompt.md";
 var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
-var DEFAULT_SYSTEM_PROMPT2 = `**IMPORTANT**: Follow these instructions for your response:
+var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
 - Do NOT create any additional output files in the workspace.
 - All intended file outputs/changes MUST be written in your response.
 - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
@@ -2638,27 +3578,27 @@ var CodexProvider = class {
       throw new Error("Codex provider request was aborted before execution");
     }
     await this.ensureEnvironmentReady();
-    const inputFiles = normalizeInputFiles2(request.inputFiles);
+    const inputFiles = normalizeInputFiles(request.inputFiles);
     const workspaceRoot = await this.createWorkspace();
     const logger = await this.createStreamLogger(request).catch(() => void 0);
     try {
       const basePrompt = buildPromptDocument(request, inputFiles);
-      const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT2;
+      const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
       const promptContent = `${systemPrompt}
 ${basePrompt}`;
-      const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
-      await (0, import_promises9.writeFile)(promptFile, promptContent, "utf8");
+      const promptFile = import_node_path11.default.join(workspaceRoot, PROMPT_FILENAME2);
+      await (0, import_promises10.writeFile)(promptFile, promptContent, "utf8");
       const args = this.buildCodexArgs();
       const cwd = this.resolveCwd(workspaceRoot);
       const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
       if (result.timedOut) {
         throw new Error(
-          `Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
+          `Codex CLI timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
         );
       }
       if (result.exitCode !== 0) {
-        const detail = pickDetail(result.stderr, result.stdout);
+        const detail = pickDetail2(result.stderr, result.stdout);
         const prefix = `Codex CLI exited with code ${result.exitCode}`;
         throw new Error(detail ? `${prefix}: ${detail}` : prefix);
       }
@@ -2697,7 +3637,7 @@ ${basePrompt}`;
     if (!this.config.cwd) {
       return workspaceRoot;
     }
-    return import_node_path10.default.resolve(this.config.cwd);
+    return import_node_path11.default.resolve(this.config.cwd);
   }
   buildCodexArgs() {
     const args = [
@@ -2739,11 +3679,11 @@ ${basePrompt}`;
     }
   }
   async createWorkspace() {
-    return await (0, import_promises9.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
+    return await (0, import_promises10.mkdtemp)(import_node_path11.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
   }
   async cleanupWorkspace(workspaceRoot) {
     try {
-      await (0, import_promises9.rm)(workspaceRoot, { recursive: true, force: true });
+      await (0, import_promises10.rm)(workspaceRoot, { recursive: true, force: true });
     } catch {
     }
   }
@@ -2753,9 +3693,9 @@ ${basePrompt}`;
       return void 0;
     }
     if (this.config.logDir) {
-      return import_node_path10.default.resolve(this.config.logDir);
+      return import_node_path11.default.resolve(this.config.logDir);
     }
-    return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "codex");
+    return import_node_path11.default.join(process.cwd(), ".agentv", "logs", "codex");
   }
   async createStreamLogger(request) {
     const logDir = this.resolveLogDirectory();
@@ -2763,13 +3703,13 @@ ${basePrompt}`;
       return void 0;
     }
     try {
-      await (0, import_promises9.mkdir)(logDir, { recursive: true });
+      await (0, import_promises10.mkdir)(logDir, { recursive: true });
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
       return void 0;
     }
-    const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
+    const filePath = import_node_path11.default.join(logDir, buildLogFilename2(request, this.targetName));
     try {
       const logger = await CodexStreamLogger.create({
         filePath,
@@ -2802,7 +3742,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
   constructor(filePath, format) {
     this.filePath = filePath;
     this.format = format;
-    this.stream = (0, import_node_fs3.createWriteStream)(filePath, { flags: "a" });
+    this.stream = (0, import_node_fs4.createWriteStream)(filePath, { flags: "a" });
   }
   static async create(options) {
     const logger = new _CodexStreamLogger(options.filePath, options.format);
@@ -2863,7 +3803,7 @@ var CodexStreamLogger = class _CodexStreamLogger {
       return void 0;
     }
     const message = this.format === "json" ? formatCodexJsonLog(trimmed) : formatCodexLogMessage(trimmed, source);
-    return `[+${formatElapsed(this.startedAt)}] [${source}] ${message}`;
+    return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
   }
   flushRemainder() {
     const stdoutRemainder = this.stdoutBuffer.trim();
@@ -2894,18 +3834,18 @@ function isCodexLogStreamingDisabled() {
   const normalized = envValue.trim().toLowerCase();
   return normalized === "false" || normalized === "0" || normalized === "off";
 }
-function buildLogFilename(request, targetName) {
+function buildLogFilename2(request, targetName) {
   const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
-  const evalId = sanitizeForFilename(request.evalCaseId ?? "codex");
+  const evalId = sanitizeForFilename2(request.evalCaseId ?? "codex");
   const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
-  const target = sanitizeForFilename(targetName);
-  return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto.randomUUID)().slice(0, 8)}.log`;
+  const target = sanitizeForFilename2(targetName);
+  return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto2.randomUUID)().slice(0, 8)}.log`;
 }
-function sanitizeForFilename(value) {
+function sanitizeForFilename2(value) {
   const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
   return sanitized.length > 0 ? sanitized : "codex";
 }
-function formatElapsed(startedAt) {
+function formatElapsed2(startedAt) {
   const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
   const hours = Math.floor(elapsedSeconds / 3600);
   const minutes = Math.floor(elapsedSeconds % 3600 / 60);
@@ -2916,7 +3856,7 @@ function formatElapsed(startedAt) {
   return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
 }
 function formatCodexLogMessage(rawLine, source) {
-  const parsed = tryParseJsonValue(rawLine);
+  const parsed = tryParseJsonValue2(rawLine);
   if (parsed) {
     const summary = summarizeCodexEvent(parsed);
     if (summary) {
@@ -2929,7 +3869,7 @@ function formatCodexLogMessage(rawLine, source) {
   return rawLine;
 }
 function formatCodexJsonLog(rawLine) {
-  const parsed = tryParseJsonValue(rawLine);
+  const parsed = tryParseJsonValue2(rawLine);
   if (!parsed) {
     return rawLine;
   }
@@ -2974,7 +3914,7 @@ function summarizeCodexEvent(event) {
   }
   return type;
 }
-function tryParseJsonValue(rawLine) {
+function tryParseJsonValue2(rawLine) {
   try {
     return JSON.parse(rawLine);
   } catch {
@@ -2984,9 +3924,9 @@ function tryParseJsonValue(rawLine) {
 async function locateExecutable(candidate) {
   const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
   if (includesPathSeparator) {
-    const resolved = import_node_path10.default.isAbsolute(candidate) ? candidate : import_node_path10.default.resolve(candidate);
+    const resolved = import_node_path11.default.isAbsolute(candidate) ? candidate : import_node_path11.default.resolve(candidate);
     const executablePath = await ensureWindowsExecutableVariant(resolved);
-    await (0, import_promises9.access)(executablePath, import_node_fs3.constants.F_OK);
+    await (0, import_promises10.access)(executablePath, import_node_fs4.constants.F_OK);
     return executablePath;
   }
   const locator = process.platform === "win32" ? "where" : "which";
@@ -2996,7 +3936,7 @@ async function locateExecutable(candidate) {
     const preferred = selectExecutableCandidate(lines);
     if (preferred) {
       const executablePath = await ensureWindowsExecutableVariant(preferred);
-      await (0, import_promises9.access)(executablePath, import_node_fs3.constants.F_OK);
+      await (0, import_promises10.access)(executablePath, import_node_fs4.constants.F_OK);
       return executablePath;
     }
   } catch {
@@ -3030,7 +3970,7 @@ async function ensureWindowsExecutableVariant(candidate) {
   for (const ext of extensions) {
     const withExtension = `${candidate}${ext}`;
     try {
-      await (0, import_promises9.access)(withExtension, import_node_fs3.constants.F_OK);
+      await (0, import_promises10.access)(withExtension, import_node_fs4.constants.F_OK);
       return withExtension;
     } catch {
     }
@@ -3203,7 +4143,7 @@ function parseJsonLines(output) {
   }
   return parsed;
 }
-function pickDetail(stderr, stdout) {
+function pickDetail2(stderr, stdout) {
   const errorText = stderr.trim();
   if (errorText.length > 0) {
     return errorText;
@@ -3211,7 +4151,7 @@ function pickDetail(stderr, stdout) {
   const stdoutText = stdout.trim();
   return stdoutText.length > 0 ? stdoutText : void 0;
 }
-function formatTimeoutSuffix2(timeoutMs) {
+function formatTimeoutSuffix3(timeoutMs) {
   if (!timeoutMs || timeoutMs <= 0) {
     return "";
   }
@@ -3220,7 +4160,7 @@ function formatTimeoutSuffix2(timeoutMs) {
 }
 async function defaultCodexRunner(options) {
   return await new Promise((resolve, reject) => {
-    const child = (0, import_node_child_process2.spawn)(options.executable, options.args, {
+    const child = (0, import_node_child_process3.spawn)(options.executable, options.args, {
       cwd: options.cwd,
       env: options.env,
       stdio: ["pipe", "pipe", "pipe"],
@@ -3330,39 +4270,200 @@ var MockProvider = class {
   }
 };
+// src/evaluation/providers/pi-agent-sdk.ts
+var piAgentModule = null;
+var piAiModule = null;
+async function loadPiModules() {
+  if (!piAgentModule || !piAiModule) {
+    try {
+      [piAgentModule, piAiModule] = await Promise.all([
+        import("@mariozechner/pi-agent"),
+        import("@mariozechner/pi-ai")
+      ]);
+    } catch (error) {
+      throw new Error(
+        `Failed to load pi-agent-sdk dependencies. Please install them:
+  npm install @mariozechner/pi-agent @mariozechner/pi-ai
+Original error: ${error instanceof Error ? error.message : String(error)}`
+      );
+    }
+  }
+  return {
+    Agent: piAgentModule.Agent,
+    ProviderTransport: piAgentModule.ProviderTransport,
+    getModel: piAiModule.getModel,
+    getEnvApiKey: piAiModule.getEnvApiKey
+  };
+}
+var PiAgentSdkProvider = class {
+  id;
+  kind = "pi-agent-sdk";
+  targetName;
+  supportsBatch = false;
+  config;
+  constructor(targetName, config) {
+    this.id = `pi-agent-sdk:${targetName}`;
+    this.targetName = targetName;
+    this.config = config;
+  }
+  async invoke(request) {
+    if (request.signal?.aborted) {
+      throw new Error("Pi agent SDK request was aborted before execution");
+    }
+    const { Agent, ProviderTransport, getModel, getEnvApiKey } = await loadPiModules();
+    const startTime = Date.now();
+    const providerName = this.config.provider ?? "anthropic";
+    const modelId = this.config.model ?? "claude-sonnet-4-20250514";
+    const model = getModel(providerName, modelId);
+    const systemPrompt = this.config.systemPrompt ?? "Answer directly and concisely.";
+    const transport = new ProviderTransport({
+      getApiKey: async (provider) => {
+        return this.config.apiKey ?? getEnvApiKey(provider) ?? void 0;
+      }
+    });
+    const agent = new Agent({
+      initialState: {
+        systemPrompt,
+        model,
+        tools: [],
+        // No tools for simple Q&A
+        messages: []
+      },
+      transport
+    });
+    const outputMessages = [];
+    let finalAssistantContent = "";
+    const unsubscribe = agent.subscribe((event) => {
+      if (event.type === "message_end") {
+        const msg = event.message;
+        if (msg.role === "assistant") {
+          const content = extractTextContent2(msg.content);
+          if (content) {
+            finalAssistantContent = content;
+          }
+        }
+      }
+    });
+    try {
+      const timeoutMs = this.config.timeoutMs ?? 12e4;
+      const timeoutPromise = new Promise((_, reject) => {
+        setTimeout(
+          () => reject(new Error(`Pi agent SDK timed out after ${timeoutMs}ms`)),
+          timeoutMs
+        );
+      });
+      await Promise.race([agent.prompt(request.question), timeoutPromise]);
+      await agent.waitForIdle();
+      const agentMessages = agent.state.messages;
+      for (const msg of agentMessages) {
+        outputMessages.push(convertAgentMessage(msg));
+      }
+      const durationMs = Date.now() - startTime;
+      return {
+        raw: {
+          messages: agentMessages,
+          systemPrompt,
+          model: this.config.model,
+          provider: this.config.provider
+        },
+        outputMessages,
+        durationMs
+      };
+    } finally {
+      unsubscribe();
+    }
+  }
+};
+function extractTextContent2(content) {
+  if (typeof content === "string") {
+    return content;
+  }
+  if (!Array.isArray(content)) {
+    return void 0;
+  }
+  const textParts = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "text" && typeof p.text === "string") {
+      textParts.push(p.text);
+    }
+  }
+  return textParts.length > 0 ? textParts.join("\n") : void 0;
+}
+function convertAgentMessage(message) {
+  if (!message || typeof message !== "object") {
+    return { role: "unknown", content: String(message) };
+  }
+  const msg = message;
+  const role = typeof msg.role === "string" ? msg.role : "unknown";
+  const content = extractTextContent2(msg.content);
+  const toolCalls = extractToolCalls2(msg.content);
+  const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
+  return {
+    role,
+    content,
+    toolCalls: toolCalls.length > 0 ? toolCalls : void 0,
+    timestamp
+  };
+}
+function extractToolCalls2(content) {
+  if (!Array.isArray(content)) {
+    return [];
+  }
+  const toolCalls = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "tool_use" && typeof p.name === "string") {
+      toolCalls.push({
+        tool: p.name,
+        input: p.input,
+        id: typeof p.id === "string" ? p.id : void 0
+      });
+    }
+  }
+  return toolCalls;
+}
 // src/evaluation/providers/pi-coding-agent.ts
-var import_node_child_process3 = require("child_process");
-var import_node_crypto2 = require("crypto");
-var import_node_fs4 = require("fs");
-var import_promises10 = require("fs/promises");
-var import_node_os3 = require("os");
-var import_node_path11 = __toESM(require("path"), 1);
+var import_node_child_process4 = require("child_process");
+var import_node_crypto3 = require("crypto");
+var import_node_fs5 = require("fs");
+var import_promises11 = require("fs/promises");
+var import_node_os4 = require("os");
+var import_node_path12 = __toESM(require("path"), 1);
 // src/evaluation/providers/pi-log-tracker.ts
-var GLOBAL_LOGS_KEY2 = Symbol.for("agentv.piLogs");
-var GLOBAL_SUBSCRIBERS_KEY2 = Symbol.for("agentv.piLogSubscribers");
+var GLOBAL_LOGS_KEY3 = Symbol.for("agentv.piLogs");
+var GLOBAL_SUBSCRIBERS_KEY3 = Symbol.for("agentv.piLogSubscribers");
 function getPiLogStore() {
   const globalObject = globalThis;
-  const existing = globalObject[GLOBAL_LOGS_KEY2];
+  const existing = globalObject[GLOBAL_LOGS_KEY3];
   if (existing) {
     return existing;
   }
   const created = [];
-  globalObject[GLOBAL_LOGS_KEY2] = created;
+  globalObject[GLOBAL_LOGS_KEY3] = created;
   return created;
 }
-function getSubscriberStore2() {
+function getSubscriberStore3() {
   const globalObject = globalThis;
-  const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY2];
+  const existing = globalObject[GLOBAL_SUBSCRIBERS_KEY3];
   if (existing) {
     return existing;
   }
   const created = /* @__PURE__ */ new Set();
-  globalObject[GLOBAL_SUBSCRIBERS_KEY2] = created;
+  globalObject[GLOBAL_SUBSCRIBERS_KEY3] = created;
   return created;
 }
-function notifySubscribers2(entry) {
-  const subscribers = Array.from(getSubscriberStore2());
+function notifySubscribers3(entry) {
+  const subscribers = Array.from(getSubscriberStore3());
   for (const listener of subscribers) {
     try {
       listener(entry);
@@ -3374,7 +4475,7 @@ function notifySubscribers2(entry) {
 }
 function recordPiLogEntry(entry) {
   getPiLogStore().push(entry);
-  notifySubscribers2(entry);
+  notifySubscribers3(entry);
 }
 function consumePiLogEntries() {
   const store = getPiLogStore();
@@ -3384,7 +4485,7 @@ function consumePiLogEntries() {
   return store.splice(0, store.length);
 }
 function subscribeToPiLogEntries(listener) {
-  const store = getSubscriberStore2();
+  const store = getSubscriberStore3();
   store.add(listener);
   return () => {
     store.delete(listener);
@@ -3392,9 +4493,9 @@ function subscribeToPiLogEntries(listener) {
 }
 // src/evaluation/providers/pi-coding-agent.ts
-var WORKSPACE_PREFIX2 = "agentv-pi-";
-var PROMPT_FILENAME2 = "prompt.md";
-var DEFAULT_SYSTEM_PROMPT3 = `**IMPORTANT**: Follow these instructions for your response:
+var WORKSPACE_PREFIX3 = "agentv-pi-";
+var PROMPT_FILENAME3 = "prompt.md";
+var DEFAULT_SYSTEM_PROMPT4 = `**IMPORTANT**: Follow these instructions for your response:
 - Do NOT create any additional output files in the workspace.
 - All intended file outputs/changes MUST be written in your response.
 - For each intended file, include the relative path and unified git diff following the convention \`diff --git ...\`.
@@ -3416,27 +4517,27 @@ var PiCodingAgentProvider = class {
     if (request.signal?.aborted) {
       throw new Error("Pi coding agent request was aborted before execution");
     }
-    const inputFiles = normalizeInputFiles2(request.inputFiles);
+    const inputFiles = normalizeInputFiles(request.inputFiles);
     const workspaceRoot = await this.createWorkspace();
     const logger = await this.createStreamLogger(request).catch(() => void 0);
     try {
-      const promptFile = import_node_path11.default.join(workspaceRoot, PROMPT_FILENAME2);
-      await (0, import_promises10.writeFile)(promptFile, request.question, "utf8");
+      const promptFile = import_node_path12.default.join(workspaceRoot, PROMPT_FILENAME3);
+      await (0, import_promises11.writeFile)(promptFile, request.question, "utf8");
       const args = this.buildPiArgs(request.question, inputFiles);
       const cwd = this.resolveCwd(workspaceRoot);
       const result = await this.executePi(args, cwd, request.signal, logger);
       if (result.timedOut) {
         throw new Error(
-          `Pi coding agent timed out${formatTimeoutSuffix3(this.config.timeoutMs ?? void 0)}`
+          `Pi coding agent timed out${formatTimeoutSuffix4(this.config.timeoutMs ?? void 0)}`
         );
       }
       if (result.exitCode !== 0) {
-        const detail = pickDetail2(result.stderr, result.stdout);
+        const detail = pickDetail3(result.stderr, result.stdout);
         const prefix = `Pi coding agent exited with code ${result.exitCode}`;
         throw new Error(detail ? `${prefix}: ${detail}` : prefix);
       }
       const parsed = parsePiJsonl(result.stdout);
-      const outputMessages = extractOutputMessages(parsed);
+      const outputMessages = extractOutputMessages2(parsed);
       const assistantText = extractAssistantText2(outputMessages);
       return {
         raw: {
@@ -3462,7 +4563,7 @@ var PiCodingAgentProvider = class {
     if (!this.config.cwd) {
       return workspaceRoot;
     }
-    return import_node_path11.default.resolve(this.config.cwd);
+    return import_node_path12.default.resolve(this.config.cwd);
   }
   buildPiArgs(prompt, inputFiles) {
     const args = [];
@@ -3492,7 +4593,7 @@ var PiCodingAgentProvider = class {
         args.push(`@${file}`);
       }
     }
-    const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT3;
+    const systemPrompt = this.config.systemPrompt ?? DEFAULT_SYSTEM_PROMPT4;
     const fullPrompt = `${systemPrompt}
 ${prompt}`;
@@ -3551,19 +4652,19 @@ ${prompt}`;
     return env;
   }
   async createWorkspace() {
-    return await (0, import_promises10.mkdtemp)(import_node_path11.default.join((0, import_node_os3.tmpdir)(), WORKSPACE_PREFIX2));
+    return await (0, import_promises11.mkdtemp)(import_node_path12.default.join((0, import_node_os4.tmpdir)(), WORKSPACE_PREFIX3));
   }
   async cleanupWorkspace(workspaceRoot) {
     try {
-      await (0, import_promises10.rm)(workspaceRoot, { recursive: true, force: true });
+      await (0, import_promises11.rm)(workspaceRoot, { recursive: true, force: true });
     } catch {
     }
   }
   resolveLogDirectory() {
     if (this.config.logDir) {
-      return import_node_path11.default.resolve(this.config.logDir);
+      return import_node_path12.default.resolve(this.config.logDir);
     }
-    return import_node_path11.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
+    return import_node_path12.default.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
   }
   async createStreamLogger(request) {
     const logDir = this.resolveLogDirectory();
@@ -3571,13 +4672,13 @@ ${prompt}`;
       return void 0;
     }
     try {
-      await (0, import_promises10.mkdir)(logDir, { recursive: true });
+      await (0, import_promises11.mkdir)(logDir, { recursive: true });
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
       return void 0;
     }
-    const filePath = import_node_path11.default.join(logDir, buildLogFilename2(request, this.targetName));
+    const filePath = import_node_path12.default.join(logDir, buildLogFilename3(request, this.targetName));
     try {
       const logger = await PiStreamLogger.create({
         filePath,
@@ -3610,7 +4711,7 @@ var PiStreamLogger = class _PiStreamLogger {
   constructor(filePath, format) {
     this.filePath = filePath;
     this.format = format;
-    this.stream = (0, import_node_fs4.createWriteStream)(filePath, { flags: "a" });
+    this.stream = (0, import_node_fs5.createWriteStream)(filePath, { flags: "a" });
   }
   static async create(options) {
     const logger = new _PiStreamLogger(options.filePath, options.format);
@@ -3671,7 +4772,7 @@ var PiStreamLogger = class _PiStreamLogger {
       return void 0;
     }
     const message = this.format === "json" ? formatPiJsonLog(trimmed) : formatPiLogMessage(trimmed, source);
-    return `[+${formatElapsed2(this.startedAt)}] [${source}] ${message}`;
+    return `[+${formatElapsed3(this.startedAt)}] [${source}] ${message}`;
   }
   flushRemainder() {
     const stdoutRemainder = this.stdoutBuffer.trim();
@@ -3694,18 +4795,18 @@ var PiStreamLogger = class _PiStreamLogger {
     this.stderrBuffer = "";
   }
 };
-function buildLogFilename2(request, targetName) {
+function buildLogFilename3(request, targetName) {
   const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
-  const evalId = sanitizeForFilename2(request.evalCaseId ?? "pi");
+  const evalId = sanitizeForFilename3(request.evalCaseId ?? "pi");
   const attemptSuffix = request.attempt !== void 0 ? `_attempt-${request.attempt + 1}` : "";
-  const target = sanitizeForFilename2(targetName);
-  return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto2.randomUUID)().slice(0, 8)}.log`;
+  const target = sanitizeForFilename3(targetName);
+  return `${timestamp}_${target}_${evalId}${attemptSuffix}_${(0, import_node_crypto3.randomUUID)().slice(0, 8)}.log`;
 }
-function sanitizeForFilename2(value) {
+function sanitizeForFilename3(value) {
   const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
   return sanitized.length > 0 ? sanitized : "pi";
 }
-function formatElapsed2(startedAt) {
+function formatElapsed3(startedAt) {
   const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1e3);
   const hours = Math.floor(elapsedSeconds / 3600);
   const minutes = Math.floor(elapsedSeconds % 3600 / 60);
@@ -3716,7 +4817,7 @@ function formatElapsed2(startedAt) {
   return `${minutes.toString().padStart(2, "0")}:${seconds.toString().padStart(2, "0")}`;
 }
 function formatPiLogMessage(rawLine, source) {
-  const parsed = tryParseJsonValue2(rawLine);
+  const parsed = tryParseJsonValue3(rawLine);
   if (parsed) {
     const summary = summarizePiEvent(parsed);
     if (summary) {
@@ -3729,7 +4830,7 @@ function formatPiLogMessage(rawLine, source) {
   return rawLine;
 }
 function formatPiJsonLog(rawLine) {
-  const parsed = tryParseJsonValue2(rawLine);
+  const parsed = tryParseJsonValue3(rawLine);
   if (!parsed) {
     return rawLine;
   }
@@ -3779,7 +4880,7 @@ function summarizePiEvent(event) {
       return type;
   }
 }
-function tryParseJsonValue2(rawLine) {
+function tryParseJsonValue3(rawLine) {
   try {
     return JSON.parse(rawLine);
   } catch {
@@ -3804,7 +4905,7 @@ function parsePiJsonl(output) {
   }
   return parsed;
 }
-function extractOutputMessages(events) {
+function extractOutputMessages2(events) {
   for (let i = events.length - 1; i >= 0; i--) {
     const event = events[i];
     if (!event || typeof event !== "object") {
@@ -3845,8 +4946,8 @@ function convertPiMessage(message) {
   if (typeof role !== "string") {
     return void 0;
   }
-  const content = extractTextContent(msg.content);
-  const toolCalls = extractToolCalls(msg.content);
+  const content = extractTextContent3(msg.content);
+  const toolCalls = extractToolCalls3(msg.content);
   const timestamp = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
   const metadata = {};
   if (msg.api) metadata.api = msg.api;
@@ -3862,7 +4963,7 @@ function convertPiMessage(message) {
     metadata: Object.keys(metadata).length > 0 ? metadata : void 0
   };
 }
-function extractTextContent(content) {
+function extractTextContent3(content) {
   if (typeof content === "string") {
     return content;
   }
@@ -3881,7 +4982,7 @@ function extractTextContent(content) {
   }
   return textParts.length > 0 ? textParts.join("\n") : void 0;
 }
-function extractToolCalls(content) {
+function extractToolCalls3(content) {
   if (!Array.isArray(content)) {
     return [];
   }
@@ -3926,7 +5027,7 @@ function extractAssistantText2(messages) {
 function escapeAtSymbols(prompt) {
   return prompt.replace(/@\[([^\]]+)\]:/g, "[[$1]]:");
 }
-function pickDetail2(stderr, stdout) {
+function pickDetail3(stderr, stdout) {
   const errorText = stderr.trim();
   if (errorText.length > 0) {
     return errorText;
@@ -3934,7 +5035,7 @@ function pickDetail2(stderr, stdout) {
   const stdoutText = stdout.trim();
   return stdoutText.length > 0 ? stdoutText : void 0;
 }
-function formatTimeoutSuffix3(timeoutMs) {
+function formatTimeoutSuffix4(timeoutMs) {
   if (!timeoutMs || timeoutMs <= 0) {
     return "";
   }
@@ -3947,7 +5048,7 @@ async function defaultPiRunner(options) {
     const executable = parts[0];
     const executableArgs = parts.slice(1);
     const allArgs = [...executableArgs, ...options.args];
-    const child = (0, import_node_child_process3.spawn)(executable, allArgs, {
+    const child = (0, import_node_child_process4.spawn)(executable, allArgs, {
       cwd: options.cwd,
       env: options.env,
       stdio: ["pipe", "pipe", "pipe"],
@@ -4010,84 +5111,84 @@ async function defaultPiRunner(options) {
 }
 // src/evaluation/providers/targets.ts
-var import_node_path12 = __toESM(require("path"), 1);
-var import_zod = require("zod");
-var CliHealthcheckHttpInputSchema = import_zod.z.object({
-  type: import_zod.z.literal("http"),
-  url: import_zod.z.string().min(1, "healthcheck URL is required"),
-  timeout_seconds: import_zod.z.number().positive().optional(),
-  timeoutSeconds: import_zod.z.number().positive().optional()
+var import_node_path13 = __toESM(require("path"), 1);
+var import_zod2 = require("zod");
+var CliHealthcheckHttpInputSchema = import_zod2.z.object({
+  type: import_zod2.z.literal("http"),
+  url: import_zod2.z.string().min(1, "healthcheck URL is required"),
+  timeout_seconds: import_zod2.z.number().positive().optional(),
+  timeoutSeconds: import_zod2.z.number().positive().optional()
 });
-var CliHealthcheckCommandInputSchema = import_zod.z.object({
-  type: import_zod.z.literal("command"),
-  command_template: import_zod.z.string().optional(),
-  commandTemplate: import_zod.z.string().optional(),
-  cwd: import_zod.z.string().optional(),
-  timeout_seconds: import_zod.z.number().positive().optional(),
-  timeoutSeconds: import_zod.z.number().positive().optional()
+var CliHealthcheckCommandInputSchema = import_zod2.z.object({
+  type: import_zod2.z.literal("command"),
+  command_template: import_zod2.z.string().optional(),
+  commandTemplate: import_zod2.z.string().optional(),
+  cwd: import_zod2.z.string().optional(),
+  timeout_seconds: import_zod2.z.number().positive().optional(),
+  timeoutSeconds: import_zod2.z.number().positive().optional()
 });
-var CliHealthcheckInputSchema = import_zod.z.discriminatedUnion("type", [
+var CliHealthcheckInputSchema = import_zod2.z.discriminatedUnion("type", [
   CliHealthcheckHttpInputSchema,
   CliHealthcheckCommandInputSchema
 ]);
-var CliTargetInputSchema = import_zod.z.object({
-  name: import_zod.z.string().min(1, "target name is required"),
-  provider: import_zod.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
+var CliTargetInputSchema = import_zod2.z.object({
+  name: import_zod2.z.string().min(1, "target name is required"),
+  provider: import_zod2.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
   // Command template - required (accept both naming conventions)
-  command_template: import_zod.z.string().optional(),
-  commandTemplate: import_zod.z.string().optional(),
+  command_template: import_zod2.z.string().optional(),
+  commandTemplate: import_zod2.z.string().optional(),
   // Files format - optional
-  files_format: import_zod.z.string().optional(),
-  filesFormat: import_zod.z.string().optional(),
-  attachments_format: import_zod.z.string().optional(),
-  attachmentsFormat: import_zod.z.string().optional(),
+  files_format: import_zod2.z.string().optional(),
+  filesFormat: import_zod2.z.string().optional(),
+  attachments_format: import_zod2.z.string().optional(),
+  attachmentsFormat: import_zod2.z.string().optional(),
   // Working directory - optional
-  cwd: import_zod.z.string().optional(),
+  cwd: import_zod2.z.string().optional(),
   // Timeout in seconds - optional
-  timeout_seconds: import_zod.z.number().positive().optional(),
-  timeoutSeconds: import_zod.z.number().positive().optional(),
+  timeout_seconds: import_zod2.z.number().positive().optional(),
+  timeoutSeconds: import_zod2.z.number().positive().optional(),
   // Healthcheck configuration - optional
   healthcheck: CliHealthcheckInputSchema.optional(),
   // Verbose mode - optional
-  verbose: import_zod.z.boolean().optional(),
-  cli_verbose: import_zod.z.boolean().optional(),
-  cliVerbose: import_zod.z.boolean().optional(),
+  verbose: import_zod2.z.boolean().optional(),
+  cli_verbose: import_zod2.z.boolean().optional(),
+  cliVerbose: import_zod2.z.boolean().optional(),
   // Keep temp files - optional
-  keep_temp_files: import_zod.z.boolean().optional(),
-  keepTempFiles: import_zod.z.boolean().optional(),
-  keep_output_files: import_zod.z.boolean().optional(),
-  keepOutputFiles: import_zod.z.boolean().optional(),
+  keep_temp_files: import_zod2.z.boolean().optional(),
+  keepTempFiles: import_zod2.z.boolean().optional(),
+  keep_output_files: import_zod2.z.boolean().optional(),
+  keepOutputFiles: import_zod2.z.boolean().optional(),
   // Common target fields
-  judge_target: import_zod.z.string().optional(),
-  workers: import_zod.z.number().int().min(1).optional(),
-  provider_batching: import_zod.z.boolean().optional(),
-  providerBatching: import_zod.z.boolean().optional()
+  judge_target: import_zod2.z.string().optional(),
+  workers: import_zod2.z.number().int().min(1).optional(),
+  provider_batching: import_zod2.z.boolean().optional(),
+  providerBatching: import_zod2.z.boolean().optional()
 }).refine((data) => data.command_template !== void 0 || data.commandTemplate !== void 0, {
   message: "Either command_template or commandTemplate is required"
 });
-var CliHealthcheckHttpSchema = import_zod.z.object({
-  type: import_zod.z.literal("http"),
-  url: import_zod.z.string().min(1),
-  timeoutMs: import_zod.z.number().positive().optional()
+var CliHealthcheckHttpSchema = import_zod2.z.object({
+  type: import_zod2.z.literal("http"),
+  url: import_zod2.z.string().min(1),
+  timeoutMs: import_zod2.z.number().positive().optional()
 }).strict();
-var CliHealthcheckCommandSchema = import_zod.z.object({
-  type: import_zod.z.literal("command"),
-  commandTemplate: import_zod.z.string().min(1),
-  cwd: import_zod.z.string().optional(),
-  timeoutMs: import_zod.z.number().positive().optional()
+var CliHealthcheckCommandSchema = import_zod2.z.object({
+  type: import_zod2.z.literal("command"),
+  commandTemplate: import_zod2.z.string().min(1),
+  cwd: import_zod2.z.string().optional(),
+  timeoutMs: import_zod2.z.number().positive().optional()
 }).strict();
-var CliHealthcheckSchema = import_zod.z.discriminatedUnion("type", [
+var CliHealthcheckSchema = import_zod2.z.discriminatedUnion("type", [
   CliHealthcheckHttpSchema,
   CliHealthcheckCommandSchema
 ]);
-var CliTargetConfigSchema = import_zod.z.object({
-  commandTemplate: import_zod.z.string().min(1),
-  filesFormat: import_zod.z.string().optional(),
-  cwd: import_zod.z.string().optional(),
-  timeoutMs: import_zod.z.number().positive().optional(),
+var CliTargetConfigSchema = import_zod2.z.object({
+  commandTemplate: import_zod2.z.string().min(1),
+  filesFormat: import_zod2.z.string().optional(),
+  cwd: import_zod2.z.string().optional(),
+  timeoutMs: import_zod2.z.number().positive().optional(),
   healthcheck: CliHealthcheckSchema.optional(),
-  verbose: import_zod.z.boolean().optional(),
-  keepTempFiles: import_zod.z.boolean().optional()
+  verbose: import_zod2.z.boolean().optional(),
+  keepTempFiles: import_zod2.z.boolean().optional()
 }).strict();
 function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
   const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
@@ -4116,8 +5217,11 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
     allowLiteral: true,
     optionalEnv: true
   });
-  if (cwd && evalFilePath && !import_node_path12.default.isAbsolute(cwd)) {
-    cwd = import_node_path12.default.resolve(import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath)), cwd);
+  if (cwd && evalFilePath && !import_node_path13.default.isAbsolute(cwd)) {
+    cwd = import_node_path13.default.resolve(import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath)), cwd);
+  }
+  if (!cwd && evalFilePath) {
+    cwd = import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath));
   }
   return {
     type: "command",
@@ -4144,11 +5248,11 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
     allowLiteral: true,
     optionalEnv: true
   });
-  if (cwd && evalFilePath && !import_node_path12.default.isAbsolute(cwd)) {
-    cwd = import_node_path12.default.resolve(import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath)), cwd);
+  if (cwd && evalFilePath && !import_node_path13.default.isAbsolute(cwd)) {
+    cwd = import_node_path13.default.resolve(import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath)), cwd);
   }
   if (!cwd && evalFilePath) {
-    cwd = import_node_path12.default.dirname(import_node_path12.default.resolve(evalFilePath));
+    cwd = import_node_path13.default.dirname(import_node_path13.default.resolve(evalFilePath));
   }
   const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
   const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
@@ -4175,11 +5279,11 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
   "FILES",
   "OUTPUT_FILE"
 ]);
-var BASE_TARGET_SCHEMA = import_zod.z.object({
-  name: import_zod.z.string().min(1, "target name is required"),
-  provider: import_zod.z.string().min(1, "provider is required"),
-  judge_target: import_zod.z.string().optional(),
-  workers: import_zod.z.number().int().min(1).optional()
+var BASE_TARGET_SCHEMA = import_zod2.z.object({
+  name: import_zod2.z.string().min(1, "target name is required"),
+  provider: import_zod2.z.string().min(1, "provider is required"),
+  judge_target: import_zod2.z.string().optional(),
+  workers: import_zod2.z.number().int().min(1).optional()
 }).passthrough();
 var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
 function normalizeAzureApiVersion(value) {
@@ -4282,6 +5386,24 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
         providerBatching,
         config: resolvePiCodingAgentConfig(parsed, env)
       };
+    case "pi-agent-sdk":
+      return {
+        kind: "pi-agent-sdk",
+        name: parsed.name,
+        judgeTarget: parsed.judge_target,
+        workers: parsed.workers,
+        providerBatching,
+        config: resolvePiAgentSdkConfig(parsed, env)
+      };
+    case "claude-code":
+      return {
+        kind: "claude-code",
+        name: parsed.name,
+        judgeTarget: parsed.judge_target,
+        workers: parsed.workers,
+        providerBatching,
+        config: resolveClaudeCodeConfig(parsed, env)
+      };
     case "mock":
       return {
         kind: "mock",
@@ -4459,41 +5581,132 @@ function resolvePiCodingAgentConfig(target, env) {
     allowLiteral: false,
     optionalEnv: true
   });
-  const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, {
-    allowLiteral: true,
+  const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, {
+    allowLiteral: true,
+    optionalEnv: true
+  });
+  const thinking = resolveOptionalString(thinkingSource, env, `${target.name} pi thinking`, {
+    allowLiteral: true,
+    optionalEnv: true
+  });
+  const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`);
+  const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
+    allowLiteral: true,
+    optionalEnv: true
+  });
+  const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`);
+  const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, {
+    allowLiteral: true,
+    optionalEnv: true
+  });
+  const logFormat = logFormatSource === "json" || logFormatSource === "summary" ? logFormatSource : void 0;
+  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
+  return {
+    executable,
+    provider,
+    model,
+    apiKey,
+    tools,
+    thinking,
+    args,
+    cwd,
+    timeoutMs,
+    logDir,
+    logFormat,
+    systemPrompt
+  };
+}
+function resolvePiAgentSdkConfig(target, env) {
+  const providerSource = target.pi_provider ?? target.piProvider ?? target.llm_provider;
+  const modelSource = target.model ?? target.pi_model ?? target.piModel;
+  const apiKeySource = target.api_key ?? target.apiKey;
+  const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
+  const systemPromptSource = target.system_prompt ?? target.systemPrompt;
+  const provider = resolveOptionalString(
+    providerSource,
+    env,
+    `${target.name} pi-agent-sdk provider`,
+    {
+      allowLiteral: true,
+      optionalEnv: true
+    }
+  );
+  const model = resolveOptionalString(modelSource, env, `${target.name} pi-agent-sdk model`, {
+    allowLiteral: true,
+    optionalEnv: true
+  });
+  const apiKey = resolveOptionalString(apiKeySource, env, `${target.name} pi-agent-sdk api key`, {
+    allowLiteral: false,
     optionalEnv: true
   });
-  const thinking = resolveOptionalString(thinkingSource, env, `${target.name} pi thinking`, {
+  const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi-agent-sdk timeout`);
+  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
+  return {
+    provider,
+    model,
+    apiKey,
+    timeoutMs,
+    systemPrompt
+  };
+}
+function resolveClaudeCodeConfig(target, env) {
+  const executableSource = target.executable ?? target.command ?? target.binary;
+  const modelSource = target.model;
+  const argsSource = target.args ?? target.arguments;
+  const cwdSource = target.cwd;
+  const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
+  const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
+  const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_CODE_LOG_FORMAT;
+  const systemPromptSource = target.system_prompt ?? target.systemPrompt;
+  const executable = resolveOptionalString(executableSource, env, `${target.name} claude-code executable`, {
     allowLiteral: true,
     optionalEnv: true
-  });
-  const args = resolveOptionalStringArray(argsSource, env, `${target.name} pi args`);
-  const cwd = resolveOptionalString(cwdSource, env, `${target.name} pi cwd`, {
+  }) ?? "claude";
+  const model = resolveOptionalString(modelSource, env, `${target.name} claude-code model`, {
     allowLiteral: true,
     optionalEnv: true
   });
-  const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} pi timeout`);
-  const logDir = resolveOptionalString(logDirSource, env, `${target.name} pi log directory`, {
+  const args = resolveOptionalStringArray(argsSource, env, `${target.name} claude-code args`);
+  const cwd = resolveOptionalString(cwdSource, env, `${target.name} claude-code cwd`, {
     allowLiteral: true,
     optionalEnv: true
   });
-  const logFormat = logFormatSource === "json" || logFormatSource === "summary" ? logFormatSource : void 0;
+  const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} claude-code timeout`);
+  const logDir = resolveOptionalString(
+    logDirSource,
+    env,
+    `${target.name} claude-code log directory`,
+    {
+      allowLiteral: true,
+      optionalEnv: true
+    }
+  );
+  const logFormat = normalizeClaudeCodeLogFormat(logFormatSource);
   const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
   return {
     executable,
-    provider,
     model,
-    apiKey,
-    tools,
-    thinking,
+    systemPrompt,
     args,
     cwd,
     timeoutMs,
     logDir,
-    logFormat,
-    systemPrompt
+    logFormat
   };
 }
+function normalizeClaudeCodeLogFormat(value) {
+  if (value === void 0 || value === null) {
+    return void 0;
+  }
+  if (typeof value !== "string") {
+    throw new Error("claude-code log format must be 'summary' or 'json'");
+  }
+  const normalized = value.trim().toLowerCase();
+  if (normalized === "json" || normalized === "summary") {
+    return normalized;
+  }
+  throw new Error("claude-code log format must be 'summary' or 'json'");
+}
 function resolveMockConfig(target) {
   const response = typeof target.response === "string" ? target.response : void 0;
   return { response };
@@ -4529,13 +5742,13 @@ function resolveVSCodeConfig(target, env, insiders) {
   };
 }
 var cliErrorMap = (issue, ctx) => {
-  if (issue.code === import_zod.z.ZodIssueCode.unrecognized_keys) {
+  if (issue.code === import_zod2.z.ZodIssueCode.unrecognized_keys) {
     return { message: `Unknown CLI provider settings: ${issue.keys.join(", ")}` };
   }
-  if (issue.code === import_zod.z.ZodIssueCode.invalid_union_discriminator) {
+  if (issue.code === import_zod2.z.ZodIssueCode.invalid_union_discriminator) {
     return { message: "healthcheck type must be 'http' or 'command'" };
   }
-  if (issue.code === import_zod.z.ZodIssueCode.invalid_type && issue.expected === "string") {
+  if (issue.code === import_zod2.z.ZodIssueCode.invalid_type && issue.expected === "string") {
     return { message: `${ctx.defaultError} (expected a string value)` };
   }
   return { message: ctx.defaultError };
@@ -4544,8 +5757,8 @@ function resolveCliConfig(target, env, evalFilePath) {
   const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
   if (!parseResult.success) {
     const firstError = parseResult.error.errors[0];
-    const path16 = firstError?.path.join(".") || "";
-    const prefix = path16 ? `${target.name} ${path16}: ` : `${target.name}: `;
+    const path17 = firstError?.path.join(".") || "";
+    const prefix = path17 ? `${target.name} ${path17}: ` : `${target.name}: `;
     throw new Error(`${prefix}${firstError?.message}`);
   }
   const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -4733,7 +5946,7 @@ function resolveOptionalNumberArray(source, description) {
 }
 // src/evaluation/providers/vscode.ts
-var import_node_path13 = __toESM(require("path"), 1);
+var import_node_path14 = __toESM(require("path"), 1);
 var import_subagent = require("subagent");
 // src/evaluation/providers/vscode-templates.ts
@@ -4903,7 +6116,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
     return "";
   }
   const buildList = (files) => files.map((absolutePath) => {
-    const fileName = import_node_path13.default.basename(absolutePath);
+    const fileName = import_node_path14.default.basename(absolutePath);
     const fileUri = pathToFileUri2(absolutePath);
     return `* [${fileName}](${fileUri})`;
   });
@@ -4928,8 +6141,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = import_node_path13.default.resolve(attachment);
-    const normalized = absolutePath.split(import_node_path13.default.sep).join("/");
+    const absolutePath = import_node_path14.default.resolve(attachment);
+    const normalized = absolutePath.split(import_node_path14.default.sep).join("/");
     if (isGuidelineFile(normalized, guidelinePatterns)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
@@ -4944,7 +6157,7 @@ function collectAttachmentFiles(attachments) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = import_node_path13.default.resolve(attachment);
+    const absolutePath = import_node_path14.default.resolve(attachment);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -4952,7 +6165,7 @@ function collectAttachmentFiles(attachments) {
   return Array.from(unique.values());
 }
 function pathToFileUri2(filePath) {
-  const absolutePath = import_node_path13.default.isAbsolute(filePath) ? filePath : import_node_path13.default.resolve(filePath);
+  const absolutePath = import_node_path14.default.isAbsolute(filePath) ? filePath : import_node_path14.default.resolve(filePath);
   const normalizedPath = absolutePath.replace(/\\/g, "/");
   if (/^[a-zA-Z]:\//.test(normalizedPath)) {
     return `file:///${normalizedPath}`;
@@ -4965,7 +6178,7 @@ function normalizeAttachments(attachments) {
   }
   const deduped = /* @__PURE__ */ new Set();
   for (const attachment of attachments) {
-    deduped.add(import_node_path13.default.resolve(attachment));
+    deduped.add(import_node_path14.default.resolve(attachment));
   }
   return Array.from(deduped);
 }
@@ -4974,7 +6187,7 @@ function mergeAttachments(all) {
   for (const list of all) {
     if (!list) continue;
     for (const inputFile of list) {
-      deduped.add(import_node_path13.default.resolve(inputFile));
+      deduped.add(import_node_path14.default.resolve(inputFile));
     }
   }
   return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -5021,9 +6234,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
 }
 // src/evaluation/providers/targets-file.ts
-var import_node_fs5 = require("fs");
-var import_promises11 = require("fs/promises");
-var import_node_path14 = __toESM(require("path"), 1);
+var import_node_fs6 = require("fs");
+var import_promises12 = require("fs/promises");
+var import_node_path15 = __toESM(require("path"), 1);
 var import_yaml3 = require("yaml");
 function isRecord(value) {
   return typeof value === "object" && value !== null && !Array.isArray(value);
@@ -5053,18 +6266,18 @@ function assertTargetDefinition(value, index, filePath) {
 }
 async function fileExists3(filePath) {
   try {
-    await (0, import_promises11.access)(filePath, import_node_fs5.constants.F_OK);
+    await (0, import_promises12.access)(filePath, import_node_fs6.constants.F_OK);
     return true;
   } catch {
     return false;
   }
 }
 async function readTargetDefinitions(filePath) {
-  const absolutePath = import_node_path14.default.resolve(filePath);
+  const absolutePath = import_node_path15.default.resolve(filePath);
   if (!await fileExists3(absolutePath)) {
     throw new Error(`targets.yaml not found at ${absolutePath}`);
   }
-  const raw = await (0, import_promises11.readFile)(absolutePath, "utf8");
+  const raw = await (0, import_promises12.readFile)(absolutePath, "utf8");
   const parsed = (0, import_yaml3.parse)(raw);
   if (!isRecord(parsed)) {
     throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with a 'targets' field`);
@@ -5094,6 +6307,10 @@ function createProvider(target) {
       return new CodexProvider(target.name, target.config);
     case "pi-coding-agent":
       return new PiCodingAgentProvider(target.name, target.config);
+    case "pi-agent-sdk":
+      return new PiAgentSdkProvider(target.name, target.config);
+    case "claude-code":
+      return new ClaudeCodeProvider(target.name, target.config);
     case "mock":
       return new MockProvider(target.name, target.config);
     case "vscode":
@@ -5112,78 +6329,176 @@ function resolveAndCreateProvider(definition, env = process.env) {
 // src/evaluation/evaluators.ts
 var import_ai2 = require("ai");
-var import_zod2 = require("zod");
+var import_zod3 = require("zod");
 // src/runtime/exec.ts
-function getBunSpawn() {
-  const bunSpawn = globalThis.Bun?.spawn;
-  return typeof bunSpawn === "function" ? bunSpawn : void 0;
+function shellEscapePath(value) {
+  if (process.platform === "win32") {
+    return `"${value.replaceAll('"', '""')}"`;
+  }
+  return `'${value.replaceAll("'", `'"'"'`)}'`;
 }
-async function execShellWithStdin(command, stdinPayload, options = {}) {
-  const bunSpawn = getBunSpawn();
-  if (bunSpawn) {
-    const encoder = new TextEncoder();
-    const proc = bunSpawn({
-      cmd: ["sh", "-c", command],
-      cwd: options.cwd,
-      stdin: encoder.encode(stdinPayload),
-      stdout: "pipe",
-      stderr: "pipe"
-    });
-    const timeout = options.timeoutMs ? setTimeout(() => {
-      proc.kill();
-    }, options.timeoutMs) : void 0;
-    try {
-      const stdout = await new Response(proc.stdout).text();
-      const stderr = await new Response(proc.stderr).text();
-      const exitCode = await proc.exited;
-      return { stdout, stderr, exitCode };
-    } finally {
-      if (timeout !== void 0) {
-        clearTimeout(timeout);
-      }
+async function execFileWithStdin(argv, stdinPayload, options = {}) {
+  if (argv.length === 0) {
+    throw new Error("Executable argv must include at least one entry");
+  }
+  if (typeof Bun !== "undefined") {
+    return execFileWithStdinBun(argv, stdinPayload, options);
+  }
+  return execFileWithStdinNode(argv, stdinPayload, options);
+}
+async function execFileWithStdinBun(argv, stdinPayload, options) {
+  const command = [...argv];
+  const encoder = new TextEncoder();
+  const proc = Bun.spawn(command, {
+    cwd: options.cwd,
+    stdin: encoder.encode(stdinPayload),
+    stdout: "pipe",
+    stderr: "pipe"
+  });
+  let timedOut = false;
+  const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
+    timedOut = true;
+    proc.kill("SIGKILL");
+  }, options.timeoutMs) : void 0;
+  try {
+    const stdoutPromise = proc.stdout ? new Response(proc.stdout).text() : Promise.resolve("");
+    const stderrPromise = proc.stderr ? new Response(proc.stderr).text() : Promise.resolve("");
+    const [stdout, stderr, exitCode] = await Promise.all([
+      stdoutPromise,
+      stderrPromise,
+      proc.exited
+    ]);
+    if (timedOut) {
+      throw new Error(`Process timed out after ${options.timeoutMs}ms`);
+    }
+    return {
+      stdout: stdout.replace(/\r\n/g, "\n"),
+      stderr: stderr.replace(/\r\n/g, "\n"),
+      exitCode
+    };
+  } finally {
+    if (timeout !== void 0) {
+      clearTimeout(timeout);
     }
   }
-  const { spawn: spawn3 } = await import("child_process");
-  return await new Promise((resolve, reject) => {
-    const child = spawn3(command, {
-      shell: true,
+}
+async function execFileWithStdinNode(argv, stdinPayload, options) {
+  const { spawn: spawn4 } = await import("child_process");
+  return new Promise((resolve, reject) => {
+    const [cmd, ...args] = argv;
+    const child = spawn4(cmd, args, {
       cwd: options.cwd,
       stdio: ["pipe", "pipe", "pipe"]
     });
-    let stdout = "";
-    let stderr = "";
-    const timeout = options.timeoutMs ? setTimeout(() => {
-      child.kill();
-      reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
+    const stdoutChunks = [];
+    const stderrChunks = [];
+    child.stdout?.on("data", (chunk) => stdoutChunks.push(chunk));
+    child.stderr?.on("data", (chunk) => stderrChunks.push(chunk));
+    let timedOut = false;
+    const timeout = options.timeoutMs !== void 0 ? setTimeout(() => {
+      timedOut = true;
+      child.kill("SIGKILL");
     }, options.timeoutMs) : void 0;
-    child.stdout?.on("data", (data) => {
-      stdout += data.toString();
-    });
-    child.stderr?.on("data", (data) => {
-      stderr += data.toString();
-    });
     child.on("error", (error) => {
-      if (timeout !== void 0) {
-        clearTimeout(timeout);
-      }
+      if (timeout !== void 0) clearTimeout(timeout);
       reject(error);
     });
-    child.on("exit", (code) => {
-      if (timeout !== void 0) {
-        clearTimeout(timeout);
+    child.on("close", (code) => {
+      if (timeout !== void 0) clearTimeout(timeout);
+      if (timedOut) {
+        reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
+        return;
       }
-      resolve({ stdout, stderr, exitCode: code ?? 0 });
+      const stdout = Buffer.concat(stdoutChunks).toString("utf8").replace(/\r\n/g, "\n");
+      const stderr = Buffer.concat(stderrChunks).toString("utf8").replace(/\r\n/g, "\n");
+      resolve({
+        stdout,
+        stderr,
+        exitCode: code ?? 0
+      });
     });
-    child.stdin?.write(stdinPayload);
-    child.stdin?.end();
+    if (child.stdin) {
+      child.stdin.write(stdinPayload);
+      child.stdin.end();
+    }
   });
 }
+async function execShellWithStdin(command, stdinPayload, options = {}) {
+  const { mkdir: mkdir4, readFile: readFile8, rm: rm4, writeFile: writeFile4 } = await import("fs/promises");
+  const { tmpdir: tmpdir4 } = await import("os");
+  const path17 = await import("path");
+  const { randomUUID: randomUUID4 } = await import("crypto");
+  const dir = path17.join(tmpdir4(), `agentv-exec-${randomUUID4()}`);
+  await mkdir4(dir, { recursive: true });
+  const stdinPath = path17.join(dir, "stdin.txt");
+  const stdoutPath = path17.join(dir, "stdout.txt");
+  const stderrPath = path17.join(dir, "stderr.txt");
+  await writeFile4(stdinPath, stdinPayload, "utf8");
+  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
+  const { spawn: spawn4 } = await import("child_process");
+  try {
+    const exitCode = await new Promise((resolve, reject) => {
+      const child = spawn4(wrappedCommand, {
+        shell: true,
+        cwd: options.cwd,
+        stdio: ["ignore", "ignore", "ignore"]
+      });
+      const timeout = options.timeoutMs ? setTimeout(() => {
+        child.kill();
+        reject(new Error(`Process timed out after ${options.timeoutMs}ms`));
+      }, options.timeoutMs) : void 0;
+      child.on("error", (error) => {
+        if (timeout !== void 0) {
+          clearTimeout(timeout);
+        }
+        reject(error);
+      });
+      child.on("exit", (code) => {
+        if (timeout !== void 0) {
+          clearTimeout(timeout);
+        }
+        resolve(code ?? 0);
+      });
+    });
+    const stdout = (await readFile8(stdoutPath, "utf8")).replace(/\r\n/g, "\n");
+    const stderr = (await readFile8(stderrPath, "utf8")).replace(/\r\n/g, "\n");
+    return { stdout, stderr, exitCode };
+  } finally {
+    await rm4(dir, { recursive: true, force: true });
+  }
+}
+// src/evaluation/case-conversion.ts
+function toSnakeCase(str) {
+  if (/^[A-Z]/.test(str)) {
+    return str;
+  }
+  return str.replace(/[A-Z]/g, (letter) => `_${letter.toLowerCase()}`);
+}
+function toSnakeCaseDeep(obj) {
+  if (obj === null || obj === void 0) {
+    return obj;
+  }
+  if (Array.isArray(obj)) {
+    return obj.map((item) => toSnakeCaseDeep(item));
+  }
+  if (typeof obj === "object") {
+    const result = {};
+    for (const [key, value] of Object.entries(obj)) {
+      const snakeKey = toSnakeCase(key);
+      result[snakeKey] = toSnakeCaseDeep(value);
+    }
+    return result;
+  }
+  return obj;
+}
 // src/evaluation/providers/types.ts
 var AGENT_PROVIDER_KINDS = [
   "codex",
   "pi-coding-agent",
+  "claude-code",
   "vscode",
   "vscode-insiders"
 ];
@@ -5224,20 +6539,20 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
 [[ ## candidate_answer ## ]]
 {{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
-var freeformEvaluationSchema = import_zod2.z.object({
-  score: import_zod2.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
-  hits: import_zod2.z.array(import_zod2.z.string()).describe("Brief specific achievements").optional(),
-  misses: import_zod2.z.array(import_zod2.z.string()).describe("Brief failures or omissions").optional(),
-  reasoning: import_zod2.z.string().describe("Concise explanation (1-2 sentences)").optional()
+var freeformEvaluationSchema = import_zod3.z.object({
+  score: import_zod3.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
+  hits: import_zod3.z.array(import_zod3.z.string()).describe("Brief specific achievements").optional(),
+  misses: import_zod3.z.array(import_zod3.z.string()).describe("Brief failures or omissions").optional(),
+  reasoning: import_zod3.z.string().describe("Concise explanation (1-2 sentences)").optional()
 });
-var rubricCheckResultSchema = import_zod2.z.object({
-  id: import_zod2.z.string().describe("The ID of the rubric item being checked"),
-  satisfied: import_zod2.z.boolean().describe("Whether this rubric requirement is met"),
-  reasoning: import_zod2.z.string().describe("Brief explanation (1-2 sentences) for this check")
+var rubricCheckResultSchema = import_zod3.z.object({
+  id: import_zod3.z.string().describe("The ID of the rubric item being checked"),
+  satisfied: import_zod3.z.boolean().describe("Whether this rubric requirement is met"),
+  reasoning: import_zod3.z.string().describe("Brief explanation (1-2 sentences) for this check")
 });
-var rubricEvaluationSchema = import_zod2.z.object({
-  checks: import_zod2.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
-  overall_reasoning: import_zod2.z.string().describe("Overall assessment summary (1-2 sentences)")
+var rubricEvaluationSchema = import_zod3.z.object({
+  checks: import_zod3.z.array(rubricCheckResultSchema).describe("Results for each rubric item"),
+  overall_reasoning: import_zod3.z.string().describe("Overall assessment summary (1-2 sentences)")
 });
 var LlmJudgeEvaluator = class {
   kind = "llm_judge";
@@ -5473,30 +6788,30 @@ var CodeEvaluator = class {
   script;
   cwd;
   agentTimeoutMs;
+  config;
   constructor(options) {
     this.script = options.script;
     this.cwd = options.cwd;
     this.agentTimeoutMs = options.agentTimeoutMs;
+    this.config = options.config;
   }
   async evaluate(context) {
-    const inputPayload = JSON.stringify(
-      {
-        question: context.evalCase.question,
-        expectedOutcome: context.evalCase.expected_outcome,
-        expectedMessages: context.evalCase.expected_messages,
-        referenceAnswer: context.evalCase.reference_answer,
-        candidateAnswer: context.candidate,
-        outputMessages: context.outputMessages ?? null,
-        guidelineFiles: context.evalCase.guideline_paths,
-        inputFiles: context.evalCase.file_paths.filter(
-          (path16) => !context.evalCase.guideline_paths.includes(path16)
-        ),
-        inputMessages: context.evalCase.input_messages,
-        traceSummary: context.traceSummary ?? null
-      },
-      null,
-      2
-    );
+    const payload = {
+      question: context.evalCase.question,
+      expectedOutcome: context.evalCase.expected_outcome,
+      expectedMessages: context.evalCase.expected_messages,
+      referenceAnswer: context.evalCase.reference_answer,
+      candidateAnswer: context.candidate,
+      outputMessages: context.outputMessages ?? null,
+      guidelineFiles: context.evalCase.guideline_paths,
+      inputFiles: context.evalCase.file_paths.filter(
+        (path17) => !context.evalCase.guideline_paths.includes(path17)
+      ),
+      inputMessages: context.evalCase.input_messages,
+      traceSummary: context.traceSummary ?? null,
+      config: this.config ?? null
+    };
+    const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
     try {
       const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
       const parsed = parseJsonSafe(stdout);
@@ -5562,18 +6877,25 @@ function calculateRubricScore(result, rubrics) {
   return { score, verdict, hits, misses };
 }
 async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
-  const { stdout, stderr, exitCode } = await execShellWithStdin(scriptPath, input, {
-    cwd,
-    timeoutMs: agentTimeoutMs
-  });
+  const { stdout, stderr, exitCode } = typeof scriptPath === "string" ? await execShellWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs }) : await execFileWithStdin(scriptPath, input, { cwd, timeoutMs: agentTimeoutMs });
   if (exitCode !== 0) {
-    const trimmedErr = stderr.trim();
+    const trimmedErr = formatStderr(stderr);
     throw new Error(
       trimmedErr.length > 0 ? `Code evaluator exited with code ${exitCode}: ${trimmedErr}` : `Code evaluator exited with code ${exitCode}`
     );
   }
   return stdout.trim();
 }
+function formatStderr(stderr) {
+  const trimmed = stderr.trim();
+  const maxLength = 2e3;
+  if (trimmed.length <= maxLength) {
+    return trimmed;
+  }
+  const tail = trimmed.slice(-maxLength);
+  return `...(truncated, last ${maxLength} chars)
+${tail}`;
+}
 function parseJsonSafe(payload) {
   try {
     return JSON.parse(payload);
@@ -5805,22 +7127,438 @@ var ToolTrajectoryEvaluator = class {
           misses.push(`Position ${i}: ${expectedTool} args mismatch`);
         }
       } else {
-        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
+        misses.push(`Position ${i}: expected ${expectedTool}, got ${actualTool}`);
+      }
+    }
+    for (let i = checkLength; i < expected.length; i++) {
+      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
+    }
+    const score = hits.length / expected.length;
+    return {
+      score,
+      verdict: scoreToVerdict(score),
+      hits,
+      misses,
+      expectedAspectCount: expected.length
+    };
+  }
+};
+var DEFAULT_DATE_FORMATS = [
+  "YYYY-MM-DDTHH:mm:ssZ",
+  // ISO with timezone
+  "YYYY-MM-DDTHH:mm:ss",
+  // ISO with time
+  "YYYY-MM-DD",
+  // ISO date
+  "DD-MMM-YYYY",
+  // Localized (e.g., "15-JAN-2025")
+  "MM/DD/YYYY",
+  // US format
+  "DD/MM/YYYY",
+  // EU format
+  "MM-DD-YYYY",
+  // US with dashes
+  "DD-MM-YYYY"
+  // EU with dashes
+];
+var MONTH_NAMES = {
+  jan: 0,
+  january: 0,
+  feb: 1,
+  february: 1,
+  mar: 2,
+  march: 2,
+  apr: 3,
+  april: 3,
+  may: 4,
+  jun: 5,
+  june: 5,
+  jul: 6,
+  july: 6,
+  aug: 7,
+  august: 7,
+  sep: 8,
+  sept: 8,
+  september: 8,
+  oct: 9,
+  october: 9,
+  nov: 10,
+  november: 10,
+  dec: 11,
+  december: 11
+};
+var FieldAccuracyEvaluator = class {
+  kind = "field_accuracy";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const { evalCase, candidate } = context;
+    let candidateData;
+    try {
+      candidateData = parseJsonFromTextSafe(candidate);
+    } catch {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["Failed to parse candidate answer as JSON"],
+        expectedAspectCount: this.config.fields.length,
+        reasoning: "Candidate answer is not valid JSON"
+      };
+    }
+    const expectedData = this.extractExpectedData(evalCase.expected_messages);
+    if (!expectedData) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No expected data found in expected_messages"],
+        expectedAspectCount: this.config.fields.length,
+        reasoning: "Could not extract expected data from expected_messages"
+      };
+    }
+    const fieldResults = [];
+    for (const fieldConfig of this.config.fields) {
+      const result = this.evaluateField(fieldConfig, candidateData, expectedData);
+      fieldResults.push(result);
+    }
+    return this.aggregateResults(fieldResults);
+  }
+  /**
+   * Extract expected data from expected_messages array.
+   * Looks for the last assistant message with content.
+   */
+  extractExpectedData(expectedMessages) {
+    for (let i = expectedMessages.length - 1; i >= 0; i--) {
+      const message = expectedMessages[i];
+      if (message.role === "assistant" && message.content) {
+        if (typeof message.content === "object" && message.content !== null) {
+          return message.content;
+        }
+        if (typeof message.content === "string") {
+          try {
+            return parseJsonFromTextSafe(message.content);
+          } catch {
+          }
+        }
+      }
+    }
+    return void 0;
+  }
+  /**
+   * Evaluate a single field against the expected value.
+   */
+  evaluateField(fieldConfig, candidateData, expectedData) {
+    const { path: path17, match, required = true, weight = 1 } = fieldConfig;
+    const candidateValue = resolvePath(candidateData, path17);
+    const expectedValue = resolvePath(expectedData, path17);
+    if (expectedValue === void 0) {
+      return {
+        path: path17,
+        score: 1,
+        // No expected value means no comparison needed
+        weight,
+        hit: true,
+        message: `${path17}: no expected value`
+      };
+    }
+    if (candidateValue === void 0) {
+      if (required) {
+        return {
+          path: path17,
+          score: 0,
+          weight,
+          hit: false,
+          message: `${path17} (required, missing)`
+        };
+      }
+      return {
+        path: path17,
+        score: 1,
+        // Don't penalize missing optional fields
+        weight: 0,
+        // Zero weight means it won't affect the score
+        hit: true,
+        message: `${path17}: optional field missing`
+      };
+    }
+    switch (match) {
+      case "exact":
+        return this.compareExact(path17, candidateValue, expectedValue, weight);
+      case "numeric_tolerance":
+        return this.compareNumericTolerance(
+          path17,
+          candidateValue,
+          expectedValue,
+          fieldConfig,
+          weight
+        );
+      case "date":
+        return this.compareDate(path17, candidateValue, expectedValue, fieldConfig, weight);
+      default:
+        return {
+          path: path17,
+          score: 0,
+          weight,
+          hit: false,
+          message: `${path17}: unknown match type "${match}"`
+        };
+    }
+  }
+  /**
+   * Exact equality comparison.
+   */
+  compareExact(path17, candidateValue, expectedValue, weight) {
+    if (deepEqual(candidateValue, expectedValue)) {
+      return {
+        path: path17,
+        score: 1,
+        weight,
+        hit: true,
+        message: path17
+      };
+    }
+    if (typeof candidateValue !== typeof expectedValue) {
+      return {
+        path: path17,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path17} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
+      };
+    }
+    return {
+      path: path17,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path17} (value mismatch)`
+    };
+  }
+  /**
+   * Numeric comparison with absolute or relative tolerance.
+   */
+  compareNumericTolerance(path17, candidateValue, expectedValue, fieldConfig, weight) {
+    const { tolerance = 0, relative = false } = fieldConfig;
+    const candidateNum = toNumber(candidateValue);
+    const expectedNum = toNumber(expectedValue);
+    if (candidateNum === null || expectedNum === null) {
+      return {
+        path: path17,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path17} (non-numeric value)`
+      };
+    }
+    if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
+      return {
+        path: path17,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path17} (invalid numeric value)`
+      };
+    }
+    const diff = Math.abs(candidateNum - expectedNum);
+    let withinTolerance;
+    if (relative) {
+      const relativeDiff = expectedNum === 0 ? diff : diff / Math.abs(expectedNum);
+      withinTolerance = relativeDiff <= tolerance;
+    } else {
+      withinTolerance = diff <= tolerance;
+    }
+    if (withinTolerance) {
+      return {
+        path: path17,
+        score: 1,
+        weight,
+        hit: true,
+        message: `${path17} (within tolerance: diff=${diff.toFixed(2)})`
+      };
+    }
+    return {
+      path: path17,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path17} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
+    };
+  }
+  /**
+   * Date comparison with format normalization.
+   */
+  compareDate(path17, candidateValue, expectedValue, fieldConfig, weight) {
+    const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
+    const candidateDate = parseDate(String(candidateValue), formats);
+    const expectedDate = parseDate(String(expectedValue), formats);
+    if (candidateDate === null) {
+      return {
+        path: path17,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path17} (unparseable candidate date)`
+      };
+    }
+    if (expectedDate === null) {
+      return {
+        path: path17,
+        score: 0,
+        weight,
+        hit: false,
+        message: `${path17} (unparseable expected date)`
+      };
+    }
+    if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
+      return {
+        path: path17,
+        score: 1,
+        weight,
+        hit: true,
+        message: path17
+      };
+    }
+    return {
+      path: path17,
+      score: 0,
+      weight,
+      hit: false,
+      message: `${path17} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
+    };
+  }
+  /**
+   * Aggregate field results using configured strategy.
+   */
+  aggregateResults(results) {
+    const aggregation = this.config.aggregation ?? "weighted_average";
+    const hits = [];
+    const misses = [];
+    for (const result of results) {
+      if (result.hit) {
+        hits.push(result.message);
+      } else {
+        misses.push(result.message);
       }
     }
-    for (let i = checkLength; i < expected.length; i++) {
-      misses.push(`Position ${i}: expected ${expected[i].tool}, got nothing`);
+    let score;
+    if (aggregation === "all_or_nothing") {
+      score = misses.length === 0 ? 1 : 0;
+    } else {
+      const totalWeight = results.reduce((sum, r) => sum + r.weight, 0);
+      if (totalWeight === 0) {
+        score = results.length === 0 ? 1 : 0;
+      } else {
+        const weightedSum = results.reduce((sum, r) => sum + r.score * r.weight, 0);
+        score = weightedSum / totalWeight;
+      }
     }
-    const score = hits.length / expected.length;
+    const reasoning = `${hits.length}/${results.length} fields matched`;
     return {
-      score,
+      score: clampScore(score),
       verdict: scoreToVerdict(score),
-      hits,
-      misses,
-      expectedAspectCount: expected.length
+      hits: hits.slice(0, 4),
+      misses: misses.slice(0, 4),
+      expectedAspectCount: results.length,
+      reasoning
     };
   }
 };
+function resolvePath(obj, path17) {
+  if (!path17 || !obj) {
+    return void 0;
+  }
+  const parts = path17.split(/\.|\[|\]/).filter((p) => p.length > 0);
+  let current = obj;
+  for (const part of parts) {
+    if (current === null || current === void 0) {
+      return void 0;
+    }
+    if (typeof current !== "object") {
+      return void 0;
+    }
+    const isIndex = /^\d+$/.test(part);
+    if (isIndex && Array.isArray(current)) {
+      current = current[Number.parseInt(part, 10)];
+    } else {
+      current = current[part];
+    }
+  }
+  return current;
+}
+function toNumber(value) {
+  if (typeof value === "number") {
+    return value;
+  }
+  if (typeof value === "string") {
+    const num = Number.parseFloat(value);
+    return Number.isNaN(num) ? null : num;
+  }
+  return null;
+}
+function parseDate(dateStr, formats) {
+  if (!dateStr) return null;
+  const trimmed = dateStr.trim();
+  const isoDate = new Date(trimmed);
+  if (!Number.isNaN(isoDate.getTime())) {
+    return isoDate;
+  }
+  const localizedMatch = trimmed.match(/^(\d{1,2})-([A-Za-z]{3,9})-(\d{4})$/);
+  if (localizedMatch) {
+    const day = Number.parseInt(localizedMatch[1], 10);
+    const monthName = localizedMatch[2].toLowerCase();
+    const year = Number.parseInt(localizedMatch[3], 10);
+    const month = MONTH_NAMES[monthName];
+    if (month !== void 0) {
+      return new Date(year, month, day);
+    }
+  }
+  const usMatch = trimmed.match(/^(\d{1,2})[\/\-](\d{1,2})[\/\-](\d{4})$/);
+  if (usMatch) {
+    const hasUSFormat = formats.some((f) => f.includes("MM/DD") || f.includes("MM-DD"));
+    const hasEUFormat = formats.some((f) => f.includes("DD/MM") || f.includes("DD-MM"));
+    if (hasUSFormat && !hasEUFormat) {
+      const month = Number.parseInt(usMatch[1], 10) - 1;
+      const day = Number.parseInt(usMatch[2], 10);
+      const year = Number.parseInt(usMatch[3], 10);
+      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
+        return new Date(year, month, day);
+      }
+    } else if (hasEUFormat && !hasUSFormat) {
+      const day = Number.parseInt(usMatch[1], 10);
+      const month = Number.parseInt(usMatch[2], 10) - 1;
+      const year = Number.parseInt(usMatch[3], 10);
+      if (month >= 0 && month <= 11 && day >= 1 && day <= 31) {
+        return new Date(year, month, day);
+      }
+    } else {
+      const num1 = Number.parseInt(usMatch[1], 10);
+      const num2 = Number.parseInt(usMatch[2], 10);
+      const year = Number.parseInt(usMatch[3], 10);
+      if (num1 > 12 && num2 <= 12) {
+        return new Date(year, num2 - 1, num1);
+      }
+      if (num2 > 12 && num1 <= 12) {
+        return new Date(year, num1 - 1, num2);
+      }
+      if (num1 <= 12 && num2 <= 31) {
+        return new Date(year, num1 - 1, num2);
+      }
+    }
+  }
+  return null;
+}
+function formatDateISO(date) {
+  return date.toISOString().split("T")[0];
+}
+function parseJsonFromTextSafe(text) {
+  const cleaned = typeof text === "string" ? text.replace(/```json\n?|```/g, "").trim() : "";
+  const match = cleaned.match(/\{[\s\S]*\}/);
+  const blob = match?.[0] ?? cleaned;
+  return JSON.parse(blob);
+}
 var DEFAULT_COMPOSITE_AGGREGATOR_PROMPT = `Review the following evaluation results:
 {{EVALUATOR_RESULTS_JSON}}
@@ -6045,11 +7783,175 @@ var CompositeEvaluator = class {
     }
   }
 };
+var LatencyEvaluator = class {
+  kind = "latency";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const { threshold } = this.config;
+    const durationMs = context.traceSummary?.durationMs;
+    if (durationMs === void 0) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No duration data available in trace"],
+        expectedAspectCount: 1,
+        reasoning: "Execution duration not reported by provider",
+        evaluatorRawRequest: {
+          type: "latency",
+          threshold,
+          durationMs: null
+        }
+      };
+    }
+    const passed = durationMs <= threshold;
+    const score = passed ? 1 : 0;
+    return {
+      score,
+      verdict: passed ? "pass" : "fail",
+      hits: passed ? [`Duration ${durationMs}ms <= ${threshold}ms threshold`] : [],
+      misses: passed ? [] : [`Duration ${durationMs}ms > ${threshold}ms threshold`],
+      expectedAspectCount: 1,
+      reasoning: `Execution took ${durationMs}ms (threshold: ${threshold}ms)`,
+      evaluatorRawRequest: {
+        type: "latency",
+        threshold,
+        durationMs
+      }
+    };
+  }
+};
+var CostEvaluator = class {
+  kind = "cost";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const { budget } = this.config;
+    const costUsd = context.traceSummary?.costUsd;
+    if (costUsd === void 0) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No cost data available in trace"],
+        expectedAspectCount: 1,
+        reasoning: "Execution cost not reported by provider",
+        evaluatorRawRequest: {
+          type: "cost",
+          budget,
+          costUsd: null
+        }
+      };
+    }
+    const passed = costUsd <= budget;
+    const score = passed ? 1 : 0;
+    const formatCost = (n) => `$${n.toFixed(4)}`;
+    return {
+      score,
+      verdict: passed ? "pass" : "fail",
+      hits: passed ? [`Cost ${formatCost(costUsd)} <= ${formatCost(budget)} budget`] : [],
+      misses: passed ? [] : [`Cost ${formatCost(costUsd)} > ${formatCost(budget)} budget`],
+      expectedAspectCount: 1,
+      reasoning: `Execution cost ${formatCost(costUsd)} (budget: ${formatCost(budget)})`,
+      evaluatorRawRequest: {
+        type: "cost",
+        budget,
+        costUsd
+      }
+    };
+  }
+};
+var TokenUsageEvaluator = class {
+  kind = "token_usage";
+  config;
+  constructor(options) {
+    this.config = options.config;
+  }
+  evaluate(context) {
+    const usage = context.traceSummary?.tokenUsage;
+    const maxTotal = this.config.max_total;
+    const maxInput = this.config.max_input;
+    const maxOutput = this.config.max_output;
+    const expectedAspectCount = Math.max(
+      [maxTotal, maxInput, maxOutput].filter((v) => typeof v === "number").length,
+      1
+    );
+    if (!usage) {
+      return {
+        score: 0,
+        verdict: "fail",
+        hits: [],
+        misses: ["No token usage data available in trace"],
+        expectedAspectCount,
+        reasoning: "Token usage not reported by provider",
+        evaluatorRawRequest: {
+          type: "token_usage",
+          max_total: maxTotal ?? null,
+          max_input: maxInput ?? null,
+          max_output: maxOutput ?? null,
+          tokenUsage: null
+        }
+      };
+    }
+    const input = usage.input;
+    const output = usage.output;
+    const cached = usage.cached ?? 0;
+    const total = input + output + cached;
+    const hits = [];
+    const misses = [];
+    if (typeof maxInput === "number") {
+      if (input <= maxInput) {
+        hits.push(`Input tokens ${input} <= ${maxInput}`);
+      } else {
+        misses.push(`Input tokens ${input} > ${maxInput}`);
+      }
+    }
+    if (typeof maxOutput === "number") {
+      if (output <= maxOutput) {
+        hits.push(`Output tokens ${output} <= ${maxOutput}`);
+      } else {
+        misses.push(`Output tokens ${output} > ${maxOutput}`);
+      }
+    }
+    if (typeof maxTotal === "number") {
+      if (total <= maxTotal) {
+        hits.push(`Total tokens ${total} <= ${maxTotal}`);
+      } else {
+        misses.push(`Total tokens ${total} > ${maxTotal}`);
+      }
+    }
+    const passed = misses.length === 0;
+    return {
+      score: passed ? 1 : 0,
+      verdict: passed ? "pass" : "fail",
+      hits,
+      misses,
+      expectedAspectCount,
+      reasoning: `token_usage input=${input}, output=${output}, cached=${cached}, total=${total}`,
+      evaluatorRawRequest: {
+        type: "token_usage",
+        max_total: maxTotal ?? null,
+        max_input: maxInput ?? null,
+        max_output: maxOutput ?? null,
+        tokenUsage: {
+          input,
+          output,
+          cached,
+          total
+        }
+      }
+    };
+  }
+};
 // src/evaluation/orchestrator.ts
-var import_node_crypto3 = require("crypto");
-var import_promises12 = require("fs/promises");
-var import_node_path15 = __toESM(require("path"), 1);
+var import_node_crypto4 = require("crypto");
+var import_node_path16 = __toESM(require("path"), 1);
 // ../../node_modules/.bun/yocto-queue@1.2.2/node_modules/yocto-queue/index.js
 var Node = class {
@@ -6191,6 +8093,9 @@ function validateConcurrency(concurrency) {
 }
 // src/evaluation/orchestrator.ts
+function usesFileReferencePrompt(provider) {
+  return isAgentProvider(provider) || provider.kind === "cli";
+}
 async function runEvaluation(options) {
   const {
     testFilePath: evalFilePath,
@@ -6202,7 +8107,6 @@ async function runEvaluation(options) {
     evaluators,
     maxRetries,
     agentTimeoutMs,
-    promptDumpDir,
     cache,
     useCache,
     now,
@@ -6282,7 +8186,6 @@ async function runEvaluation(options) {
         provider: primaryProvider,
         target,
         evaluatorRegistry,
-        promptDumpDir,
         nowFn: now ?? (() => /* @__PURE__ */ new Date()),
         onProgress,
         onResult,
@@ -6324,7 +8227,6 @@ async function runEvaluation(options) {
           evaluators: evaluatorRegistry,
           maxRetries,
           agentTimeoutMs,
-          promptDumpDir,
           cache,
           useCache,
           now,
@@ -6367,7 +8269,8 @@ async function runEvaluation(options) {
       results.push(outcome.value);
     } else {
       const evalCase = filteredEvalCases[i];
-      const promptInputs = await buildPromptInputs(evalCase);
+      const formattingMode = usesFileReferencePrompt(primaryProvider) ? "agent" : "lm";
+      const promptInputs = await buildPromptInputs(evalCase, formattingMode);
       const errorResult = buildErrorResult(
         evalCase,
         target.name,
@@ -6390,7 +8293,6 @@ async function runBatchEvaluation(options) {
     provider,
     target,
     evaluatorRegistry,
-    promptDumpDir,
     nowFn,
     onProgress,
     onResult,
@@ -6398,12 +8300,9 @@ async function runBatchEvaluation(options) {
     agentTimeoutMs
   } = options;
   const promptInputsList = [];
-  const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
+  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
   for (const evalCase of evalCases) {
     const promptInputs = await buildPromptInputs(evalCase, formattingMode);
-    if (promptDumpDir) {
-      await dumpPrompt(promptDumpDir, evalCase, promptInputs);
-    }
     promptInputsList.push(promptInputs);
   }
   const batchRequests = evalCases.map((evalCase, index) => {
@@ -6445,13 +8344,20 @@ async function runBatchEvaluation(options) {
     const promptInputs = promptInputsList[i];
     const providerResponse = batchResponse[i];
     const outputMessages = providerResponse.outputMessages;
-    const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
+    const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
+    const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
+      eventCount: 0,
+      toolNames: [],
+      toolCallsByName: {},
+      errorCount: 0
+    } : void 0;
     const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
       tokenUsage: providerResponse.tokenUsage,
       costUsd: providerResponse.costUsd,
       durationMs: providerResponse.durationMs
     }) : void 0;
     const candidate = extractLastAssistantContent(outputMessages);
+    const providerError = extractProviderError(providerResponse);
     let result;
     try {
       result = await evaluateCandidate({
@@ -6468,6 +8374,9 @@ async function runBatchEvaluation(options) {
         outputMessages,
         traceSummary
       });
+      if (providerError) {
+        result = { ...result, error: providerError };
+      }
     } catch (error) {
       const errorResult = buildErrorResult(
         evalCase,
@@ -6500,9 +8409,10 @@ async function runBatchEvaluation(options) {
       await onProgress({
         workerId: 1,
         evalId: evalCase.id,
-        status: "completed",
+        status: result.error ? "failed" : "completed",
         startedAt: 0,
-        completedAt: Date.now()
+        completedAt: Date.now(),
+        error: result.error
       });
     }
   }
@@ -6517,17 +8427,13 @@ async function runEvalCase(options) {
     now,
     maxRetries,
     agentTimeoutMs,
-    promptDumpDir,
     cache,
     useCache,
     signal,
     judgeProvider
   } = options;
-  const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
+  const formattingMode = usesFileReferencePrompt(provider) ? "agent" : "lm";
   const promptInputs = await buildPromptInputs(evalCase, formattingMode);
-  if (promptDumpDir) {
-    await dumpPrompt(promptDumpDir, evalCase, promptInputs);
-  }
   const cacheKey = useCache ? createCacheKey(provider, target, evalCase, promptInputs) : void 0;
   let cachedResponse;
   if (cacheKey && cache) {
@@ -6571,15 +8477,22 @@ async function runEvalCase(options) {
     await cache.set(cacheKey, providerResponse);
   }
   const outputMessages = providerResponse.outputMessages;
-  const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : void 0;
+  const hasExecutionMetrics = providerResponse.tokenUsage !== void 0 || providerResponse.costUsd !== void 0 || providerResponse.durationMs !== void 0;
+  const baseSummary = outputMessages ? computeTraceSummary(outputMessages) : hasExecutionMetrics ? {
+    eventCount: 0,
+    toolNames: [],
+    toolCallsByName: {},
+    errorCount: 0
+  } : void 0;
   const traceSummary = baseSummary ? mergeExecutionMetrics(baseSummary, {
     tokenUsage: providerResponse.tokenUsage,
     costUsd: providerResponse.costUsd,
     durationMs: providerResponse.durationMs
   }) : void 0;
   const candidate = extractLastAssistantContent(outputMessages);
+  const providerError = extractProviderError(providerResponse);
   try {
-    return await evaluateCandidate({
+    const result = await evaluateCandidate({
       evalCase,
       candidate,
       target,
@@ -6593,6 +8506,7 @@ async function runEvalCase(options) {
       outputMessages,
       traceSummary
     });
+    return providerError ? { ...result, error: providerError } : result;
   } catch (error) {
     return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs, provider);
   }
@@ -6658,7 +8572,6 @@ async function evaluateCandidate(options) {
     candidateAnswer: candidate,
     target: target.name,
     reasoning: score.reasoning,
-    rawAspects: score.rawAspects,
     agentProviderRequest,
     lmProviderRequest,
     evaluatorProviderRequest: evaluatorResults ? void 0 : score.evaluatorRawRequest,
@@ -6768,7 +8681,8 @@ async function runEvaluatorList(options) {
         const codeEvaluator = new CodeEvaluator({
           script: evaluator.script,
           cwd: evaluator.resolvedCwd ?? evaluator.cwd,
-          agentTimeoutMs
+          agentTimeoutMs,
+          config: evaluator.config
         });
         const score2 = await codeEvaluator.evaluate({
           evalCase,
@@ -6796,7 +8710,7 @@ async function runEvaluatorList(options) {
         });
       }
       if (evaluator.type === "composite") {
-        const evalFileDir = evalCase.guideline_paths[0] ? import_node_path15.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
+        const evalFileDir = evalCase.guideline_paths[0] ? import_node_path16.default.dirname(evalCase.guideline_paths[0]) : process.cwd();
         const createEvaluator = (memberConfig) => {
           switch (memberConfig.type) {
             case "llm_judge":
@@ -6805,7 +8719,8 @@ async function runEvaluatorList(options) {
               return new CodeEvaluator({
                 script: memberConfig.script,
                 cwd: memberConfig.resolvedCwd ?? memberConfig.cwd,
-                agentTimeoutMs
+                agentTimeoutMs,
+                config: memberConfig.config
               });
             case "composite":
               return new CompositeEvaluator({
@@ -6817,6 +8732,22 @@ async function runEvaluatorList(options) {
               return new ToolTrajectoryEvaluator({
                 config: memberConfig
               });
+            case "field_accuracy":
+              return new FieldAccuracyEvaluator({
+                config: memberConfig
+              });
+            case "latency":
+              return new LatencyEvaluator({
+                config: memberConfig
+              });
+            case "cost":
+              return new CostEvaluator({
+                config: memberConfig
+              });
+            case "token_usage":
+              return new TokenUsageEvaluator({
+                config: memberConfig
+              });
             default: {
               const unknownConfig = memberConfig;
               throw new Error(`Unsupported evaluator type in composite: ${unknownConfig.type}`);
@@ -6836,7 +8767,9 @@ async function runEvaluatorList(options) {
           attempt,
           promptInputs,
           now,
-          judgeProvider
+          judgeProvider,
+          outputMessages,
+          traceSummary
         });
         const weight = evaluator.weight ?? 1;
         scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
@@ -6881,6 +8814,118 @@ async function runEvaluatorList(options) {
           reasoning: score2.reasoning
         });
       }
+      if (evaluator.type === "field_accuracy") {
+        const fieldAccuracyEvaluator = new FieldAccuracyEvaluator({
+          config: evaluator
+        });
+        const score2 = fieldAccuracyEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now,
+          outputMessages,
+          traceSummary
+        });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          weight,
+          verdict: score2.verdict,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning
+        });
+      }
+      if (evaluator.type === "latency") {
+        const latencyEvaluator = new LatencyEvaluator({
+          config: evaluator
+        });
+        const score2 = latencyEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now,
+          outputMessages,
+          traceSummary
+        });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          weight,
+          verdict: score2.verdict,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning
+        });
+      }
+      if (evaluator.type === "cost") {
+        const costEvaluator = new CostEvaluator({
+          config: evaluator
+        });
+        const score2 = costEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now,
+          outputMessages,
+          traceSummary
+        });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          weight,
+          verdict: score2.verdict,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning
+        });
+      }
+      if (evaluator.type === "token_usage") {
+        const tokenUsageEvaluator = new TokenUsageEvaluator({
+          config: evaluator
+        });
+        const score2 = tokenUsageEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now,
+          outputMessages,
+          traceSummary
+        });
+        const weight = evaluator.weight ?? 1;
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type, weight });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          weight,
+          verdict: score2.verdict,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning
+        });
+      }
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       const fallbackScore = {
@@ -6920,7 +8965,6 @@ async function runEvaluatorList(options) {
     (total, entry) => total + (entry.score.expectedAspectCount ?? 0),
     0
   );
-  const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
   const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
   const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
   const score = {
@@ -6929,8 +8973,7 @@ async function runEvaluatorList(options) {
     hits,
     misses,
     expectedAspectCount,
-    reasoning,
-    rawAspects: rawAspects.length > 0 ? rawAspects : void 0
+    reasoning
   };
   return { score, evaluatorResults };
 }
@@ -7005,26 +9048,6 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
     llm_judge: llmJudge
   };
 }
-async function dumpPrompt(directory, evalCase, promptInputs) {
-  const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
-  const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
-  const filePath = import_node_path15.default.resolve(directory, filename);
-  await (0, import_promises12.mkdir)(import_node_path15.default.dirname(filePath), { recursive: true });
-  const payload = {
-    eval_id: evalCase.id,
-    question: promptInputs.question,
-    guidelines: promptInputs.guidelines,
-    guideline_paths: evalCase.guideline_paths
-  };
-  await (0, import_promises12.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
-}
-function sanitizeFilename(value) {
-  if (!value) {
-    return "prompt";
-  }
-  const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
-  return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
-}
 async function invokeProvider(provider, options) {
   const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
   const controller = new AbortController();
@@ -7088,14 +9111,25 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
     misses: [`Error: ${message}`],
     candidateAnswer: `Error occurred: ${message}`,
     target: targetName,
-    rawAspects: [],
     agentProviderRequest,
     lmProviderRequest,
     error: message
   };
 }
+function extractProviderError(response) {
+  const raw = response.raw;
+  if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
+    return void 0;
+  }
+  const error = raw.error;
+  if (typeof error !== "string") {
+    return void 0;
+  }
+  const trimmed = error.trim();
+  return trimmed.length > 0 ? trimmed : void 0;
+}
 function createCacheKey(provider, target, evalCase, promptInputs) {
-  const hash = (0, import_node_crypto3.createHash)("sha256");
+  const hash = (0, import_node_crypto4.createHash)("sha256");
   hash.update(provider.id);
   hash.update(target.name);
   hash.update(evalCase.id);
@@ -7152,15 +9186,15 @@ function computeWeightedMean(entries) {
 // src/evaluation/generators/rubric-generator.ts
 var import_ai3 = require("ai");
-var import_zod3 = require("zod");
-var rubricItemSchema = import_zod3.z.object({
-  id: import_zod3.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
-  description: import_zod3.z.string().describe("What this rubric checks for"),
-  weight: import_zod3.z.number().default(1).describe("Relative importance (default 1.0)"),
-  required: import_zod3.z.boolean().default(true).describe("Whether this is a mandatory requirement")
+var import_zod4 = require("zod");
+var rubricItemSchema = import_zod4.z.object({
+  id: import_zod4.z.string().describe("Short identifier for this rubric (e.g., clarity, completeness)"),
+  description: import_zod4.z.string().describe("What this rubric checks for"),
+  weight: import_zod4.z.number().default(1).describe("Relative importance (default 1.0)"),
+  required: import_zod4.z.boolean().default(true).describe("Whether this is a mandatory requirement")
 });
-var rubricGenerationSchema = import_zod3.z.object({
-  rubrics: import_zod3.z.array(rubricItemSchema).describe("List of evaluation rubrics")
+var rubricGenerationSchema = import_zod4.z.object({
+  rubrics: import_zod4.z.array(rubricItemSchema).describe("List of evaluation rubrics")
 });
 async function generateRubrics(options) {
   const { expectedOutcome, question, referenceAnswer, provider } = options;
@@ -7238,15 +9272,20 @@ function createAgentKernel() {
 0 && (module.exports = {
   CodeEvaluator,
   CompositeEvaluator,
+  CostEvaluator,
   DEFAULT_EXPLORATION_TOOLS,
+  FieldAccuracyEvaluator,
+  LatencyEvaluator,
   LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES,
+  TokenUsageEvaluator,
   ToolTrajectoryEvaluator,
   avgToolDurationMs,
   buildDirectoryChain,
   buildPromptInputs,
   buildSearchRoots,
   computeTraceSummary,
+  consumeClaudeCodeLogEntries,
   consumeCodexLogEntries,
   consumePiLogEntries,
   createAgentKernel,
@@ -7277,6 +9316,7 @@ function createAgentKernel() {
   resolveTargetDefinition,
   runEvalCase,
   runEvaluation,
+  subscribeToClaudeCodeLogEntries,
   subscribeToCodexLogEntries,
   subscribeToPiLogEntries,
   tokensPerTool