npm - @agentv/core - Versions diffs - 0.2.11 → 0.5.0 - Mend

@agentv/core 0.2.11 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/dist/{chunk-P4GOYWYH.js → chunk-NL7K4CAK.js} +5 -1
package/dist/chunk-NL7K4CAK.js.map +1 -0
package/dist/evaluation/validation/index.cjs +186 -1
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +183 -2
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +1519 -396
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +107 -63
package/dist/index.d.ts +107 -63
package/dist/index.js +1519 -395
package/dist/index.js.map +1 -1
package/package.json +1 -1
package/dist/chunk-P4GOYWYH.js.map +0 -1
package/dist/chunk-XXNQA4EW.js +0 -140
package/dist/chunk-XXNQA4EW.js.map +0 -1

package/dist/index.cjs CHANGED Viewed

@@ -30,25 +30,20 @@ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: tru
 // src/index.ts
 var index_exports = {};
 __export(index_exports, {
-  GRADER_KINDS: () => GRADER_KINDS,
-  HeuristicGrader: () => HeuristicGrader,
-  QualityGrader: () => QualityGrader,
+  CodeEvaluator: () => CodeEvaluator,
+  LlmJudgeEvaluator: () => LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
   buildDirectoryChain: () => buildDirectoryChain,
   buildPromptInputs: () => buildPromptInputs,
   buildSearchRoots: () => buildSearchRoots,
-  calculateHits: () => calculateHits,
-  calculateMisses: () => calculateMisses,
   createAgentKernel: () => createAgentKernel,
   createProvider: () => createProvider,
   ensureVSCodeSubagents: () => ensureVSCodeSubagents,
-  extractAspects: () => extractAspects,
   extractCodeBlocks: () => extractCodeBlocks,
   fileExists: () => fileExists,
   findGitRoot: () => findGitRoot,
   getHitCount: () => getHitCount,
-  isErrorLike: () => isErrorLike,
-  isGraderKind: () => isGraderKind,
+  isEvaluatorKind: () => isEvaluatorKind,
   isGuidelineFile: () => isGuidelineFile,
   isJsonObject: () => isJsonObject,
   isJsonValue: () => isJsonValue,
@@ -61,8 +56,7 @@ __export(index_exports, {
   resolveFileReference: () => resolveFileReference,
   resolveTargetDefinition: () => resolveTargetDefinition,
   runEvalCase: () => runEvalCase,
-  runEvaluation: () => runEvaluation,
-  scoreCandidateResponse: () => scoreCandidateResponse
+  runEvaluation: () => runEvaluation
 });
 module.exports = __toCommonJS(index_exports);
@@ -107,11 +101,10 @@ function isTestMessage(value) {
   }
   return candidate.content.every(isJsonObject);
 }
-var GRADER_KIND_VALUES = ["heuristic", "llm_judge"];
-var GRADER_KINDS = GRADER_KIND_VALUES;
-var GRADER_KIND_SET = new Set(GRADER_KIND_VALUES);
-function isGraderKind(value) {
-  return typeof value === "string" && GRADER_KIND_SET.has(value);
+var EVALUATOR_KIND_VALUES = ["code", "llm_judge"];
+var EVALUATOR_KIND_SET = new Set(EVALUATOR_KIND_VALUES);
+function isEvaluatorKind(value) {
+  return typeof value === "string" && EVALUATOR_KIND_SET.has(value);
 }
 function getHitCount(result) {
   return result.hits.length;
@@ -325,7 +318,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
   if (!Array.isArray(rawTestcases)) {
     throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
   }
-  const globalGrader = coerceGrader(suite.grader) ?? "llm_judge";
+  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
   const results = [];
   for (const rawEvalcase of rawTestcases) {
     if (!isJsonObject(rawEvalcase)) {
@@ -448,7 +441,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
     const assistantContent = assistantMessages[0]?.content;
     const expectedAssistantRaw = await resolveAssistantContent(assistantContent, searchRoots, verbose);
     const userTextPrompt = userTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
-    const testCaseGrader = coerceGrader(evalcase.grader) ?? globalGrader;
+    const testCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
+    const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
     const userFilePaths = [];
     for (const segment of userSegments) {
       if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -471,7 +465,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
       file_paths: allFilePaths,
       code_snippets: codeSnippets,
       outcome,
-      grader: testCaseGrader
+      evaluator: testCaseEvaluatorKind,
+      evaluators
     };
     if (verbose) {
       console.log(`
@@ -632,14 +627,88 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
   }
   return parts.join(" ");
 }
-function coerceGrader(candidate) {
+async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
+  const execution = rawEvalCase.execution;
+  const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
+  if (candidateEvaluators === void 0) {
+    return void 0;
+  }
+  if (!Array.isArray(candidateEvaluators)) {
+    logWarning(`Skipping evaluators for '${evalId}': expected array`);
+    return void 0;
+  }
+  const evaluators = [];
+  for (const rawEvaluator of candidateEvaluators) {
+    if (!isJsonObject(rawEvaluator)) {
+      logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
+      continue;
+    }
+    const name = asString(rawEvaluator.name);
+    const typeValue = rawEvaluator.type;
+    if (!name || !isEvaluatorKind(typeValue)) {
+      logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
+      continue;
+    }
+    if (typeValue === "code") {
+      const script = asString(rawEvaluator.script);
+      if (!script) {
+        logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
+        continue;
+      }
+      const cwd = asString(rawEvaluator.cwd);
+      let resolvedCwd;
+      if (cwd) {
+        const resolved = await resolveFileReference(cwd, searchRoots);
+        if (resolved.resolvedPath) {
+          resolvedCwd = import_node_path2.default.resolve(resolved.resolvedPath);
+        } else {
+          logWarning(
+            `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
+            resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => `  Tried: ${attempt}`) : void 0
+          );
+        }
+      }
+      evaluators.push({
+        name,
+        type: "code",
+        script,
+        cwd,
+        resolvedCwd
+      });
+      continue;
+    }
+    const prompt = asString(rawEvaluator.prompt);
+    let promptPath;
+    if (prompt) {
+      const resolved = await resolveFileReference(prompt, searchRoots);
+      if (resolved.resolvedPath) {
+        promptPath = import_node_path2.default.resolve(resolved.resolvedPath);
+      } else {
+        logWarning(
+          `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
+          resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => `  Tried: ${attempt}`) : void 0
+        );
+      }
+    }
+    const model = asString(rawEvaluator.model);
+    evaluators.push({
+      name,
+      type: "llm_judge",
+      prompt,
+      promptPath,
+      model
+    });
+  }
+  return evaluators.length > 0 ? evaluators : void 0;
+}
+function coerceEvaluator(candidate, contextId) {
   if (typeof candidate !== "string") {
     return void 0;
   }
-  if (isGraderKind(candidate)) {
+  if (isEvaluatorKind(candidate)) {
     return candidate;
   }
-  logWarning(`Unknown grader '${candidate}', falling back to default`);
+  logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
   return void 0;
 }
 function logWarning(message, details) {
@@ -835,6 +904,214 @@ var GeminiProvider = class {
   }
 };
+// src/evaluation/providers/cli.ts
+var import_node_child_process = require("child_process");
+var import_node_path3 = __toESM(require("path"), 1);
+var import_node_util = require("util");
+var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
+var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
+async function defaultCommandRunner(command, options) {
+  const execOptions = {
+    cwd: options.cwd,
+    env: options.env,
+    timeout: options.timeoutMs,
+    signal: options.signal,
+    maxBuffer: DEFAULT_MAX_BUFFER,
+    shell: process.platform === "win32" ? "powershell.exe" : void 0
+  };
+  try {
+    const { stdout, stderr } = await execAsync(command, execOptions);
+    return {
+      stdout,
+      stderr,
+      exitCode: 0,
+      failed: false,
+      timedOut: false,
+      signal: null
+    };
+  } catch (error) {
+    const execError = error;
+    return {
+      stdout: execError.stdout ?? "",
+      stderr: execError.stderr ?? "",
+      exitCode: typeof execError.code === "number" ? execError.code : null,
+      failed: true,
+      timedOut: execError.timedOut === true || execError.killed === true,
+      signal: execError.signal ?? null
+    };
+  }
+}
+var CliProvider = class {
+  id;
+  kind = "cli";
+  targetName;
+  supportsBatch = false;
+  config;
+  runCommand;
+  healthcheckPromise;
+  constructor(targetName, config, runner = defaultCommandRunner) {
+    this.targetName = targetName;
+    this.id = `cli:${targetName}`;
+    this.config = config;
+    this.runCommand = runner;
+  }
+  async invoke(request) {
+    if (request.signal?.aborted) {
+      throw new Error("CLI provider request was aborted before execution");
+    }
+    await this.ensureHealthy(request.signal);
+    const templateValues = buildTemplateValues(request, this.config);
+    const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
+    const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
+    const result = await this.runCommand(renderedCommand, {
+      cwd: this.config.cwd,
+      env,
+      timeoutMs: this.config.timeoutMs,
+      signal: request.signal
+    });
+    if (result.failed || (result.exitCode ?? 0) !== 0) {
+      if (request.signal?.aborted) {
+        throw new Error("CLI provider request was aborted");
+      }
+      if (result.timedOut) {
+        throw new Error(
+          `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
+        );
+      }
+      const codeText = result.exitCode !== null ? result.exitCode : "unknown";
+      const detail = result.stderr.trim() || result.stdout.trim();
+      const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
+      throw new Error(message);
+    }
+    return {
+      text: result.stdout,
+      raw: {
+        command: renderedCommand,
+        stderr: result.stderr,
+        exitCode: result.exitCode ?? 0,
+        cwd: this.config.cwd
+      }
+    };
+  }
+  async ensureHealthy(signal) {
+    if (!this.config.healthcheck) {
+      return;
+    }
+    if (!this.healthcheckPromise) {
+      this.healthcheckPromise = this.runHealthcheck(this.config.healthcheck, signal);
+    }
+    return this.healthcheckPromise;
+  }
+  async runHealthcheck(healthcheck, signal) {
+    if (!healthcheck) {
+      return;
+    }
+    const timeoutMs = healthcheck.timeoutMs ?? this.config.timeoutMs;
+    if (healthcheck.type === "http") {
+      const controller = new AbortController();
+      const timer = timeoutMs ? setTimeout(() => controller.abort(), timeoutMs) : void 0;
+      signal?.addEventListener("abort", () => controller.abort(), { once: true });
+      try {
+        const response = await fetch(healthcheck.url, { method: "GET", signal: controller.signal });
+        if (!response.ok) {
+          throw new Error(`HTTP ${response.status} ${response.statusText}`);
+        }
+      } catch (error) {
+        const reason = error instanceof Error ? error.message : String(error);
+        throw new Error(`CLI healthcheck failed for '${this.targetName}': ${reason}`);
+      } finally {
+        if (timer !== void 0) {
+          clearTimeout(timer);
+        }
+      }
+      return;
+    }
+    const renderedCommand = renderTemplate(
+      healthcheck.commandTemplate,
+      buildTemplateValues(
+        {
+          prompt: "",
+          guidelines: "",
+          inputFiles: [],
+          evalCaseId: "",
+          attempt: 0
+        },
+        this.config
+      )
+    );
+    const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
+    const result = await this.runCommand(renderedCommand, {
+      cwd: healthcheck.cwd ?? this.config.cwd,
+      env,
+      timeoutMs,
+      signal
+    });
+    if (result.failed || (result.exitCode ?? 0) !== 0) {
+      const codeText = result.exitCode !== null ? result.exitCode : "unknown";
+      const detail = result.stderr.trim() || result.stdout.trim();
+      const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
+      throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
+    }
+  }
+};
+function buildTemplateValues(request, config) {
+  const inputFiles = normalizeInputFiles(request.inputFiles);
+  return {
+    PROMPT: shellEscape(request.prompt ?? ""),
+    GUIDELINES: shellEscape(request.guidelines ?? ""),
+    EVAL_ID: shellEscape(request.evalCaseId ?? ""),
+    ATTEMPT: shellEscape(String(request.attempt ?? 0)),
+    FILES: formatFileList(inputFiles, config.filesFormat)
+  };
+}
+function normalizeInputFiles(inputFiles) {
+  if (!inputFiles || inputFiles.length === 0) {
+    return void 0;
+  }
+  const unique = /* @__PURE__ */ new Map();
+  for (const inputFile of inputFiles) {
+    const absolutePath = import_node_path3.default.resolve(inputFile);
+    if (!unique.has(absolutePath)) {
+      unique.set(absolutePath, absolutePath);
+    }
+  }
+  return Array.from(unique.values());
+}
+function formatFileList(files, template) {
+  if (!files || files.length === 0) {
+    return "";
+  }
+  const formatter = template ?? "{path}";
+  return files.map((filePath) => {
+    const escapedPath = shellEscape(filePath);
+    const escapedName = shellEscape(import_node_path3.default.basename(filePath));
+    return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
+  }).join(" ");
+}
+function renderTemplate(template, values) {
+  return template.replace(/\{([A-Z_]+)\}/g, (match, key) => {
+    const replacement = values[key];
+    return replacement !== void 0 ? replacement : match;
+  });
+}
+function shellEscape(value) {
+  if (value.length === 0) {
+    return "''";
+  }
+  if (process.platform === "win32") {
+    const escaped = value.replace(/"/g, '\\"');
+    return `"${escaped}"`;
+  }
+  return `'${value.replace(/'/g, `'"'"'`)}'`;
+}
+function formatTimeoutSuffix(timeoutMs) {
+  if (!timeoutMs || timeoutMs <= 0) {
+    return "";
+  }
+  const seconds = Math.ceil(timeoutMs / 1e3);
+  return ` after ${seconds}s`;
+}
 // src/evaluation/providers/mock.ts
 var DEFAULT_MOCK_RESPONSE = '{"answer":"Mock provider response. Configure targets.yaml to supply a custom value."}';
 var MockProvider = class {
@@ -878,6 +1155,7 @@ var MockProvider = class {
 // src/evaluation/providers/targets.ts
 var import_zod = require("zod");
+var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID", "ATTEMPT", "FILES"]);
 var BASE_TARGET_SCHEMA = import_zod.z.object({
   name: import_zod.z.string().min(1, "target name is required"),
   provider: import_zod.z.string().min(1, "provider is required"),
@@ -934,6 +1212,16 @@ function resolveTargetDefinition(definition, env = process.env) {
         providerBatching,
         config: resolveGeminiConfig(parsed, env)
       };
+    case "codex":
+    case "codex-cli":
+      return {
+        kind: "codex",
+        name: parsed.name,
+        judgeTarget: parsed.judge_target,
+        workers: parsed.workers,
+        providerBatching,
+        config: resolveCodexConfig(parsed, env)
+      };
     case "mock":
       return {
         kind: "mock",
@@ -953,6 +1241,15 @@ function resolveTargetDefinition(definition, env = process.env) {
         providerBatching,
         config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders")
       };
+    case "cli":
+      return {
+        kind: "cli",
+        name: parsed.name,
+        judgeTarget: parsed.judge_target,
+        workers: parsed.workers,
+        providerBatching,
+        config: resolveCliConfig(parsed, env)
+      };
     default:
       throw new Error(`Unsupported provider '${parsed.provider}' in target '${parsed.name}'`);
   }
@@ -1020,6 +1317,29 @@ function resolveGeminiConfig(target, env) {
     maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
   };
 }
+function resolveCodexConfig(target, env) {
+  const settings = target.settings ?? {};
+  const executableSource = settings.executable ?? settings.command ?? settings.binary;
+  const argsSource = settings.args ?? settings.arguments;
+  const cwdSource = settings.cwd;
+  const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
+  const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
+    allowLiteral: true,
+    optionalEnv: true
+  }) ?? "codex";
+  const args = resolveOptionalStringArray(argsSource, env, `${target.name} codex args`);
+  const cwd = resolveOptionalString(cwdSource, env, `${target.name} codex cwd`, {
+    allowLiteral: true,
+    optionalEnv: true
+  });
+  const timeoutMs = resolveTimeoutMs(timeoutSource, `${target.name} codex timeout`);
+  return {
+    executable,
+    args,
+    cwd,
+    timeoutMs
+  };
+}
 function resolveMockConfig(target) {
   const settings = target.settings ?? {};
   const response = typeof settings.response === "string" ? settings.response : void 0;
@@ -1049,6 +1369,125 @@ function resolveVSCodeConfig(target, env, insiders) {
     workspaceTemplate
   };
 }
+function resolveCliConfig(target, env) {
+  const settings = target.settings ?? {};
+  const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
+  const filesFormat = resolveOptionalLiteralString(
+    settings.files_format ?? settings.filesFormat ?? settings.attachments_format ?? settings.attachmentsFormat
+  );
+  const cwd = resolveOptionalString(settings.cwd, env, `${target.name} working directory`, {
+    allowLiteral: true,
+    optionalEnv: true
+  });
+  const envOverrides = resolveEnvOverrides(settings.env, env, target.name);
+  const timeoutMs = resolveTimeoutMs(settings.timeout_seconds ?? settings.timeoutSeconds, `${target.name} timeout`);
+  const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
+  const commandTemplate = resolveString(
+    commandTemplateSource,
+    env,
+    `${target.name} CLI command template`,
+    true
+  );
+  assertSupportedCliPlaceholders(commandTemplate, `${target.name} CLI command template`);
+  return {
+    commandTemplate,
+    filesFormat,
+    cwd,
+    env: envOverrides,
+    timeoutMs,
+    healthcheck
+  };
+}
+function resolveEnvOverrides(source, env, targetName) {
+  if (source === void 0 || source === null) {
+    return void 0;
+  }
+  if (typeof source !== "object" || Array.isArray(source)) {
+    throw new Error(`${targetName} env overrides must be an object map of strings`);
+  }
+  const entries = Object.entries(source);
+  const resolved = {};
+  for (const [key, value] of entries) {
+    if (typeof value !== "string") {
+      throw new Error(`${targetName} env override '${key}' must be a string`);
+    }
+    const resolvedValue = resolveString(value, env, `${targetName} env override '${key}'`);
+    resolved[key] = resolvedValue;
+  }
+  return Object.keys(resolved).length > 0 ? resolved : void 0;
+}
+function resolveTimeoutMs(source, description) {
+  const seconds = resolveOptionalNumber(source, `${description} (seconds)`);
+  if (seconds === void 0) {
+    return void 0;
+  }
+  if (seconds <= 0) {
+    throw new Error(`${description} must be greater than zero seconds`);
+  }
+  return Math.floor(seconds * 1e3);
+}
+function resolveCliHealthcheck(source, env, targetName) {
+  if (source === void 0 || source === null) {
+    return void 0;
+  }
+  if (typeof source !== "object" || Array.isArray(source)) {
+    throw new Error(`${targetName} healthcheck must be an object`);
+  }
+  const candidate = source;
+  const type = candidate.type;
+  const timeoutMs = resolveTimeoutMs(
+    candidate.timeout_seconds ?? candidate.timeoutSeconds,
+    `${targetName} healthcheck timeout`
+  );
+  if (type === "http") {
+    const url = resolveString(candidate.url, env, `${targetName} healthcheck URL`);
+    return {
+      type: "http",
+      url,
+      timeoutMs
+    };
+  }
+  if (type === "command") {
+    const commandTemplate = resolveString(
+      candidate.command_template ?? candidate.commandTemplate,
+      env,
+      `${targetName} healthcheck command template`,
+      true
+    );
+    assertSupportedCliPlaceholders(commandTemplate, `${targetName} healthcheck command template`);
+    const cwd = resolveOptionalString(candidate.cwd, env, `${targetName} healthcheck cwd`, {
+      allowLiteral: true,
+      optionalEnv: true
+    });
+    return {
+      type: "command",
+      commandTemplate,
+      timeoutMs,
+      cwd
+    };
+  }
+  throw new Error(`${targetName} healthcheck type must be 'http' or 'command'`);
+}
+function assertSupportedCliPlaceholders(template, description) {
+  const placeholders = extractCliPlaceholders(template);
+  for (const placeholder of placeholders) {
+    if (!CLI_PLACEHOLDERS.has(placeholder)) {
+      throw new Error(
+        `${description} includes unsupported placeholder '{${placeholder}}'. Supported placeholders: ${Array.from(CLI_PLACEHOLDERS).join(", ")}`
+      );
+    }
+  }
+}
+function extractCliPlaceholders(template) {
+  const matches = template.matchAll(/\{([A-Z_]+)\}/g);
+  const results = [];
+  for (const match of matches) {
+    if (match[1]) {
+      results.push(match[1]);
+    }
+  }
+  return results;
+}
 function resolveString(source, env, description, allowLiteral = false) {
   const value = resolveOptionalString(source, env, description, {
     allowLiteral,
@@ -1079,11 +1518,14 @@ function resolveOptionalString(source, env, description, options) {
   }
   const allowLiteral = options?.allowLiteral ?? false;
   const optionalEnv = options?.optionalEnv ?? false;
-  if (!allowLiteral && isLikelyEnvReference(trimmed)) {
+  const looksLikeEnv = isLikelyEnvReference(trimmed);
+  if (looksLikeEnv) {
     if (optionalEnv) {
       return void 0;
     }
-    throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
+    if (!allowLiteral) {
+      throw new Error(`Environment variable '${trimmed}' required for ${description} is not set`);
+    }
   }
   return trimmed;
 }
@@ -1133,10 +1575,42 @@ function resolveOptionalBoolean(source) {
 function isLikelyEnvReference(value) {
   return /^[A-Z0-9_]+$/.test(value);
 }
+function resolveOptionalStringArray(source, env, description) {
+  if (source === void 0 || source === null) {
+    return void 0;
+  }
+  if (!Array.isArray(source)) {
+    throw new Error(`${description} must be an array of strings`);
+  }
+  if (source.length === 0) {
+    return void 0;
+  }
+  const resolved = [];
+  for (let i = 0; i < source.length; i++) {
+    const item = source[i];
+    if (typeof item !== "string") {
+      throw new Error(`${description}[${i}] must be a string`);
+    }
+    const trimmed = item.trim();
+    if (trimmed.length === 0) {
+      throw new Error(`${description}[${i}] cannot be empty`);
+    }
+    const envValue = env[trimmed];
+    if (envValue !== void 0) {
+      if (envValue.trim().length === 0) {
+        throw new Error(`Environment variable '${trimmed}' for ${description}[${i}] is empty`);
+      }
+      resolved.push(envValue);
+    } else {
+      resolved.push(trimmed);
+    }
+  }
+  return resolved.length > 0 ? resolved : void 0;
+}
 // src/evaluation/providers/vscode.ts
 var import_promises3 = require("fs/promises");
-var import_node_path3 = __toESM(require("path"), 1);
+var import_node_path4 = __toESM(require("path"), 1);
 var import_subagent = require("subagent");
 var VSCodeProvider = class {
   id;
@@ -1154,12 +1628,11 @@ var VSCodeProvider = class {
     if (request.signal?.aborted) {
       throw new Error("VS Code provider request was aborted before dispatch");
     }
-    const attachments = normalizeAttachments(request.attachments);
-    const promptContent = buildPromptDocument(request, attachments, request.guideline_patterns);
+    const inputFiles = normalizeAttachments(request.inputFiles);
+    const promptContent = buildPromptDocument(request, inputFiles, request.guideline_patterns);
     const session = await (0, import_subagent.dispatchAgentSession)({
       userQuery: promptContent,
-      // Use full prompt content instead of just request.prompt
-      extraAttachments: attachments,
+      extraAttachments: inputFiles,
       wait: this.config.waitForResponse,
       dryRun: this.config.dryRun,
       vscodeCmd: this.config.command,
@@ -1176,7 +1649,7 @@ var VSCodeProvider = class {
         text: "",
         raw: {
           session,
-          attachments
+          inputFiles
         }
       };
     }
@@ -1185,7 +1658,7 @@ var VSCodeProvider = class {
       text: responseText,
       raw: {
         session,
-        attachments
+        inputFiles
       }
     };
   }
@@ -1195,17 +1668,17 @@ var VSCodeProvider = class {
     }
     const normalizedRequests = requests.map((req) => ({
       request: req,
-      attachments: normalizeAttachments(req.attachments)
+      inputFiles: normalizeAttachments(req.inputFiles)
     }));
-    const combinedAttachments = mergeAttachments(
-      normalizedRequests.map(({ attachments }) => attachments)
+    const combinedInputFiles = mergeAttachments(
+      normalizedRequests.map(({ inputFiles }) => inputFiles)
     );
     const userQueries = normalizedRequests.map(
-      ({ request, attachments }) => buildPromptDocument(request, attachments, request.guideline_patterns)
+      ({ request, inputFiles }) => buildPromptDocument(request, inputFiles, request.guideline_patterns)
     );
     const session = await (0, import_subagent.dispatchBatchAgent)({
       userQueries,
-      extraAttachments: combinedAttachments,
+      extraAttachments: combinedInputFiles,
       wait: this.config.waitForResponse,
       dryRun: this.config.dryRun,
       vscodeCmd: this.config.command,
@@ -1218,12 +1691,12 @@ var VSCodeProvider = class {
       throw new Error(failure);
     }
     if (this.config.dryRun) {
-      return normalizedRequests.map(({ attachments }) => ({
+      return normalizedRequests.map(({ inputFiles }) => ({
         text: "",
         raw: {
           session,
-          attachments,
-          allAttachments: combinedAttachments
+          inputFiles,
+          allInputFiles: combinedInputFiles
         }
       }));
     }
@@ -1239,8 +1712,8 @@ var VSCodeProvider = class {
         text: responseText,
         raw: {
           session,
-          attachments: normalizedRequests[index]?.attachments,
-          allAttachments: combinedAttachments,
+          inputFiles: normalizedRequests[index]?.inputFiles,
+          allInputFiles: combinedInputFiles,
           responseFile
         }
       });
@@ -1267,7 +1740,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, attachmentFiles) {
     return "";
   }
   const buildList = (files) => files.map((absolutePath) => {
-    const fileName = import_node_path3.default.basename(absolutePath);
+    const fileName = import_node_path4.default.basename(absolutePath);
     const fileUri = pathToFileUri(absolutePath);
     return `* [${fileName}](${fileUri})`;
   });
@@ -1292,8 +1765,8 @@ function collectGuidelineFiles(attachments, guidelinePatterns) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = import_node_path3.default.resolve(attachment);
-    const normalized = absolutePath.split(import_node_path3.default.sep).join("/");
+    const absolutePath = import_node_path4.default.resolve(attachment);
+    const normalized = absolutePath.split(import_node_path4.default.sep).join("/");
     if (isGuidelineFile(normalized, guidelinePatterns)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
@@ -1308,84 +1781,660 @@ function collectAttachmentFiles(attachments) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = import_node_path3.default.resolve(attachment);
+    const absolutePath = import_node_path4.default.resolve(attachment);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
   }
-  return Array.from(unique.values());
+  return Array.from(unique.values());
+}
+function pathToFileUri(filePath) {
+  const absolutePath = import_node_path4.default.isAbsolute(filePath) ? filePath : import_node_path4.default.resolve(filePath);
+  const normalizedPath = absolutePath.replace(/\\/g, "/");
+  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
+    return `file:///${normalizedPath}`;
+  }
+  return `file://${normalizedPath}`;
+}
+function normalizeAttachments(attachments) {
+  if (!attachments || attachments.length === 0) {
+    return void 0;
+  }
+  const deduped = /* @__PURE__ */ new Set();
+  for (const attachment of attachments) {
+    deduped.add(import_node_path4.default.resolve(attachment));
+  }
+  return Array.from(deduped);
+}
+function mergeAttachments(all) {
+  const deduped = /* @__PURE__ */ new Set();
+  for (const list of all) {
+    if (!list) continue;
+    for (const inputFile of list) {
+      deduped.add(import_node_path4.default.resolve(inputFile));
+    }
+  }
+  return deduped.size > 0 ? Array.from(deduped) : void 0;
+}
+async function ensureVSCodeSubagents(options) {
+  const { kind, count, verbose = false } = options;
+  const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
+  const subagentRoot = (0, import_subagent.getSubagentRoot)(vscodeCmd);
+  try {
+    if (verbose) {
+      console.log(`Provisioning ${count} subagent(s) via: subagent ${vscodeCmd} provision`);
+    }
+    const result = await (0, import_subagent.provisionSubagents)({
+      targetRoot: subagentRoot,
+      subagents: count,
+      dryRun: false
+    });
+    if (verbose) {
+      if (result.created.length > 0) {
+        console.log(`Created ${result.created.length} new subagent(s)`);
+      }
+      if (result.skippedExisting.length > 0) {
+        console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
+      }
+      console.log(`
+total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`);
+    }
+    return {
+      provisioned: true,
+      message: `Provisioned ${count} subagent(s): ${result.created.length} created, ${result.skippedExisting.length} reused`
+    };
+  } catch (error) {
+    const errorMessage = error instanceof Error ? error.message : String(error);
+    if (verbose) {
+      console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
+    }
+    return {
+      provisioned: false,
+      message: `Provisioning failed: ${errorMessage}`
+    };
+  }
+}
+// src/evaluation/providers/codex.ts
+var import_node_child_process2 = require("child_process");
+var import_node_fs3 = require("fs");
+var import_promises4 = require("fs/promises");
+var import_node_os = require("os");
+var import_node_path6 = __toESM(require("path"), 1);
+var import_node_util2 = require("util");
+// src/evaluation/providers/preread.ts
+var import_node_path5 = __toESM(require("path"), 1);
+function buildPromptDocument2(request, inputFiles, options) {
+  const parts = [];
+  const guidelineFiles = collectGuidelineFiles2(
+    inputFiles,
+    options?.guidelinePatterns ?? request.guideline_patterns,
+    options?.guidelineOverrides
+  );
+  const inputFilesList = collectInputFiles(inputFiles);
+  const nonGuidelineInputFiles = inputFilesList.filter(
+    (file) => !guidelineFiles.includes(file)
+  );
+  const prereadBlock = buildMandatoryPrereadBlock2(guidelineFiles, nonGuidelineInputFiles);
+  if (prereadBlock.length > 0) {
+    parts.push("\n", prereadBlock);
+  }
+  parts.push("\n[[ ## user_query ## ]]\n", request.prompt.trim());
+  return parts.join("\n").trim();
+}
+function normalizeInputFiles2(inputFiles) {
+  if (!inputFiles || inputFiles.length === 0) {
+    return void 0;
+  }
+  const deduped = /* @__PURE__ */ new Map();
+  for (const inputFile of inputFiles) {
+    const absolutePath = import_node_path5.default.resolve(inputFile);
+    if (!deduped.has(absolutePath)) {
+      deduped.set(absolutePath, absolutePath);
+    }
+  }
+  return Array.from(deduped.values());
+}
+function collectGuidelineFiles2(inputFiles, guidelinePatterns, overrides) {
+  if (!inputFiles || inputFiles.length === 0) {
+    return [];
+  }
+  const unique = /* @__PURE__ */ new Map();
+  for (const inputFile of inputFiles) {
+    const absolutePath = import_node_path5.default.resolve(inputFile);
+    if (overrides?.has(absolutePath)) {
+      if (!unique.has(absolutePath)) {
+        unique.set(absolutePath, absolutePath);
+      }
+      continue;
+    }
+    const normalized = absolutePath.split(import_node_path5.default.sep).join("/");
+    if (isGuidelineFile(normalized, guidelinePatterns)) {
+      if (!unique.has(absolutePath)) {
+        unique.set(absolutePath, absolutePath);
+      }
+    }
+  }
+  return Array.from(unique.values());
+}
+function collectInputFiles(inputFiles) {
+  if (!inputFiles || inputFiles.length === 0) {
+    return [];
+  }
+  const unique = /* @__PURE__ */ new Map();
+  for (const inputFile of inputFiles) {
+    const absolutePath = import_node_path5.default.resolve(inputFile);
+    if (!unique.has(absolutePath)) {
+      unique.set(absolutePath, absolutePath);
+    }
+  }
+  return Array.from(unique.values());
+}
+function buildMandatoryPrereadBlock2(guidelineFiles, inputFiles) {
+  if (guidelineFiles.length === 0 && inputFiles.length === 0) {
+    return "";
+  }
+  const buildList = (files) => files.map((absolutePath) => {
+    const fileName = import_node_path5.default.basename(absolutePath);
+    const fileUri = pathToFileUri2(absolutePath);
+    return `* [${fileName}](${fileUri})`;
+  });
+  const sections = [];
+  if (guidelineFiles.length > 0) {
+    sections.push(`Read all guideline files:
+${buildList(guidelineFiles).join("\n")}.`);
+  }
+  if (inputFiles.length > 0) {
+    sections.push(`Read all input files:
+${buildList(inputFiles).join("\n")}.`);
+  }
+  sections.push(
+    "If any file is missing, fail with ERROR: missing-file <filename> and stop.",
+    "Then apply system_instructions on the user query below."
+  );
+  return sections.join("\n");
+}
+function pathToFileUri2(filePath) {
+  const absolutePath = import_node_path5.default.isAbsolute(filePath) ? filePath : import_node_path5.default.resolve(filePath);
+  const normalizedPath = absolutePath.replace(/\\/g, "/");
+  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
+    return `file:///${normalizedPath}`;
+  }
+  return `file://${normalizedPath}`;
+}
+// src/evaluation/providers/codex.ts
+var execAsync2 = (0, import_node_util2.promisify)(import_node_child_process2.exec);
+var WORKSPACE_PREFIX = "agentv-codex-";
+var PROMPT_FILENAME = "prompt.md";
+var FILES_DIR = "files";
+var JSONL_TYPE_ITEM_COMPLETED = "item.completed";
+var CodexProvider = class {
+  id;
+  kind = "codex";
+  targetName;
+  supportsBatch = false;
+  config;
+  runCodex;
+  environmentCheck;
+  resolvedExecutable;
+  constructor(targetName, config, runner = defaultCodexRunner) {
+    this.id = `codex:${targetName}`;
+    this.targetName = targetName;
+    this.config = config;
+    this.runCodex = runner;
+  }
+  async invoke(request) {
+    if (request.signal?.aborted) {
+      throw new Error("Codex provider request was aborted before execution");
+    }
+    await this.ensureEnvironmentReady();
+    const inputFiles = normalizeInputFiles2(request.inputFiles);
+    const originalGuidelines = new Set(
+      collectGuidelineFiles2(inputFiles, request.guideline_patterns).map((file) => import_node_path6.default.resolve(file))
+    );
+    const workspaceRoot = await this.createWorkspace();
+    try {
+      const { mirroredInputFiles, guidelineMirrors } = await this.mirrorInputFiles(
+        inputFiles,
+        workspaceRoot,
+        originalGuidelines
+      );
+      const promptContent = buildPromptDocument2(request, mirroredInputFiles, {
+        guidelinePatterns: request.guideline_patterns,
+        guidelineOverrides: guidelineMirrors
+      });
+      const promptFile = import_node_path6.default.join(workspaceRoot, PROMPT_FILENAME);
+      await (0, import_promises4.writeFile)(promptFile, promptContent, "utf8");
+      const args = this.buildCodexArgs();
+      const cwd = this.resolveCwd(workspaceRoot);
+      const result = await this.executeCodex(args, cwd, promptContent, request.signal);
+      if (result.timedOut) {
+        throw new Error(
+          `Codex CLI timed out${formatTimeoutSuffix2(this.config.timeoutMs ?? void 0)}`
+        );
+      }
+      if (result.exitCode !== 0) {
+        const detail = pickDetail(result.stderr, result.stdout);
+        const prefix = `Codex CLI exited with code ${result.exitCode}`;
+        throw new Error(detail ? `${prefix}: ${detail}` : prefix);
+      }
+      const parsed = parseCodexJson(result.stdout);
+      const assistantText = extractAssistantText(parsed);
+      return {
+        text: assistantText,
+        raw: {
+          response: parsed,
+          stdout: result.stdout,
+          stderr: result.stderr,
+          exitCode: result.exitCode,
+          args,
+          executable: this.resolvedExecutable ?? this.config.executable,
+          promptFile,
+          workspace: workspaceRoot,
+          inputFiles: mirroredInputFiles
+        }
+      };
+    } finally {
+      await this.cleanupWorkspace(workspaceRoot);
+    }
+  }
+  async ensureEnvironmentReady() {
+    if (!this.environmentCheck) {
+      this.environmentCheck = this.validateEnvironment();
+    }
+    await this.environmentCheck;
+  }
+  async validateEnvironment() {
+    this.resolvedExecutable = await locateExecutable(this.config.executable);
+  }
+  resolveCwd(workspaceRoot) {
+    if (!this.config.cwd) {
+      return workspaceRoot;
+    }
+    return import_node_path6.default.resolve(this.config.cwd);
+  }
+  buildCodexArgs() {
+    const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
+    if (this.config.args && this.config.args.length > 0) {
+      args.push(...this.config.args);
+    }
+    args.push("-");
+    return args;
+  }
+  async executeCodex(args, cwd, promptContent, signal) {
+    try {
+      return await this.runCodex({
+        executable: this.resolvedExecutable ?? this.config.executable,
+        args,
+        cwd,
+        prompt: promptContent,
+        timeoutMs: this.config.timeoutMs,
+        env: process.env,
+        signal
+      });
+    } catch (error) {
+      const err = error;
+      if (err.code === "ENOENT") {
+        throw new Error(
+          `Codex executable '${this.config.executable}' was not found. Update the target settings.executable or add it to PATH.`
+        );
+      }
+      throw error;
+    }
+  }
+  async mirrorInputFiles(inputFiles, workspaceRoot, guidelineOriginals) {
+    if (!inputFiles || inputFiles.length === 0) {
+      return {
+        mirroredInputFiles: void 0,
+        guidelineMirrors: /* @__PURE__ */ new Set()
+      };
+    }
+    const filesRoot = import_node_path6.default.join(workspaceRoot, FILES_DIR);
+    await (0, import_promises4.mkdir)(filesRoot, { recursive: true });
+    const mirrored = [];
+    const guidelineMirrors = /* @__PURE__ */ new Set();
+    const nameCounts = /* @__PURE__ */ new Map();
+    for (const inputFile of inputFiles) {
+      const absoluteSource = import_node_path6.default.resolve(inputFile);
+      const baseName = import_node_path6.default.basename(absoluteSource);
+      const count = nameCounts.get(baseName) ?? 0;
+      nameCounts.set(baseName, count + 1);
+      const finalName = count === 0 ? baseName : `${baseName}.${count}`;
+      const destination = import_node_path6.default.join(filesRoot, finalName);
+      await (0, import_promises4.copyFile)(absoluteSource, destination);
+      const resolvedDestination = import_node_path6.default.resolve(destination);
+      mirrored.push(resolvedDestination);
+      if (guidelineOriginals.has(absoluteSource)) {
+        guidelineMirrors.add(resolvedDestination);
+      }
+    }
+    return {
+      mirroredInputFiles: mirrored,
+      guidelineMirrors
+    };
+  }
+  async createWorkspace() {
+    return await (0, import_promises4.mkdtemp)(import_node_path6.default.join((0, import_node_os.tmpdir)(), WORKSPACE_PREFIX));
+  }
+  async cleanupWorkspace(workspaceRoot) {
+    try {
+      await (0, import_promises4.rm)(workspaceRoot, { recursive: true, force: true });
+    } catch {
+    }
+  }
+};
+async function locateExecutable(candidate) {
+  const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
+  if (includesPathSeparator) {
+    const resolved = import_node_path6.default.isAbsolute(candidate) ? candidate : import_node_path6.default.resolve(candidate);
+    const executablePath = await ensureWindowsExecutableVariant(resolved);
+    await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
+    return executablePath;
+  }
+  const locator = process.platform === "win32" ? "where" : "which";
+  try {
+    const { stdout } = await execAsync2(`${locator} ${candidate}`);
+    const lines = stdout.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
+    const preferred = selectExecutableCandidate(lines);
+    if (preferred) {
+      const executablePath = await ensureWindowsExecutableVariant(preferred);
+      await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
+      return executablePath;
+    }
+  } catch {
+  }
+  throw new Error(`Codex executable '${candidate}' was not found on PATH`);
+}
+function selectExecutableCandidate(candidates) {
+  if (candidates.length === 0) {
+    return void 0;
+  }
+  if (process.platform !== "win32") {
+    return candidates[0];
+  }
+  const extensions = getWindowsExecutableExtensions();
+  for (const ext of extensions) {
+    const match = candidates.find((candidate) => candidate.toLowerCase().endsWith(ext));
+    if (match) {
+      return match;
+    }
+  }
+  return candidates[0];
+}
+async function ensureWindowsExecutableVariant(candidate) {
+  if (process.platform !== "win32") {
+    return candidate;
+  }
+  if (hasExecutableExtension(candidate)) {
+    return candidate;
+  }
+  const extensions = getWindowsExecutableExtensions();
+  for (const ext of extensions) {
+    const withExtension = `${candidate}${ext}`;
+    try {
+      await (0, import_promises4.access)(withExtension, import_node_fs3.constants.F_OK);
+      return withExtension;
+    } catch {
+    }
+  }
+  return candidate;
+}
+function hasExecutableExtension(candidate) {
+  const lower = candidate.toLowerCase();
+  return getWindowsExecutableExtensions().some((ext) => lower.endsWith(ext));
+}
+var DEFAULT_WINDOWS_EXTENSIONS = [".com", ".exe", ".bat", ".cmd", ".ps1"];
+function getWindowsExecutableExtensions() {
+  if (process.platform !== "win32") {
+    return [];
+  }
+  const fromEnv = process.env.PATHEXT?.split(";").map((ext) => ext.trim().toLowerCase()).filter((ext) => ext.length > 0);
+  return fromEnv && fromEnv.length > 0 ? fromEnv : DEFAULT_WINDOWS_EXTENSIONS;
+}
+function parseCodexJson(output) {
+  const trimmed = output.trim();
+  if (trimmed.length === 0) {
+    throw new Error("Codex CLI produced no output in --json mode");
+  }
+  try {
+    return JSON.parse(trimmed);
+  } catch {
+    const lineObjects = parseJsonLines(trimmed);
+    if (lineObjects) {
+      return lineObjects;
+    }
+    const lastBrace = trimmed.lastIndexOf("{");
+    if (lastBrace >= 0) {
+      const candidate = trimmed.slice(lastBrace);
+      try {
+        return JSON.parse(candidate);
+      } catch {
+      }
+    }
+    const preview = trimmed.slice(0, 200);
+    throw new Error(`Codex CLI emitted invalid JSON: ${preview}${trimmed.length > 200 ? "\u2026" : ""}`);
+  }
+}
+function extractAssistantText(parsed) {
+  if (Array.isArray(parsed)) {
+    const text = extractFromEventStream(parsed);
+    if (text) {
+      return text;
+    }
+  }
+  if (!parsed || typeof parsed !== "object") {
+    throw new Error("Codex CLI JSON response did not include an assistant message");
+  }
+  const record = parsed;
+  const eventText = extractFromEvent(record);
+  if (eventText) {
+    return eventText;
+  }
+  const messages = Array.isArray(record.messages) ? record.messages : void 0;
+  if (messages) {
+    for (let index = messages.length - 1; index >= 0; index -= 1) {
+      const entry = messages[index];
+      if (!entry || typeof entry !== "object") {
+        continue;
+      }
+      const role = entry.role;
+      if (role !== "assistant") {
+        continue;
+      }
+      const content = entry.content;
+      const flattened = flattenContent(content);
+      if (flattened) {
+        return flattened;
+      }
+    }
+  }
+  const response = record.response;
+  if (response && typeof response === "object") {
+    const content = response.content;
+    const flattened = flattenContent(content);
+    if (flattened) {
+      return flattened;
+    }
+  }
+  const output = record.output;
+  const flattenedOutput = flattenContent(output);
+  if (flattenedOutput) {
+    return flattenedOutput;
+  }
+  throw new Error("Codex CLI JSON response did not include an assistant message");
+}
+function extractFromEventStream(events) {
+  for (let index = events.length - 1; index >= 0; index -= 1) {
+    const candidate = events[index];
+    const text = extractFromEvent(candidate);
+    if (text) {
+      return text;
+    }
+  }
+  return void 0;
+}
+function extractFromEvent(event) {
+  if (!event || typeof event !== "object") {
+    return void 0;
+  }
+  const record = event;
+  const type = typeof record.type === "string" ? record.type : void 0;
+  if (type === JSONL_TYPE_ITEM_COMPLETED) {
+    const item = record.item;
+    const text = extractFromItem(item);
+    if (text) {
+      return text;
+    }
+  }
+  const output = record.output ?? record.content;
+  const flattened = flattenContent(output);
+  if (flattened) {
+    return flattened;
+  }
+  return void 0;
+}
+function extractFromItem(item) {
+  if (!item || typeof item !== "object") {
+    return void 0;
+  }
+  const record = item;
+  const itemType = typeof record.type === "string" ? record.type : void 0;
+  if (itemType === "agent_message" || itemType === "response" || itemType === "output") {
+    const text = flattenContent(record.text ?? record.content ?? record.output);
+    if (text) {
+      return text;
+    }
+  }
+  return void 0;
 }
-function pathToFileUri(filePath) {
-  const absolutePath = import_node_path3.default.isAbsolute(filePath) ? filePath : import_node_path3.default.resolve(filePath);
-  const normalizedPath = absolutePath.replace(/\\/g, "/");
-  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
-    return `file:///${normalizedPath}`;
+function flattenContent(value) {
+  if (typeof value === "string") {
+    return value;
   }
-  return `file://${normalizedPath}`;
+  if (Array.isArray(value)) {
+    const parts = value.map((segment) => {
+      if (typeof segment === "string") {
+        return segment;
+      }
+      if (segment && typeof segment === "object" && "text" in segment) {
+        const text = segment.text;
+        return typeof text === "string" ? text : void 0;
+      }
+      return void 0;
+    }).filter((part) => typeof part === "string" && part.length > 0);
+    return parts.length > 0 ? parts.join(" \n") : void 0;
+  }
+  if (value && typeof value === "object" && "text" in value) {
+    const text = value.text;
+    return typeof text === "string" ? text : void 0;
+  }
+  return void 0;
 }
-function normalizeAttachments(attachments) {
-  if (!attachments || attachments.length === 0) {
+function parseJsonLines(output) {
+  const lines = output.split(/\r?\n/).map((line) => line.trim()).filter((line) => line.length > 0);
+  if (lines.length <= 1) {
     return void 0;
   }
-  const deduped = /* @__PURE__ */ new Set();
-  for (const attachment of attachments) {
-    deduped.add(import_node_path3.default.resolve(attachment));
+  const parsed = [];
+  for (const line of lines) {
+    try {
+      parsed.push(JSON.parse(line));
+    } catch {
+      return void 0;
+    }
   }
-  return Array.from(deduped);
+  return parsed;
 }
-function mergeAttachments(all) {
-  const deduped = /* @__PURE__ */ new Set();
-  for (const list of all) {
-    if (!list) continue;
-    for (const attachment of list) {
-      deduped.add(import_node_path3.default.resolve(attachment));
-    }
+function pickDetail(stderr, stdout) {
+  const errorText = stderr.trim();
+  if (errorText.length > 0) {
+    return errorText;
   }
-  return deduped.size > 0 ? Array.from(deduped) : void 0;
+  const stdoutText = stdout.trim();
+  return stdoutText.length > 0 ? stdoutText : void 0;
 }
-async function ensureVSCodeSubagents(options) {
-  const { kind, count, verbose = false } = options;
-  const vscodeCmd = kind === "vscode-insiders" ? "code-insiders" : "code";
-  const subagentRoot = (0, import_subagent.getSubagentRoot)(vscodeCmd);
-  try {
-    if (verbose) {
-      console.log(`Provisioning ${count} subagent(s) via: subagent ${vscodeCmd} provision`);
+function formatTimeoutSuffix2(timeoutMs) {
+  if (!timeoutMs || timeoutMs <= 0) {
+    return "";
+  }
+  const seconds = Math.ceil(timeoutMs / 1e3);
+  return ` after ${seconds}s`;
+}
+async function defaultCodexRunner(options) {
+  return await new Promise((resolve, reject) => {
+    const child = (0, import_node_child_process2.spawn)(options.executable, options.args, {
+      cwd: options.cwd,
+      env: options.env,
+      stdio: ["pipe", "pipe", "pipe"],
+      shell: shouldShellExecute(options.executable)
+    });
+    let stdout = "";
+    let stderr = "";
+    let timedOut = false;
+    const onAbort = () => {
+      child.kill("SIGTERM");
+    };
+    if (options.signal) {
+      if (options.signal.aborted) {
+        onAbort();
+      } else {
+        options.signal.addEventListener("abort", onAbort, { once: true });
+      }
     }
-    const result = await (0, import_subagent.provisionSubagents)({
-      targetRoot: subagentRoot,
-      subagents: count,
-      dryRun: false
+    let timeoutHandle;
+    if (options.timeoutMs && options.timeoutMs > 0) {
+      timeoutHandle = setTimeout(() => {
+        timedOut = true;
+        child.kill("SIGTERM");
+      }, options.timeoutMs);
+      timeoutHandle.unref?.();
+    }
+    child.stdout.setEncoding("utf8");
+    child.stdout.on("data", (chunk) => {
+      stdout += chunk;
     });
-    if (verbose) {
-      if (result.created.length > 0) {
-        console.log(`Created ${result.created.length} new subagent(s)`);
+    child.stderr.setEncoding("utf8");
+    child.stderr.on("data", (chunk) => {
+      stderr += chunk;
+    });
+    child.stdin.end(options.prompt);
+    const cleanup = () => {
+      if (timeoutHandle) {
+        clearTimeout(timeoutHandle);
       }
-      if (result.skippedExisting.length > 0) {
-        console.log(`Reusing ${result.skippedExisting.length} existing unlocked subagent(s)`);
+      if (options.signal) {
+        options.signal.removeEventListener("abort", onAbort);
       }
-      console.log(`
-total unlocked subagents available: ${result.created.length + result.skippedExisting.length}`);
-    }
-    return {
-      provisioned: true,
-      message: `Provisioned ${count} subagent(s): ${result.created.length} created, ${result.skippedExisting.length} reused`
-    };
-  } catch (error) {
-    const errorMessage = error instanceof Error ? error.message : String(error);
-    if (verbose) {
-      console.warn(`Provisioning failed (continuing anyway): ${errorMessage}`);
-    }
-    return {
-      provisioned: false,
-      message: `Provisioning failed: ${errorMessage}`
     };
+    child.on("error", (error) => {
+      cleanup();
+      reject(error);
+    });
+    child.on("close", (code) => {
+      cleanup();
+      resolve({
+        stdout,
+        stderr,
+        exitCode: typeof code === "number" ? code : -1,
+        timedOut
+      });
+    });
+  });
+}
+function shouldShellExecute(executable) {
+  if (process.platform !== "win32") {
+    return false;
   }
+  const lower = executable.toLowerCase();
+  return lower.endsWith(".cmd") || lower.endsWith(".bat") || lower.endsWith(".ps1");
 }
 // src/evaluation/providers/targets-file.ts
-var import_node_fs3 = require("fs");
-var import_promises4 = require("fs/promises");
-var import_node_path4 = __toESM(require("path"), 1);
+var import_node_fs4 = require("fs");
+var import_promises5 = require("fs/promises");
+var import_node_path7 = __toESM(require("path"), 1);
 var import_yaml2 = require("yaml");
 // src/evaluation/providers/types.ts
@@ -1446,18 +2495,18 @@ function assertTargetDefinition(value, index, filePath) {
 }
 async function fileExists3(filePath) {
   try {
-    await (0, import_promises4.access)(filePath, import_node_fs3.constants.F_OK);
+    await (0, import_promises5.access)(filePath, import_node_fs4.constants.F_OK);
     return true;
   } catch {
     return false;
   }
 }
 async function readTargetDefinitions(filePath) {
-  const absolutePath = import_node_path4.default.resolve(filePath);
+  const absolutePath = import_node_path7.default.resolve(filePath);
   if (!await fileExists3(absolutePath)) {
     throw new Error(`targets.yaml not found at ${absolutePath}`);
   }
-  const raw = await (0, import_promises4.readFile)(absolutePath, "utf8");
+  const raw = await (0, import_promises5.readFile)(absolutePath, "utf8");
   const parsed = (0, import_yaml2.parse)(raw);
   if (!isRecord(parsed)) {
     throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
@@ -1480,6 +2529,10 @@ function createProvider(target) {
       return new AnthropicProvider(target.name, target.config);
     case "gemini":
       return new GeminiProvider(target.name, target.config);
+    case "cli":
+      return new CliProvider(target.name, target.config);
+    case "codex":
+      return new CodexProvider(target.name, target.config);
     case "mock":
       return new MockProvider(target.name, target.config);
     case "vscode":
@@ -1496,230 +2549,30 @@ function resolveAndCreateProvider(definition, env = process.env) {
   return createProvider(resolved);
 }
-// src/evaluation/scoring.ts
-var KEY_TERM_MATCH_THRESHOLD = 0.5;
-var ACTION_WORDS = /* @__PURE__ */ new Set([
-  "use",
-  "avoid",
-  "prefer",
-  "replace",
-  "consider",
-  "ensure",
-  "remove",
-  "add"
-]);
-var STOP_WORDS = /* @__PURE__ */ new Set([
-  "the",
-  "a",
-  "an",
-  "and",
-  "or",
-  "but",
-  "in",
-  "on",
-  "at",
-  "to",
-  "for",
-  "of",
-  "with",
-  "by",
-  "is",
-  "are",
-  "was",
-  "were",
-  "be",
-  "been",
-  "being",
-  "have",
-  "has",
-  "had",
-  "do",
-  "does",
-  "did",
-  "will",
-  "would",
-  "could",
-  "should"
-]);
-var ERROR_PREFIXES = [
-  "error:",
-  "err:",
-  "vs code command failed",
-  "exception",
-  "traceback",
-  "no response file was generated",
-  "timed out",
-  "cli not found"
-];
-function extractAspects(expectedResponse) {
-  const lines = expectedResponse.split(/\r?\n/).map((line) => line.trim());
-  const aspects = [];
-  for (const line of lines) {
-    if (line.length === 0) {
-      continue;
-    }
-    const bulletMatch = /^([-*•]|[0-9]+\.)\s*(.+)$/.exec(line);
-    if (bulletMatch) {
-      const normalized = normalizeAspect(bulletMatch[2]);
-      if (normalized.length > 0) {
-        aspects.push(normalized);
-      }
-      continue;
-    }
-    const lowered = line.toLowerCase();
-    if (Array.from(ACTION_WORDS).some((word) => lowered.startsWith(word))) {
-      const normalized = normalizeAspect(line);
-      if (normalized.length > 0) {
-        aspects.push(normalized);
-      }
-    }
-  }
-  return aspects;
-}
-function calculateHits(candidateResponse, expectedAspects) {
-  const { normalizedText, words } = normalizeCandidate(candidateResponse);
-  const hits = [];
-  for (const aspect of expectedAspects) {
-    if (matchesAspect(aspect, normalizedText, words)) {
-      hits.push(aspect);
-    }
-  }
-  return hits;
-}
-function calculateMisses(candidateResponse, expectedAspects, resolvedHits) {
-  const hits = new Set(resolvedHits ?? calculateHits(candidateResponse, expectedAspects));
-  return expectedAspects.filter((aspect) => !hits.has(aspect));
-}
-function scoreCandidateResponse(candidateResponse, expectedAspects) {
-  if (expectedAspects.length === 0) {
-    if (isErrorLike(candidateResponse)) {
-      return {
-        score: 0,
-        hits: [],
-        misses: ["Model produced an error instead of an answer."],
-        hitCount: 0,
-        totalAspects: 0,
-        rawAspects: []
-      };
-    }
-    return {
-      score: 1,
-      hits: [],
-      misses: [],
-      hitCount: 0,
-      totalAspects: 0,
-      rawAspects: []
-    };
-  }
-  const hits = calculateHits(candidateResponse, expectedAspects);
-  const misses = expectedAspects.filter((aspect) => !hits.includes(aspect));
-  const score = expectedAspects.length > 0 ? hits.length / expectedAspects.length : 0;
-  return {
-    score,
-    hits,
-    misses,
-    hitCount: hits.length,
-    totalAspects: expectedAspects.length,
-    rawAspects: expectedAspects
-  };
-}
-function isErrorLike(text) {
-  if (!text) {
-    return false;
-  }
-  const lowered = text.trim().toLowerCase();
-  return ERROR_PREFIXES.some((prefix) => lowered.startsWith(prefix));
-}
-function normalizeAspect(aspect) {
-  const sanitized = aspect.toLowerCase().replace(/[^\w\s]/g, " ").replace(/\s+/g, " ").trim();
-  return sanitized;
-}
-function normalizeCandidate(candidate) {
-  const lowered = candidate.toLowerCase();
-  const normalizedText = lowered.replace(/[^\w\s]/g, " ");
-  const words = new Set(normalizedText.split(/\s+/).filter((word) => word.length > 0));
-  return { normalizedText, words };
-}
-function matchesAspect(aspect, candidateNormalized, candidateWords) {
-  const keyTerms = extractKeyTerms(aspect);
-  if (keyTerms.length === 0) {
-    return false;
-  }
-  const matches = keyTerms.filter((term) => candidateWords.has(term)).length;
-  const ratio = matches / keyTerms.length;
-  if (ratio >= KEY_TERM_MATCH_THRESHOLD) {
-    return true;
-  }
-  const aspectWords = aspect.split(" ");
-  if (aspectWords.length >= 2) {
-    for (let index = 0; index < aspectWords.length - 1; index += 1) {
-      const phrase = `${aspectWords[index]} ${aspectWords[index + 1]}`;
-      if (candidateNormalized.includes(phrase)) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-function extractKeyTerms(aspect, maxTerms = 5) {
-  const terms = [];
-  const words = aspect.split(" ");
-  for (const word of words) {
-    if (word.length <= 2) {
-      continue;
-    }
-    if (STOP_WORDS.has(word)) {
-      continue;
-    }
-    terms.push(word);
-    if (terms.length >= maxTerms) {
-      break;
-    }
-  }
-  return terms;
-}
-// src/evaluation/grading.ts
+// src/evaluation/evaluators.ts
 var import_node_crypto = require("crypto");
-var HeuristicGrader = class {
-  kind = "heuristic";
-  grade(context) {
-    const expectedAspects = extractAspects(context.evalCase.expected_assistant_raw);
-    const result = scoreCandidateResponse(context.candidate, expectedAspects);
-    const misses = [...result.misses];
-    if (expectedAspects.length === 0 && isErrorLike(context.candidate)) {
-      const firstLine = context.candidate.split(/\r?\n/)[0]?.trim();
-      if (firstLine && !misses.includes(firstLine)) {
-        misses.unshift(firstLine);
-      }
-    }
-    return {
-      score: result.score,
-      hits: result.hits,
-      misses,
-      expectedAspectCount: result.totalAspects,
-      rawAspects: result.rawAspects
-    };
-  }
-};
-var QualityGrader = class {
+var LlmJudgeEvaluator = class {
   kind = "llm_judge";
   resolveJudgeProvider;
   maxOutputTokens;
   temperature;
+  customPrompt;
   constructor(options) {
     this.resolveJudgeProvider = options.resolveJudgeProvider;
     this.maxOutputTokens = options.maxOutputTokens;
     this.temperature = options.temperature;
+    this.customPrompt = options.customPrompt;
   }
-  async grade(context) {
+  async evaluate(context) {
     const judgeProvider = await this.resolveJudgeProvider(context);
     if (!judgeProvider) {
       throw new Error("No judge provider available for LLM grading");
     }
     const prompt = buildQualityPrompt(context.evalCase, context.candidate);
+    const systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
     const metadata = {
-      systemPrompt: QUALITY_SYSTEM_PROMPT
+      ...systemPrompt !== void 0 ? { systemPrompt } : {},
+      ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
     };
     const response = await judgeProvider.invoke({
       prompt,
@@ -1734,12 +2587,13 @@ var QualityGrader = class {
     const hits = Array.isArray(parsed.hits) ? parsed.hits.filter(isNonEmptyString).slice(0, 4) : [];
     const misses = Array.isArray(parsed.misses) ? parsed.misses.filter(isNonEmptyString).slice(0, 4) : [];
     const reasoning = parsed.reasoning ?? response.reasoning;
-    const graderRawRequest = {
+    const evaluatorRawRequest = {
       id: (0, import_node_crypto.randomUUID)(),
       provider: judgeProvider.id,
       prompt,
-      systemPrompt: QUALITY_SYSTEM_PROMPT,
-      target: context.target.name
+      target: context.target.name,
+      ...systemPrompt !== void 0 ? { systemPrompt } : {},
+      ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
     };
     return {
       score,
@@ -1747,7 +2601,7 @@ var QualityGrader = class {
       misses,
       expectedAspectCount: hits.length + misses.length || 1,
       reasoning,
-      graderRawRequest
+      evaluatorRawRequest
     };
   }
 };
@@ -1865,11 +2719,117 @@ function extractJsonBlob(text) {
 function isNonEmptyString(value) {
   return typeof value === "string" && value.trim().length > 0;
 }
+var CodeEvaluator = class {
+  kind = "code";
+  script;
+  cwd;
+  agentTimeoutMs;
+  constructor(options) {
+    this.script = options.script;
+    this.cwd = options.cwd;
+    this.agentTimeoutMs = options.agentTimeoutMs;
+  }
+  async evaluate(context) {
+    const inputPayload = JSON.stringify(
+      {
+        task: context.evalCase.task,
+        outcome: context.evalCase.outcome,
+        expected: context.evalCase.expected_assistant_raw,
+        output: context.candidate,
+        system_message: context.promptInputs.systemMessage ?? "",
+        guideline_paths: context.evalCase.guideline_paths,
+        attachments: context.evalCase.file_paths,
+        user_segments: context.evalCase.user_segments
+      },
+      null,
+      2
+    );
+    try {
+      const stdout = await executeScript(this.script, inputPayload, this.agentTimeoutMs, this.cwd);
+      const parsed = parseJsonSafe(stdout);
+      const score = clampScore(typeof parsed?.score === "number" ? parsed.score : 0);
+      const hits = Array.isArray(parsed?.hits) ? parsed.hits.filter(isNonEmptyString) : [];
+      const misses = Array.isArray(parsed?.misses) ? parsed.misses.filter(isNonEmptyString) : [];
+      const reasoning = typeof parsed?.reasoning === "string" ? parsed.reasoning : void 0;
+      return {
+        score,
+        hits,
+        misses,
+        expectedAspectCount: hits.length + misses.length || 1,
+        reasoning,
+        evaluatorRawRequest: {
+          script: this.script,
+          ...this.cwd ? { cwd: this.cwd } : {}
+        }
+      };
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return {
+        score: 0,
+        hits: [],
+        misses: [`Code evaluator failed: ${message}`],
+        expectedAspectCount: 1,
+        reasoning: message,
+        evaluatorRawRequest: {
+          script: this.script,
+          ...this.cwd ? { cwd: this.cwd } : {},
+          error: message
+        }
+      };
+    }
+  }
+};
+async function executeScript(scriptPath, input, agentTimeoutMs, cwd) {
+  const { spawn: spawn2 } = await import("child_process");
+  return await new Promise((resolve, reject) => {
+    const child = spawn2(scriptPath, {
+      shell: true,
+      cwd
+    });
+    let stdout = "";
+    let stderr = "";
+    const timeout = agentTimeoutMs ? setTimeout(() => {
+      child.kill();
+      reject(new Error(`Code evaluator timed out after ${agentTimeoutMs}ms`));
+    }, agentTimeoutMs) : void 0;
+    child.stdout?.on("data", (data) => {
+      stdout += data.toString();
+    });
+    child.stderr?.on("data", (data) => {
+      stderr += data.toString();
+    });
+    child.on("error", (error) => {
+      if (timeout !== void 0) {
+        clearTimeout(timeout);
+      }
+      reject(error);
+    });
+    child.on("exit", (code) => {
+      if (timeout !== void 0) {
+        clearTimeout(timeout);
+      }
+      if (code && code !== 0 && stderr.length > 0) {
+        reject(new Error(`Code evaluator exited with code ${code}: ${stderr.trim()}`));
+        return;
+      }
+      resolve(stdout.trim());
+    });
+    child.stdin?.write(input);
+    child.stdin?.end();
+  });
+}
+function parseJsonSafe(payload) {
+  try {
+    return JSON.parse(payload);
+  } catch {
+    return void 0;
+  }
+}
 // src/evaluation/orchestrator.ts
 var import_node_crypto2 = require("crypto");
-var import_promises5 = require("fs/promises");
-var import_node_path5 = __toESM(require("path"), 1);
+var import_promises6 = require("fs/promises");
+var import_node_path8 = __toESM(require("path"), 1);
 // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
 var Node = class {
@@ -2016,7 +2976,7 @@ async function runEvaluation(options) {
     targets,
     env,
     providerFactory,
-    graders,
+    evaluators,
     maxRetries,
     agentTimeoutMs,
     promptDumpDir,
@@ -2075,7 +3035,7 @@ async function runEvaluation(options) {
     }
     return getOrCreateProvider(resolvedJudge);
   };
-  const graderRegistry = buildGraderRegistry(graders, resolveJudgeProvider);
+  const evaluatorRegistry = buildEvaluatorRegistry(evaluators, resolveJudgeProvider);
   const primaryProvider = getOrCreateProvider(target);
   const providerSupportsBatch = target.providerBatching === true && primaryProvider.supportsBatch === true && typeof primaryProvider.invokeBatch === "function";
   if (target.providerBatching && !providerSupportsBatch && verbose) {
@@ -2098,13 +3058,14 @@ async function runEvaluation(options) {
         evalCases: filteredEvalCases,
         provider: primaryProvider,
         target,
-        graderRegistry,
+        evaluatorRegistry,
         promptDumpDir,
         nowFn: now ?? (() => /* @__PURE__ */ new Date()),
         onProgress,
         onResult,
         verbose,
-        resolveJudgeProvider
+        resolveJudgeProvider,
+        agentTimeoutMs
       });
     } catch (error) {
       if (verbose) {
@@ -2135,7 +3096,7 @@ async function runEvaluation(options) {
           evalCase,
           provider: primaryProvider,
           target,
-          graders: graderRegistry,
+          evaluators: evaluatorRegistry,
           maxRetries,
           agentTimeoutMs,
           promptDumpDir,
@@ -2201,12 +3162,13 @@ async function runBatchEvaluation(options) {
     evalCases,
     provider,
     target,
-    graderRegistry,
+    evaluatorRegistry,
     promptDumpDir,
     nowFn,
     onProgress,
     onResult,
-    resolveJudgeProvider
+    resolveJudgeProvider,
+    agentTimeoutMs
   } = options;
   const promptInputsList = [];
   for (const evalCase of evalCases) {
@@ -2222,7 +3184,7 @@ async function runBatchEvaluation(options) {
       prompt: promptInputs.request,
       guidelines: promptInputs.guidelines,
       guideline_patterns: evalCase.guideline_patterns,
-      attachments: evalCase.file_paths,
+      inputFiles: evalCase.file_paths,
       evalCaseId: evalCase.id,
       metadata: {
         systemPrompt: promptInputs.systemMessage ?? ""
@@ -2254,23 +3216,19 @@ async function runBatchEvaluation(options) {
     const evalCase = evalCases[i];
     const promptInputs = promptInputsList[i];
     const providerResponse = batchResponse[i];
-    const now = nowFn();
-    const graderKind = evalCase.grader ?? "heuristic";
-    const activeGrader = graderRegistry[graderKind] ?? graderRegistry.heuristic;
-    if (!activeGrader) {
-      throw new Error(`No grader registered for kind '${graderKind}'`);
-    }
-    let grade;
+    let result;
     try {
-      grade = await activeGrader.grade({
+      result = await evaluateCandidate({
         evalCase,
         candidate: providerResponse.text ?? "",
         target,
         provider,
-        attempt: 0,
+        evaluators: evaluatorRegistry,
         promptInputs,
-        now,
-        judgeProvider: await resolveJudgeProvider(target)
+        nowFn,
+        attempt: 0,
+        judgeProvider: await resolveJudgeProvider(target),
+        agentTimeoutMs
       });
     } catch (error) {
       const errorResult = buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
@@ -2289,28 +3247,6 @@ async function runBatchEvaluation(options) {
       }
       continue;
     }
-    const completedAt = nowFn();
-    const rawRequest = {
-      request: promptInputs.request,
-      guidelines: promptInputs.guidelines,
-      guideline_paths: evalCase.guideline_paths,
-      system_message: promptInputs.systemMessage ?? ""
-    };
-    const result = {
-      eval_id: evalCase.id,
-      conversation_id: evalCase.conversation_id,
-      score: grade.score,
-      hits: grade.hits,
-      misses: grade.misses,
-      model_answer: providerResponse.text ?? "",
-      expected_aspect_count: grade.expectedAspectCount,
-      target: target.name,
-      timestamp: completedAt.toISOString(),
-      reasoning: grade.reasoning,
-      raw_aspects: grade.rawAspects,
-      raw_request: rawRequest,
-      grader_raw_request: grade.graderRawRequest
-    };
     results.push(result);
     if (onResult) {
       await onResult(result);
@@ -2332,7 +3268,7 @@ async function runEvalCase(options) {
     evalCase,
     provider,
     target,
-    graders,
+    evaluators,
     now,
     maxRetries,
     agentTimeoutMs,
@@ -2387,27 +3323,49 @@ async function runEvalCase(options) {
   if (cacheKey && cache && !cachedResponse) {
     await cache.set(cacheKey, providerResponse);
   }
-  const graderKind = evalCase.grader ?? "heuristic";
-  const activeGrader = graders[graderKind] ?? graders.heuristic;
-  if (!activeGrader) {
-    throw new Error(`No grader registered for kind '${graderKind}'`);
-  }
-  let grade;
   try {
-    const gradeTimestamp = nowFn();
-    grade = await activeGrader.grade({
+    return await evaluateCandidate({
       evalCase,
       candidate: providerResponse.text ?? "",
       target,
       provider,
-      attempt,
+      evaluators,
       promptInputs,
-      now: gradeTimestamp,
-      judgeProvider
+      nowFn,
+      attempt,
+      judgeProvider,
+      agentTimeoutMs
     });
   } catch (error) {
     return buildErrorResult(evalCase, target.name, nowFn(), error, promptInputs);
   }
+}
+async function evaluateCandidate(options) {
+  const {
+    evalCase,
+    candidate,
+    target,
+    provider,
+    evaluators,
+    promptInputs,
+    nowFn,
+    attempt,
+    judgeProvider,
+    agentTimeoutMs
+  } = options;
+  const gradeTimestamp = nowFn();
+  const { score, evaluatorResults } = await runEvaluatorsForCase({
+    evalCase,
+    candidate,
+    target,
+    provider,
+    evaluators,
+    attempt,
+    promptInputs,
+    now: gradeTimestamp,
+    judgeProvider,
+    agentTimeoutMs
+  });
   const completedAt = nowFn();
   const rawRequest = {
     request: promptInputs.request,
@@ -2418,28 +3376,200 @@ async function runEvalCase(options) {
   return {
     eval_id: evalCase.id,
     conversation_id: evalCase.conversation_id,
-    score: grade.score,
-    hits: grade.hits,
-    misses: grade.misses,
-    model_answer: providerResponse.text ?? "",
-    expected_aspect_count: grade.expectedAspectCount,
+    score: score.score,
+    hits: score.hits,
+    misses: score.misses,
+    model_answer: candidate,
+    expected_aspect_count: score.expectedAspectCount,
     target: target.name,
     timestamp: completedAt.toISOString(),
-    reasoning: grade.reasoning,
-    raw_aspects: grade.rawAspects,
+    reasoning: score.reasoning,
+    raw_aspects: score.rawAspects,
     raw_request: rawRequest,
-    grader_raw_request: grade.graderRawRequest
+    evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
+    evaluator_results: evaluatorResults
   };
 }
+async function runEvaluatorsForCase(options) {
+  const { evalCase, candidate, target, provider, evaluators, attempt, promptInputs, now, judgeProvider, agentTimeoutMs } = options;
+  if (evalCase.evaluators && evalCase.evaluators.length > 0) {
+    return runEvaluatorList({
+      evalCase,
+      evaluators: evalCase.evaluators,
+      candidate,
+      target,
+      provider,
+      evaluatorRegistry: evaluators,
+      attempt,
+      promptInputs,
+      now,
+      judgeProvider,
+      agentTimeoutMs
+    });
+  }
+  const evaluatorKind = evalCase.evaluator ?? "llm_judge";
+  const activeEvaluator = evaluators[evaluatorKind] ?? evaluators.llm_judge;
+  if (!activeEvaluator) {
+    throw new Error(`No evaluator registered for kind '${evaluatorKind}'`);
+  }
+  const score = await activeEvaluator.evaluate({
+    evalCase,
+    candidate,
+    target,
+    provider,
+    attempt,
+    promptInputs,
+    now,
+    judgeProvider
+  });
+  return { score };
+}
+async function runEvaluatorList(options) {
+  const {
+    evalCase,
+    evaluators,
+    candidate,
+    target,
+    provider,
+    evaluatorRegistry,
+    attempt,
+    promptInputs,
+    now,
+    judgeProvider,
+    agentTimeoutMs
+  } = options;
+  const scored = [];
+  const evaluatorResults = [];
+  for (const evaluator of evaluators ?? []) {
+    try {
+      if (evaluator.type === "llm_judge") {
+        const score2 = await runLlmJudgeEvaluator({
+          config: evaluator,
+          evalCase,
+          candidate,
+          target,
+          provider,
+          evaluatorRegistry,
+          attempt,
+          promptInputs,
+          now,
+          judgeProvider
+        });
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning,
+          evaluator_raw_request: score2.evaluatorRawRequest
+        });
+        continue;
+      }
+      if (evaluator.type === "code") {
+        const codeEvaluator = new CodeEvaluator({
+          script: evaluator.script,
+          cwd: evaluator.resolvedCwd ?? evaluator.cwd,
+          agentTimeoutMs
+        });
+        const score2 = await codeEvaluator.evaluate({
+          evalCase,
+          candidate,
+          target,
+          provider,
+          attempt,
+          promptInputs,
+          now
+        });
+        scored.push({ score: score2, name: evaluator.name, type: evaluator.type });
+        evaluatorResults.push({
+          name: evaluator.name,
+          type: evaluator.type,
+          score: score2.score,
+          hits: score2.hits,
+          misses: score2.misses,
+          reasoning: score2.reasoning,
+          evaluator_raw_request: score2.evaluatorRawRequest
+        });
+        continue;
+      }
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      const fallbackScore = {
+        score: 0,
+        hits: [],
+        misses: [`Evaluator '${evaluator.name}' failed: ${message}`],
+        expectedAspectCount: 1,
+        reasoning: message
+      };
+      scored.push({ score: fallbackScore, name: evaluator.name ?? "unknown", type: evaluator.type ?? "unknown" });
+      evaluatorResults.push({
+        name: evaluator.name ?? "unknown",
+        type: evaluator.type ?? "unknown",
+        score: 0,
+        hits: [],
+        misses: [`Evaluator '${evaluator.name ?? "unknown"}' failed: ${message}`],
+        reasoning: message
+      });
+    }
+  }
+  const aggregateScore = scored.length > 0 ? scored.reduce((total, entry) => total + entry.score.score, 0) / scored.length : 0;
+  const hits = scored.flatMap((entry) => entry.score.hits);
+  const misses = scored.flatMap((entry) => entry.score.misses);
+  const expectedAspectCount = scored.reduce((total, entry) => total + (entry.score.expectedAspectCount ?? 0), 0);
+  const rawAspects = scored.flatMap((entry) => entry.score.rawAspects ?? []);
+  const reasoningParts = scored.map((entry) => entry.score.reasoning ? `${entry.name}: ${entry.score.reasoning}` : void 0).filter(isNonEmptyString2);
+  const reasoning = reasoningParts.length > 0 ? reasoningParts.join(" | ") : void 0;
+  const score = {
+    score: aggregateScore,
+    hits,
+    misses,
+    expectedAspectCount,
+    reasoning,
+    rawAspects: rawAspects.length > 0 ? rawAspects : void 0
+  };
+  return { score, evaluatorResults };
+}
+async function runLlmJudgeEvaluator(options) {
+  const { config, evalCase, candidate, target, provider, evaluatorRegistry, attempt, promptInputs, now, judgeProvider } = options;
+  const customPrompt = await resolveCustomPrompt(config);
+  return evaluatorRegistry.llm_judge.evaluate({
+    evalCase,
+    candidate,
+    target,
+    provider,
+    attempt,
+    promptInputs,
+    now,
+    judgeProvider,
+    systemPrompt: customPrompt,
+    evaluator: config,
+    judgeModel: config.model
+  });
+}
+async function resolveCustomPrompt(config) {
+  if (config.promptPath) {
+    try {
+      return await (0, import_promises6.readFile)(config.promptPath, "utf8");
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
+    }
+  }
+  return config.prompt;
+}
+function isNonEmptyString2(value) {
+  return typeof value === "string" && value.trim().length > 0;
+}
 function filterEvalCases(evalCases, evalId) {
   if (!evalId) {
     return evalCases;
   }
   return evalCases.filter((evalCase) => evalCase.id === evalId);
 }
-function buildGraderRegistry(overrides, resolveJudgeProvider) {
-  const heuristic = overrides?.heuristic ?? new HeuristicGrader();
-  const llmJudge = overrides?.llm_judge ?? new QualityGrader({
+function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
+  const llmJudge = overrides?.llm_judge ?? new LlmJudgeEvaluator({
     resolveJudgeProvider: async (context) => {
       if (context.judgeProvider) {
         return context.judgeProvider;
@@ -2449,22 +3579,21 @@ function buildGraderRegistry(overrides, resolveJudgeProvider) {
   });
   return {
     ...overrides,
-    heuristic,
     llm_judge: llmJudge
   };
 }
 async function dumpPrompt(directory, evalCase, promptInputs) {
   const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
   const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
-  const filePath = import_node_path5.default.resolve(directory, filename);
-  await (0, import_promises5.mkdir)(import_node_path5.default.dirname(filePath), { recursive: true });
+  const filePath = import_node_path8.default.resolve(directory, filename);
+  await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
   const payload = {
     eval_id: evalCase.id,
     request: promptInputs.request,
     guidelines: promptInputs.guidelines,
     guideline_paths: evalCase.guideline_paths
   };
-  await (0, import_promises5.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
+  await (0, import_promises6.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
 }
 function sanitizeFilename(value) {
   if (!value) {
@@ -2474,7 +3603,7 @@ function sanitizeFilename(value) {
   return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
 }
 async function invokeProvider(provider, options) {
-  const { evalCase, target, promptInputs, attempt, agentTimeoutMs, signal } = options;
+  const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
   const controller = new AbortController();
   const timeout = agentTimeoutMs ? setTimeout(() => controller.abort(), agentTimeoutMs) : void 0;
   if (signal) {
@@ -2485,7 +3614,7 @@ async function invokeProvider(provider, options) {
       prompt: promptInputs.request,
       guidelines: promptInputs.guidelines,
       guideline_patterns: evalCase.guideline_patterns,
-      attachments: evalCase.file_paths,
+      inputFiles: evalCase.file_paths,
       evalCaseId: evalCase.id,
       attempt,
       metadata: {
@@ -2554,25 +3683,20 @@ function createAgentKernel() {
 }
 // Annotate the CommonJS export names for ESM import in node:
 0 && (module.exports = {
-  GRADER_KINDS,
-  HeuristicGrader,
-  QualityGrader,
+  CodeEvaluator,
+  LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES,
   buildDirectoryChain,
   buildPromptInputs,
   buildSearchRoots,
-  calculateHits,
-  calculateMisses,
   createAgentKernel,
   createProvider,
   ensureVSCodeSubagents,
-  extractAspects,
   extractCodeBlocks,
   fileExists,
   findGitRoot,
   getHitCount,
-  isErrorLike,
-  isGraderKind,
+  isEvaluatorKind,
   isGuidelineFile,
   isJsonObject,
   isJsonValue,
@@ -2585,7 +3709,6 @@ function createAgentKernel() {
   resolveFileReference,
   resolveTargetDefinition,
   runEvalCase,
-  runEvaluation,
-  scoreCandidateResponse
+  runEvaluation
 });
 //# sourceMappingURL=index.cjs.map