npm - @agentv/core - Versions diffs - 0.11.0 → 0.14.2 - Mend

@agentv/core 0.11.0 → 0.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/README.md +1 -2
package/dist/{chunk-YQBJAT5I.js → chunk-IOCVST3R.js} +1 -1
package/dist/chunk-IOCVST3R.js.map +1 -0
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +1 -1
package/dist/index.cjs +912 -747
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +46 -34
package/dist/index.d.ts +46 -34
package/dist/index.js +875 -708
package/dist/index.js.map +1 -1
package/package.json +5 -2
package/dist/chunk-YQBJAT5I.js.map +0 -1

package/dist/index.cjs CHANGED Viewed

@@ -33,15 +33,15 @@ __export(index_exports, {
   CodeEvaluator: () => CodeEvaluator,
   LlmJudgeEvaluator: () => LlmJudgeEvaluator,
   TEST_MESSAGE_ROLES: () => TEST_MESSAGE_ROLES,
-  buildDirectoryChain: () => buildDirectoryChain,
+  buildDirectoryChain: () => buildDirectoryChain2,
   buildPromptInputs: () => buildPromptInputs,
-  buildSearchRoots: () => buildSearchRoots,
+  buildSearchRoots: () => buildSearchRoots2,
   consumeCodexLogEntries: () => consumeCodexLogEntries,
   createAgentKernel: () => createAgentKernel,
   createProvider: () => createProvider,
   ensureVSCodeSubagents: () => ensureVSCodeSubagents,
   extractCodeBlocks: () => extractCodeBlocks,
-  fileExists: () => fileExists,
+  fileExists: () => fileExists2,
   findGitRoot: () => findGitRoot,
   getHitCount: () => getHitCount,
   isEvaluatorKind: () => isEvaluatorKind,
@@ -57,7 +57,7 @@ __export(index_exports, {
   readTestSuiteMetadata: () => readTestSuiteMetadata,
   readTextFile: () => readTextFile,
   resolveAndCreateProvider: () => resolveAndCreateProvider,
-  resolveFileReference: () => resolveFileReference,
+  resolveFileReference: () => resolveFileReference2,
   resolveTargetDefinition: () => resolveTargetDefinition,
   runEvalCase: () => runEvalCase,
   runEvaluation: () => runEvaluation,
@@ -116,47 +116,112 @@ function getHitCount(result) {
 }
 // src/evaluation/yaml-parser.ts
+var import_promises5 = require("fs/promises");
+var import_node_path6 = __toESM(require("path"), 1);
+var import_yaml2 = require("yaml");
+// src/evaluation/formatting/segment-formatter.ts
+function extractCodeBlocks(segments) {
+  const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
+  const codeBlocks = [];
+  for (const segment of segments) {
+    const typeValue = segment["type"];
+    if (typeof typeValue !== "string" || typeValue !== "text") {
+      continue;
+    }
+    const textValue = segment["value"];
+    if (typeof textValue !== "string") {
+      continue;
+    }
+    const matches = textValue.match(CODE_BLOCK_PATTERN);
+    if (matches) {
+      codeBlocks.push(...matches);
+    }
+  }
+  return codeBlocks;
+}
+function formatFileContents(parts) {
+  const fileCount = parts.filter((p) => p.isFile).length;
+  if (fileCount > 0) {
+    return parts.map((part) => {
+      if (part.isFile && part.displayPath) {
+        return `<file path="${part.displayPath}">
+${part.content}
+</file>`;
+      }
+      return part.content;
+    }).join("\n\n");
+  }
+  return parts.map((p) => p.content).join(" ");
+}
+function formatSegment(segment) {
+  const type = asString(segment.type);
+  if (type === "text") {
+    return asString(segment.value);
+  }
+  if (type === "guideline_ref") {
+    const refPath = asString(segment.path);
+    return refPath ? `<Attached: ${refPath}>` : void 0;
+  }
+  if (type === "file") {
+    const text = asString(segment.text);
+    const filePath = asString(segment.path);
+    if (text && filePath) {
+      return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
+    }
+  }
+  return void 0;
+}
+function hasVisibleContent(segments) {
+  return segments.some((segment) => {
+    const type = asString(segment.type);
+    if (type === "text") {
+      const value = asString(segment.value);
+      return value !== void 0 && value.trim().length > 0;
+    }
+    if (type === "guideline_ref") {
+      return false;
+    }
+    if (type === "file") {
+      const text = asString(segment.text);
+      return text !== void 0 && text.trim().length > 0;
+    }
+    return false;
+  });
+}
+function asString(value) {
+  return typeof value === "string" ? value : void 0;
+}
+// src/evaluation/loaders/config-loader.ts
 var import_micromatch = __toESM(require("micromatch"), 1);
-var import_node_fs2 = require("fs");
 var import_promises2 = require("fs/promises");
 var import_node_path2 = __toESM(require("path"), 1);
-var import_node_url = require("url");
 var import_yaml = require("yaml");
-// src/evaluation/file-utils.ts
+// src/evaluation/loaders/file-resolver.ts
 var import_node_fs = require("fs");
 var import_promises = require("fs/promises");
 var import_node_path = __toESM(require("path"), 1);
-async function fileExists(filePath) {
+async function fileExists(absolutePath) {
   try {
-    await (0, import_promises.access)(filePath, import_node_fs.constants.F_OK);
+    await (0, import_promises.access)(absolutePath, import_node_fs.constants.F_OK);
     return true;
   } catch {
     return false;
   }
 }
-function normalizeLineEndings(content) {
-  return content.replace(/\r\n/g, "\n");
-}
-async function readTextFile(filePath) {
-  const content = await (0, import_promises.readFile)(filePath, "utf8");
-  return normalizeLineEndings(content);
-}
-async function findGitRoot(startPath) {
-  let currentDir = import_node_path.default.dirname(import_node_path.default.resolve(startPath));
-  const root = import_node_path.default.parse(currentDir).root;
-  while (currentDir !== root) {
-    const gitPath = import_node_path.default.join(currentDir, ".git");
-    if (await fileExists(gitPath)) {
-      return currentDir;
-    }
-    const parentDir = import_node_path.default.dirname(currentDir);
-    if (parentDir === currentDir) {
-      break;
+function resolveToAbsolutePath(candidate) {
+  if (candidate instanceof URL) {
+    return new URL(candidate).pathname;
+  }
+  if (typeof candidate === "string") {
+    if (candidate.startsWith("file://")) {
+      return new URL(candidate).pathname;
     }
-    currentDir = parentDir;
+    return import_node_path.default.resolve(candidate);
   }
-  return null;
+  throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
 }
 function buildDirectoryChain(filePath, repoRoot) {
   const directories = [];
@@ -234,44 +299,15 @@ async function resolveFileReference(rawValue, searchRoots) {
   return { displayPath, attempted };
 }
-// src/evaluation/yaml-parser.ts
-var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
+// src/evaluation/loaders/config-loader.ts
+var SCHEMA_CONFIG_V2 = "agentv-config-v2";
 var ANSI_YELLOW = "\x1B[33m";
 var ANSI_RESET = "\x1B[0m";
-var SCHEMA_EVAL_V2 = "agentv-eval-v2";
-var SCHEMA_CONFIG_V2 = "agentv-config-v2";
-async function readTestSuiteMetadata(testFilePath) {
-  try {
-    const absolutePath = import_node_path2.default.resolve(testFilePath);
-    const content = await (0, import_promises2.readFile)(absolutePath, "utf8");
-    const parsed = (0, import_yaml.parse)(content);
-    if (!isJsonObject(parsed)) {
-      return {};
-    }
-    return { target: extractTargetFromSuite(parsed) };
-  } catch {
-    return {};
-  }
-}
-function extractTargetFromSuite(suite) {
-  const execution = suite.execution;
-  if (execution && typeof execution === "object" && !Array.isArray(execution)) {
-    const executionTarget = execution.target;
-    if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
-      return executionTarget.trim();
-    }
-  }
-  const targetValue = suite.target;
-  if (typeof targetValue === "string" && targetValue.trim().length > 0) {
-    return targetValue.trim();
-  }
-  return void 0;
-}
 async function loadConfig(evalFilePath, repoRoot) {
   const directories = buildDirectoryChain(evalFilePath, repoRoot);
   for (const directory of directories) {
     const configPath = import_node_path2.default.join(directory, ".agentv", "config.yaml");
-    if (!await fileExists2(configPath)) {
+    if (!await fileExists(configPath)) {
       continue;
     }
     try {
@@ -313,24 +349,134 @@ function isGuidelineFile(filePath, patterns) {
   const patternsToUse = patterns ?? [];
   return import_micromatch.default.isMatch(normalized, patternsToUse);
 }
-function extractCodeBlocks(segments) {
-  const codeBlocks = [];
-  for (const segment of segments) {
-    const typeValue = segment["type"];
-    if (typeof typeValue !== "string" || typeValue !== "text") {
+function extractTargetFromSuite(suite) {
+  const execution = suite.execution;
+  if (execution && typeof execution === "object" && !Array.isArray(execution)) {
+    const executionTarget = execution.target;
+    if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
+      return executionTarget.trim();
+    }
+  }
+  const targetValue = suite.target;
+  if (typeof targetValue === "string" && targetValue.trim().length > 0) {
+    return targetValue.trim();
+  }
+  return void 0;
+}
+function logWarning(message) {
+  console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
+}
+// src/evaluation/loaders/evaluator-parser.ts
+var import_node_path3 = __toESM(require("path"), 1);
+var ANSI_YELLOW2 = "\x1B[33m";
+var ANSI_RESET2 = "\x1B[0m";
+async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
+  const execution = rawEvalCase.execution;
+  const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
+  if (candidateEvaluators === void 0) {
+    return void 0;
+  }
+  if (!Array.isArray(candidateEvaluators)) {
+    logWarning2(`Skipping evaluators for '${evalId}': expected array`);
+    return void 0;
+  }
+  const evaluators = [];
+  for (const rawEvaluator of candidateEvaluators) {
+    if (!isJsonObject2(rawEvaluator)) {
+      logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
       continue;
     }
-    const textValue = segment["value"];
-    if (typeof textValue !== "string") {
+    const name = asString2(rawEvaluator.name);
+    const typeValue = rawEvaluator.type;
+    if (!name || !isEvaluatorKind(typeValue)) {
+      logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
       continue;
     }
-    const matches = textValue.match(CODE_BLOCK_PATTERN);
-    if (matches) {
-      codeBlocks.push(...matches);
+    if (typeValue === "code") {
+      const script = asString2(rawEvaluator.script);
+      if (!script) {
+        logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
+        continue;
+      }
+      const cwd = asString2(rawEvaluator.cwd);
+      let resolvedCwd;
+      if (cwd) {
+        const resolved = await resolveFileReference(cwd, searchRoots);
+        if (resolved.resolvedPath) {
+          resolvedCwd = import_node_path3.default.resolve(resolved.resolvedPath);
+        } else {
+          logWarning2(
+            `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
+            resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => `  Tried: ${attempt}`) : void 0
+          );
+        }
+      } else {
+        resolvedCwd = searchRoots[0];
+      }
+      evaluators.push({
+        name,
+        type: "code",
+        script,
+        cwd,
+        resolvedCwd
+      });
+      continue;
+    }
+    const prompt = asString2(rawEvaluator.prompt);
+    let promptPath;
+    if (prompt) {
+      const resolved = await resolveFileReference(prompt, searchRoots);
+      if (resolved.resolvedPath) {
+        promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
+      } else {
+        logWarning2(
+          `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
+          resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => `  Tried: ${attempt}`) : void 0
+        );
+      }
     }
+    const _model = asString2(rawEvaluator.model);
+    evaluators.push({
+      name,
+      type: "llm_judge",
+      prompt,
+      promptPath
+    });
+  }
+  return evaluators.length > 0 ? evaluators : void 0;
+}
+function coerceEvaluator(candidate, contextId) {
+  if (typeof candidate !== "string") {
+    return void 0;
+  }
+  if (isEvaluatorKind(candidate)) {
+    return candidate;
+  }
+  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
+  return void 0;
+}
+function asString2(value) {
+  return typeof value === "string" ? value : void 0;
+}
+function isJsonObject2(value) {
+  return typeof value === "object" && value !== null && !Array.isArray(value);
+}
+function logWarning2(message, details) {
+  if (details && details.length > 0) {
+    const detailBlock = details.join("\n");
+    console.warn(`${ANSI_YELLOW2}Warning: ${message}
+${detailBlock}${ANSI_RESET2}`);
+  } else {
+    console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
   }
-  return codeBlocks;
 }
+// src/evaluation/loaders/message-processor.ts
+var import_promises3 = require("fs/promises");
+var import_node_path4 = __toESM(require("path"), 1);
+var ANSI_YELLOW3 = "\x1B[33m";
+var ANSI_RESET3 = "\x1B[0m";
 async function processMessages(options) {
   const {
     messages,
@@ -356,9 +502,9 @@ async function processMessages(options) {
       if (!isJsonObject(rawSegment)) {
         continue;
       }
-      const segmentType = asString(rawSegment.type);
+      const segmentType = asString3(rawSegment.type);
       if (segmentType === "file") {
-        const rawValue = asString(rawSegment.value);
+        const rawValue = asString3(rawSegment.value);
         if (!rawValue) {
           continue;
         }
@@ -369,15 +515,15 @@ async function processMessages(options) {
         if (!resolvedPath) {
           const attempts = attempted.length ? ["  Tried:", ...attempted.map((candidate) => `    ${candidate}`)] : void 0;
           const context = messageType === "input" ? "" : " in expected_messages";
-          logWarning(`File not found${context}: ${displayPath}`, attempts);
+          logWarning3(`File not found${context}: ${displayPath}`, attempts);
           continue;
         }
         try {
-          const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
+          const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
           if (messageType === "input" && guidelinePatterns && guidelinePaths) {
-            const relativeToRepo = import_node_path2.default.relative(repoRootPath, resolvedPath);
+            const relativeToRepo = import_node_path4.default.relative(repoRootPath, resolvedPath);
             if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
-              guidelinePaths.push(import_node_path2.default.resolve(resolvedPath));
+              guidelinePaths.push(import_node_path4.default.resolve(resolvedPath));
               if (verbose) {
                 console.log(`  [Guideline] Found: ${displayPath}`);
                 console.log(`    Resolved to: ${resolvedPath}`);
@@ -389,7 +535,7 @@ async function processMessages(options) {
             type: "file",
             path: displayPath,
             text: fileContent,
-            resolvedPath: import_node_path2.default.resolve(resolvedPath)
+            resolvedPath: import_node_path4.default.resolve(resolvedPath)
           });
           if (verbose) {
             const label = messageType === "input" ? "[File]" : "[Expected Output File]";
@@ -398,7 +544,7 @@ async function processMessages(options) {
           }
         } catch (error) {
           const context = messageType === "input" ? "" : " expected output";
-          logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
+          logWarning3(`Could not read${context} file ${resolvedPath}: ${error.message}`);
         }
         continue;
       }
@@ -412,201 +558,117 @@ async function processMessages(options) {
   }
   return segments;
 }
-async function loadEvalCases(evalFilePath, repoRoot, options) {
-  const verbose = options?.verbose ?? false;
-  const evalIdFilter = options?.evalId;
-  const absoluteTestPath = import_node_path2.default.resolve(evalFilePath);
-  if (!await fileExists2(absoluteTestPath)) {
-    throw new Error(`Test file not found: ${evalFilePath}`);
-  }
-  const repoRootPath = resolveToAbsolutePath(repoRoot);
-  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
-  const config = await loadConfig(absoluteTestPath, repoRootPath);
-  const guidelinePatterns = config?.guideline_patterns;
-  const rawFile = await (0, import_promises2.readFile)(absoluteTestPath, "utf8");
-  const parsed = (0, import_yaml.parse)(rawFile);
-  if (!isJsonObject(parsed)) {
-    throw new Error(`Invalid test file format: ${evalFilePath}`);
-  }
-  const suite = parsed;
-  const datasetNameFromSuite = asString(suite.dataset)?.trim();
-  const fallbackDataset = import_node_path2.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
-  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
-  const schema = suite.$schema;
-  if (schema !== SCHEMA_EVAL_V2) {
-    const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
-Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
-    throw new Error(message);
+async function resolveAssistantContent(content, searchRoots, verbose) {
+  if (typeof content === "string") {
+    return content;
   }
-  const rawTestcases = suite.evalcases;
-  if (!Array.isArray(rawTestcases)) {
-    throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
+  if (!content) {
+    return "";
   }
-  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
-  const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
-  const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
-  const results = [];
-  for (const rawEvalcase of rawTestcases) {
-    if (!isJsonObject(rawEvalcase)) {
-      logWarning("Skipping invalid eval case entry (expected object)");
+  const parts = [];
+  for (const entry of content) {
+    if (typeof entry === "string") {
+      parts.push({ content: entry, isFile: false });
       continue;
     }
-    const evalcase = rawEvalcase;
-    const id = asString(evalcase.id);
-    if (evalIdFilter && id !== evalIdFilter) {
+    if (!isJsonObject(entry)) {
       continue;
     }
-    const conversationId = asString(evalcase.conversation_id);
-    const outcome = asString(evalcase.outcome);
-    const inputMessagesValue = evalcase.input_messages;
-    const expectedMessagesValue = evalcase.expected_messages;
-    if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
-      logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
+    const segmentType = asString3(entry.type);
+    if (segmentType === "file") {
+      const rawValue = asString3(entry.value);
+      if (!rawValue) {
+        continue;
+      }
+      const { displayPath, resolvedPath, attempted } = await resolveFileReference(
+        rawValue,
+        searchRoots
+      );
+      if (!resolvedPath) {
+        const attempts = attempted.length ? ["  Tried:", ...attempted.map((candidate) => `    ${candidate}`)] : void 0;
+        logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
+        continue;
+      }
+      try {
+        const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
+        parts.push({ content: fileContent, isFile: true, displayPath });
+        if (verbose) {
+          console.log(`  [Expected Assistant File] Found: ${displayPath}`);
+          console.log(`    Resolved to: ${resolvedPath}`);
+        }
+      } catch (error) {
+        logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
+      }
       continue;
     }
-    const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
-    const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
-    const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
-    if (hasExpectedMessages && expectedMessages.length === 0) {
-      logWarning(`No valid expected message found for eval case: ${id}`);
+    const textValue = asString3(entry.text);
+    if (typeof textValue === "string") {
+      parts.push({ content: textValue, isFile: false });
       continue;
     }
-    if (expectedMessages.length > 1) {
-      logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
-    }
-    const guidelinePaths = [];
-    const inputTextParts = [];
-    const inputSegments = await processMessages({
-      messages: inputMessages,
-      searchRoots,
-      repoRootPath,
-      guidelinePatterns,
-      guidelinePaths,
-      textParts: inputTextParts,
-      messageType: "input",
-      verbose
-    });
-    const outputSegments = hasExpectedMessages ? await processMessages({
-      messages: expectedMessages,
-      searchRoots,
-      repoRootPath,
-      guidelinePatterns,
-      messageType: "output",
-      verbose
-    }) : [];
-    const codeSnippets = extractCodeBlocks(inputSegments);
-    const expectedContent = expectedMessages[0]?.content;
-    const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
-    const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
-    const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
-    const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
-    const userFilePaths = [];
-    for (const segment of inputSegments) {
-      if (segment.type === "file" && typeof segment.resolvedPath === "string") {
-        userFilePaths.push(segment.resolvedPath);
-      }
-    }
-    const allFilePaths = [
-      ...guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
-      ...userFilePaths
-    ];
-    const testCase = {
-      id,
-      dataset: datasetName,
-      conversation_id: conversationId,
-      question,
-      input_messages: inputMessages,
-      input_segments: inputSegments,
-      output_segments: outputSegments,
-      reference_answer: referenceAnswer,
-      guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path2.default.resolve(guidelinePath)),
-      guideline_patterns: guidelinePatterns,
-      file_paths: allFilePaths,
-      code_snippets: codeSnippets,
-      expected_outcome: outcome,
-      evaluator: evalCaseEvaluatorKind,
-      evaluators
-    };
-    if (verbose) {
-      console.log(`
-[Eval Case: ${id}]`);
-      if (testCase.guideline_paths.length > 0) {
-        console.log(`  Guidelines used: ${testCase.guideline_paths.length}`);
-        for (const guidelinePath of testCase.guideline_paths) {
-          console.log(`    - ${guidelinePath}`);
-        }
-      } else {
-        console.log("  No guidelines found");
-      }
+    const valueValue = asString3(entry.value);
+    if (typeof valueValue === "string") {
+      parts.push({ content: valueValue, isFile: false });
+      continue;
     }
-    results.push(testCase);
+    parts.push({ content: JSON.stringify(entry), isFile: false });
   }
-  return results;
+  return formatFileContents(parts);
 }
-function needsRoleMarkers(messages, processedSegmentsByMessage) {
-  if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
-    return true;
-  }
-  let messagesWithContent = 0;
-  for (const segments of processedSegmentsByMessage) {
-    if (hasVisibleContent(segments)) {
-      messagesWithContent++;
-    }
-  }
-  return messagesWithContent > 1;
+function asString3(value) {
+  return typeof value === "string" ? value : void 0;
 }
-function hasVisibleContent(segments) {
-  return segments.some((segment) => {
-    const type = asString(segment.type);
-    if (type === "text") {
-      const value = asString(segment.value);
-      return value !== void 0 && value.trim().length > 0;
-    }
-    if (type === "guideline_ref") {
-      return false;
-    }
-    if (type === "file") {
-      const text = asString(segment.text);
-      return text !== void 0 && text.trim().length > 0;
-    }
-    return false;
-  });
+function cloneJsonObject(source) {
+  const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
+  return Object.fromEntries(entries);
 }
-function formatSegment(segment) {
-  const type = asString(segment.type);
-  if (type === "text") {
-    return asString(segment.value);
+function cloneJsonValue(value) {
+  if (value === null) {
+    return null;
   }
-  if (type === "guideline_ref") {
-    const refPath = asString(segment.path);
-    return refPath ? `<Attached: ${refPath}>` : void 0;
+  if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
+    return value;
   }
-  if (type === "file") {
-    const text = asString(segment.text);
-    const filePath = asString(segment.path);
-    if (text && filePath) {
-      return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
-    }
+  if (Array.isArray(value)) {
+    return value.map((item) => cloneJsonValue(item));
+  }
+  if (typeof value === "object") {
+    return cloneJsonObject(value);
+  }
+  return value;
+}
+function logWarning3(message, details) {
+  if (details && details.length > 0) {
+    const detailBlock = details.join("\n");
+    console.warn(`${ANSI_YELLOW3}Warning: ${message}
+${detailBlock}${ANSI_RESET3}`);
+  } else {
+    console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
   }
-  return void 0;
 }
+// src/evaluation/formatting/prompt-builder.ts
+var import_promises4 = require("fs/promises");
+var import_node_path5 = __toESM(require("path"), 1);
+var ANSI_YELLOW4 = "\x1B[33m";
+var ANSI_RESET4 = "\x1B[0m";
 async function buildPromptInputs(testCase) {
   const guidelineParts = [];
   for (const rawPath of testCase.guideline_paths) {
-    const absolutePath = import_node_path2.default.resolve(rawPath);
-    if (!await fileExists2(absolutePath)) {
-      logWarning(`Could not read guideline file ${absolutePath}: file does not exist`);
+    const absolutePath = import_node_path5.default.resolve(rawPath);
+    if (!await fileExists(absolutePath)) {
+      logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
       continue;
     }
     try {
-      const content = (await (0, import_promises2.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
+      const content = (await (0, import_promises4.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
       guidelineParts.push({
         content,
         isFile: true,
-        displayPath: import_node_path2.default.basename(absolutePath)
+        displayPath: import_node_path5.default.basename(absolutePath)
       });
     } catch (error) {
-      logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
+      logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
     }
   }
   const guidelines = formatFileContents(guidelineParts);
@@ -630,9 +692,9 @@ async function buildPromptInputs(testCase) {
             messageSegments.push({ type: "text", value: segment });
           }
         } else if (isJsonObject(segment)) {
-          const type = asString(segment.type);
+          const type = asString4(segment.type);
           if (type === "file") {
-            const value = asString(segment.value);
+            const value = asString4(segment.value);
             if (!value) continue;
             if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
               messageSegments.push({ type: "guideline_ref", path: value });
@@ -643,7 +705,7 @@ async function buildPromptInputs(testCase) {
               messageSegments.push({ type: "file", text: fileText, path: value });
             }
           } else if (type === "text") {
-            const textValue = asString(segment.value);
+            const textValue = asString4(segment.value);
             if (textValue && textValue.trim().length > 0) {
               messageSegments.push({ type: "text", value: textValue });
             }
@@ -699,6 +761,18 @@ ${messageContent}`);
   }) : void 0;
   return { question, guidelines, chatPrompt };
 }
+function needsRoleMarkers(messages, processedSegmentsByMessage) {
+  if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
+    return true;
+  }
+  let messagesWithContent = 0;
+  for (const segments of processedSegmentsByMessage) {
+    if (hasVisibleContent(segments)) {
+      messagesWithContent++;
+    }
+  }
+  return messagesWithContent > 1;
+}
 function buildChatPromptFromSegments(options) {
   const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
   if (messages.length === 0) {
@@ -740,13 +814,12 @@ ${guidelineContent.trim()}`);
     const segments = segmentsByMessage[i];
     const contentParts = [];
     let role = message.role;
-    let name;
     if (role === "system") {
       role = "assistant";
       contentParts.push("@[System]:");
     } else if (role === "tool") {
-      role = "function";
-      name = "tool";
+      role = "assistant";
+      contentParts.push("@[Tool]:");
     }
     for (const segment of segments) {
       if (segment.type === "guideline_ref") {
@@ -764,282 +837,509 @@ ${guidelineContent.trim()}`);
     if (contentParts.length === 0) {
       continue;
     }
+    const content = contentParts.join("\n");
     chatPrompt.push({
       role,
-      content: contentParts.join("\n"),
-      ...name ? { name } : {}
+      content
     });
   }
   return chatPrompt.length > 0 ? chatPrompt : void 0;
 }
-async function fileExists2(absolutePath) {
+function asString4(value) {
+  return typeof value === "string" ? value : void 0;
+}
+function logWarning4(message) {
+  console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
+}
+// src/evaluation/yaml-parser.ts
+var ANSI_YELLOW5 = "\x1B[33m";
+var ANSI_RESET5 = "\x1B[0m";
+var SCHEMA_EVAL_V2 = "agentv-eval-v2";
+async function readTestSuiteMetadata(testFilePath) {
   try {
-    await (0, import_promises2.access)(absolutePath, import_node_fs2.constants.F_OK);
-    return true;
+    const absolutePath = import_node_path6.default.resolve(testFilePath);
+    const content = await (0, import_promises5.readFile)(absolutePath, "utf8");
+    const parsed = (0, import_yaml2.parse)(content);
+    if (!isJsonObject(parsed)) {
+      return {};
+    }
+    return { target: extractTargetFromSuite(parsed) };
   } catch {
-    return false;
+    return {};
   }
 }
-function resolveToAbsolutePath(candidate) {
-  if (candidate instanceof URL) {
-    return (0, import_node_url.fileURLToPath)(candidate);
+async function loadEvalCases(evalFilePath, repoRoot, options) {
+  const verbose = options?.verbose ?? false;
+  const evalIdFilter = options?.evalId;
+  const absoluteTestPath = import_node_path6.default.resolve(evalFilePath);
+  const repoRootPath = resolveToAbsolutePath(repoRoot);
+  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
+  const config = await loadConfig(absoluteTestPath, repoRootPath);
+  const guidelinePatterns = config?.guideline_patterns;
+  const rawFile = await (0, import_promises5.readFile)(absoluteTestPath, "utf8");
+  const parsed = (0, import_yaml2.parse)(rawFile);
+  if (!isJsonObject(parsed)) {
+    throw new Error(`Invalid test file format: ${evalFilePath}`);
   }
-  if (typeof candidate === "string") {
-    if (candidate.startsWith("file://")) {
-      return (0, import_node_url.fileURLToPath)(new URL(candidate));
+  const suite = parsed;
+  const datasetNameFromSuite = asString5(suite.dataset)?.trim();
+  const fallbackDataset = import_node_path6.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
+  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
+  const schema = suite.$schema;
+  if (schema !== SCHEMA_EVAL_V2) {
+    const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
+Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
+    throw new Error(message);
+  }
+  const rawTestcases = suite.evalcases;
+  if (!Array.isArray(rawTestcases)) {
+    throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
+  }
+  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
+  const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
+  const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
+  const results = [];
+  for (const rawEvalcase of rawTestcases) {
+    if (!isJsonObject(rawEvalcase)) {
+      logWarning5("Skipping invalid eval case entry (expected object)");
+      continue;
     }
-    return import_node_path2.default.resolve(candidate);
+    const evalcase = rawEvalcase;
+    const id = asString5(evalcase.id);
+    if (evalIdFilter && id !== evalIdFilter) {
+      continue;
+    }
+    const conversationId = asString5(evalcase.conversation_id);
+    const outcome = asString5(evalcase.outcome);
+    const inputMessagesValue = evalcase.input_messages;
+    const expectedMessagesValue = evalcase.expected_messages;
+    if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
+      logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
+      continue;
+    }
+    const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
+    const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
+    const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
+    if (hasExpectedMessages && expectedMessages.length === 0) {
+      logWarning5(`No valid expected message found for eval case: ${id}`);
+      continue;
+    }
+    if (expectedMessages.length > 1) {
+      logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
+    }
+    const guidelinePaths = [];
+    const inputTextParts = [];
+    const inputSegments = await processMessages({
+      messages: inputMessages,
+      searchRoots,
+      repoRootPath,
+      guidelinePatterns,
+      guidelinePaths,
+      textParts: inputTextParts,
+      messageType: "input",
+      verbose
+    });
+    const outputSegments = hasExpectedMessages ? await processMessages({
+      messages: expectedMessages,
+      searchRoots,
+      repoRootPath,
+      guidelinePatterns,
+      messageType: "output",
+      verbose
+    }) : [];
+    const codeSnippets = extractCodeBlocks(inputSegments);
+    const expectedContent = expectedMessages[0]?.content;
+    const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
+    const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
+    const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
+    const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
+    const userFilePaths = [];
+    for (const segment of inputSegments) {
+      if (segment.type === "file" && typeof segment.resolvedPath === "string") {
+        userFilePaths.push(segment.resolvedPath);
+      }
+    }
+    const allFilePaths = [
+      ...guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
+      ...userFilePaths
+    ];
+    const testCase = {
+      id,
+      dataset: datasetName,
+      conversation_id: conversationId,
+      question,
+      input_messages: inputMessages,
+      input_segments: inputSegments,
+      output_segments: outputSegments,
+      reference_answer: referenceAnswer,
+      guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
+      guideline_patterns: guidelinePatterns,
+      file_paths: allFilePaths,
+      code_snippets: codeSnippets,
+      expected_outcome: outcome,
+      evaluator: evalCaseEvaluatorKind,
+      evaluators
+    };
+    if (verbose) {
+      console.log(`
+[Eval Case: ${id}]`);
+      if (testCase.guideline_paths.length > 0) {
+        console.log(`  Guidelines used: ${testCase.guideline_paths.length}`);
+        for (const guidelinePath of testCase.guideline_paths) {
+          console.log(`    - ${guidelinePath}`);
+        }
+      } else {
+        console.log("  No guidelines found");
+      }
+    }
+    results.push(testCase);
   }
-  throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
+  return results;
 }
-function asString(value) {
+function asString5(value) {
   return typeof value === "string" ? value : void 0;
 }
-function cloneJsonObject(source) {
-  const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
-  return Object.fromEntries(entries);
-}
-function cloneJsonValue(value) {
-  if (value === null) {
-    return null;
-  }
-  if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
-    return value;
+function logWarning5(message, details) {
+  if (details && details.length > 0) {
+    const detailBlock = details.join("\n");
+    console.warn(`${ANSI_YELLOW5}Warning: ${message}
+${detailBlock}${ANSI_RESET5}`);
+  } else {
+    console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
   }
-  if (Array.isArray(value)) {
-    return value.map((item) => cloneJsonValue(item));
+}
+// src/evaluation/file-utils.ts
+var import_node_fs2 = require("fs");
+var import_promises6 = require("fs/promises");
+var import_node_path7 = __toESM(require("path"), 1);
+async function fileExists2(filePath) {
+  try {
+    await (0, import_promises6.access)(filePath, import_node_fs2.constants.F_OK);
+    return true;
+  } catch {
+    return false;
   }
-  return cloneJsonObject(value);
 }
-function formatFileContents(parts) {
-  const fileCount = parts.filter((p) => p.isFile).length;
-  if (fileCount > 0) {
-    return parts.map((part) => {
-      if (part.isFile && part.displayPath) {
-        return `<file path="${part.displayPath}">
-${part.content}
-</file>`;
-      }
-      return part.content;
-    }).join("\n\n");
+function normalizeLineEndings(content) {
+  return content.replace(/\r\n/g, "\n");
+}
+async function readTextFile(filePath) {
+  const content = await (0, import_promises6.readFile)(filePath, "utf8");
+  return normalizeLineEndings(content);
+}
+async function findGitRoot(startPath) {
+  let currentDir = import_node_path7.default.dirname(import_node_path7.default.resolve(startPath));
+  const root = import_node_path7.default.parse(currentDir).root;
+  while (currentDir !== root) {
+    const gitPath = import_node_path7.default.join(currentDir, ".git");
+    if (await fileExists2(gitPath)) {
+      return currentDir;
+    }
+    const parentDir = import_node_path7.default.dirname(currentDir);
+    if (parentDir === currentDir) {
+      break;
+    }
+    currentDir = parentDir;
   }
-  return parts.map((p) => p.content).join(" ");
+  return null;
 }
-async function resolveAssistantContent(content, searchRoots, verbose) {
-  if (typeof content === "string") {
-    return content;
+function buildDirectoryChain2(filePath, repoRoot) {
+  const directories = [];
+  const seen = /* @__PURE__ */ new Set();
+  const boundary = import_node_path7.default.resolve(repoRoot);
+  let current = import_node_path7.default.resolve(import_node_path7.default.dirname(filePath));
+  while (current !== void 0) {
+    if (!seen.has(current)) {
+      directories.push(current);
+      seen.add(current);
+    }
+    if (current === boundary) {
+      break;
+    }
+    const parent = import_node_path7.default.dirname(current);
+    if (parent === current) {
+      break;
+    }
+    current = parent;
   }
-  if (!content) {
-    return "";
+  if (!seen.has(boundary)) {
+    directories.push(boundary);
   }
-  const parts = [];
-  for (const entry of content) {
-    if (typeof entry === "string") {
-      parts.push({ content: entry, isFile: false });
-      continue;
-    }
-    if (!isJsonObject(entry)) {
-      continue;
+  return directories;
+}
+function buildSearchRoots2(evalPath, repoRoot) {
+  const uniqueRoots = [];
+  const addRoot = (root) => {
+    const normalized = import_node_path7.default.resolve(root);
+    if (!uniqueRoots.includes(normalized)) {
+      uniqueRoots.push(normalized);
     }
-    const segmentType = asString(entry.type);
-    if (segmentType === "file") {
-      const rawValue = asString(entry.value);
-      if (!rawValue) {
-        continue;
-      }
-      const { displayPath, resolvedPath, attempted } = await resolveFileReference(
-        rawValue,
-        searchRoots
-      );
-      if (!resolvedPath) {
-        const attempts = attempted.length ? ["  Tried:", ...attempted.map((candidate) => `    ${candidate}`)] : void 0;
-        logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
-        continue;
-      }
-      try {
-        const fileContent = (await (0, import_promises2.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
-        parts.push({ content: fileContent, isFile: true, displayPath });
-        if (verbose) {
-          console.log(`  [Expected Assistant File] Found: ${displayPath}`);
-          console.log(`    Resolved to: ${resolvedPath}`);
-        }
-      } catch (error) {
-        logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
-      }
-      continue;
+  };
+  let currentDir = import_node_path7.default.dirname(evalPath);
+  let reachedBoundary = false;
+  while (!reachedBoundary) {
+    addRoot(currentDir);
+    const parentDir = import_node_path7.default.dirname(currentDir);
+    if (currentDir === repoRoot || parentDir === currentDir) {
+      reachedBoundary = true;
+    } else {
+      currentDir = parentDir;
     }
-    const textValue = asString(entry.text);
-    if (typeof textValue === "string") {
-      parts.push({ content: textValue, isFile: false });
+  }
+  addRoot(repoRoot);
+  addRoot(process.cwd());
+  return uniqueRoots;
+}
+function trimLeadingSeparators2(value) {
+  const trimmed = value.replace(/^[/\\]+/, "");
+  return trimmed.length > 0 ? trimmed : value;
+}
+async function resolveFileReference2(rawValue, searchRoots) {
+  const displayPath = trimLeadingSeparators2(rawValue);
+  const potentialPaths = [];
+  if (import_node_path7.default.isAbsolute(rawValue)) {
+    potentialPaths.push(import_node_path7.default.normalize(rawValue));
+  }
+  for (const base of searchRoots) {
+    potentialPaths.push(import_node_path7.default.resolve(base, displayPath));
+  }
+  const attempted = [];
+  const seen = /* @__PURE__ */ new Set();
+  for (const candidate of potentialPaths) {
+    const absoluteCandidate = import_node_path7.default.resolve(candidate);
+    if (seen.has(absoluteCandidate)) {
       continue;
     }
-    const valueValue = asString(entry.value);
-    if (typeof valueValue === "string") {
-      parts.push({ content: valueValue, isFile: false });
-      continue;
+    seen.add(absoluteCandidate);
+    attempted.push(absoluteCandidate);
+    if (await fileExists2(absoluteCandidate)) {
+      return { displayPath, resolvedPath: absoluteCandidate, attempted };
     }
-    parts.push({ content: JSON.stringify(entry), isFile: false });
   }
-  return formatFileContents(parts);
+  return { displayPath, attempted };
 }
-async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
-  const execution = rawEvalCase.execution;
-  const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
-  if (candidateEvaluators === void 0) {
-    return void 0;
+// src/evaluation/providers/ai-sdk.ts
+var import_anthropic = require("@ai-sdk/anthropic");
+var import_azure = require("@ai-sdk/azure");
+var import_google = require("@ai-sdk/google");
+var import_ai = require("ai");
+var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
+var AzureProvider = class {
+  constructor(targetName, config) {
+    this.config = config;
+    this.id = `azure:${targetName}`;
+    this.targetName = targetName;
+    this.defaults = {
+      temperature: config.temperature,
+      maxOutputTokens: config.maxOutputTokens
+    };
+    this.retryConfig = config.retry;
+    const azure = (0, import_azure.createAzure)(buildAzureOptions(config));
+    this.model = azure(config.deploymentName);
   }
-  if (!Array.isArray(candidateEvaluators)) {
-    logWarning(`Skipping evaluators for '${evalId}': expected array`);
-    return void 0;
+  id;
+  kind = "azure";
+  targetName;
+  model;
+  defaults;
+  retryConfig;
+  async invoke(request) {
+    return invokeModel({
+      model: this.model,
+      request,
+      defaults: this.defaults,
+      retryConfig: this.retryConfig
+    });
+  }
+};
+var AnthropicProvider = class {
+  constructor(targetName, config) {
+    this.config = config;
+    this.id = `anthropic:${targetName}`;
+    this.targetName = targetName;
+    this.defaults = {
+      temperature: config.temperature,
+      maxOutputTokens: config.maxOutputTokens,
+      thinkingBudget: config.thinkingBudget
+    };
+    this.retryConfig = config.retry;
+    const anthropic = (0, import_anthropic.createAnthropic)({
+      apiKey: config.apiKey
+    });
+    this.model = anthropic(config.model);
+  }
+  id;
+  kind = "anthropic";
+  targetName;
+  model;
+  defaults;
+  retryConfig;
+  async invoke(request) {
+    const providerOptions = buildAnthropicProviderOptions(this.defaults);
+    return invokeModel({
+      model: this.model,
+      request,
+      defaults: this.defaults,
+      retryConfig: this.retryConfig,
+      providerOptions
+    });
   }
-  const evaluators = [];
-  for (const rawEvaluator of candidateEvaluators) {
-    if (!isJsonObject(rawEvaluator)) {
-      logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
-      continue;
-    }
-    const name = asString(rawEvaluator.name);
-    const typeValue = rawEvaluator.type;
-    if (!name || !isEvaluatorKind(typeValue)) {
-      logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
-      continue;
-    }
-    if (typeValue === "code") {
-      const script = asString(rawEvaluator.script);
-      if (!script) {
-        logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
-        continue;
-      }
-      const cwd = asString(rawEvaluator.cwd);
-      let resolvedCwd;
-      if (cwd) {
-        const resolved = await resolveFileReference(cwd, searchRoots);
-        if (resolved.resolvedPath) {
-          resolvedCwd = import_node_path2.default.resolve(resolved.resolvedPath);
-        } else {
-          logWarning(
-            `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
-            resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => `  Tried: ${attempt}`) : void 0
-          );
-        }
-      } else {
-        resolvedCwd = searchRoots[0];
-      }
-      evaluators.push({
-        name,
-        type: "code",
-        script,
-        cwd,
-        resolvedCwd
-      });
-      continue;
-    }
-    const prompt = asString(rawEvaluator.prompt);
-    let promptPath;
-    if (prompt) {
-      const resolved = await resolveFileReference(prompt, searchRoots);
-      if (resolved.resolvedPath) {
-        promptPath = import_node_path2.default.resolve(resolved.resolvedPath);
-      } else {
-        logWarning(
-          `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
-          resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => `  Tried: ${attempt}`) : void 0
-        );
-      }
-    }
-    const model = asString(rawEvaluator.model);
-    evaluators.push({
-      name,
-      type: "llm_judge",
-      prompt,
-      promptPath
+};
+var GeminiProvider = class {
+  constructor(targetName, config) {
+    this.config = config;
+    this.id = `gemini:${targetName}`;
+    this.targetName = targetName;
+    this.defaults = {
+      temperature: config.temperature,
+      maxOutputTokens: config.maxOutputTokens
+    };
+    this.retryConfig = config.retry;
+    const google = (0, import_google.createGoogleGenerativeAI)({
+      apiKey: config.apiKey
     });
+    this.model = google(config.model);
   }
-  return evaluators.length > 0 ? evaluators : void 0;
+  id;
+  kind = "gemini";
+  targetName;
+  model;
+  defaults;
+  retryConfig;
+  async invoke(request) {
+    return invokeModel({
+      model: this.model,
+      request,
+      defaults: this.defaults,
+      retryConfig: this.retryConfig
+    });
+  }
+};
+function buildAzureOptions(config) {
+  const options = {
+    apiKey: config.apiKey,
+    apiVersion: config.version,
+    useDeploymentBasedUrls: true
+  };
+  const baseURL = normalizeAzureBaseUrl(config.resourceName);
+  if (baseURL) {
+    options.baseURL = baseURL;
+  } else {
+    options.resourceName = config.resourceName;
+  }
+  return options;
 }
-function coerceEvaluator(candidate, contextId) {
-  if (typeof candidate !== "string") {
+function normalizeAzureBaseUrl(resourceName) {
+  const trimmed = resourceName.trim();
+  if (!/^https?:\/\//i.test(trimmed)) {
     return void 0;
   }
-  if (isEvaluatorKind(candidate)) {
-    return candidate;
-  }
-  logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
-  return void 0;
+  const withoutSlash = trimmed.replace(/\/+$/, "");
+  const normalized = withoutSlash.endsWith("/openai") ? withoutSlash : `${withoutSlash}/openai`;
+  return normalized;
 }
-function logWarning(message, details) {
-  if (details && details.length > 0) {
-    const detailBlock = details.join("\n");
-    console.warn(`${ANSI_YELLOW}Warning: ${message}
-${detailBlock}${ANSI_RESET}`);
-  } else {
-    console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
+function buildAnthropicProviderOptions(defaults) {
+  if (defaults.thinkingBudget === void 0) {
+    return void 0;
   }
+  return {
+    anthropic: {
+      thinking: {
+        type: "enabled",
+        budgetTokens: defaults.thinkingBudget
+      }
+    }
+  };
 }
-// src/evaluation/providers/ax.ts
-var import_ax = require("@ax-llm/ax");
-var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
 function buildChatPrompt(request) {
-  if (request.chatPrompt) {
-    const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
+  const provided = request.chatPrompt?.length ? request.chatPrompt : void 0;
+  if (provided) {
+    const hasSystemMessage = provided.some((message) => message.role === "system");
     if (hasSystemMessage) {
-      return request.chatPrompt;
+      return provided;
     }
-    const systemContent2 = resolveSystemContent(request);
-    return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
+    const systemContent2 = resolveSystemContent(request, false);
+    return [{ role: "system", content: systemContent2 }, ...provided];
   }
-  const systemContent = resolveSystemContent(request);
+  const systemContent = resolveSystemContent(request, true);
   const userContent = request.question.trim();
   const prompt = [
-    {
-      role: "system",
-      content: systemContent
-    },
-    {
-      role: "user",
-      content: userContent
-    }
+    { role: "system", content: systemContent },
+    { role: "user", content: userContent }
   ];
   return prompt;
 }
-function resolveSystemContent(request) {
+function resolveSystemContent(request, includeGuidelines) {
   const systemSegments = [];
-  const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
-  if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
-    systemSegments.push(metadataSystemPrompt.trim());
+  if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
+    systemSegments.push(request.systemPrompt.trim());
   } else {
     systemSegments.push(DEFAULT_SYSTEM_PROMPT);
   }
-  if (request.guidelines && request.guidelines.trim().length > 0) {
+  if (includeGuidelines && request.guidelines && request.guidelines.trim().length > 0) {
     systemSegments.push(`[[ ## Guidelines ## ]]
 ${request.guidelines.trim()}`);
   }
   return systemSegments.join("\n\n");
 }
-function extractModelConfig(request, defaults) {
+function toModelMessages(chatPrompt) {
+  return chatPrompt.map((message) => {
+    if (message.role === "tool" || message.role === "function") {
+      const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
+      return {
+        role: "assistant",
+        content: `${prefix}${message.content}`
+      };
+    }
+    if (message.role === "assistant" || message.role === "system" || message.role === "user") {
+      return {
+        role: message.role,
+        content: message.content
+      };
+    }
+    return {
+      role: "user",
+      content: message.content
+    };
+  });
+}
+function resolveModelSettings(request, defaults) {
   const temperature = request.temperature ?? defaults.temperature;
-  const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
-  const config = {};
-  if (temperature !== void 0) {
-    config.temperature = temperature;
-  }
-  if (maxTokens !== void 0) {
-    config.maxTokens = maxTokens;
-  }
-  return Object.keys(config).length > 0 ? config : void 0;
+  const maxOutputTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
+  return {
+    temperature,
+    maxOutputTokens
+  };
+}
+async function invokeModel(options) {
+  const { model, request, defaults, retryConfig, providerOptions } = options;
+  const chatPrompt = buildChatPrompt(request);
+  const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
+  const result = await withRetry(
+    () => (0, import_ai.generateText)({
+      model,
+      messages: toModelMessages(chatPrompt),
+      temperature,
+      maxOutputTokens,
+      maxRetries: 0,
+      abortSignal: request.signal,
+      ...providerOptions ? { providerOptions } : {}
+    }),
+    retryConfig,
+    request.signal
+  );
+  return mapResponse(result);
 }
-function mapResponse(response) {
-  const primary = response.results[0];
-  const text = typeof primary?.content === "string" ? primary.content : "";
-  const reasoning = primary?.thought ?? primary?.thoughtBlock?.data;
-  const usage = toJsonObject(response.modelUsage);
+function mapResponse(result) {
   return {
-    text,
-    reasoning,
-    raw: response,
-    usage
+    text: result.text ?? "",
+    reasoning: result.reasoningText ?? void 0,
+    raw: result,
+    usage: toJsonObject(result.totalUsage ?? result.usage)
   };
 }
 function toJsonObject(value) {
@@ -1052,34 +1352,59 @@ function toJsonObject(value) {
     return void 0;
   }
 }
-function ensureChatResponse(result) {
-  if (typeof ReadableStream !== "undefined" && result instanceof ReadableStream) {
-    throw new Error("Streaming responses are not supported for this provider");
+function extractStatus(error) {
+  if (!error || typeof error !== "object") {
+    return void 0;
+  }
+  const candidate = error;
+  const directStatus = candidate.status ?? candidate.statusCode;
+  if (typeof directStatus === "number" && Number.isFinite(directStatus)) {
+    return directStatus;
   }
-  if (!result || typeof result !== "object" || !("results" in result)) {
-    throw new Error("Unexpected response type from AxAI provider");
+  const responseStatus = typeof candidate.response === "object" && candidate.response ? candidate.response.status : void 0;
+  if (typeof responseStatus === "number" && Number.isFinite(responseStatus)) {
+    return responseStatus;
+  }
+  const message = typeof candidate.message === "string" ? candidate.message : void 0;
+  if (message) {
+    const match = message.match(/HTTP\s+(\d{3})/i);
+    if (match) {
+      const parsed = Number.parseInt(match[1], 10);
+      if (Number.isFinite(parsed)) {
+        return parsed;
+      }
+    }
   }
-  return result;
+  return void 0;
 }
-function isRetryableError(error, retryableStatusCodes) {
+function isNetworkError(error) {
   if (!error || typeof error !== "object") {
     return false;
   }
-  if ("status" in error && typeof error.status === "number") {
-    return retryableStatusCodes.includes(error.status);
+  const candidate = error;
+  if (candidate.name === "AbortError") {
+    return false;
   }
-  if ("message" in error && typeof error.message === "string") {
-    const match = error.message.match(/HTTP (\d{3})/);
-    if (match) {
-      const status = Number.parseInt(match[1], 10);
-      return retryableStatusCodes.includes(status);
-    }
+  const code = candidate.code;
+  if (typeof code === "string" && /^E(AI|CONN|HOST|NET|PIPE|TIME|REFUSED|RESET)/i.test(code)) {
+    return true;
   }
-  if ("name" in error && error.name === "AxAIServiceNetworkError") {
+  const message = typeof candidate.message === "string" ? candidate.message : void 0;
+  if (message && /(network|fetch failed|ECONNRESET|ENOTFOUND|EAI_AGAIN|ETIMEDOUT|ECONNREFUSED)/i.test(message)) {
     return true;
   }
   return false;
 }
+function isRetryableError(error, retryableStatusCodes) {
+  const status = extractStatus(error);
+  if (status === 401 || status === 403) {
+    return false;
+  }
+  if (typeof status === "number") {
+    return retryableStatusCodes.includes(status);
+  }
+  return isNetworkError(error);
+}
 function calculateRetryDelay(attempt, config) {
   const delay = Math.min(
     config.maxDelayMs,
@@ -1115,152 +1440,16 @@ async function withRetry(fn, retryConfig, signal) {
       }
       const delay = calculateRetryDelay(attempt, config);
       await sleep(delay);
-      if (signal?.aborted) {
-        throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
-      }
     }
   }
   throw lastError;
 }
-var AzureProvider = class {
-  constructor(targetName, config) {
-    this.config = config;
-    this.id = `azure:${targetName}`;
-    this.targetName = targetName;
-    this.defaults = {
-      temperature: config.temperature,
-      maxOutputTokens: config.maxOutputTokens
-    };
-    this.retryConfig = config.retry;
-    this.ai = import_ax.AxAI.create({
-      name: "azure-openai",
-      apiKey: config.apiKey,
-      resourceName: config.resourceName,
-      deploymentName: config.deploymentName,
-      version: config.version,
-      config: {
-        stream: false
-      }
-    });
-  }
-  id;
-  kind = "azure";
-  targetName;
-  ai;
-  defaults;
-  retryConfig;
-  async invoke(request) {
-    const chatPrompt = buildChatPrompt(request);
-    const modelConfig = extractModelConfig(request, this.defaults);
-    const response = await withRetry(
-      async () => await this.ai.chat(
-        {
-          chatPrompt,
-          model: this.config.deploymentName,
-          ...modelConfig ? { modelConfig } : {}
-        },
-        request.signal ? { abortSignal: request.signal } : void 0
-      ),
-      this.retryConfig,
-      request.signal
-    );
-    return mapResponse(ensureChatResponse(response));
-  }
-  getAxAI() {
-    return this.ai;
-  }
-};
-var AnthropicProvider = class {
-  constructor(targetName, config) {
-    this.config = config;
-    this.id = `anthropic:${targetName}`;
-    this.targetName = targetName;
-    this.defaults = {
-      temperature: config.temperature,
-      maxOutputTokens: config.maxOutputTokens,
-      thinkingBudget: config.thinkingBudget
-    };
-    this.retryConfig = config.retry;
-    this.ai = import_ax.AxAI.create({
-      name: "anthropic",
-      apiKey: config.apiKey
-    });
-  }
-  id;
-  kind = "anthropic";
-  targetName;
-  ai;
-  defaults;
-  retryConfig;
-  async invoke(request) {
-    const chatPrompt = buildChatPrompt(request);
-    const modelConfig = extractModelConfig(request, this.defaults);
-    const response = await withRetry(
-      async () => await this.ai.chat(
-        {
-          chatPrompt,
-          model: this.config.model,
-          ...modelConfig ? { modelConfig } : {}
-        },
-        request.signal ? { abortSignal: request.signal } : void 0
-      ),
-      this.retryConfig,
-      request.signal
-    );
-    return mapResponse(ensureChatResponse(response));
-  }
-  getAxAI() {
-    return this.ai;
-  }
-};
-var GeminiProvider = class {
-  constructor(targetName, config) {
-    this.config = config;
-    this.id = `gemini:${targetName}`;
-    this.targetName = targetName;
-    this.defaults = {
-      temperature: config.temperature,
-      maxOutputTokens: config.maxOutputTokens
-    };
-    this.retryConfig = config.retry;
-    this.ai = import_ax.AxAI.create({
-      name: "google-gemini",
-      apiKey: config.apiKey
-    });
-  }
-  id;
-  kind = "gemini";
-  targetName;
-  ai;
-  defaults;
-  retryConfig;
-  async invoke(request) {
-    const chatPrompt = buildChatPrompt(request);
-    const modelConfig = extractModelConfig(request, this.defaults);
-    const response = await withRetry(
-      async () => await this.ai.chat(
-        {
-          chatPrompt,
-          model: this.config.model,
-          ...modelConfig ? { modelConfig } : {}
-        },
-        request.signal ? { abortSignal: request.signal } : void 0
-      ),
-      this.retryConfig,
-      request.signal
-    );
-    return mapResponse(ensureChatResponse(response));
-  }
-  getAxAI() {
-    return this.ai;
-  }
-};
 // src/evaluation/providers/cli.ts
 var import_node_child_process = require("child_process");
-var import_promises3 = __toESM(require("fs/promises"), 1);
+var import_promises7 = __toESM(require("fs/promises"), 1);
 var import_node_os = __toESM(require("os"), 1);
-var import_node_path3 = __toESM(require("path"), 1);
+var import_node_path8 = __toESM(require("path"), 1);
 var import_node_util = require("util");
 var execAsync = (0, import_node_util.promisify)(import_node_child_process.exec);
 var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
@@ -1302,12 +1491,14 @@ var CliProvider = class {
   supportsBatch = false;
   config;
   runCommand;
+  verbose;
   healthcheckPromise;
   constructor(targetName, config, runner = defaultCommandRunner) {
     this.targetName = targetName;
     this.id = `cli:${targetName}`;
     this.config = config;
     this.runCommand = runner;
+    this.verbose = config.verbose ?? false;
   }
   async invoke(request) {
     if (request.signal?.aborted) {
@@ -1357,7 +1548,7 @@ var CliProvider = class {
       const errorMsg = error instanceof Error ? error.message : String(error);
       throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
     } finally {
-      await import_promises3.default.unlink(filePath).catch(() => {
+      await import_promises7.default.unlink(filePath).catch(() => {
       });
     }
   }
@@ -1408,6 +1599,11 @@ var CliProvider = class {
         generateOutputFilePath("healthcheck")
       )
     );
+    if (this.verbose) {
+      console.log(
+        `[cli-provider:${this.targetName}] (healthcheck) CLI_EVALS_DIR=${process.env.CLI_EVALS_DIR ?? ""} cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
+      );
+    }
     const result = await this.runCommand(renderedCommand, {
       cwd: healthcheck.cwd ?? this.config.cwd,
       env: process.env,
@@ -1439,7 +1635,7 @@ function normalizeInputFiles(inputFiles) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = import_node_path3.default.resolve(inputFile);
+    const absolutePath = import_node_path8.default.resolve(inputFile);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -1453,7 +1649,7 @@ function formatFileList(files, template) {
   const formatter = template ?? "{path}";
   return files.map((filePath) => {
     const escapedPath = shellEscape(filePath);
-    const escapedName = shellEscape(import_node_path3.default.basename(filePath));
+    const escapedName = shellEscape(import_node_path8.default.basename(filePath));
     return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
   }).join(" ");
 }
@@ -1477,7 +1673,7 @@ function generateOutputFilePath(evalCaseId) {
   const safeEvalId = evalCaseId || "unknown";
   const timestamp = Date.now();
   const random = Math.random().toString(36).substring(2, 9);
-  return import_node_path3.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
+  return import_node_path8.default.join(import_node_os.default.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
 }
 function formatTimeoutSuffix(timeoutMs) {
   if (!timeoutMs || timeoutMs <= 0) {
@@ -1491,9 +1687,9 @@ function formatTimeoutSuffix(timeoutMs) {
 var import_node_child_process2 = require("child_process");
 var import_node_crypto = require("crypto");
 var import_node_fs3 = require("fs");
-var import_promises4 = require("fs/promises");
+var import_promises8 = require("fs/promises");
 var import_node_os2 = require("os");
-var import_node_path5 = __toESM(require("path"), 1);
+var import_node_path10 = __toESM(require("path"), 1);
 var import_node_util2 = require("util");
 // src/evaluation/providers/codex-log-tracker.ts
@@ -1550,7 +1746,7 @@ function subscribeToCodexLogEntries(listener) {
 }
 // src/evaluation/providers/preread.ts
-var import_node_path4 = __toESM(require("path"), 1);
+var import_node_path9 = __toESM(require("path"), 1);
 function buildPromptDocument(request, inputFiles, options) {
   const parts = [];
   const guidelineFiles = collectGuidelineFiles(
@@ -1575,7 +1771,7 @@ function normalizeInputFiles2(inputFiles) {
   }
   const deduped = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = import_node_path4.default.resolve(inputFile);
+    const absolutePath = import_node_path9.default.resolve(inputFile);
     if (!deduped.has(absolutePath)) {
       deduped.set(absolutePath, absolutePath);
     }
@@ -1588,14 +1784,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = import_node_path4.default.resolve(inputFile);
+    const absolutePath = import_node_path9.default.resolve(inputFile);
     if (overrides?.has(absolutePath)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
       }
       continue;
     }
-    const normalized = absolutePath.split(import_node_path4.default.sep).join("/");
+    const normalized = absolutePath.split(import_node_path9.default.sep).join("/");
     if (isGuidelineFile(normalized, guidelinePatterns)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
@@ -1610,7 +1806,7 @@ function collectInputFiles(inputFiles) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = import_node_path4.default.resolve(inputFile);
+    const absolutePath = import_node_path9.default.resolve(inputFile);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -1622,7 +1818,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
     return "";
   }
   const buildList = (files) => files.map((absolutePath) => {
-    const fileName = import_node_path4.default.basename(absolutePath);
+    const fileName = import_node_path9.default.basename(absolutePath);
     const fileUri = pathToFileUri(absolutePath);
     return `* [${fileName}](${fileUri})`;
   });
@@ -1642,7 +1838,7 @@ ${buildList(inputFiles).join("\n")}.`);
   return sections.join("\n");
 }
 function pathToFileUri(filePath) {
-  const absolutePath = import_node_path4.default.isAbsolute(filePath) ? filePath : import_node_path4.default.resolve(filePath);
+  const absolutePath = import_node_path9.default.isAbsolute(filePath) ? filePath : import_node_path9.default.resolve(filePath);
   const normalizedPath = absolutePath.replace(/\\/g, "/");
   if (/^[a-zA-Z]:\//.test(normalizedPath)) {
     return `file:///${normalizedPath}`;
@@ -1680,8 +1876,8 @@ var CodexProvider = class {
     const logger = await this.createStreamLogger(request).catch(() => void 0);
     try {
       const promptContent = buildPromptDocument(request, inputFiles);
-      const promptFile = import_node_path5.default.join(workspaceRoot, PROMPT_FILENAME);
-      await (0, import_promises4.writeFile)(promptFile, promptContent, "utf8");
+      const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
+      await (0, import_promises8.writeFile)(promptFile, promptContent, "utf8");
       const args = this.buildCodexArgs();
       const cwd = this.resolveCwd(workspaceRoot);
       const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
@@ -1730,7 +1926,7 @@ var CodexProvider = class {
     if (!this.config.cwd) {
       return workspaceRoot;
     }
-    return import_node_path5.default.resolve(this.config.cwd);
+    return import_node_path10.default.resolve(this.config.cwd);
   }
   buildCodexArgs() {
     const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
@@ -1764,11 +1960,11 @@ var CodexProvider = class {
     }
   }
   async createWorkspace() {
-    return await (0, import_promises4.mkdtemp)(import_node_path5.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
+    return await (0, import_promises8.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
   }
   async cleanupWorkspace(workspaceRoot) {
     try {
-      await (0, import_promises4.rm)(workspaceRoot, { recursive: true, force: true });
+      await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
     } catch {
     }
   }
@@ -1778,9 +1974,9 @@ var CodexProvider = class {
       return void 0;
     }
     if (this.config.logDir) {
-      return import_node_path5.default.resolve(this.config.logDir);
+      return import_node_path10.default.resolve(this.config.logDir);
     }
-    return import_node_path5.default.join(process.cwd(), ".agentv", "logs", "codex");
+    return import_node_path10.default.join(process.cwd(), ".agentv", "logs", "codex");
   }
   async createStreamLogger(request) {
     const logDir = this.resolveLogDirectory();
@@ -1788,13 +1984,13 @@ var CodexProvider = class {
       return void 0;
     }
     try {
-      await (0, import_promises4.mkdir)(logDir, { recursive: true });
+      await (0, import_promises8.mkdir)(logDir, { recursive: true });
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
       return void 0;
     }
-    const filePath = import_node_path5.default.join(logDir, buildLogFilename(request, this.targetName));
+    const filePath = import_node_path10.default.join(logDir, buildLogFilename(request, this.targetName));
     try {
       const logger = await CodexStreamLogger.create({
         filePath,
@@ -2009,9 +2205,9 @@ function tryParseJsonValue(rawLine) {
 async function locateExecutable(candidate) {
   const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
   if (includesPathSeparator) {
-    const resolved = import_node_path5.default.isAbsolute(candidate) ? candidate : import_node_path5.default.resolve(candidate);
+    const resolved = import_node_path10.default.isAbsolute(candidate) ? candidate : import_node_path10.default.resolve(candidate);
     const executablePath = await ensureWindowsExecutableVariant(resolved);
-    await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
+    await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
     return executablePath;
   }
   const locator = process.platform === "win32" ? "where" : "which";
@@ -2021,7 +2217,7 @@ async function locateExecutable(candidate) {
     const preferred = selectExecutableCandidate(lines);
     if (preferred) {
       const executablePath = await ensureWindowsExecutableVariant(preferred);
-      await (0, import_promises4.access)(executablePath, import_node_fs3.constants.F_OK);
+      await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
       return executablePath;
     }
   } catch {
@@ -2055,7 +2251,7 @@ async function ensureWindowsExecutableVariant(candidate) {
   for (const ext of extensions) {
     const withExtension = `${candidate}${ext}`;
     try {
-      await (0, import_promises4.access)(withExtension, import_node_fs3.constants.F_OK);
+      await (0, import_promises8.access)(withExtension, import_node_fs3.constants.F_OK);
       return withExtension;
     } catch {
     }
@@ -2867,7 +3063,7 @@ function resolveOptionalNumberArray(source, description) {
 }
 // src/evaluation/providers/vscode.ts
-var import_node_path6 = __toESM(require("path"), 1);
+var import_node_path11 = __toESM(require("path"), 1);
 var import_subagent = require("subagent");
 var VSCodeProvider = class {
   id;
@@ -2980,6 +3176,9 @@ var VSCodeProvider = class {
 };
 function buildPromptDocument2(request, attachments, guidelinePatterns) {
   const parts = [];
+  if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
+    parts.push(request.systemPrompt.trim());
+  }
   const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
   const attachmentFiles = collectAttachmentFiles(attachments);
   const nonGuidelineAttachments = attachmentFiles.filter(
@@ -2997,7 +3196,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
     return "";
   }
   const buildList = (files) => files.map((absolutePath) => {
-    const fileName = import_node_path6.default.basename(absolutePath);
+    const fileName = import_node_path11.default.basename(absolutePath);
     const fileUri = pathToFileUri2(absolutePath);
     return `* [${fileName}](${fileUri})`;
   });
@@ -3022,8 +3221,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = import_node_path6.default.resolve(attachment);
-    const normalized = absolutePath.split(import_node_path6.default.sep).join("/");
+    const absolutePath = import_node_path11.default.resolve(attachment);
+    const normalized = absolutePath.split(import_node_path11.default.sep).join("/");
     if (isGuidelineFile(normalized, guidelinePatterns)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
@@ -3038,7 +3237,7 @@ function collectAttachmentFiles(attachments) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = import_node_path6.default.resolve(attachment);
+    const absolutePath = import_node_path11.default.resolve(attachment);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -3046,7 +3245,7 @@ function collectAttachmentFiles(attachments) {
   return Array.from(unique.values());
 }
 function pathToFileUri2(filePath) {
-  const absolutePath = import_node_path6.default.isAbsolute(filePath) ? filePath : import_node_path6.default.resolve(filePath);
+  const absolutePath = import_node_path11.default.isAbsolute(filePath) ? filePath : import_node_path11.default.resolve(filePath);
   const normalizedPath = absolutePath.replace(/\\/g, "/");
   if (/^[a-zA-Z]:\//.test(normalizedPath)) {
     return `file:///${normalizedPath}`;
@@ -3059,7 +3258,7 @@ function normalizeAttachments(attachments) {
   }
   const deduped = /* @__PURE__ */ new Set();
   for (const attachment of attachments) {
-    deduped.add(import_node_path6.default.resolve(attachment));
+    deduped.add(import_node_path11.default.resolve(attachment));
   }
   return Array.from(deduped);
 }
@@ -3068,7 +3267,7 @@ function mergeAttachments(all) {
   for (const list of all) {
     if (!list) continue;
     for (const inputFile of list) {
-      deduped.add(import_node_path6.default.resolve(inputFile));
+      deduped.add(import_node_path11.default.resolve(inputFile));
     }
   }
   return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -3114,9 +3313,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
 // src/evaluation/providers/targets-file.ts
 var import_node_fs4 = require("fs");
-var import_promises5 = require("fs/promises");
-var import_node_path7 = __toESM(require("path"), 1);
-var import_yaml2 = require("yaml");
+var import_promises9 = require("fs/promises");
+var import_node_path12 = __toESM(require("path"), 1);
+var import_yaml3 = require("yaml");
 // src/evaluation/providers/types.ts
 var AGENT_PROVIDER_KINDS = [
@@ -3177,19 +3376,19 @@ function assertTargetDefinition(value, index, filePath) {
 }
 async function fileExists3(filePath) {
   try {
-    await (0, import_promises5.access)(filePath, import_node_fs4.constants.F_OK);
+    await (0, import_promises9.access)(filePath, import_node_fs4.constants.F_OK);
     return true;
   } catch {
     return false;
   }
 }
 async function readTargetDefinitions(filePath) {
-  const absolutePath = import_node_path7.default.resolve(filePath);
+  const absolutePath = import_node_path12.default.resolve(filePath);
   if (!await fileExists3(absolutePath)) {
     throw new Error(`targets.yaml not found at ${absolutePath}`);
   }
-  const raw = await (0, import_promises5.readFile)(absolutePath, "utf8");
-  const parsed = (0, import_yaml2.parse)(raw);
+  const raw = await (0, import_promises9.readFile)(absolutePath, "utf8");
+  const parsed = (0, import_yaml3.parse)(raw);
   if (!isRecord(parsed)) {
     throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
   }
@@ -3232,18 +3431,34 @@ function resolveAndCreateProvider(definition, env = process.env) {
 }
 // src/evaluation/evaluators.ts
-var import_node_crypto2 = require("crypto");
+var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
+Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
+Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
+[[ ## expected_outcome ## ]]
+{{expected_outcome}}
+[[ ## question ## ]]
+{{question}}
+[[ ## reference_answer ## ]]
+{{reference_answer}}
+[[ ## candidate_answer ## ]]
+{{candidate_answer}}`;
 var LlmJudgeEvaluator = class {
   kind = "llm_judge";
   resolveJudgeProvider;
   maxOutputTokens;
   temperature;
-  customPrompt;
+  evaluatorTemplate;
   constructor(options) {
     this.resolveJudgeProvider = options.resolveJudgeProvider;
     this.maxOutputTokens = options.maxOutputTokens;
     this.temperature = options.temperature;
-    this.customPrompt = options.customPrompt;
+    this.evaluatorTemplate = options.evaluatorTemplate;
   }
   async evaluate(context) {
     const judgeProvider = await this.resolveJudgeProvider(context);
@@ -3253,26 +3468,21 @@ var LlmJudgeEvaluator = class {
     return this.evaluateWithPrompt(context, judgeProvider);
   }
   async evaluateWithPrompt(context, judgeProvider) {
-    const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
     const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
-    let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
-    let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
-    if (systemPrompt && hasTemplateVariables(systemPrompt)) {
-      const variables = {
-        input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
-        output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
-        candidate_answer: context.candidate,
-        reference_answer: context.evalCase.reference_answer ?? "",
-        expected_outcome: context.evalCase.expected_outcome,
-        question: formattedQuestion
-      };
-      prompt = substituteVariables(systemPrompt, variables);
-      systemPrompt = buildSystemPrompt(hasReferenceAnswer);
-    }
-    const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
+    const variables = {
+      input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
+      output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
+      candidate_answer: context.candidate.trim(),
+      reference_answer: (context.evalCase.reference_answer ?? "").trim(),
+      expected_outcome: context.evalCase.expected_outcome.trim(),
+      question: formattedQuestion.trim()
+    };
+    const systemPrompt = buildOutputSchema();
+    const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
+    const userPrompt = substituteVariables(evaluatorTemplate, variables);
     const response = await judgeProvider.invoke({
-      question: prompt,
-      metadata,
+      question: userPrompt,
+      systemPrompt,
       evalCaseId: context.evalCase.id,
       attempt: context.attempt,
       maxOutputTokens: this.maxOutputTokens,
@@ -3285,11 +3495,9 @@ var LlmJudgeEvaluator = class {
     const reasoning = parsed.reasoning ?? response.reasoning;
     const expectedAspectCount = Math.max(hits.length + misses.length, 1);
     const evaluatorRawRequest = {
-      id: (0, import_node_crypto2.randomUUID)(),
-      provider: judgeProvider.id,
-      prompt,
-      target: context.target.name,
-      ...systemPrompt !== void 0 && { systemPrompt }
+      userPrompt,
+      systemPrompt,
+      target: judgeProvider.targetName
     };
     return {
       score,
@@ -3301,20 +3509,8 @@ var LlmJudgeEvaluator = class {
     };
   }
 };
-function buildSystemPrompt(hasReferenceAnswer) {
-  const basePrompt = [
-    "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
-    ""
-  ];
-  if (hasReferenceAnswer) {
-    basePrompt.push(
-      "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
-      ""
-    );
-  }
-  basePrompt.push(
-    "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
-    "",
+function buildOutputSchema() {
+  return [
     "You must respond with a single JSON object matching this schema:",
     "",
     "{",
@@ -3323,30 +3519,7 @@ function buildSystemPrompt(hasReferenceAnswer) {
     '  "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
     '  "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
     "}"
-  );
-  return basePrompt.join("\n");
-}
-function buildQualityPrompt(evalCase, candidate, question) {
-  const parts = [
-    "[[ ## expected_outcome ## ]]",
-    evalCase.expected_outcome.trim(),
-    "",
-    "[[ ## question ## ]]",
-    question.trim(),
-    ""
-  ];
-  if (hasNonEmptyReferenceAnswer(evalCase)) {
-    parts.push(
-      "[[ ## reference_answer ## ]]",
-      evalCase.reference_answer.trim(),
-      ""
-    );
-  }
-  parts.push(
-    "[[ ## candidate_answer ## ]]",
-    candidate.trim()
-  );
-  return parts.join("\n");
+  ].join("\n");
 }
 function clampScore(value) {
   if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -3428,9 +3601,6 @@ function extractJsonBlob(text) {
 function isNonEmptyString(value) {
   return typeof value === "string" && value.trim().length > 0;
 }
-function hasNonEmptyReferenceAnswer(evalCase) {
-  return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
-}
 var CodeEvaluator = class {
   kind = "code";
   script;
@@ -3536,19 +3706,16 @@ function parseJsonSafe(payload) {
     return void 0;
   }
 }
-function hasTemplateVariables(text) {
-  return /\$\{[a-zA-Z0-9_]+\}/.test(text);
-}
 function substituteVariables(template, variables) {
-  return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
+  return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
     return variables[varName] ?? match;
   });
 }
 // src/evaluation/orchestrator.ts
-var import_node_crypto3 = require("crypto");
-var import_promises6 = require("fs/promises");
-var import_node_path8 = __toESM(require("path"), 1);
+var import_node_crypto2 = require("crypto");
+var import_promises10 = require("fs/promises");
+var import_node_path13 = __toESM(require("path"), 1);
 // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
 var Node = class {
@@ -4111,6 +4278,7 @@ async function evaluateCandidate(options) {
     }
   }
   return {
+    timestamp: completedAt.toISOString(),
     eval_id: evalCase.id,
     dataset: evalCase.dataset,
     conversation_id: evalCase.conversation_id,
@@ -4118,14 +4286,12 @@ async function evaluateCandidate(options) {
     hits: score.hits,
     misses: score.misses,
     candidate_answer: candidate,
-    expected_aspect_count: score.expectedAspectCount,
     target: target.name,
-    timestamp: completedAt.toISOString(),
     reasoning: score.reasoning,
     raw_aspects: score.rawAspects,
     agent_provider_request: agentProviderRequest,
     lm_provider_request: lmProviderRequest,
-    evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
+    evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
     evaluator_results: evaluatorResults
   };
 }
@@ -4202,7 +4368,7 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluator_raw_request: score2.evaluatorRawRequest
+          evaluator_provider_request: score2.evaluatorRawRequest
         });
         continue;
       }
@@ -4229,7 +4395,7 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluator_raw_request: score2.evaluatorRawRequest
+          evaluator_provider_request: score2.evaluatorRawRequest
         });
         continue;
       }
@@ -4282,7 +4448,7 @@ async function runLlmJudgeEvaluator(options) {
     promptInputs,
     now,
     judgeProvider,
-    systemPrompt: customPrompt,
+    evaluatorTemplateOverride: customPrompt,
     evaluator: config
   });
 }
@@ -4323,22 +4489,22 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
 async function dumpPrompt(directory, evalCase, promptInputs) {
   const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
   const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
-  const filePath = import_node_path8.default.resolve(directory, filename);
-  await (0, import_promises6.mkdir)(import_node_path8.default.dirname(filePath), { recursive: true });
+  const filePath = import_node_path13.default.resolve(directory, filename);
+  await (0, import_promises10.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
   const payload = {
     eval_id: evalCase.id,
     question: promptInputs.question,
     guidelines: promptInputs.guidelines,
     guideline_paths: evalCase.guideline_paths
   };
-  await (0, import_promises6.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
+  await (0, import_promises10.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
 }
 function sanitizeFilename(value) {
   if (!value) {
     return "prompt";
   }
   const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
-  return sanitized.length > 0 ? sanitized : (0, import_node_crypto3.randomUUID)();
+  return sanitized.length > 0 ? sanitized : (0, import_node_crypto2.randomUUID)();
 }
 async function invokeProvider(provider, options) {
   const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -4394,6 +4560,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
     }
   }
   return {
+    timestamp: timestamp.toISOString(),
     eval_id: evalCase.id,
     dataset: evalCase.dataset,
     conversation_id: evalCase.conversation_id,
@@ -4401,9 +4568,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
     hits: [],
     misses: [`Error: ${message}`],
     candidate_answer: `Error occurred: ${message}`,
-    expected_aspect_count: 0,
     target: targetName,
-    timestamp: timestamp.toISOString(),
     raw_aspects: [],
     agent_provider_request: agentProviderRequest,
     lm_provider_request: lmProviderRequest,
@@ -4411,7 +4576,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
   };
 }
 function createCacheKey(provider, target, evalCase, promptInputs) {
-  const hash = (0, import_node_crypto3.createHash)("sha256");
+  const hash = (0, import_node_crypto2.createHash)("sha256");
   hash.update(provider.id);
   hash.update(target.name);
   hash.update(evalCase.id);