npm - @agentv/core - Versions diffs - 0.11.0 → 0.14.2 - Mend

@agentv/core 0.11.0 → 0.14.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/README.md +1 -2
package/dist/{chunk-YQBJAT5I.js → chunk-IOCVST3R.js} +1 -1
package/dist/chunk-IOCVST3R.js.map +1 -0
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +1 -1
package/dist/index.cjs +912 -747
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +46 -34
package/dist/index.d.ts +46 -34
package/dist/index.js +875 -708
package/dist/index.js.map +1 -1
package/package.json +5 -2
package/dist/chunk-YQBJAT5I.js.map +0 -1

package/dist/index.js CHANGED Viewed

@@ -9,7 +9,7 @@ import {
   readTextFile,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-YQBJAT5I.js";
+} from "./chunk-IOCVST3R.js";
 // src/evaluation/types.ts
 var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -62,48 +62,197 @@ function getHitCount(result) {
 }
 // src/evaluation/yaml-parser.ts
+import { readFile as readFile4 } from "node:fs/promises";
+import path6 from "node:path";
+import { parse as parse2 } from "yaml";
+// src/evaluation/formatting/segment-formatter.ts
+function extractCodeBlocks(segments) {
+  const CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
+  const codeBlocks = [];
+  for (const segment of segments) {
+    const typeValue = segment["type"];
+    if (typeof typeValue !== "string" || typeValue !== "text") {
+      continue;
+    }
+    const textValue = segment["value"];
+    if (typeof textValue !== "string") {
+      continue;
+    }
+    const matches = textValue.match(CODE_BLOCK_PATTERN);
+    if (matches) {
+      codeBlocks.push(...matches);
+    }
+  }
+  return codeBlocks;
+}
+function formatFileContents(parts) {
+  const fileCount = parts.filter((p) => p.isFile).length;
+  if (fileCount > 0) {
+    return parts.map((part) => {
+      if (part.isFile && part.displayPath) {
+        return `<file path="${part.displayPath}">
+${part.content}
+</file>`;
+      }
+      return part.content;
+    }).join("\n\n");
+  }
+  return parts.map((p) => p.content).join(" ");
+}
+function formatSegment(segment) {
+  const type = asString(segment.type);
+  if (type === "text") {
+    return asString(segment.value);
+  }
+  if (type === "guideline_ref") {
+    const refPath = asString(segment.path);
+    return refPath ? `<Attached: ${refPath}>` : void 0;
+  }
+  if (type === "file") {
+    const text = asString(segment.text);
+    const filePath = asString(segment.path);
+    if (text && filePath) {
+      return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
+    }
+  }
+  return void 0;
+}
+function hasVisibleContent(segments) {
+  return segments.some((segment) => {
+    const type = asString(segment.type);
+    if (type === "text") {
+      const value = asString(segment.value);
+      return value !== void 0 && value.trim().length > 0;
+    }
+    if (type === "guideline_ref") {
+      return false;
+    }
+    if (type === "file") {
+      const text = asString(segment.text);
+      return text !== void 0 && text.trim().length > 0;
+    }
+    return false;
+  });
+}
+function asString(value) {
+  return typeof value === "string" ? value : void 0;
+}
+// src/evaluation/loaders/config-loader.ts
 import micromatch from "micromatch";
+import { readFile } from "node:fs/promises";
+import path2 from "node:path";
+import { parse } from "yaml";
+// src/evaluation/loaders/file-resolver.ts
 import { constants } from "node:fs";
-import { access, readFile } from "node:fs/promises";
+import { access } from "node:fs/promises";
 import path from "node:path";
-import { fileURLToPath } from "node:url";
-import { parse } from "yaml";
-var CODE_BLOCK_PATTERN = /```[\s\S]*?```/g;
-var ANSI_YELLOW = "\x1B[33m";
-var ANSI_RESET = "\x1B[0m";
-var SCHEMA_EVAL_V2 = "agentv-eval-v2";
-var SCHEMA_CONFIG_V2 = "agentv-config-v2";
-async function readTestSuiteMetadata(testFilePath) {
+async function fileExists2(absolutePath) {
   try {
-    const absolutePath = path.resolve(testFilePath);
-    const content = await readFile(absolutePath, "utf8");
-    const parsed = parse(content);
-    if (!isJsonObject(parsed)) {
-      return {};
-    }
-    return { target: extractTargetFromSuite(parsed) };
+    await access(absolutePath, constants.F_OK);
+    return true;
   } catch {
-    return {};
+    return false;
   }
 }
-function extractTargetFromSuite(suite) {
-  const execution = suite.execution;
-  if (execution && typeof execution === "object" && !Array.isArray(execution)) {
-    const executionTarget = execution.target;
-    if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
-      return executionTarget.trim();
+function resolveToAbsolutePath(candidate) {
+  if (candidate instanceof URL) {
+    return new URL(candidate).pathname;
+  }
+  if (typeof candidate === "string") {
+    if (candidate.startsWith("file://")) {
+      return new URL(candidate).pathname;
     }
+    return path.resolve(candidate);
   }
-  const targetValue = suite.target;
-  if (typeof targetValue === "string" && targetValue.trim().length > 0) {
-    return targetValue.trim();
+  throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
+}
+function buildDirectoryChain2(filePath, repoRoot) {
+  const directories = [];
+  const seen = /* @__PURE__ */ new Set();
+  const boundary = path.resolve(repoRoot);
+  let current = path.resolve(path.dirname(filePath));
+  while (current !== void 0) {
+    if (!seen.has(current)) {
+      directories.push(current);
+      seen.add(current);
+    }
+    if (current === boundary) {
+      break;
+    }
+    const parent = path.dirname(current);
+    if (parent === current) {
+      break;
+    }
+    current = parent;
   }
-  return void 0;
+  if (!seen.has(boundary)) {
+    directories.push(boundary);
+  }
+  return directories;
+}
+function buildSearchRoots2(evalPath, repoRoot) {
+  const uniqueRoots = [];
+  const addRoot = (root) => {
+    const normalized = path.resolve(root);
+    if (!uniqueRoots.includes(normalized)) {
+      uniqueRoots.push(normalized);
+    }
+  };
+  let currentDir = path.dirname(evalPath);
+  let reachedBoundary = false;
+  while (!reachedBoundary) {
+    addRoot(currentDir);
+    const parentDir = path.dirname(currentDir);
+    if (currentDir === repoRoot || parentDir === currentDir) {
+      reachedBoundary = true;
+    } else {
+      currentDir = parentDir;
+    }
+  }
+  addRoot(repoRoot);
+  addRoot(process.cwd());
+  return uniqueRoots;
 }
+function trimLeadingSeparators(value) {
+  const trimmed = value.replace(/^[/\\]+/, "");
+  return trimmed.length > 0 ? trimmed : value;
+}
+async function resolveFileReference2(rawValue, searchRoots) {
+  const displayPath = trimLeadingSeparators(rawValue);
+  const potentialPaths = [];
+  if (path.isAbsolute(rawValue)) {
+    potentialPaths.push(path.normalize(rawValue));
+  }
+  for (const base of searchRoots) {
+    potentialPaths.push(path.resolve(base, displayPath));
+  }
+  const attempted = [];
+  const seen = /* @__PURE__ */ new Set();
+  for (const candidate of potentialPaths) {
+    const absoluteCandidate = path.resolve(candidate);
+    if (seen.has(absoluteCandidate)) {
+      continue;
+    }
+    seen.add(absoluteCandidate);
+    attempted.push(absoluteCandidate);
+    if (await fileExists2(absoluteCandidate)) {
+      return { displayPath, resolvedPath: absoluteCandidate, attempted };
+    }
+  }
+  return { displayPath, attempted };
+}
+// src/evaluation/loaders/config-loader.ts
+var SCHEMA_CONFIG_V2 = "agentv-config-v2";
+var ANSI_YELLOW = "\x1B[33m";
+var ANSI_RESET = "\x1B[0m";
 async function loadConfig(evalFilePath, repoRoot) {
-  const directories = buildDirectoryChain(evalFilePath, repoRoot);
+  const directories = buildDirectoryChain2(evalFilePath, repoRoot);
   for (const directory of directories) {
-    const configPath = path.join(directory, ".agentv", "config.yaml");
+    const configPath = path2.join(directory, ".agentv", "config.yaml");
     if (!await fileExists2(configPath)) {
       continue;
     }
@@ -146,24 +295,134 @@ function isGuidelineFile(filePath, patterns) {
   const patternsToUse = patterns ?? [];
   return micromatch.isMatch(normalized, patternsToUse);
 }
-function extractCodeBlocks(segments) {
-  const codeBlocks = [];
-  for (const segment of segments) {
-    const typeValue = segment["type"];
-    if (typeof typeValue !== "string" || typeValue !== "text") {
+function extractTargetFromSuite(suite) {
+  const execution = suite.execution;
+  if (execution && typeof execution === "object" && !Array.isArray(execution)) {
+    const executionTarget = execution.target;
+    if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
+      return executionTarget.trim();
+    }
+  }
+  const targetValue = suite.target;
+  if (typeof targetValue === "string" && targetValue.trim().length > 0) {
+    return targetValue.trim();
+  }
+  return void 0;
+}
+function logWarning(message) {
+  console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
+}
+// src/evaluation/loaders/evaluator-parser.ts
+import path3 from "node:path";
+var ANSI_YELLOW2 = "\x1B[33m";
+var ANSI_RESET2 = "\x1B[0m";
+async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
+  const execution = rawEvalCase.execution;
+  const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
+  if (candidateEvaluators === void 0) {
+    return void 0;
+  }
+  if (!Array.isArray(candidateEvaluators)) {
+    logWarning2(`Skipping evaluators for '${evalId}': expected array`);
+    return void 0;
+  }
+  const evaluators = [];
+  for (const rawEvaluator of candidateEvaluators) {
+    if (!isJsonObject2(rawEvaluator)) {
+      logWarning2(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
       continue;
     }
-    const textValue = segment["value"];
-    if (typeof textValue !== "string") {
+    const name = asString2(rawEvaluator.name);
+    const typeValue = rawEvaluator.type;
+    if (!name || !isEvaluatorKind(typeValue)) {
+      logWarning2(`Skipping evaluator with invalid name/type in '${evalId}'`);
       continue;
     }
-    const matches = textValue.match(CODE_BLOCK_PATTERN);
-    if (matches) {
-      codeBlocks.push(...matches);
+    if (typeValue === "code") {
+      const script = asString2(rawEvaluator.script);
+      if (!script) {
+        logWarning2(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
+        continue;
+      }
+      const cwd = asString2(rawEvaluator.cwd);
+      let resolvedCwd;
+      if (cwd) {
+        const resolved = await resolveFileReference2(cwd, searchRoots);
+        if (resolved.resolvedPath) {
+          resolvedCwd = path3.resolve(resolved.resolvedPath);
+        } else {
+          logWarning2(
+            `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
+            resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => `  Tried: ${attempt}`) : void 0
+          );
+        }
+      } else {
+        resolvedCwd = searchRoots[0];
+      }
+      evaluators.push({
+        name,
+        type: "code",
+        script,
+        cwd,
+        resolvedCwd
+      });
+      continue;
+    }
+    const prompt = asString2(rawEvaluator.prompt);
+    let promptPath;
+    if (prompt) {
+      const resolved = await resolveFileReference2(prompt, searchRoots);
+      if (resolved.resolvedPath) {
+        promptPath = path3.resolve(resolved.resolvedPath);
+      } else {
+        logWarning2(
+          `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
+          resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => `  Tried: ${attempt}`) : void 0
+        );
+      }
     }
+    const _model = asString2(rawEvaluator.model);
+    evaluators.push({
+      name,
+      type: "llm_judge",
+      prompt,
+      promptPath
+    });
   }
-  return codeBlocks;
+  return evaluators.length > 0 ? evaluators : void 0;
 }
+function coerceEvaluator(candidate, contextId) {
+  if (typeof candidate !== "string") {
+    return void 0;
+  }
+  if (isEvaluatorKind(candidate)) {
+    return candidate;
+  }
+  logWarning2(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
+  return void 0;
+}
+function asString2(value) {
+  return typeof value === "string" ? value : void 0;
+}
+function isJsonObject2(value) {
+  return typeof value === "object" && value !== null && !Array.isArray(value);
+}
+function logWarning2(message, details) {
+  if (details && details.length > 0) {
+    const detailBlock = details.join("\n");
+    console.warn(`${ANSI_YELLOW2}Warning: ${message}
+${detailBlock}${ANSI_RESET2}`);
+  } else {
+    console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
+  }
+}
+// src/evaluation/loaders/message-processor.ts
+import { readFile as readFile2 } from "node:fs/promises";
+import path4 from "node:path";
+var ANSI_YELLOW3 = "\x1B[33m";
+var ANSI_RESET3 = "\x1B[0m";
 async function processMessages(options) {
   const {
     messages,
@@ -189,28 +448,28 @@ async function processMessages(options) {
       if (!isJsonObject(rawSegment)) {
         continue;
       }
-      const segmentType = asString(rawSegment.type);
+      const segmentType = asString3(rawSegment.type);
       if (segmentType === "file") {
-        const rawValue = asString(rawSegment.value);
+        const rawValue = asString3(rawSegment.value);
         if (!rawValue) {
           continue;
         }
-        const { displayPath, resolvedPath, attempted } = await resolveFileReference(
+        const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
           rawValue,
           searchRoots
         );
         if (!resolvedPath) {
           const attempts = attempted.length ? ["  Tried:", ...attempted.map((candidate) => `    ${candidate}`)] : void 0;
           const context = messageType === "input" ? "" : " in expected_messages";
-          logWarning(`File not found${context}: ${displayPath}`, attempts);
+          logWarning3(`File not found${context}: ${displayPath}`, attempts);
           continue;
         }
         try {
-          const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
+          const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
           if (messageType === "input" && guidelinePatterns && guidelinePaths) {
-            const relativeToRepo = path.relative(repoRootPath, resolvedPath);
+            const relativeToRepo = path4.relative(repoRootPath, resolvedPath);
             if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
-              guidelinePaths.push(path.resolve(resolvedPath));
+              guidelinePaths.push(path4.resolve(resolvedPath));
               if (verbose) {
                 console.log(`  [Guideline] Found: ${displayPath}`);
                 console.log(`    Resolved to: ${resolvedPath}`);
@@ -222,7 +481,7 @@ async function processMessages(options) {
             type: "file",
             path: displayPath,
             text: fileContent,
-            resolvedPath: path.resolve(resolvedPath)
+            resolvedPath: path4.resolve(resolvedPath)
           });
           if (verbose) {
             const label = messageType === "input" ? "[File]" : "[Expected Output File]";
@@ -231,7 +490,7 @@ async function processMessages(options) {
           }
         } catch (error) {
           const context = messageType === "input" ? "" : " expected output";
-          logWarning(`Could not read${context} file ${resolvedPath}: ${error.message}`);
+          logWarning3(`Could not read${context} file ${resolvedPath}: ${error.message}`);
         }
         continue;
       }
@@ -245,201 +504,117 @@ async function processMessages(options) {
   }
   return segments;
 }
-async function loadEvalCases(evalFilePath, repoRoot, options) {
-  const verbose = options?.verbose ?? false;
-  const evalIdFilter = options?.evalId;
-  const absoluteTestPath = path.resolve(evalFilePath);
-  if (!await fileExists2(absoluteTestPath)) {
-    throw new Error(`Test file not found: ${evalFilePath}`);
-  }
-  const repoRootPath = resolveToAbsolutePath(repoRoot);
-  const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
-  const config = await loadConfig(absoluteTestPath, repoRootPath);
-  const guidelinePatterns = config?.guideline_patterns;
-  const rawFile = await readFile(absoluteTestPath, "utf8");
-  const parsed = parse(rawFile);
-  if (!isJsonObject(parsed)) {
-    throw new Error(`Invalid test file format: ${evalFilePath}`);
-  }
-  const suite = parsed;
-  const datasetNameFromSuite = asString(suite.dataset)?.trim();
-  const fallbackDataset = path.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
-  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
-  const schema = suite.$schema;
-  if (schema !== SCHEMA_EVAL_V2) {
-    const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
-Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
-    throw new Error(message);
+async function resolveAssistantContent(content, searchRoots, verbose) {
+  if (typeof content === "string") {
+    return content;
   }
-  const rawTestcases = suite.evalcases;
-  if (!Array.isArray(rawTestcases)) {
-    throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
+  if (!content) {
+    return "";
   }
-  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
-  const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
-  const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
-  const results = [];
-  for (const rawEvalcase of rawTestcases) {
-    if (!isJsonObject(rawEvalcase)) {
-      logWarning("Skipping invalid eval case entry (expected object)");
+  const parts = [];
+  for (const entry of content) {
+    if (typeof entry === "string") {
+      parts.push({ content: entry, isFile: false });
       continue;
     }
-    const evalcase = rawEvalcase;
-    const id = asString(evalcase.id);
-    if (evalIdFilter && id !== evalIdFilter) {
+    if (!isJsonObject(entry)) {
       continue;
     }
-    const conversationId = asString(evalcase.conversation_id);
-    const outcome = asString(evalcase.outcome);
-    const inputMessagesValue = evalcase.input_messages;
-    const expectedMessagesValue = evalcase.expected_messages;
-    if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
-      logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
+    const segmentType = asString3(entry.type);
+    if (segmentType === "file") {
+      const rawValue = asString3(entry.value);
+      if (!rawValue) {
+        continue;
+      }
+      const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
+        rawValue,
+        searchRoots
+      );
+      if (!resolvedPath) {
+        const attempts = attempted.length ? ["  Tried:", ...attempted.map((candidate) => `    ${candidate}`)] : void 0;
+        logWarning3(`File not found in expected_messages: ${displayPath}`, attempts);
+        continue;
+      }
+      try {
+        const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
+        parts.push({ content: fileContent, isFile: true, displayPath });
+        if (verbose) {
+          console.log(`  [Expected Assistant File] Found: ${displayPath}`);
+          console.log(`    Resolved to: ${resolvedPath}`);
+        }
+      } catch (error) {
+        logWarning3(`Could not read file ${resolvedPath}: ${error.message}`);
+      }
       continue;
     }
-    const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
-    const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
-    const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
-    if (hasExpectedMessages && expectedMessages.length === 0) {
-      logWarning(`No valid expected message found for eval case: ${id}`);
+    const textValue = asString3(entry.text);
+    if (typeof textValue === "string") {
+      parts.push({ content: textValue, isFile: false });
       continue;
     }
-    if (expectedMessages.length > 1) {
-      logWarning(`Multiple expected messages found for eval case: ${id}, using first`);
-    }
-    const guidelinePaths = [];
-    const inputTextParts = [];
-    const inputSegments = await processMessages({
-      messages: inputMessages,
-      searchRoots,
-      repoRootPath,
-      guidelinePatterns,
-      guidelinePaths,
-      textParts: inputTextParts,
-      messageType: "input",
-      verbose
-    });
-    const outputSegments = hasExpectedMessages ? await processMessages({
-      messages: expectedMessages,
-      searchRoots,
-      repoRootPath,
-      guidelinePatterns,
-      messageType: "output",
-      verbose
-    }) : [];
-    const codeSnippets = extractCodeBlocks(inputSegments);
-    const expectedContent = expectedMessages[0]?.content;
-    const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
-    const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
-    const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
-    const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
-    const userFilePaths = [];
-    for (const segment of inputSegments) {
-      if (segment.type === "file" && typeof segment.resolvedPath === "string") {
-        userFilePaths.push(segment.resolvedPath);
-      }
-    }
-    const allFilePaths = [
-      ...guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
-      ...userFilePaths
-    ];
-    const testCase = {
-      id,
-      dataset: datasetName,
-      conversation_id: conversationId,
-      question,
-      input_messages: inputMessages,
-      input_segments: inputSegments,
-      output_segments: outputSegments,
-      reference_answer: referenceAnswer,
-      guideline_paths: guidelinePaths.map((guidelinePath) => path.resolve(guidelinePath)),
-      guideline_patterns: guidelinePatterns,
-      file_paths: allFilePaths,
-      code_snippets: codeSnippets,
-      expected_outcome: outcome,
-      evaluator: evalCaseEvaluatorKind,
-      evaluators
-    };
-    if (verbose) {
-      console.log(`
-[Eval Case: ${id}]`);
-      if (testCase.guideline_paths.length > 0) {
-        console.log(`  Guidelines used: ${testCase.guideline_paths.length}`);
-        for (const guidelinePath of testCase.guideline_paths) {
-          console.log(`    - ${guidelinePath}`);
-        }
-      } else {
-        console.log("  No guidelines found");
-      }
+    const valueValue = asString3(entry.value);
+    if (typeof valueValue === "string") {
+      parts.push({ content: valueValue, isFile: false });
+      continue;
     }
-    results.push(testCase);
+    parts.push({ content: JSON.stringify(entry), isFile: false });
   }
-  return results;
+  return formatFileContents(parts);
 }
-function needsRoleMarkers(messages, processedSegmentsByMessage) {
-  if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
-    return true;
-  }
-  let messagesWithContent = 0;
-  for (const segments of processedSegmentsByMessage) {
-    if (hasVisibleContent(segments)) {
-      messagesWithContent++;
-    }
-  }
-  return messagesWithContent > 1;
+function asString3(value) {
+  return typeof value === "string" ? value : void 0;
 }
-function hasVisibleContent(segments) {
-  return segments.some((segment) => {
-    const type = asString(segment.type);
-    if (type === "text") {
-      const value = asString(segment.value);
-      return value !== void 0 && value.trim().length > 0;
-    }
-    if (type === "guideline_ref") {
-      return false;
-    }
-    if (type === "file") {
-      const text = asString(segment.text);
-      return text !== void 0 && text.trim().length > 0;
-    }
-    return false;
-  });
+function cloneJsonObject(source) {
+  const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
+  return Object.fromEntries(entries);
 }
-function formatSegment(segment) {
-  const type = asString(segment.type);
-  if (type === "text") {
-    return asString(segment.value);
+function cloneJsonValue(value) {
+  if (value === null) {
+    return null;
   }
-  if (type === "guideline_ref") {
-    const refPath = asString(segment.path);
-    return refPath ? `<Attached: ${refPath}>` : void 0;
+  if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
+    return value;
   }
-  if (type === "file") {
-    const text = asString(segment.text);
-    const filePath = asString(segment.path);
-    if (text && filePath) {
-      return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
-    }
+  if (Array.isArray(value)) {
+    return value.map((item) => cloneJsonValue(item));
+  }
+  if (typeof value === "object") {
+    return cloneJsonObject(value);
+  }
+  return value;
+}
+function logWarning3(message, details) {
+  if (details && details.length > 0) {
+    const detailBlock = details.join("\n");
+    console.warn(`${ANSI_YELLOW3}Warning: ${message}
+${detailBlock}${ANSI_RESET3}`);
+  } else {
+    console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
   }
-  return void 0;
 }
+// src/evaluation/formatting/prompt-builder.ts
+import { readFile as readFile3 } from "node:fs/promises";
+import path5 from "node:path";
+var ANSI_YELLOW4 = "\x1B[33m";
+var ANSI_RESET4 = "\x1B[0m";
 async function buildPromptInputs(testCase) {
   const guidelineParts = [];
   for (const rawPath of testCase.guideline_paths) {
-    const absolutePath = path.resolve(rawPath);
+    const absolutePath = path5.resolve(rawPath);
     if (!await fileExists2(absolutePath)) {
-      logWarning(`Could not read guideline file ${absolutePath}: file does not exist`);
+      logWarning4(`Could not read guideline file ${absolutePath}: file does not exist`);
       continue;
     }
     try {
-      const content = (await readFile(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
+      const content = (await readFile3(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
       guidelineParts.push({
         content,
         isFile: true,
-        displayPath: path.basename(absolutePath)
+        displayPath: path5.basename(absolutePath)
       });
     } catch (error) {
-      logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
+      logWarning4(`Could not read guideline file ${absolutePath}: ${error.message}`);
     }
   }
   const guidelines = formatFileContents(guidelineParts);
@@ -463,9 +638,9 @@ async function buildPromptInputs(testCase) {
             messageSegments.push({ type: "text", value: segment });
           }
         } else if (isJsonObject(segment)) {
-          const type = asString(segment.type);
+          const type = asString4(segment.type);
           if (type === "file") {
-            const value = asString(segment.value);
+            const value = asString4(segment.value);
             if (!value) continue;
             if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
               messageSegments.push({ type: "guideline_ref", path: value });
@@ -476,7 +651,7 @@ async function buildPromptInputs(testCase) {
               messageSegments.push({ type: "file", text: fileText, path: value });
             }
           } else if (type === "text") {
-            const textValue = asString(segment.value);
+            const textValue = asString4(segment.value);
             if (textValue && textValue.trim().length > 0) {
               messageSegments.push({ type: "text", value: textValue });
             }
@@ -532,6 +707,18 @@ ${messageContent}`);
   }) : void 0;
   return { question, guidelines, chatPrompt };
 }
+function needsRoleMarkers(messages, processedSegmentsByMessage) {
+  if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
+    return true;
+  }
+  let messagesWithContent = 0;
+  for (const segments of processedSegmentsByMessage) {
+    if (hasVisibleContent(segments)) {
+      messagesWithContent++;
+    }
+  }
+  return messagesWithContent > 1;
+}
 function buildChatPromptFromSegments(options) {
   const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
   if (messages.length === 0) {
@@ -573,13 +760,12 @@ ${guidelineContent.trim()}`);
     const segments = segmentsByMessage[i];
     const contentParts = [];
     let role = message.role;
-    let name;
     if (role === "system") {
       role = "assistant";
       contentParts.push("@[System]:");
     } else if (role === "tool") {
-      role = "function";
-      name = "tool";
+      role = "assistant";
+      contentParts.push("@[Tool]:");
     }
     for (const segment of segments) {
       if (segment.type === "guideline_ref") {
@@ -597,282 +783,398 @@ ${guidelineContent.trim()}`);
     if (contentParts.length === 0) {
       continue;
     }
+    const content = contentParts.join("\n");
     chatPrompt.push({
       role,
-      content: contentParts.join("\n"),
-      ...name ? { name } : {}
+      content
     });
   }
   return chatPrompt.length > 0 ? chatPrompt : void 0;
 }
-async function fileExists2(absolutePath) {
-  try {
-    await access(absolutePath, constants.F_OK);
-    return true;
-  } catch {
-    return false;
-  }
-}
-function resolveToAbsolutePath(candidate) {
-  if (candidate instanceof URL) {
-    return fileURLToPath(candidate);
-  }
-  if (typeof candidate === "string") {
-    if (candidate.startsWith("file://")) {
-      return fileURLToPath(new URL(candidate));
-    }
-    return path.resolve(candidate);
-  }
-  throw new TypeError("Unsupported repoRoot value. Expected string or URL.");
-}
-function asString(value) {
+function asString4(value) {
   return typeof value === "string" ? value : void 0;
 }
-function cloneJsonObject(source) {
-  const entries = Object.entries(source).map(([key, value]) => [key, cloneJsonValue(value)]);
-  return Object.fromEntries(entries);
+function logWarning4(message) {
+  console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
 }
-function cloneJsonValue(value) {
-  if (value === null) {
-    return null;
-  }
-  if (typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
-    return value;
-  }
-  if (Array.isArray(value)) {
-    return value.map((item) => cloneJsonValue(item));
+// src/evaluation/yaml-parser.ts
+var ANSI_YELLOW5 = "\x1B[33m";
+var ANSI_RESET5 = "\x1B[0m";
+var SCHEMA_EVAL_V2 = "agentv-eval-v2";
+async function readTestSuiteMetadata(testFilePath) {
+  try {
+    const absolutePath = path6.resolve(testFilePath);
+    const content = await readFile4(absolutePath, "utf8");
+    const parsed = parse2(content);
+    if (!isJsonObject(parsed)) {
+      return {};
+    }
+    return { target: extractTargetFromSuite(parsed) };
+  } catch {
+    return {};
   }
-  return cloneJsonObject(value);
 }
-function formatFileContents(parts) {
-  const fileCount = parts.filter((p) => p.isFile).length;
-  if (fileCount > 0) {
-    return parts.map((part) => {
-      if (part.isFile && part.displayPath) {
-        return `<file path="${part.displayPath}">
-${part.content}
-</file>`;
-      }
-      return part.content;
-    }).join("\n\n");
+async function loadEvalCases(evalFilePath, repoRoot, options) {
+  const verbose = options?.verbose ?? false;
+  const evalIdFilter = options?.evalId;
+  const absoluteTestPath = path6.resolve(evalFilePath);
+  const repoRootPath = resolveToAbsolutePath(repoRoot);
+  const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
+  const config = await loadConfig(absoluteTestPath, repoRootPath);
+  const guidelinePatterns = config?.guideline_patterns;
+  const rawFile = await readFile4(absoluteTestPath, "utf8");
+  const parsed = parse2(rawFile);
+  if (!isJsonObject(parsed)) {
+    throw new Error(`Invalid test file format: ${evalFilePath}`);
   }
-  return parts.map((p) => p.content).join(" ");
-}
-async function resolveAssistantContent(content, searchRoots, verbose) {
-  if (typeof content === "string") {
-    return content;
+  const suite = parsed;
+  const datasetNameFromSuite = asString5(suite.dataset)?.trim();
+  const fallbackDataset = path6.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
+  const datasetName = datasetNameFromSuite && datasetNameFromSuite.length > 0 ? datasetNameFromSuite : fallbackDataset;
+  const schema = suite.$schema;
+  if (schema !== SCHEMA_EVAL_V2) {
+    const message = typeof schema === "string" ? `Invalid $schema value '${schema}' in ${evalFilePath}. Expected '${SCHEMA_EVAL_V2}'` : `Missing required field '$schema' in ${evalFilePath}.
+Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
+    throw new Error(message);
   }
-  if (!content) {
-    return "";
+  const rawTestcases = suite.evalcases;
+  if (!Array.isArray(rawTestcases)) {
+    throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
   }
-  const parts = [];
-  for (const entry of content) {
-    if (typeof entry === "string") {
-      parts.push({ content: entry, isFile: false });
-      continue;
-    }
-    if (!isJsonObject(entry)) {
-      continue;
-    }
-    const segmentType = asString(entry.type);
-    if (segmentType === "file") {
-      const rawValue = asString(entry.value);
-      if (!rawValue) {
-        continue;
-      }
-      const { displayPath, resolvedPath, attempted } = await resolveFileReference(
-        rawValue,
-        searchRoots
-      );
-      if (!resolvedPath) {
-        const attempts = attempted.length ? ["  Tried:", ...attempted.map((candidate) => `    ${candidate}`)] : void 0;
-        logWarning(`File not found in expected_messages: ${displayPath}`, attempts);
-        continue;
-      }
-      try {
-        const fileContent = (await readFile(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
-        parts.push({ content: fileContent, isFile: true, displayPath });
-        if (verbose) {
-          console.log(`  [Expected Assistant File] Found: ${displayPath}`);
-          console.log(`    Resolved to: ${resolvedPath}`);
-        }
-      } catch (error) {
-        logWarning(`Could not read file ${resolvedPath}: ${error.message}`);
-      }
-      continue;
-    }
-    const textValue = asString(entry.text);
-    if (typeof textValue === "string") {
-      parts.push({ content: textValue, isFile: false });
+  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
+  const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
+  const _globalTarget = asString5(globalExecution?.target) ?? asString5(suite.target);
+  const results = [];
+  for (const rawEvalcase of rawTestcases) {
+    if (!isJsonObject(rawEvalcase)) {
+      logWarning5("Skipping invalid eval case entry (expected object)");
       continue;
     }
-    const valueValue = asString(entry.value);
-    if (typeof valueValue === "string") {
-      parts.push({ content: valueValue, isFile: false });
+    const evalcase = rawEvalcase;
+    const id = asString5(evalcase.id);
+    if (evalIdFilter && id !== evalIdFilter) {
       continue;
     }
-    parts.push({ content: JSON.stringify(entry), isFile: false });
-  }
-  return formatFileContents(parts);
-}
-async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
-  const execution = rawEvalCase.execution;
-  const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
-  if (candidateEvaluators === void 0) {
-    return void 0;
-  }
-  if (!Array.isArray(candidateEvaluators)) {
-    logWarning(`Skipping evaluators for '${evalId}': expected array`);
-    return void 0;
-  }
-  const evaluators = [];
-  for (const rawEvaluator of candidateEvaluators) {
-    if (!isJsonObject(rawEvaluator)) {
-      logWarning(`Skipping invalid evaluator entry for '${evalId}' (expected object)`);
+    const conversationId = asString5(evalcase.conversation_id);
+    const outcome = asString5(evalcase.outcome);
+    const inputMessagesValue = evalcase.input_messages;
+    const expectedMessagesValue = evalcase.expected_messages;
+    if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
+      logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
       continue;
     }
-    const name = asString(rawEvaluator.name);
-    const typeValue = rawEvaluator.type;
-    if (!name || !isEvaluatorKind(typeValue)) {
-      logWarning(`Skipping evaluator with invalid name/type in '${evalId}'`);
+    const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
+    const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
+    const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
+    if (hasExpectedMessages && expectedMessages.length === 0) {
+      logWarning5(`No valid expected message found for eval case: ${id}`);
       continue;
     }
-    if (typeValue === "code") {
-      const script = asString(rawEvaluator.script);
-      if (!script) {
-        logWarning(`Skipping code evaluator '${name}' in '${evalId}': missing script`);
-        continue;
-      }
-      const cwd = asString(rawEvaluator.cwd);
-      let resolvedCwd;
-      if (cwd) {
-        const resolved = await resolveFileReference(cwd, searchRoots);
-        if (resolved.resolvedPath) {
-          resolvedCwd = path.resolve(resolved.resolvedPath);
-        } else {
-          logWarning(
-            `Code evaluator '${name}' in '${evalId}': cwd not found (${resolved.displayPath})`,
-            resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => `  Tried: ${attempt}`) : void 0
-          );
-        }
-      } else {
-        resolvedCwd = searchRoots[0];
-      }
-      evaluators.push({
-        name,
-        type: "code",
-        script,
-        cwd,
-        resolvedCwd
-      });
-      continue;
+    if (expectedMessages.length > 1) {
+      logWarning5(`Multiple expected messages found for eval case: ${id}, using first`);
     }
-    const prompt = asString(rawEvaluator.prompt);
-    let promptPath;
-    if (prompt) {
-      const resolved = await resolveFileReference(prompt, searchRoots);
-      if (resolved.resolvedPath) {
-        promptPath = path.resolve(resolved.resolvedPath);
-      } else {
-        logWarning(
-          `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
-          resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => `  Tried: ${attempt}`) : void 0
-        );
+    const guidelinePaths = [];
+    const inputTextParts = [];
+    const inputSegments = await processMessages({
+      messages: inputMessages,
+      searchRoots,
+      repoRootPath,
+      guidelinePatterns,
+      guidelinePaths,
+      textParts: inputTextParts,
+      messageType: "input",
+      verbose
+    });
+    const outputSegments = hasExpectedMessages ? await processMessages({
+      messages: expectedMessages,
+      searchRoots,
+      repoRootPath,
+      guidelinePatterns,
+      messageType: "output",
+      verbose
+    }) : [];
+    const codeSnippets = extractCodeBlocks(inputSegments);
+    const expectedContent = expectedMessages[0]?.content;
+    const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
+    const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
+    const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
+    const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
+    const userFilePaths = [];
+    for (const segment of inputSegments) {
+      if (segment.type === "file" && typeof segment.resolvedPath === "string") {
+        userFilePaths.push(segment.resolvedPath);
       }
     }
-    const model = asString(rawEvaluator.model);
-    evaluators.push({
-      name,
-      type: "llm_judge",
-      prompt,
-      promptPath
-    });
+    const allFilePaths = [
+      ...guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
+      ...userFilePaths
+    ];
+    const testCase = {
+      id,
+      dataset: datasetName,
+      conversation_id: conversationId,
+      question,
+      input_messages: inputMessages,
+      input_segments: inputSegments,
+      output_segments: outputSegments,
+      reference_answer: referenceAnswer,
+      guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
+      guideline_patterns: guidelinePatterns,
+      file_paths: allFilePaths,
+      code_snippets: codeSnippets,
+      expected_outcome: outcome,
+      evaluator: evalCaseEvaluatorKind,
+      evaluators
+    };
+    if (verbose) {
+      console.log(`
+[Eval Case: ${id}]`);
+      if (testCase.guideline_paths.length > 0) {
+        console.log(`  Guidelines used: ${testCase.guideline_paths.length}`);
+        for (const guidelinePath of testCase.guideline_paths) {
+          console.log(`    - ${guidelinePath}`);
+        }
+      } else {
+        console.log("  No guidelines found");
+      }
+    }
+    results.push(testCase);
   }
-  return evaluators.length > 0 ? evaluators : void 0;
+  return results;
 }
-function coerceEvaluator(candidate, contextId) {
-  if (typeof candidate !== "string") {
-    return void 0;
-  }
-  if (isEvaluatorKind(candidate)) {
-    return candidate;
-  }
-  logWarning(`Unknown evaluator '${candidate}' in ${contextId}, falling back to default`);
-  return void 0;
+function asString5(value) {
+  return typeof value === "string" ? value : void 0;
 }
-function logWarning(message, details) {
+function logWarning5(message, details) {
   if (details && details.length > 0) {
     const detailBlock = details.join("\n");
-    console.warn(`${ANSI_YELLOW}Warning: ${message}
-${detailBlock}${ANSI_RESET}`);
+    console.warn(`${ANSI_YELLOW5}Warning: ${message}
+${detailBlock}${ANSI_RESET5}`);
   } else {
-    console.warn(`${ANSI_YELLOW}Warning: ${message}${ANSI_RESET}`);
+    console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
   }
 }
-// src/evaluation/providers/ax.ts
-import { AxAI } from "@ax-llm/ax";
+// src/evaluation/providers/ai-sdk.ts
+import { createAnthropic } from "@ai-sdk/anthropic";
+import { createAzure } from "@ai-sdk/azure";
+import { createGoogleGenerativeAI } from "@ai-sdk/google";
+import { generateText } from "ai";
 var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
+var AzureProvider = class {
+  constructor(targetName, config) {
+    this.config = config;
+    this.id = `azure:${targetName}`;
+    this.targetName = targetName;
+    this.defaults = {
+      temperature: config.temperature,
+      maxOutputTokens: config.maxOutputTokens
+    };
+    this.retryConfig = config.retry;
+    const azure = createAzure(buildAzureOptions(config));
+    this.model = azure(config.deploymentName);
+  }
+  id;
+  kind = "azure";
+  targetName;
+  model;
+  defaults;
+  retryConfig;
+  async invoke(request) {
+    return invokeModel({
+      model: this.model,
+      request,
+      defaults: this.defaults,
+      retryConfig: this.retryConfig
+    });
+  }
+};
+var AnthropicProvider = class {
+  constructor(targetName, config) {
+    this.config = config;
+    this.id = `anthropic:${targetName}`;
+    this.targetName = targetName;
+    this.defaults = {
+      temperature: config.temperature,
+      maxOutputTokens: config.maxOutputTokens,
+      thinkingBudget: config.thinkingBudget
+    };
+    this.retryConfig = config.retry;
+    const anthropic = createAnthropic({
+      apiKey: config.apiKey
+    });
+    this.model = anthropic(config.model);
+  }
+  id;
+  kind = "anthropic";
+  targetName;
+  model;
+  defaults;
+  retryConfig;
+  async invoke(request) {
+    const providerOptions = buildAnthropicProviderOptions(this.defaults);
+    return invokeModel({
+      model: this.model,
+      request,
+      defaults: this.defaults,
+      retryConfig: this.retryConfig,
+      providerOptions
+    });
+  }
+};
+var GeminiProvider = class {
+  constructor(targetName, config) {
+    this.config = config;
+    this.id = `gemini:${targetName}`;
+    this.targetName = targetName;
+    this.defaults = {
+      temperature: config.temperature,
+      maxOutputTokens: config.maxOutputTokens
+    };
+    this.retryConfig = config.retry;
+    const google = createGoogleGenerativeAI({
+      apiKey: config.apiKey
+    });
+    this.model = google(config.model);
+  }
+  id;
+  kind = "gemini";
+  targetName;
+  model;
+  defaults;
+  retryConfig;
+  async invoke(request) {
+    return invokeModel({
+      model: this.model,
+      request,
+      defaults: this.defaults,
+      retryConfig: this.retryConfig
+    });
+  }
+};
+function buildAzureOptions(config) {
+  const options = {
+    apiKey: config.apiKey,
+    apiVersion: config.version,
+    useDeploymentBasedUrls: true
+  };
+  const baseURL = normalizeAzureBaseUrl(config.resourceName);
+  if (baseURL) {
+    options.baseURL = baseURL;
+  } else {
+    options.resourceName = config.resourceName;
+  }
+  return options;
+}
+function normalizeAzureBaseUrl(resourceName) {
+  const trimmed = resourceName.trim();
+  if (!/^https?:\/\//i.test(trimmed)) {
+    return void 0;
+  }
+  const withoutSlash = trimmed.replace(/\/+$/, "");
+  const normalized = withoutSlash.endsWith("/openai") ? withoutSlash : `${withoutSlash}/openai`;
+  return normalized;
+}
+function buildAnthropicProviderOptions(defaults) {
+  if (defaults.thinkingBudget === void 0) {
+    return void 0;
+  }
+  return {
+    anthropic: {
+      thinking: {
+        type: "enabled",
+        budgetTokens: defaults.thinkingBudget
+      }
+    }
+  };
+}
 function buildChatPrompt(request) {
-  if (request.chatPrompt) {
-    const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
+  const provided = request.chatPrompt?.length ? request.chatPrompt : void 0;
+  if (provided) {
+    const hasSystemMessage = provided.some((message) => message.role === "system");
     if (hasSystemMessage) {
-      return request.chatPrompt;
+      return provided;
     }
-    const systemContent2 = resolveSystemContent(request);
-    return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
+    const systemContent2 = resolveSystemContent(request, false);
+    return [{ role: "system", content: systemContent2 }, ...provided];
   }
-  const systemContent = resolveSystemContent(request);
+  const systemContent = resolveSystemContent(request, true);
   const userContent = request.question.trim();
   const prompt = [
-    {
-      role: "system",
-      content: systemContent
-    },
-    {
-      role: "user",
-      content: userContent
-    }
+    { role: "system", content: systemContent },
+    { role: "user", content: userContent }
   ];
   return prompt;
 }
-function resolveSystemContent(request) {
+function resolveSystemContent(request, includeGuidelines) {
   const systemSegments = [];
-  const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
-  if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
-    systemSegments.push(metadataSystemPrompt.trim());
+  if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
+    systemSegments.push(request.systemPrompt.trim());
   } else {
     systemSegments.push(DEFAULT_SYSTEM_PROMPT);
   }
-  if (request.guidelines && request.guidelines.trim().length > 0) {
+  if (includeGuidelines && request.guidelines && request.guidelines.trim().length > 0) {
     systemSegments.push(`[[ ## Guidelines ## ]]
 ${request.guidelines.trim()}`);
   }
   return systemSegments.join("\n\n");
 }
-function extractModelConfig(request, defaults) {
+function toModelMessages(chatPrompt) {
+  return chatPrompt.map((message) => {
+    if (message.role === "tool" || message.role === "function") {
+      const prefix = message.name ? `@[${message.name}]: ` : "@[Tool]: ";
+      return {
+        role: "assistant",
+        content: `${prefix}${message.content}`
+      };
+    }
+    if (message.role === "assistant" || message.role === "system" || message.role === "user") {
+      return {
+        role: message.role,
+        content: message.content
+      };
+    }
+    return {
+      role: "user",
+      content: message.content
+    };
+  });
+}
+function resolveModelSettings(request, defaults) {
   const temperature = request.temperature ?? defaults.temperature;
-  const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
-  const config = {};
-  if (temperature !== void 0) {
-    config.temperature = temperature;
-  }
-  if (maxTokens !== void 0) {
-    config.maxTokens = maxTokens;
-  }
-  return Object.keys(config).length > 0 ? config : void 0;
+  const maxOutputTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
+  return {
+    temperature,
+    maxOutputTokens
+  };
+}
+async function invokeModel(options) {
+  const { model, request, defaults, retryConfig, providerOptions } = options;
+  const chatPrompt = buildChatPrompt(request);
+  const { temperature, maxOutputTokens } = resolveModelSettings(request, defaults);
+  const result = await withRetry(
+    () => generateText({
+      model,
+      messages: toModelMessages(chatPrompt),
+      temperature,
+      maxOutputTokens,
+      maxRetries: 0,
+      abortSignal: request.signal,
+      ...providerOptions ? { providerOptions } : {}
+    }),
+    retryConfig,
+    request.signal
+  );
+  return mapResponse(result);
 }
-function mapResponse(response) {
-  const primary = response.results[0];
-  const text = typeof primary?.content === "string" ? primary.content : "";
-  const reasoning = primary?.thought ?? primary?.thoughtBlock?.data;
-  const usage = toJsonObject(response.modelUsage);
+function mapResponse(result) {
   return {
-    text,
-    reasoning,
-    raw: response,
-    usage
+    text: result.text ?? "",
+    reasoning: result.reasoningText ?? void 0,
+    raw: result,
+    usage: toJsonObject(result.totalUsage ?? result.usage)
   };
 }
 function toJsonObject(value) {
@@ -885,34 +1187,59 @@ function toJsonObject(value) {
     return void 0;
   }
 }
-function ensureChatResponse(result) {
-  if (typeof ReadableStream !== "undefined" && result instanceof ReadableStream) {
-    throw new Error("Streaming responses are not supported for this provider");
+function extractStatus(error) {
+  if (!error || typeof error !== "object") {
+    return void 0;
   }
-  if (!result || typeof result !== "object" || !("results" in result)) {
-    throw new Error("Unexpected response type from AxAI provider");
+  const candidate = error;
+  const directStatus = candidate.status ?? candidate.statusCode;
+  if (typeof directStatus === "number" && Number.isFinite(directStatus)) {
+    return directStatus;
   }
-  return result;
+  const responseStatus = typeof candidate.response === "object" && candidate.response ? candidate.response.status : void 0;
+  if (typeof responseStatus === "number" && Number.isFinite(responseStatus)) {
+    return responseStatus;
+  }
+  const message = typeof candidate.message === "string" ? candidate.message : void 0;
+  if (message) {
+    const match = message.match(/HTTP\s+(\d{3})/i);
+    if (match) {
+      const parsed = Number.parseInt(match[1], 10);
+      if (Number.isFinite(parsed)) {
+        return parsed;
+      }
+    }
+  }
+  return void 0;
 }
-function isRetryableError(error, retryableStatusCodes) {
+function isNetworkError(error) {
   if (!error || typeof error !== "object") {
     return false;
   }
-  if ("status" in error && typeof error.status === "number") {
-    return retryableStatusCodes.includes(error.status);
+  const candidate = error;
+  if (candidate.name === "AbortError") {
+    return false;
   }
-  if ("message" in error && typeof error.message === "string") {
-    const match = error.message.match(/HTTP (\d{3})/);
-    if (match) {
-      const status = Number.parseInt(match[1], 10);
-      return retryableStatusCodes.includes(status);
-    }
+  const code = candidate.code;
+  if (typeof code === "string" && /^E(AI|CONN|HOST|NET|PIPE|TIME|REFUSED|RESET)/i.test(code)) {
+    return true;
   }
-  if ("name" in error && error.name === "AxAIServiceNetworkError") {
+  const message = typeof candidate.message === "string" ? candidate.message : void 0;
+  if (message && /(network|fetch failed|ECONNRESET|ENOTFOUND|EAI_AGAIN|ETIMEDOUT|ECONNREFUSED)/i.test(message)) {
     return true;
   }
   return false;
 }
+function isRetryableError(error, retryableStatusCodes) {
+  const status = extractStatus(error);
+  if (status === 401 || status === 403) {
+    return false;
+  }
+  if (typeof status === "number") {
+    return retryableStatusCodes.includes(status);
+  }
+  return isNetworkError(error);
+}
 function calculateRetryDelay(attempt, config) {
   const delay = Math.min(
     config.maxDelayMs,
@@ -948,152 +1275,16 @@ async function withRetry(fn, retryConfig, signal) {
       }
       const delay = calculateRetryDelay(attempt, config);
       await sleep(delay);
-      if (signal?.aborted) {
-        throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
-      }
     }
   }
   throw lastError;
 }
-var AzureProvider = class {
-  constructor(targetName, config) {
-    this.config = config;
-    this.id = `azure:${targetName}`;
-    this.targetName = targetName;
-    this.defaults = {
-      temperature: config.temperature,
-      maxOutputTokens: config.maxOutputTokens
-    };
-    this.retryConfig = config.retry;
-    this.ai = AxAI.create({
-      name: "azure-openai",
-      apiKey: config.apiKey,
-      resourceName: config.resourceName,
-      deploymentName: config.deploymentName,
-      version: config.version,
-      config: {
-        stream: false
-      }
-    });
-  }
-  id;
-  kind = "azure";
-  targetName;
-  ai;
-  defaults;
-  retryConfig;
-  async invoke(request) {
-    const chatPrompt = buildChatPrompt(request);
-    const modelConfig = extractModelConfig(request, this.defaults);
-    const response = await withRetry(
-      async () => await this.ai.chat(
-        {
-          chatPrompt,
-          model: this.config.deploymentName,
-          ...modelConfig ? { modelConfig } : {}
-        },
-        request.signal ? { abortSignal: request.signal } : void 0
-      ),
-      this.retryConfig,
-      request.signal
-    );
-    return mapResponse(ensureChatResponse(response));
-  }
-  getAxAI() {
-    return this.ai;
-  }
-};
-var AnthropicProvider = class {
-  constructor(targetName, config) {
-    this.config = config;
-    this.id = `anthropic:${targetName}`;
-    this.targetName = targetName;
-    this.defaults = {
-      temperature: config.temperature,
-      maxOutputTokens: config.maxOutputTokens,
-      thinkingBudget: config.thinkingBudget
-    };
-    this.retryConfig = config.retry;
-    this.ai = AxAI.create({
-      name: "anthropic",
-      apiKey: config.apiKey
-    });
-  }
-  id;
-  kind = "anthropic";
-  targetName;
-  ai;
-  defaults;
-  retryConfig;
-  async invoke(request) {
-    const chatPrompt = buildChatPrompt(request);
-    const modelConfig = extractModelConfig(request, this.defaults);
-    const response = await withRetry(
-      async () => await this.ai.chat(
-        {
-          chatPrompt,
-          model: this.config.model,
-          ...modelConfig ? { modelConfig } : {}
-        },
-        request.signal ? { abortSignal: request.signal } : void 0
-      ),
-      this.retryConfig,
-      request.signal
-    );
-    return mapResponse(ensureChatResponse(response));
-  }
-  getAxAI() {
-    return this.ai;
-  }
-};
-var GeminiProvider = class {
-  constructor(targetName, config) {
-    this.config = config;
-    this.id = `gemini:${targetName}`;
-    this.targetName = targetName;
-    this.defaults = {
-      temperature: config.temperature,
-      maxOutputTokens: config.maxOutputTokens
-    };
-    this.retryConfig = config.retry;
-    this.ai = AxAI.create({
-      name: "google-gemini",
-      apiKey: config.apiKey
-    });
-  }
-  id;
-  kind = "gemini";
-  targetName;
-  ai;
-  defaults;
-  retryConfig;
-  async invoke(request) {
-    const chatPrompt = buildChatPrompt(request);
-    const modelConfig = extractModelConfig(request, this.defaults);
-    const response = await withRetry(
-      async () => await this.ai.chat(
-        {
-          chatPrompt,
-          model: this.config.model,
-          ...modelConfig ? { modelConfig } : {}
-        },
-        request.signal ? { abortSignal: request.signal } : void 0
-      ),
-      this.retryConfig,
-      request.signal
-    );
-    return mapResponse(ensureChatResponse(response));
-  }
-  getAxAI() {
-    return this.ai;
-  }
-};
 // src/evaluation/providers/cli.ts
 import { exec as execWithCallback } from "node:child_process";
 import fs from "node:fs/promises";
 import os from "node:os";
-import path2 from "node:path";
+import path7 from "node:path";
 import { promisify } from "node:util";
 var execAsync = promisify(execWithCallback);
 var DEFAULT_MAX_BUFFER = 10 * 1024 * 1024;
@@ -1135,12 +1326,14 @@ var CliProvider = class {
   supportsBatch = false;
   config;
   runCommand;
+  verbose;
   healthcheckPromise;
   constructor(targetName, config, runner = defaultCommandRunner) {
     this.targetName = targetName;
     this.id = `cli:${targetName}`;
     this.config = config;
     this.runCommand = runner;
+    this.verbose = config.verbose ?? false;
   }
   async invoke(request) {
     if (request.signal?.aborted) {
@@ -1241,6 +1434,11 @@ var CliProvider = class {
         generateOutputFilePath("healthcheck")
       )
     );
+    if (this.verbose) {
+      console.log(
+        `[cli-provider:${this.targetName}] (healthcheck) CLI_EVALS_DIR=${process.env.CLI_EVALS_DIR ?? ""} cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
+      );
+    }
     const result = await this.runCommand(renderedCommand, {
       cwd: healthcheck.cwd ?? this.config.cwd,
       env: process.env,
@@ -1272,7 +1470,7 @@ function normalizeInputFiles(inputFiles) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = path2.resolve(inputFile);
+    const absolutePath = path7.resolve(inputFile);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -1286,7 +1484,7 @@ function formatFileList(files, template) {
   const formatter = template ?? "{path}";
   return files.map((filePath) => {
     const escapedPath = shellEscape(filePath);
-    const escapedName = shellEscape(path2.basename(filePath));
+    const escapedName = shellEscape(path7.basename(filePath));
     return formatter.replaceAll("{path}", escapedPath).replaceAll("{basename}", escapedName);
   }).join(" ");
 }
@@ -1310,7 +1508,7 @@ function generateOutputFilePath(evalCaseId) {
   const safeEvalId = evalCaseId || "unknown";
   const timestamp = Date.now();
   const random = Math.random().toString(36).substring(2, 9);
-  return path2.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
+  return path7.join(os.tmpdir(), `agentv-${safeEvalId}-${timestamp}-${random}.json`);
 }
 function formatTimeoutSuffix(timeoutMs) {
   if (!timeoutMs || timeoutMs <= 0) {
@@ -1326,7 +1524,7 @@ import { randomUUID } from "node:crypto";
 import { constants as constants2, createWriteStream } from "node:fs";
 import { access as access2, mkdtemp, mkdir, rm, writeFile } from "node:fs/promises";
 import { tmpdir } from "node:os";
-import path4 from "node:path";
+import path9 from "node:path";
 import { promisify as promisify2 } from "node:util";
 // src/evaluation/providers/codex-log-tracker.ts
@@ -1383,7 +1581,7 @@ function subscribeToCodexLogEntries(listener) {
 }
 // src/evaluation/providers/preread.ts
-import path3 from "node:path";
+import path8 from "node:path";
 function buildPromptDocument(request, inputFiles, options) {
   const parts = [];
   const guidelineFiles = collectGuidelineFiles(
@@ -1408,7 +1606,7 @@ function normalizeInputFiles2(inputFiles) {
   }
   const deduped = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = path3.resolve(inputFile);
+    const absolutePath = path8.resolve(inputFile);
     if (!deduped.has(absolutePath)) {
       deduped.set(absolutePath, absolutePath);
     }
@@ -1421,14 +1619,14 @@ function collectGuidelineFiles(inputFiles, guidelinePatterns, overrides) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = path3.resolve(inputFile);
+    const absolutePath = path8.resolve(inputFile);
     if (overrides?.has(absolutePath)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
       }
       continue;
     }
-    const normalized = absolutePath.split(path3.sep).join("/");
+    const normalized = absolutePath.split(path8.sep).join("/");
     if (isGuidelineFile(normalized, guidelinePatterns)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
@@ -1443,7 +1641,7 @@ function collectInputFiles(inputFiles) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const inputFile of inputFiles) {
-    const absolutePath = path3.resolve(inputFile);
+    const absolutePath = path8.resolve(inputFile);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -1455,7 +1653,7 @@ function buildMandatoryPrereadBlock(guidelineFiles, inputFiles) {
     return "";
   }
   const buildList = (files) => files.map((absolutePath) => {
-    const fileName = path3.basename(absolutePath);
+    const fileName = path8.basename(absolutePath);
     const fileUri = pathToFileUri(absolutePath);
     return `* [${fileName}](${fileUri})`;
   });
@@ -1475,7 +1673,7 @@ ${buildList(inputFiles).join("\n")}.`);
   return sections.join("\n");
 }
 function pathToFileUri(filePath) {
-  const absolutePath = path3.isAbsolute(filePath) ? filePath : path3.resolve(filePath);
+  const absolutePath = path8.isAbsolute(filePath) ? filePath : path8.resolve(filePath);
   const normalizedPath = absolutePath.replace(/\\/g, "/");
   if (/^[a-zA-Z]:\//.test(normalizedPath)) {
     return `file:///${normalizedPath}`;
@@ -1513,7 +1711,7 @@ var CodexProvider = class {
     const logger = await this.createStreamLogger(request).catch(() => void 0);
     try {
       const promptContent = buildPromptDocument(request, inputFiles);
-      const promptFile = path4.join(workspaceRoot, PROMPT_FILENAME);
+      const promptFile = path9.join(workspaceRoot, PROMPT_FILENAME);
       await writeFile(promptFile, promptContent, "utf8");
       const args = this.buildCodexArgs();
       const cwd = this.resolveCwd(workspaceRoot);
@@ -1563,7 +1761,7 @@ var CodexProvider = class {
     if (!this.config.cwd) {
       return workspaceRoot;
     }
-    return path4.resolve(this.config.cwd);
+    return path9.resolve(this.config.cwd);
   }
   buildCodexArgs() {
     const args = ["--ask-for-approval", "never", "exec", "--json", "--color", "never", "--skip-git-repo-check"];
@@ -1597,7 +1795,7 @@ var CodexProvider = class {
     }
   }
   async createWorkspace() {
-    return await mkdtemp(path4.join(tmpdir(), WORKSPACE_PREFIX));
+    return await mkdtemp(path9.join(tmpdir(), WORKSPACE_PREFIX));
   }
   async cleanupWorkspace(workspaceRoot) {
     try {
@@ -1611,9 +1809,9 @@ var CodexProvider = class {
       return void 0;
     }
     if (this.config.logDir) {
-      return path4.resolve(this.config.logDir);
+      return path9.resolve(this.config.logDir);
     }
-    return path4.join(process.cwd(), ".agentv", "logs", "codex");
+    return path9.join(process.cwd(), ".agentv", "logs", "codex");
   }
   async createStreamLogger(request) {
     const logDir = this.resolveLogDirectory();
@@ -1627,7 +1825,7 @@ var CodexProvider = class {
       console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
       return void 0;
     }
-    const filePath = path4.join(logDir, buildLogFilename(request, this.targetName));
+    const filePath = path9.join(logDir, buildLogFilename(request, this.targetName));
     try {
       const logger = await CodexStreamLogger.create({
         filePath,
@@ -1842,7 +2040,7 @@ function tryParseJsonValue(rawLine) {
 async function locateExecutable(candidate) {
   const includesPathSeparator = candidate.includes("/") || candidate.includes("\\");
   if (includesPathSeparator) {
-    const resolved = path4.isAbsolute(candidate) ? candidate : path4.resolve(candidate);
+    const resolved = path9.isAbsolute(candidate) ? candidate : path9.resolve(candidate);
     const executablePath = await ensureWindowsExecutableVariant(resolved);
     await access2(executablePath, constants2.F_OK);
     return executablePath;
@@ -2189,7 +2387,7 @@ var MockProvider = class {
 };
 // src/evaluation/providers/vscode.ts
-import path5 from "node:path";
+import path10 from "node:path";
 import { dispatchAgentSession, dispatchBatchAgent, getSubagentRoot, provisionSubagents } from "subagent";
 var VSCodeProvider = class {
   id;
@@ -2302,6 +2500,9 @@ var VSCodeProvider = class {
 };
 function buildPromptDocument2(request, attachments, guidelinePatterns) {
   const parts = [];
+  if (request.systemPrompt && request.systemPrompt.trim().length > 0) {
+    parts.push(request.systemPrompt.trim());
+  }
   const guidelineFiles = collectGuidelineFiles2(attachments, guidelinePatterns);
   const attachmentFiles = collectAttachmentFiles(attachments);
   const nonGuidelineAttachments = attachmentFiles.filter(
@@ -2319,7 +2520,7 @@ function buildMandatoryPrereadBlock2(guidelineFiles, attachmentFiles) {
     return "";
   }
   const buildList = (files) => files.map((absolutePath) => {
-    const fileName = path5.basename(absolutePath);
+    const fileName = path10.basename(absolutePath);
     const fileUri = pathToFileUri2(absolutePath);
     return `* [${fileName}](${fileUri})`;
   });
@@ -2344,8 +2545,8 @@ function collectGuidelineFiles2(attachments, guidelinePatterns) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = path5.resolve(attachment);
-    const normalized = absolutePath.split(path5.sep).join("/");
+    const absolutePath = path10.resolve(attachment);
+    const normalized = absolutePath.split(path10.sep).join("/");
     if (isGuidelineFile(normalized, guidelinePatterns)) {
       if (!unique.has(absolutePath)) {
         unique.set(absolutePath, absolutePath);
@@ -2360,7 +2561,7 @@ function collectAttachmentFiles(attachments) {
   }
   const unique = /* @__PURE__ */ new Map();
   for (const attachment of attachments) {
-    const absolutePath = path5.resolve(attachment);
+    const absolutePath = path10.resolve(attachment);
     if (!unique.has(absolutePath)) {
       unique.set(absolutePath, absolutePath);
     }
@@ -2368,7 +2569,7 @@ function collectAttachmentFiles(attachments) {
   return Array.from(unique.values());
 }
 function pathToFileUri2(filePath) {
-  const absolutePath = path5.isAbsolute(filePath) ? filePath : path5.resolve(filePath);
+  const absolutePath = path10.isAbsolute(filePath) ? filePath : path10.resolve(filePath);
   const normalizedPath = absolutePath.replace(/\\/g, "/");
   if (/^[a-zA-Z]:\//.test(normalizedPath)) {
     return `file:///${normalizedPath}`;
@@ -2381,7 +2582,7 @@ function normalizeAttachments(attachments) {
   }
   const deduped = /* @__PURE__ */ new Set();
   for (const attachment of attachments) {
-    deduped.add(path5.resolve(attachment));
+    deduped.add(path10.resolve(attachment));
   }
   return Array.from(deduped);
 }
@@ -2390,7 +2591,7 @@ function mergeAttachments(all) {
   for (const list of all) {
     if (!list) continue;
     for (const inputFile of list) {
-      deduped.add(path5.resolve(inputFile));
+      deduped.add(path10.resolve(inputFile));
     }
   }
   return deduped.size > 0 ? Array.from(deduped) : void 0;
@@ -2436,9 +2637,9 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
 // src/evaluation/providers/targets-file.ts
 import { constants as constants3 } from "node:fs";
-import { access as access3, readFile as readFile2 } from "node:fs/promises";
-import path6 from "node:path";
-import { parse as parse2 } from "yaml";
+import { access as access3, readFile as readFile5 } from "node:fs/promises";
+import path11 from "node:path";
+import { parse as parse3 } from "yaml";
 function isRecord(value) {
   return typeof value === "object" && value !== null && !Array.isArray(value);
 }
@@ -2493,12 +2694,12 @@ async function fileExists3(filePath) {
   }
 }
 async function readTargetDefinitions(filePath) {
-  const absolutePath = path6.resolve(filePath);
+  const absolutePath = path11.resolve(filePath);
   if (!await fileExists3(absolutePath)) {
     throw new Error(`targets.yaml not found at ${absolutePath}`);
   }
-  const raw = await readFile2(absolutePath, "utf8");
-  const parsed = parse2(raw);
+  const raw = await readFile5(absolutePath, "utf8");
+  const parsed = parse3(raw);
   if (!isRecord(parsed)) {
     throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
   }
@@ -2541,18 +2742,34 @@ function resolveAndCreateProvider(definition, env = process.env) {
 }
 // src/evaluation/evaluators.ts
-import { randomUUID as randomUUID2 } from "node:crypto";
+var DEFAULT_EVALUATOR_TEMPLATE = `You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.
+Use the reference_answer as a gold standard for a high-quality response (if provided). The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.
+Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
+[[ ## expected_outcome ## ]]
+{{expected_outcome}}
+[[ ## question ## ]]
+{{question}}
+[[ ## reference_answer ## ]]
+{{reference_answer}}
+[[ ## candidate_answer ## ]]
+{{candidate_answer}}`;
 var LlmJudgeEvaluator = class {
   kind = "llm_judge";
   resolveJudgeProvider;
   maxOutputTokens;
   temperature;
-  customPrompt;
+  evaluatorTemplate;
   constructor(options) {
     this.resolveJudgeProvider = options.resolveJudgeProvider;
     this.maxOutputTokens = options.maxOutputTokens;
     this.temperature = options.temperature;
-    this.customPrompt = options.customPrompt;
+    this.evaluatorTemplate = options.evaluatorTemplate;
   }
   async evaluate(context) {
     const judgeProvider = await this.resolveJudgeProvider(context);
@@ -2562,26 +2779,21 @@ var LlmJudgeEvaluator = class {
     return this.evaluateWithPrompt(context, judgeProvider);
   }
   async evaluateWithPrompt(context, judgeProvider) {
-    const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
     const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
-    let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
-    let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
-    if (systemPrompt && hasTemplateVariables(systemPrompt)) {
-      const variables = {
-        input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
-        output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
-        candidate_answer: context.candidate,
-        reference_answer: context.evalCase.reference_answer ?? "",
-        expected_outcome: context.evalCase.expected_outcome,
-        question: formattedQuestion
-      };
-      prompt = substituteVariables(systemPrompt, variables);
-      systemPrompt = buildSystemPrompt(hasReferenceAnswer);
-    }
-    const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
+    const variables = {
+      input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
+      output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
+      candidate_answer: context.candidate.trim(),
+      reference_answer: (context.evalCase.reference_answer ?? "").trim(),
+      expected_outcome: context.evalCase.expected_outcome.trim(),
+      question: formattedQuestion.trim()
+    };
+    const systemPrompt = buildOutputSchema();
+    const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
+    const userPrompt = substituteVariables(evaluatorTemplate, variables);
     const response = await judgeProvider.invoke({
-      question: prompt,
-      metadata,
+      question: userPrompt,
+      systemPrompt,
       evalCaseId: context.evalCase.id,
       attempt: context.attempt,
       maxOutputTokens: this.maxOutputTokens,
@@ -2594,11 +2806,9 @@ var LlmJudgeEvaluator = class {
     const reasoning = parsed.reasoning ?? response.reasoning;
     const expectedAspectCount = Math.max(hits.length + misses.length, 1);
     const evaluatorRawRequest = {
-      id: randomUUID2(),
-      provider: judgeProvider.id,
-      prompt,
-      target: context.target.name,
-      ...systemPrompt !== void 0 && { systemPrompt }
+      userPrompt,
+      systemPrompt,
+      target: judgeProvider.targetName
     };
     return {
       score,
@@ -2610,20 +2820,8 @@ var LlmJudgeEvaluator = class {
     };
   }
 };
-function buildSystemPrompt(hasReferenceAnswer) {
-  const basePrompt = [
-    "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
-    ""
-  ];
-  if (hasReferenceAnswer) {
-    basePrompt.push(
-      "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
-      ""
-    );
-  }
-  basePrompt.push(
-    "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
-    "",
+function buildOutputSchema() {
+  return [
     "You must respond with a single JSON object matching this schema:",
     "",
     "{",
@@ -2632,30 +2830,7 @@ function buildSystemPrompt(hasReferenceAnswer) {
     '  "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
     '  "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
     "}"
-  );
-  return basePrompt.join("\n");
-}
-function buildQualityPrompt(evalCase, candidate, question) {
-  const parts = [
-    "[[ ## expected_outcome ## ]]",
-    evalCase.expected_outcome.trim(),
-    "",
-    "[[ ## question ## ]]",
-    question.trim(),
-    ""
-  ];
-  if (hasNonEmptyReferenceAnswer(evalCase)) {
-    parts.push(
-      "[[ ## reference_answer ## ]]",
-      evalCase.reference_answer.trim(),
-      ""
-    );
-  }
-  parts.push(
-    "[[ ## candidate_answer ## ]]",
-    candidate.trim()
-  );
-  return parts.join("\n");
+  ].join("\n");
 }
 function clampScore(value) {
   if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -2737,9 +2912,6 @@ function extractJsonBlob(text) {
 function isNonEmptyString(value) {
   return typeof value === "string" && value.trim().length > 0;
 }
-function hasNonEmptyReferenceAnswer(evalCase) {
-  return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
-}
 var CodeEvaluator = class {
   kind = "code";
   script;
@@ -2845,19 +3017,16 @@ function parseJsonSafe(payload) {
     return void 0;
   }
 }
-function hasTemplateVariables(text) {
-  return /\$\{[a-zA-Z0-9_]+\}/.test(text);
-}
 function substituteVariables(template, variables) {
-  return template.replace(/\$\{([a-zA-Z0-9_]+)\}/g, (match, varName) => {
+  return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
     return variables[varName] ?? match;
   });
 }
 // src/evaluation/orchestrator.ts
-import { createHash, randomUUID as randomUUID3 } from "node:crypto";
+import { createHash, randomUUID as randomUUID2 } from "node:crypto";
 import { mkdir as mkdir2, writeFile as writeFile2 } from "node:fs/promises";
-import path7 from "node:path";
+import path12 from "node:path";
 // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
 var Node = class {
@@ -3420,6 +3589,7 @@ async function evaluateCandidate(options) {
     }
   }
   return {
+    timestamp: completedAt.toISOString(),
     eval_id: evalCase.id,
     dataset: evalCase.dataset,
     conversation_id: evalCase.conversation_id,
@@ -3427,14 +3597,12 @@ async function evaluateCandidate(options) {
     hits: score.hits,
     misses: score.misses,
     candidate_answer: candidate,
-    expected_aspect_count: score.expectedAspectCount,
     target: target.name,
-    timestamp: completedAt.toISOString(),
     reasoning: score.reasoning,
     raw_aspects: score.rawAspects,
     agent_provider_request: agentProviderRequest,
     lm_provider_request: lmProviderRequest,
-    evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
+    evaluator_provider_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
     evaluator_results: evaluatorResults
   };
 }
@@ -3511,7 +3679,7 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluator_raw_request: score2.evaluatorRawRequest
+          evaluator_provider_request: score2.evaluatorRawRequest
         });
         continue;
       }
@@ -3538,7 +3706,7 @@ async function runEvaluatorList(options) {
           hits: score2.hits,
           misses: score2.misses,
           reasoning: score2.reasoning,
-          evaluator_raw_request: score2.evaluatorRawRequest
+          evaluator_provider_request: score2.evaluatorRawRequest
         });
         continue;
       }
@@ -3591,7 +3759,7 @@ async function runLlmJudgeEvaluator(options) {
     promptInputs,
     now,
     judgeProvider,
-    systemPrompt: customPrompt,
+    evaluatorTemplateOverride: customPrompt,
     evaluator: config
   });
 }
@@ -3632,8 +3800,8 @@ function buildEvaluatorRegistry(overrides, resolveJudgeProvider) {
 async function dumpPrompt(directory, evalCase, promptInputs) {
   const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
   const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
-  const filePath = path7.resolve(directory, filename);
-  await mkdir2(path7.dirname(filePath), { recursive: true });
+  const filePath = path12.resolve(directory, filename);
+  await mkdir2(path12.dirname(filePath), { recursive: true });
   const payload = {
     eval_id: evalCase.id,
     question: promptInputs.question,
@@ -3647,7 +3815,7 @@ function sanitizeFilename(value) {
     return "prompt";
   }
   const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, "_");
-  return sanitized.length > 0 ? sanitized : randomUUID3();
+  return sanitized.length > 0 ? sanitized : randomUUID2();
 }
 async function invokeProvider(provider, options) {
   const { evalCase, promptInputs, attempt, agentTimeoutMs, signal } = options;
@@ -3703,6 +3871,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
     }
   }
   return {
+    timestamp: timestamp.toISOString(),
     eval_id: evalCase.id,
     dataset: evalCase.dataset,
     conversation_id: evalCase.conversation_id,
@@ -3710,9 +3879,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
     hits: [],
     misses: [`Error: ${message}`],
     candidate_answer: `Error occurred: ${message}`,
-    expected_aspect_count: 0,
     target: targetName,
-    timestamp: timestamp.toISOString(),
     raw_aspects: [],
     agent_provider_request: agentProviderRequest,
     lm_provider_request: lmProviderRequest,