npm - @agentv/core - Versions diffs - 0.14.2 → 0.16.0 - Mend

@agentv/core 0.14.2 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.cjs CHANGED Viewed

@@ -116,7 +116,7 @@ function getHitCount(result) {
 }
 // src/evaluation/yaml-parser.ts
-var import_promises5 = require("fs/promises");
+var import_promises6 = require("fs/promises");
 var import_node_path6 = __toESM(require("path"), 1);
 var import_yaml2 = require("yaml");
@@ -154,7 +154,7 @@ ${part.content}
   }
   return parts.map((p) => p.content).join(" ");
 }
-function formatSegment(segment) {
+function formatSegment(segment, mode = "lm") {
   const type = asString(segment.type);
   if (type === "text") {
     return asString(segment.value);
@@ -164,8 +164,14 @@ function formatSegment(segment) {
     return refPath ? `<Attached: ${refPath}>` : void 0;
   }
   if (type === "file") {
-    const text = asString(segment.text);
     const filePath = asString(segment.path);
+    if (!filePath) {
+      return void 0;
+    }
+    if (mode === "agent") {
+      return `<file: path="${filePath}">`;
+    }
+    const text = asString(segment.text);
     if (text && filePath) {
       return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
     }
@@ -369,8 +375,67 @@ function logWarning(message) {
 // src/evaluation/loaders/evaluator-parser.ts
 var import_node_path3 = __toESM(require("path"), 1);
+// src/evaluation/validation/prompt-validator.ts
+var import_promises3 = require("fs/promises");
+// src/evaluation/template-variables.ts
+var TEMPLATE_VARIABLES = {
+  CANDIDATE_ANSWER: "candidate_answer",
+  EXPECTED_MESSAGES: "expected_messages",
+  QUESTION: "question",
+  EXPECTED_OUTCOME: "expected_outcome",
+  REFERENCE_ANSWER: "reference_answer",
+  INPUT_MESSAGES: "input_messages"
+};
+var VALID_TEMPLATE_VARIABLES = new Set(
+  Object.values(TEMPLATE_VARIABLES)
+);
+var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
+  TEMPLATE_VARIABLES.CANDIDATE_ANSWER,
+  TEMPLATE_VARIABLES.EXPECTED_MESSAGES
+]);
+// src/evaluation/validation/prompt-validator.ts
 var ANSI_YELLOW2 = "\x1B[33m";
 var ANSI_RESET2 = "\x1B[0m";
+async function validateCustomPromptContent(promptPath) {
+  const content = await (0, import_promises3.readFile)(promptPath, "utf8");
+  validateTemplateVariables(content, promptPath);
+}
+function validateTemplateVariables(content, source) {
+  const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
+  const foundVariables = /* @__PURE__ */ new Set();
+  const invalidVariables = [];
+  let match;
+  while ((match = variablePattern.exec(content)) !== null) {
+    const varName = match[1];
+    foundVariables.add(varName);
+    if (!VALID_TEMPLATE_VARIABLES.has(varName)) {
+      invalidVariables.push(varName);
+    }
+  }
+  const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.CANDIDATE_ANSWER);
+  const hasExpectedMessages = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_MESSAGES);
+  const hasRequiredFields = hasCandidateAnswer || hasExpectedMessages;
+  if (!hasRequiredFields) {
+    throw new Error(
+      `Missing required fields. Must include at least one of:
+  - {{ ${TEMPLATE_VARIABLES.CANDIDATE_ANSWER} }}
+  - {{ ${TEMPLATE_VARIABLES.EXPECTED_MESSAGES} }}`
+    );
+  }
+  if (invalidVariables.length > 0) {
+    const warningMessage = `${ANSI_YELLOW2}Warning: Custom evaluator template at ${source}
+  Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
+  Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET2}`;
+    console.warn(warningMessage);
+  }
+}
+// src/evaluation/loaders/evaluator-parser.ts
+var ANSI_YELLOW3 = "\x1B[33m";
+var ANSI_RESET3 = "\x1B[0m";
 async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
   const execution = rawEvalCase.execution;
   const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
@@ -429,6 +494,12 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       const resolved = await resolveFileReference(prompt, searchRoots);
       if (resolved.resolvedPath) {
         promptPath = import_node_path3.default.resolve(resolved.resolvedPath);
+        try {
+          await validateCustomPromptContent(promptPath);
+        } catch (error) {
+          const message = error instanceof Error ? error.message : String(error);
+          throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
+        }
       } else {
         logWarning2(
           `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
@@ -465,18 +536,18 @@ function isJsonObject2(value) {
 function logWarning2(message, details) {
   if (details && details.length > 0) {
     const detailBlock = details.join("\n");
-    console.warn(`${ANSI_YELLOW2}Warning: ${message}
-${detailBlock}${ANSI_RESET2}`);
+    console.warn(`${ANSI_YELLOW3}Warning: ${message}
+${detailBlock}${ANSI_RESET3}`);
   } else {
-    console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
+    console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
   }
 }
 // src/evaluation/loaders/message-processor.ts
-var import_promises3 = require("fs/promises");
+var import_promises4 = require("fs/promises");
 var import_node_path4 = __toESM(require("path"), 1);
-var ANSI_YELLOW3 = "\x1B[33m";
-var ANSI_RESET3 = "\x1B[0m";
+var ANSI_YELLOW4 = "\x1B[33m";
+var ANSI_RESET4 = "\x1B[0m";
 async function processMessages(options) {
   const {
     messages,
@@ -519,7 +590,7 @@ async function processMessages(options) {
           continue;
         }
         try {
-          const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
+          const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
           if (messageType === "input" && guidelinePatterns && guidelinePaths) {
             const relativeToRepo = import_node_path4.default.relative(repoRootPath, resolvedPath);
             if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
@@ -590,7 +661,7 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
         continue;
       }
       try {
-        const fileContent = (await (0, import_promises3.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
+        const fileContent = (await (0, import_promises4.readFile)(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
         parts.push({ content: fileContent, isFile: true, displayPath });
         if (verbose) {
           console.log(`  [Expected Assistant File] Found: ${displayPath}`);
@@ -640,19 +711,19 @@ function cloneJsonValue(value) {
 function logWarning3(message, details) {
   if (details && details.length > 0) {
     const detailBlock = details.join("\n");
-    console.warn(`${ANSI_YELLOW3}Warning: ${message}
-${detailBlock}${ANSI_RESET3}`);
+    console.warn(`${ANSI_YELLOW4}Warning: ${message}
+${detailBlock}${ANSI_RESET4}`);
   } else {
-    console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
+    console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
   }
 }
 // src/evaluation/formatting/prompt-builder.ts
-var import_promises4 = require("fs/promises");
+var import_promises5 = require("fs/promises");
 var import_node_path5 = __toESM(require("path"), 1);
-var ANSI_YELLOW4 = "\x1B[33m";
-var ANSI_RESET4 = "\x1B[0m";
-async function buildPromptInputs(testCase) {
+var ANSI_YELLOW5 = "\x1B[33m";
+var ANSI_RESET5 = "\x1B[0m";
+async function buildPromptInputs(testCase, mode = "lm") {
   const guidelineParts = [];
   for (const rawPath of testCase.guideline_paths) {
     const absolutePath = import_node_path5.default.resolve(rawPath);
@@ -661,7 +732,7 @@ async function buildPromptInputs(testCase) {
       continue;
     }
     try {
-      const content = (await (0, import_promises4.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
+      const content = (await (0, import_promises5.readFile)(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
       guidelineParts.push({
         content,
         isFile: true,
@@ -728,7 +799,7 @@ async function buildPromptInputs(testCase) {
       const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
       const contentParts = [];
       for (const segment of segments) {
-        const formattedContent = formatSegment(segment);
+        const formattedContent = formatSegment(segment, mode);
         if (formattedContent) {
           contentParts.push(formattedContent);
         }
@@ -743,7 +814,11 @@ ${messageContent}`);
   } else {
     const questionParts = [];
     for (const segment of testCase.input_segments) {
-      const formattedContent = formatSegment(segment);
+      if (segment.type === "file" && typeof segment.path === "string" && testCase.guideline_patterns && isGuidelineFile(segment.path, testCase.guideline_patterns)) {
+        questionParts.push(`<Attached: ${segment.path}>`);
+        continue;
+      }
+      const formattedContent = formatSegment(segment, mode);
       if (formattedContent) {
         questionParts.push(formattedContent);
       }
@@ -757,7 +832,8 @@ ${messageContent}`);
     messages: testCase.input_messages,
     segmentsByMessage,
     guidelinePatterns: testCase.guideline_patterns,
-    guidelineContent: guidelines
+    guidelineContent: guidelines,
+    mode
   }) : void 0;
   return { question, guidelines, chatPrompt };
 }
@@ -774,7 +850,7 @@ function needsRoleMarkers(messages, processedSegmentsByMessage) {
   return messagesWithContent > 1;
 }
 function buildChatPromptFromSegments(options) {
-  const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
+  const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt, mode = "lm" } = options;
   if (messages.length === 0) {
     return void 0;
   }
@@ -792,7 +868,7 @@ ${guidelineContent.trim()}`);
     const segments = segmentsByMessage[startIndex];
     const contentParts = [];
     for (const segment of segments) {
-      const formatted = formatSegment(segment);
+      const formatted = formatSegment(segment, mode);
       if (formatted) {
         contentParts.push(formatted);
       }
@@ -825,7 +901,7 @@ ${guidelineContent.trim()}`);
       if (segment.type === "guideline_ref") {
         continue;
       }
-      const formatted = formatSegment(segment);
+      const formatted = formatSegment(segment, mode);
       if (formatted) {
         const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
         if (isGuidelineRef) {
@@ -849,17 +925,18 @@ function asString4(value) {
   return typeof value === "string" ? value : void 0;
 }
 function logWarning4(message) {
-  console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
+  console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
 }
 // src/evaluation/yaml-parser.ts
-var ANSI_YELLOW5 = "\x1B[33m";
-var ANSI_RESET5 = "\x1B[0m";
+var ANSI_YELLOW6 = "\x1B[33m";
+var ANSI_RED = "\x1B[31m";
+var ANSI_RESET6 = "\x1B[0m";
 var SCHEMA_EVAL_V2 = "agentv-eval-v2";
 async function readTestSuiteMetadata(testFilePath) {
   try {
     const absolutePath = import_node_path6.default.resolve(testFilePath);
-    const content = await (0, import_promises5.readFile)(absolutePath, "utf8");
+    const content = await (0, import_promises6.readFile)(absolutePath, "utf8");
     const parsed = (0, import_yaml2.parse)(content);
     if (!isJsonObject(parsed)) {
       return {};
@@ -877,7 +954,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
   const searchRoots = buildSearchRoots(absoluteTestPath, repoRootPath);
   const config = await loadConfig(absoluteTestPath, repoRootPath);
   const guidelinePatterns = config?.guideline_patterns;
-  const rawFile = await (0, import_promises5.readFile)(absoluteTestPath, "utf8");
+  const rawFile = await (0, import_promises6.readFile)(absoluteTestPath, "utf8");
   const parsed = (0, import_yaml2.parse)(rawFile);
   if (!isJsonObject(parsed)) {
     throw new Error(`Invalid test file format: ${evalFilePath}`);
@@ -915,14 +992,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
     const inputMessagesValue = evalcase.input_messages;
     const expectedMessagesValue = evalcase.expected_messages;
     if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
-      logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
+      logError(`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`);
       continue;
     }
     const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
     const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
     const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
     if (hasExpectedMessages && expectedMessages.length === 0) {
-      logWarning5(`No valid expected message found for eval case: ${id}`);
+      logError(`No valid expected message found for eval case: ${id}`);
       continue;
     }
     if (expectedMessages.length > 1) {
@@ -953,7 +1030,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
     const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
     const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
     const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
-    const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
+    let evaluators;
+    try {
+      evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      logError(`Skipping eval case '${id}': ${message}`);
+      continue;
+    }
     const userFilePaths = [];
     for (const segment of inputSegments) {
       if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -971,7 +1055,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
       question,
       input_messages: inputMessages,
       input_segments: inputSegments,
-      output_segments: outputSegments,
+      expected_segments: outputSegments,
       reference_answer: referenceAnswer,
       guideline_paths: guidelinePaths.map((guidelinePath) => import_node_path6.default.resolve(guidelinePath)),
       guideline_patterns: guidelinePatterns,
@@ -1003,20 +1087,29 @@ function asString5(value) {
 function logWarning5(message, details) {
   if (details && details.length > 0) {
     const detailBlock = details.join("\n");
-    console.warn(`${ANSI_YELLOW5}Warning: ${message}
-${detailBlock}${ANSI_RESET5}`);
+    console.warn(`${ANSI_YELLOW6}Warning: ${message}
+${detailBlock}${ANSI_RESET6}`);
   } else {
-    console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
+    console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
+  }
+}
+function logError(message, details) {
+  if (details && details.length > 0) {
+    const detailBlock = details.join("\n");
+    console.error(`${ANSI_RED}Error: ${message}
+${detailBlock}${ANSI_RESET6}`);
+  } else {
+    console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET6}`);
   }
 }
 // src/evaluation/file-utils.ts
 var import_node_fs2 = require("fs");
-var import_promises6 = require("fs/promises");
+var import_promises7 = require("fs/promises");
 var import_node_path7 = __toESM(require("path"), 1);
 async function fileExists2(filePath) {
   try {
-    await (0, import_promises6.access)(filePath, import_node_fs2.constants.F_OK);
+    await (0, import_promises7.access)(filePath, import_node_fs2.constants.F_OK);
     return true;
   } catch {
     return false;
@@ -1026,7 +1119,7 @@ function normalizeLineEndings(content) {
   return content.replace(/\r\n/g, "\n");
 }
 async function readTextFile(filePath) {
-  const content = await (0, import_promises6.readFile)(filePath, "utf8");
+  const content = await (0, import_promises7.readFile)(filePath, "utf8");
   return normalizeLineEndings(content);
 }
 async function findGitRoot(startPath) {
@@ -1447,7 +1540,7 @@ async function withRetry(fn, retryConfig, signal) {
 // src/evaluation/providers/cli.ts
 var import_node_child_process = require("child_process");
-var import_promises7 = __toESM(require("fs/promises"), 1);
+var import_promises8 = __toESM(require("fs/promises"), 1);
 var import_node_os = __toESM(require("os"), 1);
 var import_node_path8 = __toESM(require("path"), 1);
 var import_node_util = require("util");
@@ -1548,7 +1641,7 @@ var CliProvider = class {
       const errorMsg = error instanceof Error ? error.message : String(error);
       throw new Error(`Failed to read output file '${filePath}': ${errorMsg}`);
     } finally {
-      await import_promises7.default.unlink(filePath).catch(() => {
+      await import_promises8.default.unlink(filePath).catch(() => {
       });
     }
   }
@@ -1687,7 +1780,7 @@ function formatTimeoutSuffix(timeoutMs) {
 var import_node_child_process2 = require("child_process");
 var import_node_crypto = require("crypto");
 var import_node_fs3 = require("fs");
-var import_promises8 = require("fs/promises");
+var import_promises9 = require("fs/promises");
 var import_node_os2 = require("os");
 var import_node_path10 = __toESM(require("path"), 1);
 var import_node_util2 = require("util");
@@ -1877,7 +1970,7 @@ var CodexProvider = class {
     try {
       const promptContent = buildPromptDocument(request, inputFiles);
       const promptFile = import_node_path10.default.join(workspaceRoot, PROMPT_FILENAME);
-      await (0, import_promises8.writeFile)(promptFile, promptContent, "utf8");
+      await (0, import_promises9.writeFile)(promptFile, promptContent, "utf8");
       const args = this.buildCodexArgs();
       const cwd = this.resolveCwd(workspaceRoot);
       const result = await this.executeCodex(args, cwd, promptContent, request.signal, logger);
@@ -1960,11 +2053,11 @@ var CodexProvider = class {
     }
   }
   async createWorkspace() {
-    return await (0, import_promises8.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
+    return await (0, import_promises9.mkdtemp)(import_node_path10.default.join((0, import_node_os2.tmpdir)(), WORKSPACE_PREFIX));
   }
   async cleanupWorkspace(workspaceRoot) {
     try {
-      await (0, import_promises8.rm)(workspaceRoot, { recursive: true, force: true });
+      await (0, import_promises9.rm)(workspaceRoot, { recursive: true, force: true });
     } catch {
     }
   }
@@ -1984,7 +2077,7 @@ var CodexProvider = class {
       return void 0;
     }
     try {
-      await (0, import_promises8.mkdir)(logDir, { recursive: true });
+      await (0, import_promises9.mkdir)(logDir, { recursive: true });
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       console.warn(`Skipping Codex stream logging (could not create ${logDir}): ${message}`);
@@ -2207,7 +2300,7 @@ async function locateExecutable(candidate) {
   if (includesPathSeparator) {
     const resolved = import_node_path10.default.isAbsolute(candidate) ? candidate : import_node_path10.default.resolve(candidate);
     const executablePath = await ensureWindowsExecutableVariant(resolved);
-    await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
+    await (0, import_promises9.access)(executablePath, import_node_fs3.constants.F_OK);
     return executablePath;
   }
   const locator = process.platform === "win32" ? "where" : "which";
@@ -2217,7 +2310,7 @@ async function locateExecutable(candidate) {
     const preferred = selectExecutableCandidate(lines);
     if (preferred) {
       const executablePath = await ensureWindowsExecutableVariant(preferred);
-      await (0, import_promises8.access)(executablePath, import_node_fs3.constants.F_OK);
+      await (0, import_promises9.access)(executablePath, import_node_fs3.constants.F_OK);
       return executablePath;
     }
   } catch {
@@ -2251,7 +2344,7 @@ async function ensureWindowsExecutableVariant(candidate) {
   for (const ext of extensions) {
     const withExtension = `${candidate}${ext}`;
     try {
-      await (0, import_promises8.access)(withExtension, import_node_fs3.constants.F_OK);
+      await (0, import_promises9.access)(withExtension, import_node_fs3.constants.F_OK);
       return withExtension;
     } catch {
     }
@@ -3313,7 +3406,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
 // src/evaluation/providers/targets-file.ts
 var import_node_fs4 = require("fs");
-var import_promises9 = require("fs/promises");
+var import_promises10 = require("fs/promises");
 var import_node_path12 = __toESM(require("path"), 1);
 var import_yaml3 = require("yaml");
@@ -3376,7 +3469,7 @@ function assertTargetDefinition(value, index, filePath) {
 }
 async function fileExists3(filePath) {
   try {
-    await (0, import_promises9.access)(filePath, import_node_fs4.constants.F_OK);
+    await (0, import_promises10.access)(filePath, import_node_fs4.constants.F_OK);
     return true;
   } catch {
     return false;
@@ -3387,7 +3480,7 @@ async function readTargetDefinitions(filePath) {
   if (!await fileExists3(absolutePath)) {
     throw new Error(`targets.yaml not found at ${absolutePath}`);
   }
-  const raw = await (0, import_promises9.readFile)(absolutePath, "utf8");
+  const raw = await (0, import_promises10.readFile)(absolutePath, "utf8");
   const parsed = (0, import_yaml3.parse)(raw);
   if (!isRecord(parsed)) {
     throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
@@ -3438,16 +3531,16 @@ Use the reference_answer as a gold standard for a high-quality response (if prov
 Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
 [[ ## expected_outcome ## ]]
-{{expected_outcome}}
+{{${TEMPLATE_VARIABLES.EXPECTED_OUTCOME}}}
 [[ ## question ## ]]
-{{question}}
+{{${TEMPLATE_VARIABLES.QUESTION}}}
 [[ ## reference_answer ## ]]
-{{reference_answer}}
+{{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
 [[ ## candidate_answer ## ]]
-{{candidate_answer}}`;
+{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
 var LlmJudgeEvaluator = class {
   kind = "llm_judge";
   resolveJudgeProvider;
@@ -3470,12 +3563,12 @@ var LlmJudgeEvaluator = class {
   async evaluateWithPrompt(context, judgeProvider) {
     const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
     const variables = {
-      input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
-      output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
-      candidate_answer: context.candidate.trim(),
-      reference_answer: (context.evalCase.reference_answer ?? "").trim(),
-      expected_outcome: context.evalCase.expected_outcome.trim(),
-      question: formattedQuestion.trim()
+      [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
+      [TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(context.evalCase.expected_segments, null, 2),
+      [TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
+      [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
+      [TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
+      [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim()
     };
     const systemPrompt = buildOutputSchema();
     const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
@@ -3707,14 +3800,14 @@ function parseJsonSafe(payload) {
   }
 }
 function substituteVariables(template, variables) {
-  return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
+  return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
     return variables[varName] ?? match;
   });
 }
 // src/evaluation/orchestrator.ts
 var import_node_crypto2 = require("crypto");
-var import_promises10 = require("fs/promises");
+var import_promises11 = require("fs/promises");
 var import_node_path13 = __toESM(require("path"), 1);
 // ../../node_modules/.pnpm/yocto-queue@1.2.1/node_modules/yocto-queue/index.js
@@ -3871,11 +3964,11 @@ async function runEvaluation(options) {
     now,
     evalId,
     verbose,
+    evalCases: preloadedEvalCases,
     onResult,
     onProgress
   } = options;
-  const load = loadEvalCases;
-  const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
+  const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
   const filteredEvalCases = filterEvalCases(evalCases, evalId);
   if (filteredEvalCases.length === 0) {
     if (evalId) {
@@ -4059,8 +4152,9 @@ async function runBatchEvaluation(options) {
     agentTimeoutMs
   } = options;
   const promptInputsList = [];
+  const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
   for (const evalCase of evalCases) {
-    const promptInputs = await buildPromptInputs(evalCase);
+    const promptInputs = await buildPromptInputs(evalCase, formattingMode);
     if (promptDumpDir) {
       await dumpPrompt(promptDumpDir, evalCase, promptInputs);
     }
@@ -4166,7 +4260,8 @@ async function runEvalCase(options) {
     signal,
     judgeProvider
   } = options;
-  const promptInputs = await buildPromptInputs(evalCase);
+  const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
+  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
   if (promptDumpDir) {
     await dumpPrompt(promptDumpDir, evalCase, promptInputs);
   }
@@ -4455,7 +4550,8 @@ async function runLlmJudgeEvaluator(options) {
 async function resolveCustomPrompt(config) {
   if (config.promptPath) {
     try {
-      return await readTextFile(config.promptPath);
+      const content = await readTextFile(config.promptPath);
+      return content;
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);
@@ -4490,14 +4586,14 @@ async function dumpPrompt(directory, evalCase, promptInputs) {
   const timestamp = (/* @__PURE__ */ new Date()).toISOString().replace(/[:.]/g, "-");
   const filename = `${timestamp}_${sanitizeFilename(evalCase.id)}.json`;
   const filePath = import_node_path13.default.resolve(directory, filename);
-  await (0, import_promises10.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
+  await (0, import_promises11.mkdir)(import_node_path13.default.dirname(filePath), { recursive: true });
   const payload = {
     eval_id: evalCase.id,
     question: promptInputs.question,
     guidelines: promptInputs.guidelines,
     guideline_paths: evalCase.guideline_paths
   };
-  await (0, import_promises10.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
+  await (0, import_promises11.writeFile)(filePath, JSON.stringify(payload, null, 2), "utf8");
 }
 function sanitizeFilename(value) {
   if (!value) {