npm - @agentv/core - Versions diffs - 0.15.0 → 0.16.0 - Mend

@agentv/core 0.15.0 → 0.16.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/dist/index.d.cts CHANGED Viewed

@@ -101,7 +101,7 @@ interface EvalCase {
     readonly question: string;
     readonly input_messages: readonly TestMessage[];
     readonly input_segments: readonly JsonObject[];
-    readonly output_segments: readonly JsonObject[];
+    readonly expected_segments: readonly JsonObject[];
     readonly reference_answer?: string;
     readonly guideline_paths: readonly string[];
     readonly guideline_patterns?: readonly string[];
@@ -147,6 +147,17 @@ interface EvaluatorResult {
  */
 declare function getHitCount(result: Pick<EvaluationResult, "hits">): number;
+/**
+ * Formatting mode for segment content.
+ * - 'agent': File references only (for providers with filesystem access)
+ * - 'lm': Embedded file content with XML tags (for language model providers)
+ */
+type FormattingMode = 'agent' | 'lm';
+/**
+ * Extract fenced code blocks from AgentV user segments.
+ */
+declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
 type ChatMessageRole = "system" | "user" | "assistant" | "tool" | "function";
 interface ChatMessage {
     readonly role: ChatMessageRole;
@@ -271,12 +282,13 @@ interface PromptInputs {
     readonly chatPrompt?: ChatPrompt;
     readonly systemMessage?: string;
 }
-declare function buildPromptInputs(testCase: EvalCase): Promise<PromptInputs>;
 /**
- * Extract fenced code blocks from AgentV user segments.
+ * Build prompt inputs by consolidating user request context and guideline content.
+ *
+ * @param testCase - The evaluation test case
+ * @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
  */
-declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
+declare function buildPromptInputs(testCase: EvalCase, mode?: FormattingMode): Promise<PromptInputs>;
 /**
  * Determine whether a path references guideline content (instructions or prompts).
@@ -605,6 +617,7 @@ interface RunEvaluationOptions {
     readonly evalId?: string;
     readonly verbose?: boolean;
     readonly maxConcurrency?: number;
+    readonly evalCases?: readonly EvalCase[];
     readonly onResult?: (result: EvaluationResult) => MaybePromise<void>;
     readonly onProgress?: (event: ProgressEvent) => MaybePromise<void>;
 }

package/dist/index.d.ts CHANGED Viewed

@@ -101,7 +101,7 @@ interface EvalCase {
     readonly question: string;
     readonly input_messages: readonly TestMessage[];
     readonly input_segments: readonly JsonObject[];
-    readonly output_segments: readonly JsonObject[];
+    readonly expected_segments: readonly JsonObject[];
     readonly reference_answer?: string;
     readonly guideline_paths: readonly string[];
     readonly guideline_patterns?: readonly string[];
@@ -147,6 +147,17 @@ interface EvaluatorResult {
  */
 declare function getHitCount(result: Pick<EvaluationResult, "hits">): number;
+/**
+ * Formatting mode for segment content.
+ * - 'agent': File references only (for providers with filesystem access)
+ * - 'lm': Embedded file content with XML tags (for language model providers)
+ */
+type FormattingMode = 'agent' | 'lm';
+/**
+ * Extract fenced code blocks from AgentV user segments.
+ */
+declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
 type ChatMessageRole = "system" | "user" | "assistant" | "tool" | "function";
 interface ChatMessage {
     readonly role: ChatMessageRole;
@@ -271,12 +282,13 @@ interface PromptInputs {
     readonly chatPrompt?: ChatPrompt;
     readonly systemMessage?: string;
 }
-declare function buildPromptInputs(testCase: EvalCase): Promise<PromptInputs>;
 /**
- * Extract fenced code blocks from AgentV user segments.
+ * Build prompt inputs by consolidating user request context and guideline content.
+ *
+ * @param testCase - The evaluation test case
+ * @param mode - Formatting mode: 'agent' for file references, 'lm' for embedded content (default: 'lm')
  */
-declare function extractCodeBlocks(segments: readonly JsonObject[]): readonly string[];
+declare function buildPromptInputs(testCase: EvalCase, mode?: FormattingMode): Promise<PromptInputs>;
 /**
  * Determine whether a path references guideline content (instructions or prompts).
@@ -605,6 +617,7 @@ interface RunEvaluationOptions {
     readonly evalId?: string;
     readonly verbose?: boolean;
     readonly maxConcurrency?: number;
+    readonly evalCases?: readonly EvalCase[];
     readonly onResult?: (result: EvaluationResult) => MaybePromise<void>;
     readonly onProgress?: (event: ProgressEvent) => MaybePromise<void>;
 }

package/dist/index.js CHANGED Viewed

@@ -62,7 +62,7 @@ function getHitCount(result) {
 }
 // src/evaluation/yaml-parser.ts
-import { readFile as readFile4 } from "node:fs/promises";
+import { readFile as readFile5 } from "node:fs/promises";
 import path6 from "node:path";
 import { parse as parse2 } from "yaml";
@@ -100,7 +100,7 @@ ${part.content}
   }
   return parts.map((p) => p.content).join(" ");
 }
-function formatSegment(segment) {
+function formatSegment(segment, mode = "lm") {
   const type = asString(segment.type);
   if (type === "text") {
     return asString(segment.value);
@@ -110,8 +110,14 @@ function formatSegment(segment) {
     return refPath ? `<Attached: ${refPath}>` : void 0;
   }
   if (type === "file") {
-    const text = asString(segment.text);
     const filePath = asString(segment.path);
+    if (!filePath) {
+      return void 0;
+    }
+    if (mode === "agent") {
+      return `<file: path="${filePath}">`;
+    }
+    const text = asString(segment.text);
     if (text && filePath) {
       return formatFileContents([{ content: text.trim(), isFile: true, displayPath: filePath }]);
     }
@@ -315,8 +321,67 @@ function logWarning(message) {
 // src/evaluation/loaders/evaluator-parser.ts
 import path3 from "node:path";
+// src/evaluation/validation/prompt-validator.ts
+import { readFile as readFile2 } from "node:fs/promises";
+// src/evaluation/template-variables.ts
+var TEMPLATE_VARIABLES = {
+  CANDIDATE_ANSWER: "candidate_answer",
+  EXPECTED_MESSAGES: "expected_messages",
+  QUESTION: "question",
+  EXPECTED_OUTCOME: "expected_outcome",
+  REFERENCE_ANSWER: "reference_answer",
+  INPUT_MESSAGES: "input_messages"
+};
+var VALID_TEMPLATE_VARIABLES = new Set(
+  Object.values(TEMPLATE_VARIABLES)
+);
+var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
+  TEMPLATE_VARIABLES.CANDIDATE_ANSWER,
+  TEMPLATE_VARIABLES.EXPECTED_MESSAGES
+]);
+// src/evaluation/validation/prompt-validator.ts
 var ANSI_YELLOW2 = "\x1B[33m";
 var ANSI_RESET2 = "\x1B[0m";
+async function validateCustomPromptContent(promptPath) {
+  const content = await readFile2(promptPath, "utf8");
+  validateTemplateVariables(content, promptPath);
+}
+function validateTemplateVariables(content, source) {
+  const variablePattern = /\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g;
+  const foundVariables = /* @__PURE__ */ new Set();
+  const invalidVariables = [];
+  let match;
+  while ((match = variablePattern.exec(content)) !== null) {
+    const varName = match[1];
+    foundVariables.add(varName);
+    if (!VALID_TEMPLATE_VARIABLES.has(varName)) {
+      invalidVariables.push(varName);
+    }
+  }
+  const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.CANDIDATE_ANSWER);
+  const hasExpectedMessages = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_MESSAGES);
+  const hasRequiredFields = hasCandidateAnswer || hasExpectedMessages;
+  if (!hasRequiredFields) {
+    throw new Error(
+      `Missing required fields. Must include at least one of:
+  - {{ ${TEMPLATE_VARIABLES.CANDIDATE_ANSWER} }}
+  - {{ ${TEMPLATE_VARIABLES.EXPECTED_MESSAGES} }}`
+    );
+  }
+  if (invalidVariables.length > 0) {
+    const warningMessage = `${ANSI_YELLOW2}Warning: Custom evaluator template at ${source}
+  Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
+  Valid variables: ${Array.from(VALID_TEMPLATE_VARIABLES).map((v) => `{{ ${v} }}`).join(", ")}${ANSI_RESET2}`;
+    console.warn(warningMessage);
+  }
+}
+// src/evaluation/loaders/evaluator-parser.ts
+var ANSI_YELLOW3 = "\x1B[33m";
+var ANSI_RESET3 = "\x1B[0m";
 async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
   const execution = rawEvalCase.execution;
   const candidateEvaluators = isJsonObject2(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
@@ -375,6 +440,12 @@ async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId
       const resolved = await resolveFileReference2(prompt, searchRoots);
       if (resolved.resolvedPath) {
         promptPath = path3.resolve(resolved.resolvedPath);
+        try {
+          await validateCustomPromptContent(promptPath);
+        } catch (error) {
+          const message = error instanceof Error ? error.message : String(error);
+          throw new Error(`Evaluator '${name}' template (${promptPath}): ${message}`);
+        }
       } else {
         logWarning2(
           `Inline prompt used for evaluator '${name}' in '${evalId}' (file not found: ${resolved.displayPath})`,
@@ -411,18 +482,18 @@ function isJsonObject2(value) {
 function logWarning2(message, details) {
   if (details && details.length > 0) {
     const detailBlock = details.join("\n");
-    console.warn(`${ANSI_YELLOW2}Warning: ${message}
-${detailBlock}${ANSI_RESET2}`);
+    console.warn(`${ANSI_YELLOW3}Warning: ${message}
+${detailBlock}${ANSI_RESET3}`);
   } else {
-    console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
+    console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
   }
 }
 // src/evaluation/loaders/message-processor.ts
-import { readFile as readFile2 } from "node:fs/promises";
+import { readFile as readFile3 } from "node:fs/promises";
 import path4 from "node:path";
-var ANSI_YELLOW3 = "\x1B[33m";
-var ANSI_RESET3 = "\x1B[0m";
+var ANSI_YELLOW4 = "\x1B[33m";
+var ANSI_RESET4 = "\x1B[0m";
 async function processMessages(options) {
   const {
     messages,
@@ -465,7 +536,7 @@ async function processMessages(options) {
           continue;
         }
         try {
-          const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
+          const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n");
           if (messageType === "input" && guidelinePatterns && guidelinePaths) {
             const relativeToRepo = path4.relative(repoRootPath, resolvedPath);
             if (isGuidelineFile(relativeToRepo, guidelinePatterns)) {
@@ -536,7 +607,7 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
         continue;
       }
       try {
-        const fileContent = (await readFile2(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
+        const fileContent = (await readFile3(resolvedPath, "utf8")).replace(/\r\n/g, "\n").trim();
         parts.push({ content: fileContent, isFile: true, displayPath });
         if (verbose) {
           console.log(`  [Expected Assistant File] Found: ${displayPath}`);
@@ -586,19 +657,19 @@ function cloneJsonValue(value) {
 function logWarning3(message, details) {
   if (details && details.length > 0) {
     const detailBlock = details.join("\n");
-    console.warn(`${ANSI_YELLOW3}Warning: ${message}
-${detailBlock}${ANSI_RESET3}`);
+    console.warn(`${ANSI_YELLOW4}Warning: ${message}
+${detailBlock}${ANSI_RESET4}`);
   } else {
-    console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET3}`);
+    console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
   }
 }
 // src/evaluation/formatting/prompt-builder.ts
-import { readFile as readFile3 } from "node:fs/promises";
+import { readFile as readFile4 } from "node:fs/promises";
 import path5 from "node:path";
-var ANSI_YELLOW4 = "\x1B[33m";
-var ANSI_RESET4 = "\x1B[0m";
-async function buildPromptInputs(testCase) {
+var ANSI_YELLOW5 = "\x1B[33m";
+var ANSI_RESET5 = "\x1B[0m";
+async function buildPromptInputs(testCase, mode = "lm") {
   const guidelineParts = [];
   for (const rawPath of testCase.guideline_paths) {
     const absolutePath = path5.resolve(rawPath);
@@ -607,7 +678,7 @@ async function buildPromptInputs(testCase) {
       continue;
     }
     try {
-      const content = (await readFile3(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
+      const content = (await readFile4(absolutePath, "utf8")).replace(/\r\n/g, "\n").trim();
       guidelineParts.push({
         content,
         isFile: true,
@@ -674,7 +745,7 @@ async function buildPromptInputs(testCase) {
       const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
       const contentParts = [];
       for (const segment of segments) {
-        const formattedContent = formatSegment(segment);
+        const formattedContent = formatSegment(segment, mode);
         if (formattedContent) {
           contentParts.push(formattedContent);
         }
@@ -689,7 +760,11 @@ ${messageContent}`);
   } else {
     const questionParts = [];
     for (const segment of testCase.input_segments) {
-      const formattedContent = formatSegment(segment);
+      if (segment.type === "file" && typeof segment.path === "string" && testCase.guideline_patterns && isGuidelineFile(segment.path, testCase.guideline_patterns)) {
+        questionParts.push(`<Attached: ${segment.path}>`);
+        continue;
+      }
+      const formattedContent = formatSegment(segment, mode);
       if (formattedContent) {
         questionParts.push(formattedContent);
       }
@@ -703,7 +778,8 @@ ${messageContent}`);
     messages: testCase.input_messages,
     segmentsByMessage,
     guidelinePatterns: testCase.guideline_patterns,
-    guidelineContent: guidelines
+    guidelineContent: guidelines,
+    mode
   }) : void 0;
   return { question, guidelines, chatPrompt };
 }
@@ -720,7 +796,7 @@ function needsRoleMarkers(messages, processedSegmentsByMessage) {
   return messagesWithContent > 1;
 }
 function buildChatPromptFromSegments(options) {
-  const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
+  const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt, mode = "lm" } = options;
   if (messages.length === 0) {
     return void 0;
   }
@@ -738,7 +814,7 @@ ${guidelineContent.trim()}`);
     const segments = segmentsByMessage[startIndex];
     const contentParts = [];
     for (const segment of segments) {
-      const formatted = formatSegment(segment);
+      const formatted = formatSegment(segment, mode);
       if (formatted) {
         contentParts.push(formatted);
       }
@@ -771,7 +847,7 @@ ${guidelineContent.trim()}`);
       if (segment.type === "guideline_ref") {
         continue;
       }
-      const formatted = formatSegment(segment);
+      const formatted = formatSegment(segment, mode);
       if (formatted) {
         const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
         if (isGuidelineRef) {
@@ -795,17 +871,18 @@ function asString4(value) {
   return typeof value === "string" ? value : void 0;
 }
 function logWarning4(message) {
-  console.warn(`${ANSI_YELLOW4}Warning: ${message}${ANSI_RESET4}`);
+  console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
 }
 // src/evaluation/yaml-parser.ts
-var ANSI_YELLOW5 = "\x1B[33m";
-var ANSI_RESET5 = "\x1B[0m";
+var ANSI_YELLOW6 = "\x1B[33m";
+var ANSI_RED = "\x1B[31m";
+var ANSI_RESET6 = "\x1B[0m";
 var SCHEMA_EVAL_V2 = "agentv-eval-v2";
 async function readTestSuiteMetadata(testFilePath) {
   try {
     const absolutePath = path6.resolve(testFilePath);
-    const content = await readFile4(absolutePath, "utf8");
+    const content = await readFile5(absolutePath, "utf8");
     const parsed = parse2(content);
     if (!isJsonObject(parsed)) {
       return {};
@@ -823,7 +900,7 @@ async function loadEvalCases(evalFilePath, repoRoot, options) {
   const searchRoots = buildSearchRoots2(absoluteTestPath, repoRootPath);
   const config = await loadConfig(absoluteTestPath, repoRootPath);
   const guidelinePatterns = config?.guideline_patterns;
-  const rawFile = await readFile4(absoluteTestPath, "utf8");
+  const rawFile = await readFile5(absoluteTestPath, "utf8");
   const parsed = parse2(rawFile);
   if (!isJsonObject(parsed)) {
     throw new Error(`Invalid test file format: ${evalFilePath}`);
@@ -861,14 +938,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
     const inputMessagesValue = evalcase.input_messages;
     const expectedMessagesValue = evalcase.expected_messages;
     if (!id || !outcome || !Array.isArray(inputMessagesValue)) {
-      logWarning5(`Skipping incomplete eval case: ${id ?? "unknown"}`);
+      logError(`Skipping incomplete eval case: ${id ?? "unknown"}. Missing required fields: id, outcome, and/or input_messages`);
       continue;
     }
     const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
     const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
     const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
     if (hasExpectedMessages && expectedMessages.length === 0) {
-      logWarning5(`No valid expected message found for eval case: ${id}`);
+      logError(`No valid expected message found for eval case: ${id}`);
       continue;
     }
     if (expectedMessages.length > 1) {
@@ -899,7 +976,14 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
     const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
     const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
     const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
-    const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
+    let evaluators;
+    try {
+      evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      logError(`Skipping eval case '${id}': ${message}`);
+      continue;
+    }
     const userFilePaths = [];
     for (const segment of inputSegments) {
       if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -917,7 +1001,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
       question,
       input_messages: inputMessages,
       input_segments: inputSegments,
-      output_segments: outputSegments,
+      expected_segments: outputSegments,
       reference_answer: referenceAnswer,
       guideline_paths: guidelinePaths.map((guidelinePath) => path6.resolve(guidelinePath)),
       guideline_patterns: guidelinePatterns,
@@ -949,10 +1033,19 @@ function asString5(value) {
 function logWarning5(message, details) {
   if (details && details.length > 0) {
     const detailBlock = details.join("\n");
-    console.warn(`${ANSI_YELLOW5}Warning: ${message}
-${detailBlock}${ANSI_RESET5}`);
+    console.warn(`${ANSI_YELLOW6}Warning: ${message}
+${detailBlock}${ANSI_RESET6}`);
   } else {
-    console.warn(`${ANSI_YELLOW5}Warning: ${message}${ANSI_RESET5}`);
+    console.warn(`${ANSI_YELLOW6}Warning: ${message}${ANSI_RESET6}`);
+  }
+}
+function logError(message, details) {
+  if (details && details.length > 0) {
+    const detailBlock = details.join("\n");
+    console.error(`${ANSI_RED}Error: ${message}
+${detailBlock}${ANSI_RESET6}`);
+  } else {
+    console.error(`${ANSI_RED}Error: ${message}${ANSI_RESET6}`);
   }
 }
@@ -2637,7 +2730,7 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
 // src/evaluation/providers/targets-file.ts
 import { constants as constants3 } from "node:fs";
-import { access as access3, readFile as readFile5 } from "node:fs/promises";
+import { access as access3, readFile as readFile6 } from "node:fs/promises";
 import path11 from "node:path";
 import { parse as parse3 } from "yaml";
 function isRecord(value) {
@@ -2698,7 +2791,7 @@ async function readTargetDefinitions(filePath) {
   if (!await fileExists3(absolutePath)) {
     throw new Error(`targets.yaml not found at ${absolutePath}`);
   }
-  const raw = await readFile5(absolutePath, "utf8");
+  const raw = await readFile6(absolutePath, "utf8");
   const parsed = parse3(raw);
   if (!isRecord(parsed)) {
     throw new Error(`targets.yaml at ${absolutePath} must be a YAML object with '$schema' and 'targets' fields`);
@@ -2749,16 +2842,16 @@ Use the reference_answer as a gold standard for a high-quality response (if prov
 Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.
 [[ ## expected_outcome ## ]]
-{{expected_outcome}}
+{{${TEMPLATE_VARIABLES.EXPECTED_OUTCOME}}}
 [[ ## question ## ]]
-{{question}}
+{{${TEMPLATE_VARIABLES.QUESTION}}}
 [[ ## reference_answer ## ]]
-{{reference_answer}}
+{{${TEMPLATE_VARIABLES.REFERENCE_ANSWER}}}
 [[ ## candidate_answer ## ]]
-{{candidate_answer}}`;
+{{${TEMPLATE_VARIABLES.CANDIDATE_ANSWER}}}`;
 var LlmJudgeEvaluator = class {
   kind = "llm_judge";
   resolveJudgeProvider;
@@ -2781,12 +2874,12 @@ var LlmJudgeEvaluator = class {
   async evaluateWithPrompt(context, judgeProvider) {
     const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
     const variables = {
-      input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
-      output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
-      candidate_answer: context.candidate.trim(),
-      reference_answer: (context.evalCase.reference_answer ?? "").trim(),
-      expected_outcome: context.evalCase.expected_outcome.trim(),
-      question: formattedQuestion.trim()
+      [TEMPLATE_VARIABLES.INPUT_MESSAGES]: JSON.stringify(context.evalCase.input_segments, null, 2),
+      [TEMPLATE_VARIABLES.EXPECTED_MESSAGES]: JSON.stringify(context.evalCase.expected_segments, null, 2),
+      [TEMPLATE_VARIABLES.CANDIDATE_ANSWER]: context.candidate.trim(),
+      [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? "").trim(),
+      [TEMPLATE_VARIABLES.EXPECTED_OUTCOME]: context.evalCase.expected_outcome.trim(),
+      [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim()
     };
     const systemPrompt = buildOutputSchema();
     const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
@@ -3018,7 +3111,7 @@ function parseJsonSafe(payload) {
   }
 }
 function substituteVariables(template, variables) {
-  return template.replace(/\{\{([a-zA-Z0-9_]+)\}\}/g, (match, varName) => {
+  return template.replace(/\{\{\s*([a-zA-Z0-9_]+)\s*\}\}/g, (match, varName) => {
     return variables[varName] ?? match;
   });
 }
@@ -3182,11 +3275,11 @@ async function runEvaluation(options) {
     now,
     evalId,
     verbose,
+    evalCases: preloadedEvalCases,
     onResult,
     onProgress
   } = options;
-  const load = loadEvalCases;
-  const evalCases = await load(evalFilePath, repoRoot, { verbose, evalId });
+  const evalCases = preloadedEvalCases ?? await loadEvalCases(evalFilePath, repoRoot, { verbose, evalId });
   const filteredEvalCases = filterEvalCases(evalCases, evalId);
   if (filteredEvalCases.length === 0) {
     if (evalId) {
@@ -3370,8 +3463,9 @@ async function runBatchEvaluation(options) {
     agentTimeoutMs
   } = options;
   const promptInputsList = [];
+  const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
   for (const evalCase of evalCases) {
-    const promptInputs = await buildPromptInputs(evalCase);
+    const promptInputs = await buildPromptInputs(evalCase, formattingMode);
     if (promptDumpDir) {
       await dumpPrompt(promptDumpDir, evalCase, promptInputs);
     }
@@ -3477,7 +3571,8 @@ async function runEvalCase(options) {
     signal,
     judgeProvider
   } = options;
-  const promptInputs = await buildPromptInputs(evalCase);
+  const formattingMode = isAgentProvider(provider) ? "agent" : "lm";
+  const promptInputs = await buildPromptInputs(evalCase, formattingMode);
   if (promptDumpDir) {
     await dumpPrompt(promptDumpDir, evalCase, promptInputs);
   }
@@ -3766,7 +3861,8 @@ async function runLlmJudgeEvaluator(options) {
 async function resolveCustomPrompt(config) {
   if (config.promptPath) {
     try {
-      return await readTextFile(config.promptPath);
+      const content = await readTextFile(config.promptPath);
+      return content;
     } catch (error) {
       const message = error instanceof Error ? error.message : String(error);
       console.warn(`Could not read custom prompt at ${config.promptPath}: ${message}`);