npm - @agentv/core - Versions diffs - 0.9.0 → 0.10.1 - Mend

@agentv/core 0.9.0 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

package/dist/{chunk-SNTZFB24.js → chunk-YQBJAT5I.js} +1 -1
package/dist/{chunk-SNTZFB24.js.map → chunk-YQBJAT5I.js.map} +1 -1
package/dist/evaluation/validation/index.cjs +30 -13
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +21 -4
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +375 -104
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +74 -64
package/dist/index.d.ts +74 -64
package/dist/index.js +375 -105
package/dist/index.js.map +1 -1
package/package.json +1 -1

package/dist/index.cjs CHANGED Viewed

@@ -54,6 +54,7 @@ __export(index_exports, {
   loadEvalCases: () => loadEvalCases,
   normalizeLineEndings: () => normalizeLineEndings,
   readTargetDefinitions: () => readTargetDefinitions,
+  readTestSuiteMetadata: () => readTestSuiteMetadata,
   readTextFile: () => readTextFile,
   resolveAndCreateProvider: () => resolveAndCreateProvider,
   resolveFileReference: () => resolveFileReference,
@@ -239,6 +240,33 @@ var ANSI_YELLOW = "\x1B[33m";
 var ANSI_RESET = "\x1B[0m";
 var SCHEMA_EVAL_V2 = "agentv-eval-v2";
 var SCHEMA_CONFIG_V2 = "agentv-config-v2";
+async function readTestSuiteMetadata(testFilePath) {
+  try {
+    const absolutePath = import_node_path2.default.resolve(testFilePath);
+    const content = await (0, import_promises2.readFile)(absolutePath, "utf8");
+    const parsed = (0, import_yaml.parse)(content);
+    if (!isJsonObject(parsed)) {
+      return {};
+    }
+    return { target: extractTargetFromSuite(parsed) };
+  } catch {
+    return {};
+  }
+}
+function extractTargetFromSuite(suite) {
+  const execution = suite.execution;
+  if (execution && typeof execution === "object" && !Array.isArray(execution)) {
+    const executionTarget = execution.target;
+    if (typeof executionTarget === "string" && executionTarget.trim().length > 0) {
+      return executionTarget.trim();
+    }
+  }
+  const targetValue = suite.target;
+  if (typeof targetValue === "string" && targetValue.trim().length > 0) {
+    return targetValue.trim();
+  }
+  return void 0;
+}
 async function loadConfig(evalFilePath, repoRoot) {
   const directories = buildDirectoryChain(evalFilePath, repoRoot);
   for (const directory of directories) {
@@ -415,6 +443,8 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
     throw new Error(`Invalid test file format: ${evalFilePath} - missing 'evalcases' field`);
   }
   const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm_judge";
+  const globalExecution = isJsonObject(suite.execution) ? suite.execution : void 0;
+  const globalTarget = asString(globalExecution?.target) ?? asString(suite.target);
   const results = [];
   for (const rawEvalcase of rawTestcases) {
     if (!isJsonObject(rawEvalcase)) {
@@ -434,14 +464,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
       logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
       continue;
     }
-    if (!Array.isArray(expectedMessagesValue)) {
-      logWarning(`Eval case '${id}' missing expected_messages array`);
-      continue;
-    }
+    const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
     const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
-    const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
-    if (expectedMessages.length === 0) {
-      logWarning(`No expected message found for eval case: ${id}`);
+    const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
+    if (hasExpectedMessages && expectedMessages.length === 0) {
+      logWarning(`No valid expected message found for eval case: ${id}`);
       continue;
     }
     if (expectedMessages.length > 1) {
@@ -459,20 +486,20 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
       messageType: "input",
       verbose
     });
-    const outputSegments = await processMessages({
+    const outputSegments = hasExpectedMessages ? await processMessages({
       messages: expectedMessages,
       searchRoots,
       repoRootPath,
       guidelinePatterns,
       messageType: "output",
       verbose
-    });
+    }) : [];
     const codeSnippets = extractCodeBlocks(inputSegments);
     const expectedContent = expectedMessages[0]?.content;
-    const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
+    const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
     const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
     const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
-    const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
+    const evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
     const userFilePaths = [];
     for (const segment of inputSegments) {
       if (segment.type === "file" && typeof segment.resolvedPath === "string") {
@@ -488,6 +515,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
       dataset: datasetName,
       conversation_id: conversationId,
       question,
+      input_messages: inputMessages,
       input_segments: inputSegments,
       output_segments: outputSegments,
       reference_answer: referenceAnswer,
@@ -515,6 +543,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
   }
   return results;
 }
+function needsRoleMarkers(messages, processedSegmentsByMessage) {
+  if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
+    return true;
+  }
+  let messagesWithContent = 0;
+  for (const segments of processedSegmentsByMessage) {
+    if (hasVisibleContent(segments)) {
+      messagesWithContent++;
+    }
+  }
+  return messagesWithContent > 1;
+}
+function hasVisibleContent(segments) {
+  return segments.some((segment) => {
+    const type = asString(segment.type);
+    if (type === "text") {
+      const value = asString(segment.value);
+      return value !== void 0 && value.trim().length > 0;
+    }
+    if (type === "guideline_ref") {
+      return false;
+    }
+    if (type === "file") {
+      const text = asString(segment.text);
+      return text !== void 0 && text.trim().length > 0;
+    }
+    return false;
+  });
+}
+function formatSegment(segment) {
+  const type = asString(segment.type);
+  if (type === "text") {
+    return asString(segment.value);
+  }
+  if (type === "guideline_ref") {
+    const refPath = asString(segment.path);
+    return refPath ? `<Attached: ${refPath}>` : void 0;
+  }
+  if (type === "file") {
+    const text = asString(segment.text);
+    const filePath = asString(segment.path);
+    if (text && filePath) {
+      return `=== ${filePath} ===
+${text}`;
+    }
+  }
+  return void 0;
+}
 async function buildPromptInputs(testCase) {
   const guidelineContents = [];
   for (const rawPath of testCase.guideline_paths) {
@@ -531,36 +607,168 @@ ${content}`);
       logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
     }
   }
-  const questionParts = [];
+  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
+  const segmentsByMessage = [];
+  const fileContentsByPath = /* @__PURE__ */ new Map();
   for (const segment of testCase.input_segments) {
-    const typeValue = segment.type;
-    if (typeof typeValue === "string" && typeValue === "file") {
-      const pathValue = segment.path;
-      const textValue = segment.text;
-      const label = typeof pathValue === "string" ? pathValue : "file";
-      const body = typeof textValue === "string" ? textValue : "";
-      questionParts.push(`=== ${label} ===
-${body}`);
-      continue;
+    if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
+      fileContentsByPath.set(segment.path, segment.text);
     }
-    if (typeof typeValue === "string" && typeValue === "text") {
-      const value = segment.value;
-      if (typeof value === "string") {
-        questionParts.push(value);
+  }
+  for (const message of testCase.input_messages) {
+    const messageSegments = [];
+    if (typeof message.content === "string") {
+      if (message.content.trim().length > 0) {
+        messageSegments.push({ type: "text", value: message.content });
+      }
+    } else if (Array.isArray(message.content)) {
+      for (const segment of message.content) {
+        if (typeof segment === "string") {
+          if (segment.trim().length > 0) {
+            messageSegments.push({ type: "text", value: segment });
+          }
+        } else if (isJsonObject(segment)) {
+          const type = asString(segment.type);
+          if (type === "file") {
+            const value = asString(segment.value);
+            if (!value) continue;
+            if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
+              messageSegments.push({ type: "guideline_ref", path: value });
+              continue;
+            }
+            const fileText = fileContentsByPath.get(value);
+            if (fileText !== void 0) {
+              messageSegments.push({ type: "file", text: fileText, path: value });
+            }
+          } else if (type === "text") {
+            const textValue = asString(segment.value);
+            if (textValue && textValue.trim().length > 0) {
+              messageSegments.push({ type: "text", value: textValue });
+            }
+          }
+        }
+      }
+    }
+    segmentsByMessage.push(messageSegments);
+  }
+  const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
+  let question;
+  if (useRoleMarkers) {
+    const messageParts = [];
+    for (let i = 0; i < testCase.input_messages.length; i++) {
+      const message = testCase.input_messages[i];
+      const segments = segmentsByMessage[i];
+      if (!hasVisibleContent(segments)) {
+        continue;
+      }
+      const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
+      const contentParts = [];
+      for (const segment of segments) {
+        const formattedContent = formatSegment(segment);
+        if (formattedContent) {
+          contentParts.push(formattedContent);
+        }
+      }
+      if (contentParts.length > 0) {
+        const messageContent = contentParts.join("\n");
+        messageParts.push(`@[${roleLabel}]:
+${messageContent}`);
+      }
+    }
+    question = messageParts.join("\n\n");
+  } else {
+    const questionParts = [];
+    for (const segment of testCase.input_segments) {
+      const formattedContent = formatSegment(segment);
+      if (formattedContent) {
+        questionParts.push(formattedContent);
+      }
+    }
+    if (testCase.code_snippets.length > 0) {
+      questionParts.push(testCase.code_snippets.join("\n"));
+    }
+    question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
+  }
+  const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
+    messages: testCase.input_messages,
+    segmentsByMessage,
+    guidelinePatterns: testCase.guideline_patterns,
+    guidelineContent: guidelines
+  }) : void 0;
+  return { question, guidelines, chatPrompt };
+}
+function buildChatPromptFromSegments(options) {
+  const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
+  if (messages.length === 0) {
+    return void 0;
+  }
+  const systemSegments = [];
+  if (systemPrompt && systemPrompt.trim().length > 0) {
+    systemSegments.push(systemPrompt.trim());
+  }
+  if (guidelineContent && guidelineContent.trim().length > 0) {
+    systemSegments.push(`[[ ## Guidelines ## ]]
+${guidelineContent.trim()}`);
+  }
+  let startIndex = 0;
+  while (startIndex < messages.length && messages[startIndex].role === "system") {
+    const segments = segmentsByMessage[startIndex];
+    const contentParts = [];
+    for (const segment of segments) {
+      const formatted = formatSegment(segment);
+      if (formatted) {
+        contentParts.push(formatted);
       }
-      continue;
     }
-    const genericValue = segment.value;
-    if (typeof genericValue === "string") {
-      questionParts.push(genericValue);
+    if (contentParts.length > 0) {
+      systemSegments.push(contentParts.join("\n"));
     }
+    startIndex += 1;
   }
-  if (testCase.code_snippets.length > 0) {
-    questionParts.push(testCase.code_snippets.join("\n"));
+  const chatPrompt = [];
+  if (systemSegments.length > 0) {
+    chatPrompt.push({
+      role: "system",
+      content: systemSegments.join("\n\n")
+    });
   }
-  const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
-  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
-  return { question, guidelines };
+  for (let i = startIndex; i < messages.length; i++) {
+    const message = messages[i];
+    const segments = segmentsByMessage[i];
+    const contentParts = [];
+    let role = message.role;
+    let name;
+    if (role === "system") {
+      role = "assistant";
+      contentParts.push("@[System]:");
+    } else if (role === "tool") {
+      role = "function";
+      name = "tool";
+    }
+    for (const segment of segments) {
+      if (segment.type === "guideline_ref") {
+        continue;
+      }
+      const formatted = formatSegment(segment);
+      if (formatted) {
+        const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
+        if (isGuidelineRef) {
+          continue;
+        }
+        contentParts.push(formatted);
+      }
+    }
+    if (contentParts.length === 0) {
+      continue;
+    }
+    chatPrompt.push({
+      role,
+      content: contentParts.join("\n"),
+      ...name ? { name } : {}
+    });
+  }
+  return chatPrompt.length > 0 ? chatPrompt : void 0;
 }
 async function fileExists2(absolutePath) {
   try {
@@ -658,9 +866,9 @@ async function resolveAssistantContent(content, searchRoots, verbose) {
   }
   return parts.join(" ");
 }
-async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
+async function parseEvaluators(rawEvalCase, globalExecution, searchRoots, evalId) {
   const execution = rawEvalCase.execution;
-  const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators;
+  const candidateEvaluators = isJsonObject(execution) ? execution.evaluators ?? rawEvalCase.evaluators : rawEvalCase.evaluators ?? globalExecution?.evaluators;
   if (candidateEvaluators === void 0) {
     return void 0;
   }
@@ -698,6 +906,8 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
             resolved.attempted.length > 0 ? resolved.attempted.map((attempt) => `  Tried: ${attempt}`) : void 0
           );
         }
+      } else {
+        resolvedCwd = searchRoots[0];
       }
       evaluators.push({
         name,
@@ -726,8 +936,7 @@ async function parseEvaluators(rawEvalCase, searchRoots, evalId) {
       name,
       type: "llm_judge",
       prompt,
-      promptPath,
-      model
+      promptPath
     });
   }
   return evaluators.length > 0 ? evaluators : void 0;
@@ -757,21 +966,14 @@ var import_ax = require("@ax-llm/ax");
 var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
 function buildChatPrompt(request) {
   if (request.chatPrompt) {
-    return request.chatPrompt;
-  }
-  const systemSegments = [];
-  const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
-  if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
-    systemSegments.push(metadataSystemPrompt.trim());
-  } else {
-    systemSegments.push(DEFAULT_SYSTEM_PROMPT);
-  }
-  if (request.guidelines && request.guidelines.trim().length > 0) {
-    systemSegments.push(`[[ ## Guidelines ## ]]
-${request.guidelines.trim()}`);
+    const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
+    if (hasSystemMessage) {
+      return request.chatPrompt;
+    }
+    const systemContent2 = resolveSystemContent(request);
+    return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
   }
-  const systemContent = systemSegments.join("\n\n");
+  const systemContent = resolveSystemContent(request);
   const userContent = request.question.trim();
   const prompt = [
     {
@@ -785,6 +987,21 @@ ${request.guidelines.trim()}`);
   ];
   return prompt;
 }
+function resolveSystemContent(request) {
+  const systemSegments = [];
+  const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
+  if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
+    systemSegments.push(metadataSystemPrompt.trim());
+  } else {
+    systemSegments.push(DEFAULT_SYSTEM_PROMPT);
+  }
+  if (request.guidelines && request.guidelines.trim().length > 0) {
+    systemSegments.push(`[[ ## Guidelines ## ]]
+${request.guidelines.trim()}`);
+  }
+  return systemSegments.join("\n\n");
+}
 function extractModelConfig(request, defaults) {
   const temperature = request.temperature ?? defaults.temperature;
   const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
@@ -3020,24 +3237,23 @@ var LlmJudgeEvaluator = class {
     return this.evaluateWithPrompt(context, judgeProvider);
   }
   async evaluateWithPrompt(context, judgeProvider) {
-    let prompt = buildQualityPrompt(context.evalCase, context.candidate);
-    let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
+    const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
+    const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
+    let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
+    let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
     if (systemPrompt && hasTemplateVariables(systemPrompt)) {
       const variables = {
         input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
         output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
         candidate_answer: context.candidate,
-        reference_answer: context.evalCase.reference_answer,
+        reference_answer: context.evalCase.reference_answer ?? "",
         expected_outcome: context.evalCase.expected_outcome,
-        question: context.evalCase.question
+        question: formattedQuestion
       };
       prompt = substituteVariables(systemPrompt, variables);
-      systemPrompt = QUALITY_SYSTEM_PROMPT;
+      systemPrompt = buildSystemPrompt(hasReferenceAnswer);
     }
-    const metadata = {
-      ...systemPrompt !== void 0 ? { systemPrompt } : {},
-      ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
-    };
+    const metadata = systemPrompt !== void 0 ? { systemPrompt } : {};
     const response = await judgeProvider.invoke({
       question: prompt,
       metadata,
@@ -3057,8 +3273,7 @@ var LlmJudgeEvaluator = class {
       provider: judgeProvider.id,
       prompt,
       target: context.target.name,
-      ...systemPrompt !== void 0 ? { systemPrompt } : {},
-      ...context.judgeModel !== void 0 ? { model: context.judgeModel } : {}
+      ...systemPrompt !== void 0 && { systemPrompt }
     };
     return {
       score,
@@ -3070,38 +3285,51 @@ var LlmJudgeEvaluator = class {
     };
   }
 };
-var QUALITY_SYSTEM_PROMPT = [
-  "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
-  "",
-  "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
-  "",
-  "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
-  "",
-  "You must respond with a single JSON object matching this schema:",
-  "",
-  "{",
-  '  "score": <number between 0.0 and 1.0>,',
-  '  "hits": [<array of strings, max 4 items, brief specific achievements>],',
-  '  "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
-  '  "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
-  "}"
-].join("\n");
-function buildQualityPrompt(evalCase, candidate) {
+function buildSystemPrompt(hasReferenceAnswer) {
+  const basePrompt = [
+    "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
+    ""
+  ];
+  if (hasReferenceAnswer) {
+    basePrompt.push(
+      "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
+      ""
+    );
+  }
+  basePrompt.push(
+    "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
+    "",
+    "You must respond with a single JSON object matching this schema:",
+    "",
+    "{",
+    '  "score": <number between 0.0 and 1.0>,',
+    '  "hits": [<array of strings, max 4 items, brief specific achievements>],',
+    '  "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
+    '  "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
+    "}"
+  );
+  return basePrompt.join("\n");
+}
+function buildQualityPrompt(evalCase, candidate, question) {
   const parts = [
     "[[ ## expected_outcome ## ]]",
     evalCase.expected_outcome.trim(),
     "",
     "[[ ## question ## ]]",
-    evalCase.question.trim(),
-    "",
-    "[[ ## reference_answer ## ]]",
-    evalCase.reference_answer.trim(),
-    "",
-    "[[ ## candidate_answer ## ]]",
-    candidate.trim(),
-    "",
-    "Respond with a single JSON object matching the schema described in the system prompt."
+    question.trim(),
+    ""
   ];
+  if (hasNonEmptyReferenceAnswer(evalCase)) {
+    parts.push(
+      "[[ ## reference_answer ## ]]",
+      evalCase.reference_answer.trim(),
+      ""
+    );
+  }
+  parts.push(
+    "[[ ## candidate_answer ## ]]",
+    candidate.trim()
+  );
   return parts.join("\n");
 }
 function clampScore(value) {
@@ -3184,6 +3412,9 @@ function extractJsonBlob(text) {
 function isNonEmptyString(value) {
   return typeof value === "string" && value.trim().length > 0;
 }
+function hasNonEmptyReferenceAnswer(evalCase) {
+  return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
+}
 var CodeEvaluator = class {
   kind = "code";
   script;
@@ -3842,11 +4073,27 @@ async function evaluateCandidate(options) {
     agentTimeoutMs
   });
   const completedAt = nowFn();
-  const rawRequest = {
-    question: promptInputs.question,
-    ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
-    guideline_paths: evalCase.guideline_paths
-  };
+  let agentProviderRequest;
+  let lmProviderRequest;
+  if (isAgentProvider(provider)) {
+    agentProviderRequest = {
+      question: promptInputs.question,
+      guideline_paths: evalCase.guideline_paths
+    };
+  } else {
+    if (promptInputs.chatPrompt) {
+      lmProviderRequest = {
+        chat_prompt: promptInputs.chatPrompt,
+        guideline_paths: evalCase.guideline_paths
+      };
+    } else {
+      lmProviderRequest = {
+        question: promptInputs.question,
+        guidelines: promptInputs.guidelines,
+        guideline_paths: evalCase.guideline_paths
+      };
+    }
+  }
   return {
     eval_id: evalCase.id,
     dataset: evalCase.dataset,
@@ -3860,7 +4107,8 @@ async function evaluateCandidate(options) {
     timestamp: completedAt.toISOString(),
     reasoning: score.reasoning,
     raw_aspects: score.rawAspects,
-    raw_request: rawRequest,
+    agent_provider_request: agentProviderRequest,
+    lm_provider_request: lmProviderRequest,
     evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
     evaluator_results: evaluatorResults
   };
@@ -4019,8 +4267,7 @@ async function runLlmJudgeEvaluator(options) {
     now,
     judgeProvider,
     systemPrompt: customPrompt,
-    evaluator: config,
-    judgeModel: config.model
+    evaluator: config
   });
 }
 async function resolveCustomPrompt(config) {
@@ -4089,6 +4336,7 @@ async function invokeProvider(provider, options) {
       question: promptInputs.question,
       guidelines: promptInputs.guidelines,
       guideline_patterns: evalCase.guideline_patterns,
+      chatPrompt: promptInputs.chatPrompt,
       inputFiles: evalCase.file_paths,
       evalCaseId: evalCase.id,
       attempt,
@@ -4105,12 +4353,30 @@ async function invokeProvider(provider, options) {
 }
 function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
   const message = error instanceof Error ? error.message : String(error);
-  const rawRequest = {
-    question: promptInputs.question,
-    ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
-    guideline_paths: evalCase.guideline_paths,
-    error: message
-  };
+  let agentProviderRequest;
+  let lmProviderRequest;
+  if (isAgentProvider(provider)) {
+    agentProviderRequest = {
+      question: promptInputs.question,
+      guideline_paths: evalCase.guideline_paths,
+      error: message
+    };
+  } else {
+    if (promptInputs.chatPrompt) {
+      lmProviderRequest = {
+        chat_prompt: promptInputs.chatPrompt,
+        guideline_paths: evalCase.guideline_paths,
+        error: message
+      };
+    } else {
+      lmProviderRequest = {
+        question: promptInputs.question,
+        guidelines: promptInputs.guidelines,
+        guideline_paths: evalCase.guideline_paths,
+        error: message
+      };
+    }
+  }
   return {
     eval_id: evalCase.id,
     dataset: evalCase.dataset,
@@ -4123,7 +4389,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
     target: targetName,
     timestamp: timestamp.toISOString(),
     raw_aspects: [],
-    raw_request: rawRequest,
+    agent_provider_request: agentProviderRequest,
+    lm_provider_request: lmProviderRequest,
     error: message
   };
 }
@@ -4135,6 +4402,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
   hash.update(promptInputs.question);
   hash.update(promptInputs.guidelines);
   hash.update(promptInputs.systemMessage ?? "");
+  if (promptInputs.chatPrompt) {
+    hash.update(JSON.stringify(promptInputs.chatPrompt));
+  }
   return hash.digest("hex");
 }
 function isTimeoutLike(error) {
@@ -4183,6 +4453,7 @@ function createAgentKernel() {
   loadEvalCases,
   normalizeLineEndings,
   readTargetDefinitions,
+  readTestSuiteMetadata,
   readTextFile,
   resolveAndCreateProvider,
   resolveFileReference,