npm - @agentv/core - Versions diffs - 0.7.5 → 0.10.0 - Mend

@agentv/core 0.7.5 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-7XM7HYRS.js → chunk-YQBJAT5I.js} +97 -67
package/dist/chunk-YQBJAT5I.js.map +1 -0
package/dist/evaluation/validation/index.cjs +61 -69
package/dist/evaluation/validation/index.cjs.map +1 -1
package/dist/evaluation/validation/index.js +51 -58
package/dist/evaluation/validation/index.js.map +1 -1
package/dist/index.cjs +538 -192
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +136 -58
package/dist/index.d.ts +136 -58
package/dist/index.js +443 -127
package/dist/index.js.map +1 -1
package/package.json +1 -2
package/dist/chunk-7XM7HYRS.js.map +0 -1

package/dist/index.js CHANGED Viewed

@@ -9,7 +9,7 @@ import {
   readTextFile,
   resolveFileReference,
   resolveTargetDefinition
-} from "./chunk-7XM7HYRS.js";
+} from "./chunk-YQBJAT5I.js";
 // src/evaluation/types.ts
 var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
@@ -268,14 +268,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
       logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
       continue;
     }
-    if (!Array.isArray(expectedMessagesValue)) {
-      logWarning(`Eval case '${id}' missing expected_messages array`);
-      continue;
-    }
+    const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
     const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
-    const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
-    if (expectedMessages.length === 0) {
-      logWarning(`No expected message found for eval case: ${id}`);
+    const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
+    if (hasExpectedMessages && expectedMessages.length === 0) {
+      logWarning(`No valid expected message found for eval case: ${id}`);
       continue;
     }
     if (expectedMessages.length > 1) {
@@ -293,17 +290,17 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
       messageType: "input",
       verbose
     });
-    const outputSegments = await processMessages({
+    const outputSegments = hasExpectedMessages ? await processMessages({
       messages: expectedMessages,
       searchRoots,
       repoRootPath,
       guidelinePatterns,
       messageType: "output",
       verbose
-    });
+    }) : [];
     const codeSnippets = extractCodeBlocks(inputSegments);
     const expectedContent = expectedMessages[0]?.content;
-    const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
+    const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
     const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
     const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
     const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
@@ -322,6 +319,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
       dataset: datasetName,
       conversation_id: conversationId,
       question,
+      input_messages: inputMessages,
       input_segments: inputSegments,
       output_segments: outputSegments,
       reference_answer: referenceAnswer,
@@ -349,6 +347,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
   }
   return results;
 }
+function needsRoleMarkers(messages, processedSegmentsByMessage) {
+  if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
+    return true;
+  }
+  let messagesWithContent = 0;
+  for (const segments of processedSegmentsByMessage) {
+    if (hasVisibleContent(segments)) {
+      messagesWithContent++;
+    }
+  }
+  return messagesWithContent > 1;
+}
+function hasVisibleContent(segments) {
+  return segments.some((segment) => {
+    const type = asString(segment.type);
+    if (type === "text") {
+      const value = asString(segment.value);
+      return value !== void 0 && value.trim().length > 0;
+    }
+    if (type === "guideline_ref") {
+      return false;
+    }
+    if (type === "file") {
+      const text = asString(segment.text);
+      return text !== void 0 && text.trim().length > 0;
+    }
+    return false;
+  });
+}
+function formatSegment(segment) {
+  const type = asString(segment.type);
+  if (type === "text") {
+    return asString(segment.value);
+  }
+  if (type === "guideline_ref") {
+    const refPath = asString(segment.path);
+    return refPath ? `<Attached: ${refPath}>` : void 0;
+  }
+  if (type === "file") {
+    const text = asString(segment.text);
+    const filePath = asString(segment.path);
+    if (text && filePath) {
+      return `=== ${filePath} ===
+${text}`;
+    }
+  }
+  return void 0;
+}
 async function buildPromptInputs(testCase) {
   const guidelineContents = [];
   for (const rawPath of testCase.guideline_paths) {
@@ -365,36 +411,168 @@ ${content}`);
       logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
     }
   }
-  const questionParts = [];
+  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
+  const segmentsByMessage = [];
+  const fileContentsByPath = /* @__PURE__ */ new Map();
   for (const segment of testCase.input_segments) {
-    const typeValue = segment.type;
-    if (typeof typeValue === "string" && typeValue === "file") {
-      const pathValue = segment.path;
-      const textValue = segment.text;
-      const label = typeof pathValue === "string" ? pathValue : "file";
-      const body = typeof textValue === "string" ? textValue : "";
-      questionParts.push(`=== ${label} ===
-${body}`);
-      continue;
+    if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
+      fileContentsByPath.set(segment.path, segment.text);
     }
-    if (typeof typeValue === "string" && typeValue === "text") {
-      const value = segment.value;
-      if (typeof value === "string") {
-        questionParts.push(value);
+  }
+  for (const message of testCase.input_messages) {
+    const messageSegments = [];
+    if (typeof message.content === "string") {
+      if (message.content.trim().length > 0) {
+        messageSegments.push({ type: "text", value: message.content });
+      }
+    } else if (Array.isArray(message.content)) {
+      for (const segment of message.content) {
+        if (typeof segment === "string") {
+          if (segment.trim().length > 0) {
+            messageSegments.push({ type: "text", value: segment });
+          }
+        } else if (isJsonObject(segment)) {
+          const type = asString(segment.type);
+          if (type === "file") {
+            const value = asString(segment.value);
+            if (!value) continue;
+            if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
+              messageSegments.push({ type: "guideline_ref", path: value });
+              continue;
+            }
+            const fileText = fileContentsByPath.get(value);
+            if (fileText !== void 0) {
+              messageSegments.push({ type: "file", text: fileText, path: value });
+            }
+          } else if (type === "text") {
+            const textValue = asString(segment.value);
+            if (textValue && textValue.trim().length > 0) {
+              messageSegments.push({ type: "text", value: textValue });
+            }
+          }
+        }
+      }
+    }
+    segmentsByMessage.push(messageSegments);
+  }
+  const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
+  let question;
+  if (useRoleMarkers) {
+    const messageParts = [];
+    for (let i = 0; i < testCase.input_messages.length; i++) {
+      const message = testCase.input_messages[i];
+      const segments = segmentsByMessage[i];
+      if (!hasVisibleContent(segments)) {
+        continue;
+      }
+      const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
+      const contentParts = [];
+      for (const segment of segments) {
+        const formattedContent = formatSegment(segment);
+        if (formattedContent) {
+          contentParts.push(formattedContent);
+        }
+      }
+      if (contentParts.length > 0) {
+        const messageContent = contentParts.join("\n");
+        messageParts.push(`@[${roleLabel}]:
+${messageContent}`);
+      }
+    }
+    question = messageParts.join("\n\n");
+  } else {
+    const questionParts = [];
+    for (const segment of testCase.input_segments) {
+      const formattedContent = formatSegment(segment);
+      if (formattedContent) {
+        questionParts.push(formattedContent);
+      }
+    }
+    if (testCase.code_snippets.length > 0) {
+      questionParts.push(testCase.code_snippets.join("\n"));
+    }
+    question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
+  }
+  const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
+    messages: testCase.input_messages,
+    segmentsByMessage,
+    guidelinePatterns: testCase.guideline_patterns,
+    guidelineContent: guidelines
+  }) : void 0;
+  return { question, guidelines, chatPrompt };
+}
+function buildChatPromptFromSegments(options) {
+  const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
+  if (messages.length === 0) {
+    return void 0;
+  }
+  const systemSegments = [];
+  if (systemPrompt && systemPrompt.trim().length > 0) {
+    systemSegments.push(systemPrompt.trim());
+  }
+  if (guidelineContent && guidelineContent.trim().length > 0) {
+    systemSegments.push(`[[ ## Guidelines ## ]]
+${guidelineContent.trim()}`);
+  }
+  let startIndex = 0;
+  while (startIndex < messages.length && messages[startIndex].role === "system") {
+    const segments = segmentsByMessage[startIndex];
+    const contentParts = [];
+    for (const segment of segments) {
+      const formatted = formatSegment(segment);
+      if (formatted) {
+        contentParts.push(formatted);
       }
-      continue;
     }
-    const genericValue = segment.value;
-    if (typeof genericValue === "string") {
-      questionParts.push(genericValue);
+    if (contentParts.length > 0) {
+      systemSegments.push(contentParts.join("\n"));
     }
+    startIndex += 1;
   }
-  if (testCase.code_snippets.length > 0) {
-    questionParts.push(testCase.code_snippets.join("\n"));
+  const chatPrompt = [];
+  if (systemSegments.length > 0) {
+    chatPrompt.push({
+      role: "system",
+      content: systemSegments.join("\n\n")
+    });
   }
-  const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
-  const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
-  return { question, guidelines };
+  for (let i = startIndex; i < messages.length; i++) {
+    const message = messages[i];
+    const segments = segmentsByMessage[i];
+    const contentParts = [];
+    let role = message.role;
+    let name;
+    if (role === "system") {
+      role = "assistant";
+      contentParts.push("@[System]:");
+    } else if (role === "tool") {
+      role = "function";
+      name = "tool";
+    }
+    for (const segment of segments) {
+      if (segment.type === "guideline_ref") {
+        continue;
+      }
+      const formatted = formatSegment(segment);
+      if (formatted) {
+        const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
+        if (isGuidelineRef) {
+          continue;
+        }
+        contentParts.push(formatted);
+      }
+    }
+    if (contentParts.length === 0) {
+      continue;
+    }
+    chatPrompt.push({
+      role,
+      content: contentParts.join("\n"),
+      ...name ? { name } : {}
+    });
+  }
+  return chatPrompt.length > 0 ? chatPrompt : void 0;
 }
 async function fileExists2(absolutePath) {
   try {
@@ -591,21 +769,14 @@ import { AxAI } from "@ax-llm/ax";
 var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
 function buildChatPrompt(request) {
   if (request.chatPrompt) {
-    return request.chatPrompt;
-  }
-  const systemSegments = [];
-  const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
-  if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
-    systemSegments.push(metadataSystemPrompt.trim());
-  } else {
-    systemSegments.push(DEFAULT_SYSTEM_PROMPT);
-  }
-  if (request.guidelines && request.guidelines.trim().length > 0) {
-    systemSegments.push(`[[ ## Guidelines ## ]]
-${request.guidelines.trim()}`);
+    const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
+    if (hasSystemMessage) {
+      return request.chatPrompt;
+    }
+    const systemContent2 = resolveSystemContent(request);
+    return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
   }
-  const systemContent = systemSegments.join("\n\n");
+  const systemContent = resolveSystemContent(request);
   const userContent = request.question.trim();
   const prompt = [
     {
@@ -619,6 +790,21 @@ ${request.guidelines.trim()}`);
   ];
   return prompt;
 }
+function resolveSystemContent(request) {
+  const systemSegments = [];
+  const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
+  if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
+    systemSegments.push(metadataSystemPrompt.trim());
+  } else {
+    systemSegments.push(DEFAULT_SYSTEM_PROMPT);
+  }
+  if (request.guidelines && request.guidelines.trim().length > 0) {
+    systemSegments.push(`[[ ## Guidelines ## ]]
+${request.guidelines.trim()}`);
+  }
+  return systemSegments.join("\n\n");
+}
 function extractModelConfig(request, defaults) {
   const temperature = request.temperature ?? defaults.temperature;
   const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
@@ -662,6 +848,67 @@ function ensureChatResponse(result) {
   }
   return result;
 }
+function isRetryableError(error, retryableStatusCodes) {
+  if (!error || typeof error !== "object") {
+    return false;
+  }
+  if ("status" in error && typeof error.status === "number") {
+    return retryableStatusCodes.includes(error.status);
+  }
+  if ("message" in error && typeof error.message === "string") {
+    const match = error.message.match(/HTTP (\d{3})/);
+    if (match) {
+      const status = Number.parseInt(match[1], 10);
+      return retryableStatusCodes.includes(status);
+    }
+  }
+  if ("name" in error && error.name === "AxAIServiceNetworkError") {
+    return true;
+  }
+  return false;
+}
+function calculateRetryDelay(attempt, config) {
+  const delay = Math.min(
+    config.maxDelayMs,
+    config.initialDelayMs * config.backoffFactor ** attempt
+  );
+  return delay * (0.75 + Math.random() * 0.5);
+}
+async function sleep(ms) {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+async function withRetry(fn, retryConfig, signal) {
+  const config = {
+    maxRetries: retryConfig?.maxRetries ?? 3,
+    initialDelayMs: retryConfig?.initialDelayMs ?? 1e3,
+    maxDelayMs: retryConfig?.maxDelayMs ?? 6e4,
+    backoffFactor: retryConfig?.backoffFactor ?? 2,
+    retryableStatusCodes: retryConfig?.retryableStatusCodes ?? [500, 408, 429, 502, 503, 504]
+  };
+  let lastError;
+  for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
+    if (signal?.aborted) {
+      throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
+    }
+    try {
+      return await fn();
+    } catch (error) {
+      lastError = error;
+      if (attempt >= config.maxRetries) {
+        break;
+      }
+      if (!isRetryableError(error, config.retryableStatusCodes)) {
+        throw error;
+      }
+      const delay = calculateRetryDelay(attempt, config);
+      await sleep(delay);
+      if (signal?.aborted) {
+        throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
+      }
+    }
+  }
+  throw lastError;
+}
 var AzureProvider = class {
   constructor(targetName, config) {
     this.config = config;
@@ -671,6 +918,7 @@ var AzureProvider = class {
       temperature: config.temperature,
       maxOutputTokens: config.maxOutputTokens
     };
+    this.retryConfig = config.retry;
     this.ai = AxAI.create({
       name: "azure-openai",
       apiKey: config.apiKey,
@@ -687,16 +935,21 @@ var AzureProvider = class {
   targetName;
   ai;
   defaults;
+  retryConfig;
   async invoke(request) {
     const chatPrompt = buildChatPrompt(request);
     const modelConfig = extractModelConfig(request, this.defaults);
-    const response = await this.ai.chat(
-      {
-        chatPrompt,
-        model: this.config.deploymentName,
-        ...modelConfig ? { modelConfig } : {}
-      },
-      request.signal ? { abortSignal: request.signal } : void 0
+    const response = await withRetry(
+      async () => await this.ai.chat(
+        {
+          chatPrompt,
+          model: this.config.deploymentName,
+          ...modelConfig ? { modelConfig } : {}
+        },
+        request.signal ? { abortSignal: request.signal } : void 0
+      ),
+      this.retryConfig,
+      request.signal
     );
     return mapResponse(ensureChatResponse(response));
   }
@@ -714,6 +967,7 @@ var AnthropicProvider = class {
       maxOutputTokens: config.maxOutputTokens,
       thinkingBudget: config.thinkingBudget
     };
+    this.retryConfig = config.retry;
     this.ai = AxAI.create({
       name: "anthropic",
       apiKey: config.apiKey
@@ -724,16 +978,21 @@ var AnthropicProvider = class {
   targetName;
   ai;
   defaults;
+  retryConfig;
   async invoke(request) {
     const chatPrompt = buildChatPrompt(request);
     const modelConfig = extractModelConfig(request, this.defaults);
-    const response = await this.ai.chat(
-      {
-        chatPrompt,
-        model: this.config.model,
-        ...modelConfig ? { modelConfig } : {}
-      },
-      request.signal ? { abortSignal: request.signal } : void 0
+    const response = await withRetry(
+      async () => await this.ai.chat(
+        {
+          chatPrompt,
+          model: this.config.model,
+          ...modelConfig ? { modelConfig } : {}
+        },
+        request.signal ? { abortSignal: request.signal } : void 0
+      ),
+      this.retryConfig,
+      request.signal
     );
     return mapResponse(ensureChatResponse(response));
   }
@@ -750,6 +1009,7 @@ var GeminiProvider = class {
       temperature: config.temperature,
       maxOutputTokens: config.maxOutputTokens
     };
+    this.retryConfig = config.retry;
     this.ai = AxAI.create({
       name: "google-gemini",
       apiKey: config.apiKey
@@ -760,16 +1020,21 @@ var GeminiProvider = class {
   targetName;
   ai;
   defaults;
+  retryConfig;
   async invoke(request) {
     const chatPrompt = buildChatPrompt(request);
     const modelConfig = extractModelConfig(request, this.defaults);
-    const response = await this.ai.chat(
-      {
-        chatPrompt,
-        model: this.config.model,
-        ...modelConfig ? { modelConfig } : {}
-      },
-      request.signal ? { abortSignal: request.signal } : void 0
+    const response = await withRetry(
+      async () => await this.ai.chat(
+        {
+          chatPrompt,
+          model: this.config.model,
+          ...modelConfig ? { modelConfig } : {}
+        },
+        request.signal ? { abortSignal: request.signal } : void 0
+      ),
+      this.retryConfig,
+      request.signal
     );
     return mapResponse(ensureChatResponse(response));
   }
@@ -839,10 +1104,9 @@ var CliProvider = class {
     const outputFilePath = generateOutputFilePath(request.evalCaseId);
     const templateValues = buildTemplateValues(request, this.config, outputFilePath);
     const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
-    const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
     const result = await this.runCommand(renderedCommand, {
       cwd: this.config.cwd,
-      env,
+      env: process.env,
       timeoutMs: this.config.timeoutMs,
       signal: request.signal
     });
@@ -931,10 +1195,9 @@ var CliProvider = class {
         generateOutputFilePath("healthcheck")
       )
     );
-    const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
     const result = await this.runCommand(renderedCommand, {
       cwd: healthcheck.cwd ?? this.config.cwd,
-      env,
+      env: process.env,
       timeoutMs,
       signal
     });
@@ -2167,20 +2430,13 @@ function assertTargetDefinition(value, index, filePath) {
   }
   const name = value.name;
   const provider = value.provider;
-  const settings = value.settings;
-  const judgeTarget = value.judge_target;
   if (typeof name !== "string" || name.trim().length === 0) {
     throw new Error(`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`);
   }
   if (typeof provider !== "string" || provider.trim().length === 0) {
     throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
   }
-  return {
-    name,
-    provider,
-    settings: isRecord(settings) ? settings : void 0,
-    judge_target: typeof judgeTarget === "string" ? judgeTarget : void 0
-  };
+  return value;
 }
 async function fileExists3(filePath) {
   try {
@@ -2260,19 +2516,21 @@ var LlmJudgeEvaluator = class {
     return this.evaluateWithPrompt(context, judgeProvider);
   }
   async evaluateWithPrompt(context, judgeProvider) {
-    let prompt = buildQualityPrompt(context.evalCase, context.candidate);
-    let systemPrompt = context.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
+    const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context.evalCase);
+    const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
+    let prompt = buildQualityPrompt(context.evalCase, context.candidate, formattedQuestion);
+    let systemPrompt = context.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
     if (systemPrompt && hasTemplateVariables(systemPrompt)) {
       const variables = {
         input_messages: JSON.stringify(context.evalCase.input_segments, null, 2),
         output_messages: JSON.stringify(context.evalCase.output_segments, null, 2),
         candidate_answer: context.candidate,
-        reference_answer: context.evalCase.reference_answer,
+        reference_answer: context.evalCase.reference_answer ?? "",
         expected_outcome: context.evalCase.expected_outcome,
-        question: context.evalCase.question
+        question: formattedQuestion
       };
       prompt = substituteVariables(systemPrompt, variables);
-      systemPrompt = QUALITY_SYSTEM_PROMPT;
+      systemPrompt = buildSystemPrompt(hasReferenceAnswer);
     }
     const metadata = {
       ...systemPrompt !== void 0 ? { systemPrompt } : {},
@@ -2310,38 +2568,51 @@ var LlmJudgeEvaluator = class {
     };
   }
 };
-var QUALITY_SYSTEM_PROMPT = [
-  "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
-  "",
-  "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
-  "",
-  "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
-  "",
-  "You must respond with a single JSON object matching this schema:",
-  "",
-  "{",
-  '  "score": <number between 0.0 and 1.0>,',
-  '  "hits": [<array of strings, max 4 items, brief specific achievements>],',
-  '  "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
-  '  "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
-  "}"
-].join("\n");
-function buildQualityPrompt(evalCase, candidate) {
+function buildSystemPrompt(hasReferenceAnswer) {
+  const basePrompt = [
+    "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
+    ""
+  ];
+  if (hasReferenceAnswer) {
+    basePrompt.push(
+      "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
+      ""
+    );
+  }
+  basePrompt.push(
+    "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
+    "",
+    "You must respond with a single JSON object matching this schema:",
+    "",
+    "{",
+    '  "score": <number between 0.0 and 1.0>,',
+    '  "hits": [<array of strings, max 4 items, brief specific achievements>],',
+    '  "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
+    '  "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
+    "}"
+  );
+  return basePrompt.join("\n");
+}
+function buildQualityPrompt(evalCase, candidate, question) {
   const parts = [
     "[[ ## expected_outcome ## ]]",
     evalCase.expected_outcome.trim(),
     "",
     "[[ ## question ## ]]",
-    evalCase.question.trim(),
-    "",
-    "[[ ## reference_answer ## ]]",
-    evalCase.reference_answer.trim(),
-    "",
-    "[[ ## candidate_answer ## ]]",
-    candidate.trim(),
-    "",
-    "Respond with a single JSON object matching the schema described in the system prompt."
+    question.trim(),
+    ""
   ];
+  if (hasNonEmptyReferenceAnswer(evalCase)) {
+    parts.push(
+      "[[ ## reference_answer ## ]]",
+      evalCase.reference_answer.trim(),
+      ""
+    );
+  }
+  parts.push(
+    "[[ ## candidate_answer ## ]]",
+    candidate.trim()
+  );
   return parts.join("\n");
 }
 function clampScore(value) {
@@ -2424,6 +2695,9 @@ function extractJsonBlob(text) {
 function isNonEmptyString(value) {
   return typeof value === "string" && value.trim().length > 0;
 }
+function hasNonEmptyReferenceAnswer(evalCase) {
+  return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
+}
 var CodeEvaluator = class {
   kind = "code";
   script;
@@ -2821,10 +3095,11 @@ async function runEvaluation(options) {
           await onProgress({
             workerId,
             evalId: evalCase.id,
-            status: "completed",
+            status: result.error ? "failed" : "completed",
             startedAt: 0,
             // Not used for completed status
-            completedAt: Date.now()
+            completedAt: Date.now(),
+            error: result.error
           });
         }
         if (onResult) {
@@ -3081,11 +3356,27 @@ async function evaluateCandidate(options) {
     agentTimeoutMs
   });
   const completedAt = nowFn();
-  const rawRequest = {
-    question: promptInputs.question,
-    ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
-    guideline_paths: evalCase.guideline_paths
-  };
+  let agentProviderRequest;
+  let lmProviderRequest;
+  if (isAgentProvider(provider)) {
+    agentProviderRequest = {
+      question: promptInputs.question,
+      guideline_paths: evalCase.guideline_paths
+    };
+  } else {
+    if (promptInputs.chatPrompt) {
+      lmProviderRequest = {
+        chat_prompt: promptInputs.chatPrompt,
+        guideline_paths: evalCase.guideline_paths
+      };
+    } else {
+      lmProviderRequest = {
+        question: promptInputs.question,
+        guidelines: promptInputs.guidelines,
+        guideline_paths: evalCase.guideline_paths
+      };
+    }
+  }
   return {
     eval_id: evalCase.id,
     dataset: evalCase.dataset,
@@ -3099,7 +3390,8 @@ async function evaluateCandidate(options) {
     timestamp: completedAt.toISOString(),
     reasoning: score.reasoning,
     raw_aspects: score.rawAspects,
-    raw_request: rawRequest,
+    agent_provider_request: agentProviderRequest,
+    lm_provider_request: lmProviderRequest,
     evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
     evaluator_results: evaluatorResults
   };
@@ -3328,6 +3620,7 @@ async function invokeProvider(provider, options) {
       question: promptInputs.question,
       guidelines: promptInputs.guidelines,
       guideline_patterns: evalCase.guideline_patterns,
+      chatPrompt: promptInputs.chatPrompt,
       inputFiles: evalCase.file_paths,
       evalCaseId: evalCase.id,
       attempt,
@@ -3344,12 +3637,30 @@ async function invokeProvider(provider, options) {
 }
 function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
   const message = error instanceof Error ? error.message : String(error);
-  const rawRequest = {
-    question: promptInputs.question,
-    ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
-    guideline_paths: evalCase.guideline_paths,
-    error: message
-  };
+  let agentProviderRequest;
+  let lmProviderRequest;
+  if (isAgentProvider(provider)) {
+    agentProviderRequest = {
+      question: promptInputs.question,
+      guideline_paths: evalCase.guideline_paths,
+      error: message
+    };
+  } else {
+    if (promptInputs.chatPrompt) {
+      lmProviderRequest = {
+        chat_prompt: promptInputs.chatPrompt,
+        guideline_paths: evalCase.guideline_paths,
+        error: message
+      };
+    } else {
+      lmProviderRequest = {
+        question: promptInputs.question,
+        guidelines: promptInputs.guidelines,
+        guideline_paths: evalCase.guideline_paths,
+        error: message
+      };
+    }
+  }
   return {
     eval_id: evalCase.id,
     dataset: evalCase.dataset,
@@ -3362,7 +3673,9 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
     target: targetName,
     timestamp: timestamp.toISOString(),
     raw_aspects: [],
-    raw_request: rawRequest
+    agent_provider_request: agentProviderRequest,
+    lm_provider_request: lmProviderRequest,
+    error: message
   };
 }
 function createCacheKey(provider, target, evalCase, promptInputs) {
@@ -3373,6 +3686,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
   hash.update(promptInputs.question);
   hash.update(promptInputs.guidelines);
   hash.update(promptInputs.systemMessage ?? "");
+  if (promptInputs.chatPrompt) {
+    hash.update(JSON.stringify(promptInputs.chatPrompt));
+  }
   return hash.digest("hex");
 }
 function isTimeoutLike(error) {