npm - agentv - Versions diffs - 3.14.6 → 4.1.0 - Mend

agentv 3.14.6 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +59 -533
package/dist/{chunk-CQRWNXVG.js → chunk-2W5JKKXC.js} +537 -727
package/dist/chunk-2W5JKKXC.js.map +1 -0
package/dist/{chunk-Y25VL7PX.js → chunk-4Z326WWF.js} +40 -17
package/dist/chunk-4Z326WWF.js.map +1 -0
package/dist/{chunk-ELQEFMGO.js → chunk-XEAW7OQT.js} +594 -296
package/dist/chunk-XEAW7OQT.js.map +1 -0
package/dist/cli.js +3 -3
package/dist/{dist-5EEXTTC3.js → dist-2JUUJ6PT.js} +18 -2
package/dist/index.js +3 -3
package/dist/{interactive-5ESM5DWV.js → interactive-7ZYS6IOC.js} +4 -11
package/dist/interactive-7ZYS6IOC.js.map +1 -0
package/dist/studio/assets/index-CDGReinH.js +71 -0
package/dist/studio/assets/index-DofvSOmX.js +11 -0
package/dist/studio/assets/index-izxfmBKC.css +1 -0
package/dist/studio/index.html +13 -0
package/package.json +1 -1
package/dist/chunk-CQRWNXVG.js.map +0 -1
package/dist/chunk-ELQEFMGO.js.map +0 -1
package/dist/chunk-Y25VL7PX.js.map +0 -1
package/dist/interactive-5ESM5DWV.js.map +0 -1
/package/dist/{dist-5EEXTTC3.js.map → dist-2JUUJ6PT.js.map} +0 -0

package/dist/{chunk-ELQEFMGO.js → chunk-XEAW7OQT.js} RENAMED Viewed

@@ -301,7 +301,7 @@ var require_dist = __commonJS({
   }
 });
-// ../../packages/core/dist/chunk-HP5PFOVK.js
+// ../../packages/core/dist/chunk-PXYYRDHH.js
 import { constants } from "node:fs";
 import { access, readFile } from "node:fs/promises";
 import path from "node:path";
@@ -419,11 +419,32 @@ __export(external_exports2, {
   void: () => voidType
 });
-// ../../packages/core/dist/chunk-HP5PFOVK.js
+// ../../packages/core/dist/chunk-PXYYRDHH.js
 import { readFile as readFile2 } from "node:fs/promises";
 import path3 from "node:path";
 import fg from "fast-glob";
 import { parse as parseYaml } from "yaml";
+var CONTENT_TYPES = /* @__PURE__ */ new Set(["text", "image", "file"]);
+function isContent(value) {
+  if (!value || typeof value !== "object") return false;
+  const v = value;
+  return typeof v.type === "string" && CONTENT_TYPES.has(v.type);
+}
+function isContentArray(value) {
+  return Array.isArray(value) && value.length > 0 && value.every(isContent);
+}
+function getTextContent(content) {
+  if (content == null) return "";
+  if (typeof content === "string") return content;
+  if (!Array.isArray(content)) return "";
+  const parts = [];
+  for (const block of content) {
+    if (block.type === "text") {
+      parts.push(block.text);
+    }
+  }
+  return parts.join("\n");
+}
 var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
 var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
 var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
@@ -776,6 +797,12 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
   "FILES",
   "OUTPUT_FILE"
 ]);
+var COMMON_TARGET_SETTINGS = [
+  "provider_batching",
+  "providerBatching",
+  "subagent_mode_allowed",
+  "subagentModeAllowed"
+];
 var BASE_TARGET_SCHEMA = external_exports2.object({
   name: external_exports2.string().min(1, "target name is required"),
   provider: external_exports2.string().min(1, "provider is required"),
@@ -784,7 +811,8 @@ var BASE_TARGET_SCHEMA = external_exports2.object({
   // backward compat
   workers: external_exports2.number().int().min(1).optional(),
   workspace_template: external_exports2.string().optional(),
-  workspaceTemplate: external_exports2.string().optional()
+  workspaceTemplate: external_exports2.string().optional(),
+  subagent_mode_allowed: external_exports2.boolean().optional()
 }).passthrough();
 var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
 var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
@@ -847,42 +875,40 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
   const providerBatching = resolveOptionalBoolean(
     parsed.provider_batching ?? parsed.providerBatching
   );
+  const subagentModeAllowed = resolveOptionalBoolean(
+    parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
+  );
+  const base = {
+    name: parsed.name,
+    graderTarget: parsed.grader_target ?? parsed.judge_target,
+    workers: parsed.workers,
+    providerBatching,
+    subagentModeAllowed
+  };
   switch (provider) {
     case "openai":
       return {
         kind: "openai",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveOpenAIConfig(parsed, env)
       };
     case "openrouter":
       return {
         kind: "openrouter",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveOpenRouterConfig(parsed, env)
       };
     case "azure":
     case "azure-openai":
       return {
         kind: "azure",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveAzureConfig(parsed, env)
       };
     case "anthropic":
       return {
         kind: "anthropic",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveAnthropicConfig(parsed, env)
       };
     case "gemini":
@@ -890,68 +916,47 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
     case "google-gemini":
       return {
         kind: "gemini",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveGeminiConfig(parsed, env)
       };
     case "codex":
     case "codex-cli":
       return {
         kind: "codex",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveCodexConfig(parsed, env, evalFilePath)
       };
     case "copilot-sdk":
     case "copilot_sdk":
       return {
         kind: "copilot-sdk",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveCopilotSdkConfig(parsed, env, evalFilePath)
       };
     case "copilot":
     case "copilot-cli":
       return {
         kind: "copilot-cli",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveCopilotCliConfig(parsed, env, evalFilePath)
       };
     case "copilot-log":
       return {
         kind: "copilot-log",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveCopilotLogConfig(parsed, env)
       };
     case "pi":
     case "pi-coding-agent":
       return {
         kind: "pi-coding-agent",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolvePiCodingAgentConfig(parsed, env, evalFilePath)
       };
     case "pi-cli":
       return {
         kind: "pi-cli",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolvePiCliConfig(parsed, env, evalFilePath)
       };
     case "claude":
@@ -959,38 +964,26 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
     case "claude-cli":
       return {
         kind: "claude-cli",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveClaudeConfig(parsed, env, evalFilePath)
       };
     case "claude-sdk":
       return {
         kind: "claude-sdk",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveClaudeConfig(parsed, env, evalFilePath)
       };
     case "mock":
       return {
         kind: "mock",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveMockConfig(parsed)
       };
     case "vscode":
     case "vscode-insiders":
       return {
         kind: provider,
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders", evalFilePath)
       };
     case "agentv": {
@@ -1003,29 +996,21 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
       const temperature = typeof parsed.temperature === "number" ? parsed.temperature : 0;
       return {
         kind: "agentv",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
+        ...base,
         workers: typeof parsed.workers === "number" ? parsed.workers : void 0,
-        providerBatching,
         config: { model, temperature }
       };
     }
     case "cli":
       return {
         kind: "cli",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveCliConfig(parsed, env, evalFilePath)
       };
     default:
       return {
         kind: "cli",
-        name: parsed.name,
-        graderTarget: parsed.grader_target ?? parsed.judge_target,
-        workers: parsed.workers,
-        providerBatching,
+        ...base,
         config: resolveDiscoveredProviderConfig(parsed, provider, env, evalFilePath)
       };
   }
@@ -1653,8 +1638,8 @@ function resolveCliConfig(target, env, evalFilePath) {
   const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
   if (!parseResult.success) {
     const firstError = parseResult.error.errors[0];
-    const path47 = firstError?.path.join(".") || "";
-    const prefix = path47 ? `${target.name} ${path47}: ` : `${target.name}: `;
+    const path48 = firstError?.path.join(".") || "";
+    const prefix = path48 ? `${target.name} ${path48}: ` : `${target.name}: `;
     throw new Error(`${prefix}${firstError?.message}`);
   }
   const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -1897,6 +1882,82 @@ function resolveOptionalNumberArray(source, description) {
   }
   return resolved.length > 0 ? resolved : void 0;
 }
+var AGENT_PROVIDER_KINDS = [
+  "codex",
+  "copilot-sdk",
+  "copilot-cli",
+  "pi-coding-agent",
+  "pi-cli",
+  "claude",
+  "claude-cli",
+  "claude-sdk",
+  "vscode",
+  "vscode-insiders"
+];
+var KNOWN_PROVIDERS = [
+  "openai",
+  "openrouter",
+  "azure",
+  "anthropic",
+  "gemini",
+  "codex",
+  "copilot-sdk",
+  "copilot-cli",
+  "copilot-log",
+  "pi-coding-agent",
+  "pi-cli",
+  "claude",
+  "claude-cli",
+  "claude-sdk",
+  "cli",
+  "mock",
+  "vscode",
+  "vscode-insiders",
+  "agentv"
+];
+var PROVIDER_ALIASES = [
+  "azure-openai",
+  // alias for "azure"
+  "google",
+  // alias for "gemini"
+  "google-gemini",
+  // alias for "gemini"
+  "codex-cli",
+  // alias for "codex"
+  "copilot",
+  // alias for "copilot-cli" (default copilot experience)
+  "copilot_sdk",
+  // alias for "copilot-sdk" (underscore variant)
+  "pi",
+  // alias for "pi-coding-agent"
+  "claude-code",
+  // alias for "claude" (legacy)
+  "bedrock",
+  // legacy/future support
+  "vertex"
+  // legacy/future support
+];
+function extractLastAssistantContent(messages) {
+  if (!messages || messages.length === 0) {
+    return "";
+  }
+  for (let i = messages.length - 1; i >= 0; i--) {
+    const msg = messages[i];
+    if (msg.role === "assistant" && msg.content !== void 0) {
+      if (typeof msg.content === "string") {
+        return msg.content;
+      }
+      if (isContentArray(msg.content)) {
+        return getTextContent(msg.content);
+      }
+      return JSON.stringify(msg.content);
+    }
+  }
+  return "";
+}
+function isAgentProvider(provider) {
+  return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
+}
 var ENV_VAR_PATTERN = /\$\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*\}\}/g;
 function interpolateEnv(value, env) {
   if (typeof value === "string") {
@@ -2026,79 +2087,6 @@ async function expandFileReferences(tests, evalFileDir) {
   }
   return expanded;
 }
-var AGENT_PROVIDER_KINDS = [
-  "codex",
-  "copilot-sdk",
-  "copilot-cli",
-  "pi-coding-agent",
-  "pi-cli",
-  "claude",
-  "claude-cli",
-  "claude-sdk",
-  "vscode",
-  "vscode-insiders"
-];
-var KNOWN_PROVIDERS = [
-  "openai",
-  "openrouter",
-  "azure",
-  "anthropic",
-  "gemini",
-  "codex",
-  "copilot-sdk",
-  "copilot-cli",
-  "copilot-log",
-  "pi-coding-agent",
-  "pi-cli",
-  "claude",
-  "claude-cli",
-  "claude-sdk",
-  "cli",
-  "mock",
-  "vscode",
-  "vscode-insiders",
-  "agentv"
-];
-var PROVIDER_ALIASES = [
-  "azure-openai",
-  // alias for "azure"
-  "google",
-  // alias for "gemini"
-  "google-gemini",
-  // alias for "gemini"
-  "codex-cli",
-  // alias for "codex"
-  "copilot",
-  // alias for "copilot-cli" (default copilot experience)
-  "copilot_sdk",
-  // alias for "copilot-sdk" (underscore variant)
-  "pi",
-  // alias for "pi-coding-agent"
-  "claude-code",
-  // alias for "claude" (legacy)
-  "bedrock",
-  // legacy/future support
-  "vertex"
-  // legacy/future support
-];
-function extractLastAssistantContent(messages) {
-  if (!messages || messages.length === 0) {
-    return "";
-  }
-  for (let i = messages.length - 1; i >= 0; i--) {
-    const msg = messages[i];
-    if (msg.role === "assistant" && msg.content !== void 0) {
-      if (typeof msg.content === "string") {
-        return msg.content;
-      }
-      return JSON.stringify(msg.content);
-    }
-  }
-  return "";
-}
-function isAgentProvider(provider) {
-  return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
-}
 // ../../packages/core/dist/index.js
 import { readFile as readFile6 } from "node:fs/promises";
@@ -6734,7 +6722,7 @@ function createOpenRouter(options = {}) {
   );
   const createChatModel = (modelId, settings = {}) => new OpenRouterChatLanguageModel(modelId, settings, {
     provider: "openrouter.chat",
-    url: ({ path: path47 }) => `${baseURL}${path47}`,
+    url: ({ path: path48 }) => `${baseURL}${path48}`,
     headers: getHeaders,
     compatibility,
     fetch: options.fetch,
@@ -6742,7 +6730,7 @@ function createOpenRouter(options = {}) {
   });
   const createCompletionModel = (modelId, settings = {}) => new OpenRouterCompletionLanguageModel(modelId, settings, {
     provider: "openrouter.completion",
-    url: ({ path: path47 }) => `${baseURL}${path47}`,
+    url: ({ path: path48 }) => `${baseURL}${path48}`,
     headers: getHeaders,
     compatibility,
     fetch: options.fetch,
@@ -6750,14 +6738,14 @@ function createOpenRouter(options = {}) {
   });
   const createEmbeddingModel = (modelId, settings = {}) => new OpenRouterEmbeddingModel(modelId, settings, {
     provider: "openrouter.embedding",
-    url: ({ path: path47 }) => `${baseURL}${path47}`,
+    url: ({ path: path48 }) => `${baseURL}${path48}`,
     headers: getHeaders,
     fetch: options.fetch,
     extraBody: options.extraBody
   });
   const createImageModel = (modelId, settings = {}) => new OpenRouterImageModel(modelId, settings, {
     provider: "openrouter.image",
-    url: ({ path: path47 }) => `${baseURL}${path47}`,
+    url: ({ path: path48 }) => `${baseURL}${path48}`,
     headers: getHeaders,
     fetch: options.fetch,
     extraBody: options.extraBody
@@ -14350,6 +14338,7 @@ import { existsSync as existsSync4 } from "node:fs";
 import path45 from "node:path";
 import { mkdir as mkdir15, readFile as readFile13, writeFile as writeFile8 } from "node:fs/promises";
 import path46 from "node:path";
+import path47 from "node:path";
 function computeTraceSummary(messages) {
   const toolCallCounts = {};
   const toolDurations = {};
@@ -14979,15 +14968,23 @@ var TEMPLATE_VARIABLES = {
   INPUT: "input",
   OUTPUT: "output",
   FILE_CHANGES: "file_changes",
+  /** @deprecated Use INPUT instead — resolves to the same text value. */
   INPUT_TEXT: "input_text",
+  /** @deprecated Use OUTPUT instead — resolves to the same text value. */
   OUTPUT_TEXT: "output_text",
+  /** @deprecated Use EXPECTED_OUTPUT instead — resolves to the same text value. */
   EXPECTED_OUTPUT_TEXT: "expected_output_text"
 };
 var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
 var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
-  TEMPLATE_VARIABLES.OUTPUT_TEXT,
+  TEMPLATE_VARIABLES.OUTPUT,
   TEMPLATE_VARIABLES.EXPECTED_OUTPUT
 ]);
+var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
+  [TEMPLATE_VARIABLES.INPUT_TEXT, TEMPLATE_VARIABLES.INPUT],
+  [TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT],
+  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
+]);
 var ANSI_YELLOW22 = "\x1B[33m";
 var ANSI_RESET3 = "\x1B[0m";
 async function validateCustomPromptContent(promptPath) {
@@ -15007,16 +15004,29 @@ function validateTemplateVariables(content, source) {
     }
     match = variablePattern.exec(content);
   }
-  const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
-  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
+  const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
+  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT);
   const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
   if (!hasRequiredFields) {
     throw new Error(
       `Missing required fields. Must include at least one of:
-  - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
+  - {{ ${TEMPLATE_VARIABLES.OUTPUT} }}
   - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
     );
   }
+  const deprecatedUsed = [];
+  for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
+    if (foundVariables.has(deprecated)) {
+      deprecatedUsed.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
+    }
+  }
+  if (deprecatedUsed.length > 0) {
+    console.warn(
+      `${ANSI_YELLOW22}Warning: Template at ${source} uses deprecated variable names:
+  ${deprecatedUsed.join("\n  ")}
+  These still work but will be removed in a future version.${ANSI_RESET3}`
+    );
+  }
   if (invalidVariables.length > 0) {
     const warningMessage = `${ANSI_YELLOW22}Warning: Custom evaluator template at ${source}
   Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
@@ -16418,6 +16428,19 @@ function hasVisibleContent(segments) {
 function asString2(value) {
   return typeof value === "string" ? value : void 0;
 }
+var IMAGE_MEDIA_TYPES = {
+  ".png": "image/png",
+  ".jpg": "image/jpeg",
+  ".jpeg": "image/jpeg",
+  ".gif": "image/gif",
+  ".webp": "image/webp",
+  ".svg": "image/svg+xml",
+  ".bmp": "image/bmp"
+};
+function detectImageMediaType(filePath) {
+  const ext = path5.extname(filePath).toLowerCase();
+  return IMAGE_MEDIA_TYPES[ext];
+}
 var ANSI_YELLOW4 = "\x1B[33m";
 var ANSI_RESET5 = "\x1B[0m";
 async function processMessages(options) {
@@ -16483,6 +16506,47 @@ async function processMessages(options) {
         }
         continue;
       }
+      if (segmentType === "image") {
+        const rawValue = asString3(rawSegment.value);
+        if (!rawValue) {
+          continue;
+        }
+        const { displayPath, resolvedPath, attempted } = await resolveFileReference22(
+          rawValue,
+          searchRoots
+        );
+        if (!resolvedPath) {
+          const attempts = attempted.length ? ["  Tried:", ...attempted.map((candidate) => `    ${candidate}`)] : void 0;
+          const context2 = messageType === "input" ? "" : " in expected_output";
+          logWarning3(`Image file not found${context2}: ${displayPath}`, attempts);
+          continue;
+        }
+        const mediaType = detectImageMediaType(resolvedPath);
+        if (!mediaType) {
+          logWarning3(
+            `Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
+          );
+          continue;
+        }
+        try {
+          const imageBuffer = await readFile4(resolvedPath);
+          const base64 = imageBuffer.toString("base64");
+          processedContent.push({
+            type: "image",
+            media_type: mediaType,
+            source: `data:${mediaType};base64,${base64}`
+          });
+          if (verbose) {
+            const label = messageType === "input" ? "[Image]" : "[Expected Output Image]";
+            console.log(`  ${label} Found: ${displayPath}`);
+            console.log(`    Resolved to: ${resolvedPath} (${mediaType})`);
+          }
+        } catch (error) {
+          const context2 = messageType === "input" ? "" : " expected output";
+          logWarning3(`Could not read${context2} image ${resolvedPath}: ${error.message}`);
+        }
+        continue;
+      }
       const clonedSegment = cloneJsonObject(rawSegment);
       processedContent.push(clonedSegment);
       const inlineValue = clonedSegment.value;
@@ -16560,6 +16624,46 @@ async function processExpectedMessages(options) {
           }
           continue;
         }
+        if (segmentType === "image") {
+          const rawValue = asString3(rawSegment.value);
+          if (!rawValue) {
+            continue;
+          }
+          const { displayPath, resolvedPath, attempted } = await resolveFileReference22(
+            rawValue,
+            searchRoots
+          );
+          if (!resolvedPath) {
+            const attempts = attempted.length ? ["  Tried:", ...attempted.map((candidate) => `    ${candidate}`)] : void 0;
+            logWarning3(`Image file not found in expected_output: ${displayPath}`, attempts);
+            continue;
+          }
+          const mediaType = detectImageMediaType(resolvedPath);
+          if (!mediaType) {
+            logWarning3(
+              `Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
+            );
+            continue;
+          }
+          try {
+            const imageBuffer = await readFile4(resolvedPath);
+            const base64 = imageBuffer.toString("base64");
+            processedContent.push({
+              type: "image",
+              media_type: mediaType,
+              source: `data:${mediaType};base64,${base64}`
+            });
+            if (verbose) {
+              console.log(`  [Expected Output Image] Found: ${displayPath}`);
+              console.log(`    Resolved to: ${resolvedPath} (${mediaType})`);
+            }
+          } catch (error) {
+            logWarning3(
+              `Could not read expected output image ${resolvedPath}: ${error.message}`
+            );
+          }
+          continue;
+        }
         processedContent.push(cloneJsonObject(rawSegment));
       }
       segment.content = processedContent;
@@ -16802,7 +16906,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
     const userFilePaths = collectResolvedInputFilePaths(inputMessages);
     const testCase = {
       id,
-      eval_set: evalSetName,
+      dataset: evalSetName,
       conversation_id: conversationId,
       question,
       input: inputMessages,
@@ -17066,7 +17170,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
   }
   const suite = interpolated;
   const evalSetNameFromSuite = asString5(suite.name)?.trim();
-  const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
+  const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
   const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
   const rawTestcases = resolveTests(suite);
   const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
@@ -17187,7 +17291,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
     const caseTargets = extractTargetsFromTestCase(evalcase);
     const testCase = {
       id,
-      eval_set: evalSetName,
+      dataset: evalSetName,
+      category: options?.category,
       conversation_id: conversationId,
       question,
       input: inputMessages,
@@ -18090,6 +18195,47 @@ async function withRetry(fn, retryConfig, signal) {
   }
   throw lastError;
 }
+function toContentArray(content) {
+  if (!Array.isArray(content)) return void 0;
+  let hasNonText = false;
+  const blocks = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") continue;
+    const p = part;
+    if (p.type === "text" && typeof p.text === "string") {
+      blocks.push({ type: "text", text: p.text });
+    } else if (p.type === "image" && typeof p.source === "object" && p.source !== null) {
+      const src = p.source;
+      const mediaType = typeof p.media_type === "string" ? p.media_type : typeof src.media_type === "string" ? src.media_type : "application/octet-stream";
+      const data = typeof src.data === "string" && src.data !== "" ? `data:${mediaType};base64,${src.data}` : typeof p.url === "string" && p.url !== "" ? p.url : "";
+      if (!data) continue;
+      blocks.push({ type: "image", media_type: mediaType, source: data });
+      hasNonText = true;
+    } else if (p.type === "tool_use") {
+    } else if (p.type === "tool_result") {
+    }
+  }
+  return hasNonText && blocks.length > 0 ? blocks : void 0;
+}
+function extractTextContent2(content) {
+  if (typeof content === "string") {
+    return content;
+  }
+  if (!Array.isArray(content)) {
+    return void 0;
+  }
+  const textParts = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") {
+      continue;
+    }
+    const p = part;
+    if (p.type === "text" && typeof p.text === "string") {
+      textParts.push(p.text);
+    }
+  }
+  return textParts.length > 0 ? textParts.join("\n") : void 0;
+}
 var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeLogs");
 var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeLogSubscribers");
 function getClaudeLogStore() {
@@ -18249,11 +18395,12 @@ var ClaudeCliProvider = class {
             if (betaMessage && typeof betaMessage === "object") {
               const msg = betaMessage;
               const content = msg.content;
+              const structuredContent = toContentArray(content);
               const textContent = extractTextContent2(content);
               const toolCalls = extractToolCalls(content);
               const outputMsg = {
                 role: "assistant",
-                content: textContent,
+                content: structuredContent ?? textContent,
                 toolCalls: toolCalls.length > 0 ? toolCalls : void 0
               };
               output.push(outputMsg);
@@ -18592,25 +18739,6 @@ function summarizeEvent(event) {
       return void 0;
   }
 }
-function extractTextContent2(content) {
-  if (typeof content === "string") {
-    return content;
-  }
-  if (!Array.isArray(content)) {
-    return void 0;
-  }
-  const textParts = [];
-  for (const part of content) {
-    if (!part || typeof part !== "object") {
-      continue;
-    }
-    const p = part;
-    if (p.type === "text" && typeof p.text === "string") {
-      textParts.push(p.text);
-    }
-  }
-  return textParts.length > 0 ? textParts.join("\n") : void 0;
-}
 function extractToolCalls(content) {
   if (!Array.isArray(content)) {
     return [];
@@ -18777,11 +18905,12 @@ var ClaudeSdkProvider = class {
             if (betaMessage && typeof betaMessage === "object") {
               const msg = betaMessage;
               const content = msg.content;
-              const textContent = extractTextContent22(content);
+              const structuredContent = toContentArray(content);
+              const textContent = extractTextContent2(content);
               const toolCalls = extractToolCalls2(content);
               const outputMsg = {
                 role: "assistant",
-                content: textContent,
+                content: structuredContent ?? textContent,
                 toolCalls: toolCalls.length > 0 ? toolCalls : void 0
               };
               output.push(outputMsg);
@@ -18899,25 +19028,6 @@ var ClaudeSdkProvider = class {
     }
   }
 };
-function extractTextContent22(content) {
-  if (typeof content === "string") {
-    return content;
-  }
-  if (!Array.isArray(content)) {
-    return void 0;
-  }
-  const textParts = [];
-  for (const part of content) {
-    if (!part || typeof part !== "object") {
-      continue;
-    }
-    const p = part;
-    if (p.type === "text" && typeof p.text === "string") {
-      textParts.push(p.text);
-    }
-  }
-  return textParts.length > 0 ? textParts.join("\n") : void 0;
-}
 function extractToolCalls2(content) {
   if (!Array.isArray(content)) {
     return [];
@@ -19133,7 +19243,7 @@ function convertMessages(messages) {
   return messages.map((msg) => ({
     role: msg.role,
     name: msg.name,
-    content: msg.content,
+    content: isContentArray(msg.content) ? msg.content : typeof msg.content === "string" ? msg.content : void 0,
     toolCalls: msg.tool_calls?.map((tc) => ({
       tool: tc.tool,
       input: tc.input,
@@ -21319,6 +21429,35 @@ function extractPiTextContent(content) {
   }
   return textParts.length > 0 ? textParts.join("\n") : void 0;
 }
+function toPiContentArray(content) {
+  if (!Array.isArray(content)) return void 0;
+  let hasNonText = false;
+  const blocks = [];
+  for (const part of content) {
+    if (!part || typeof part !== "object") continue;
+    const p = part;
+    if (p.type === "text" && typeof p.text === "string") {
+      blocks.push({ type: "text", text: p.text });
+    } else if (p.type === "image") {
+      const mediaType = typeof p.media_type === "string" ? p.media_type : "application/octet-stream";
+      let source = "";
+      if (typeof p.source === "object" && p.source !== null) {
+        const src = p.source;
+        const srcMediaType = typeof src.media_type === "string" ? src.media_type : mediaType;
+        source = typeof src.data === "string" ? `data:${srcMediaType};base64,${src.data}` : "";
+      }
+      if (!source && typeof p.url === "string") {
+        source = p.url;
+      }
+      if (source) {
+        blocks.push({ type: "image", media_type: mediaType, source });
+        hasNonText = true;
+      }
+    } else if (p.type === "tool_use" || p.type === "tool_result") {
+    }
+  }
+  return hasNonText && blocks.length > 0 ? blocks : void 0;
+}
 function toFiniteNumber(value) {
   if (typeof value === "number" && Number.isFinite(value)) return value;
   return void 0;
@@ -22478,7 +22617,8 @@ function convertAgentMessage(message, toolTrackers, completedToolResults) {
   }
   const msg = message;
   const role = typeof msg.role === "string" ? msg.role : "unknown";
-  const content = extractPiTextContent(msg.content);
+  const structuredContent = toPiContentArray(msg.content);
+  const content = structuredContent ?? extractPiTextContent(msg.content);
   const toolCalls = extractToolCalls4(msg.content, toolTrackers, completedToolResults);
   const startTimeVal = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
   let msgTokenUsage;
@@ -24233,13 +24373,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
 async function execShellWithStdin(command, stdinPayload, options = {}) {
   const { mkdir: mkdir16, readFile: readFile14, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
   const { tmpdir: tmpdir3 } = await import("node:os");
-  const path47 = await import("node:path");
+  const path48 = await import("node:path");
   const { randomUUID: randomUUID10 } = await import("node:crypto");
-  const dir = path47.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
+  const dir = path48.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
   await mkdir16(dir, { recursive: true });
-  const stdinPath = path47.join(dir, "stdin.txt");
-  const stdoutPath = path47.join(dir, "stdout.txt");
-  const stderrPath = path47.join(dir, "stderr.txt");
+  const stdinPath = path48.join(dir, "stdin.txt");
+  const stdoutPath = path48.join(dir, "stdout.txt");
+  const stderrPath = path48.join(dir, "stderr.txt");
   await writeFile9(stdinPath, stdinPayload, "utf8");
   const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
   const { spawn: spawn5 } = await import("node:child_process");
@@ -24547,6 +24687,56 @@ function toCamelCaseDeep(obj) {
   return obj;
 }
 var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
+var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
+async function materializeContentForGrader(messages, getWorkDir) {
+  if (!messages || messages.length === 0) return messages ?? null;
+  let hasAnyImage = false;
+  for (const msg of messages) {
+    if (isContentArray(msg.content)) {
+      for (const block of msg.content) {
+        if (block.type === "image") {
+          hasAnyImage = true;
+          break;
+        }
+      }
+    }
+    if (hasAnyImage) break;
+  }
+  if (!hasAnyImage) return messages;
+  let counter = 0;
+  const result = [];
+  for (const msg of messages) {
+    if (!isContentArray(msg.content)) {
+      result.push(msg);
+      continue;
+    }
+    if (!msg.content.some((b) => b.type === "image")) {
+      result.push(msg);
+      continue;
+    }
+    const blocks = [];
+    for (const block of msg.content) {
+      if (block.type !== "image") {
+        blocks.push({ ...block });
+        continue;
+      }
+      const img = block;
+      const match = DATA_URI_RE.exec(img.source);
+      if (match) {
+        const [, mediaType, base64Data] = match;
+        const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
+        const dir = await getWorkDir();
+        const filePath = join(dir, `img-${counter++}.${ext}`);
+        await writeFile6(filePath, Buffer.from(base64Data, "base64"));
+        blocks.push({ type: "image", media_type: img.media_type, path: filePath });
+      } else {
+        blocks.push({ type: "image", media_type: img.media_type, path: img.source });
+      }
+    }
+    result.push({ ...msg, content: blocks });
+  }
+  return result;
+}
 var CodeEvaluator = class {
   kind = "code-grader";
   command;
@@ -24562,7 +24752,18 @@ var CodeEvaluator = class {
     this.target = options.target;
   }
   async evaluate(context2) {
-    let outputForPayload = context2.output ?? null;
+    let imageTmpDir;
+    const getImageDir = async () => {
+      if (!imageTmpDir) {
+        imageTmpDir = await mkdtemp2(join(tmpdir2(), "agentv-img-"));
+      }
+      return imageTmpDir;
+    };
+    const materializedOutput = await materializeContentForGrader(
+      context2.output,
+      getImageDir
+    );
+    let outputForPayload = materializedOutput;
     let outputPath;
     if (outputForPayload) {
       const serialized = JSON.stringify(outputForPayload);
@@ -24575,12 +24776,17 @@ var CodeEvaluator = class {
     }
     const payload = {
       criteria: context2.evalCase.criteria,
-      expectedOutput: context2.evalCase.expected_output,
-      outputText: context2.candidate,
+      expectedOutput: await materializeContentForGrader(
+        context2.evalCase.expected_output,
+        getImageDir
+      ),
       output: outputForPayload,
       outputPath,
       inputFiles: context2.evalCase.file_paths,
-      input: context2.evalCase.input,
+      input: await materializeContentForGrader(
+        context2.evalCase.input,
+        getImageDir
+      ),
       trace: context2.trace ?? null,
       tokenUsage: context2.tokenUsage ?? null,
       costUsd: context2.costUsd ?? null,
@@ -24589,9 +24795,7 @@ var CodeEvaluator = class {
       endTime: context2.endTime ?? null,
       fileChanges: context2.fileChanges ?? null,
       workspacePath: context2.workspacePath ?? null,
-      config: this.config ?? null,
-      inputText: context2.evalCase.question,
-      expectedOutputText: context2.evalCase.reference_answer ?? ""
+      config: this.config ?? null
     };
     const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
     let proxyEnv;
@@ -24681,6 +24885,10 @@ var CodeEvaluator = class {
         await rm3(dirname(outputPath), { recursive: true, force: true }).catch(() => {
         });
       }
+      if (imageTmpDir) {
+        await rm3(imageTmpDir, { recursive: true, force: true }).catch(() => {
+        });
+      }
     }
   }
 };
@@ -24749,13 +24957,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
 {{${TEMPLATE_VARIABLES.CRITERIA}}}
 [[ ## question ## ]]
-{{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
+{{${TEMPLATE_VARIABLES.INPUT}}}
 [[ ## reference_answer ## ]]
-{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
+{{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT}}}
 [[ ## answer ## ]]
-{{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
+{{${TEMPLATE_VARIABLES.OUTPUT}}}`;
 var freeformEvaluationSchema = external_exports2.object({
   score: external_exports2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
   assertions: external_exports2.array(
@@ -24827,21 +25035,19 @@ var LlmGraderEvaluator = class {
   async evaluateFreeform(context2, graderProvider) {
     const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
     const variables = {
-      [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context2.evalCase.input, null, 2),
-      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(
-        context2.evalCase.expected_output,
-        null,
-        2
-      ),
-      [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
+      [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
+      [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
+      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
       [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
       [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
+      // Deprecated aliases — same values as the primary variables above
       [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
       [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
       [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
     };
     const systemPrompt = buildOutputSchema();
     const evaluatorTemplate = context2.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
+    warnDeprecatedTemplateVars(evaluatorTemplate);
     let userPrompt = substituteVariables(evaluatorTemplate, variables);
     if (context2.fileChanges && !context2.evaluatorTemplateOverride && !this.evaluatorTemplate) {
       userPrompt += `
@@ -24853,13 +25059,15 @@ ${context2.fileChanges}`;
       userPrompt,
       systemPrompt
     };
+    const images = context2.output ? extractImageBlocks(context2.output) : [];
     try {
       const { data, tokenUsage } = await this.runWithRetry({
         context: context2,
         graderProvider,
         systemPrompt,
         userPrompt,
-        schema: freeformEvaluationSchema
+        schema: freeformEvaluationSchema,
+        images
       });
       const score = clampScore(data.score);
       const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
@@ -24903,13 +25111,15 @@ ${context2.fileChanges}`;
       userPrompt: prompt,
       systemPrompt
     };
+    const images = context2.output ? extractImageBlocks(context2.output) : [];
     try {
       const { data, tokenUsage } = await this.runWithRetry({
         context: context2,
         graderProvider,
         systemPrompt,
         userPrompt: prompt,
-        schema: rubricEvaluationSchema
+        schema: rubricEvaluationSchema,
+        images
       });
       const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
       return {
@@ -24946,13 +25156,15 @@ ${context2.fileChanges}`;
       userPrompt: prompt,
       systemPrompt
     };
+    const images = context2.output ? extractImageBlocks(context2.output) : [];
     try {
       const { data, tokenUsage } = await this.runWithRetry({
         context: context2,
         graderProvider,
         systemPrompt,
         userPrompt: prompt,
-        schema: scoreRangeEvaluationSchema
+        schema: scoreRangeEvaluationSchema,
+        images
       });
       const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
       return {
@@ -25159,12 +25371,17 @@ ${context2.fileChanges}`;
     const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
     const variables = {
       [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
+      [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
+      [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
+      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
+      [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
+      // Deprecated aliases
       [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
       [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
-      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
-      [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
+      [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
     };
     if (this.evaluatorTemplate) {
+      warnDeprecatedTemplateVars(this.evaluatorTemplate);
       return substituteVariables(this.evaluatorTemplate, variables);
     }
     const config = context2.evaluator;
@@ -25215,11 +25432,16 @@ ${context2.fileChanges}`;
     if (this.evaluatorTemplate) {
       const variables = {
         [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
+        [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
+        [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
+        [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
+        [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
+        // Deprecated aliases
         [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
         [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
-        [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
-        [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
+        [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
       };
+      warnDeprecatedTemplateVars(this.evaluatorTemplate);
       const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
       const outputSchema2 = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
       return `${customPrompt}
@@ -25390,18 +25612,35 @@ ${outputSchema2}`;
   // LLM mode retry logic
   // ---------------------------------------------------------------------------
   async runWithRetry(options) {
-    const { context: context2, graderProvider, systemPrompt, userPrompt, schema } = options;
+    const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
     let lastError;
     for (let attempt = 1; attempt <= 3; attempt++) {
       try {
         const model = graderProvider.asLanguageModel?.();
         if (model) {
-          const result = await generateText({
+          const modelOptions = {
+            ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
+            ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
+          };
+          const hasImages = images && images.length > 0;
+          const result = hasImages ? await generateText({
+            model,
+            system: systemPrompt,
+            messages: [
+              {
+                role: "user",
+                content: [
+                  { type: "text", text: userPrompt },
+                  ...toAiSdkImageParts(images)
+                ]
+              }
+            ],
+            ...modelOptions
+          }) : await generateText({
             model,
             system: systemPrompt,
             prompt: userPrompt,
-            ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
-            ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
+            ...modelOptions
           });
           const data2 = schema.parse(parseJsonFromText(result.text));
           const rawUsage = result.usage;
@@ -25461,6 +25700,26 @@ function substituteVariables(template, variables) {
     return variables[varName] ?? match;
   });
 }
+var ANSI_YELLOW7 = "\x1B[33m";
+var ANSI_RESET8 = "\x1B[0m";
+var warnedTemplateStrings = /* @__PURE__ */ new Set();
+function warnDeprecatedTemplateVars(template) {
+  if (warnedTemplateStrings.has(template)) return;
+  const used = [];
+  for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
+    if (new RegExp(`\\{\\{\\s*${deprecated}\\s*\\}\\}`).test(template)) {
+      used.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
+    }
+  }
+  if (used.length > 0) {
+    warnedTemplateStrings.add(template);
+    console.warn(
+      `${ANSI_YELLOW7}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
+  ${used.join("\n  ")}
+  Update your custom evaluator template to use the new names.${ANSI_RESET8}`
+    );
+  }
+}
 function calculateRubricScore(result, rubrics) {
   const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
   const assertions = [];
@@ -25555,6 +25814,26 @@ function calculateScoreRangeResult(result, rubrics) {
     }
   };
 }
+function extractImageBlocks(messages) {
+  const images = [];
+  for (const msg of messages) {
+    if (msg.role !== "assistant") continue;
+    if (!isContentArray(msg.content)) continue;
+    for (const block of msg.content) {
+      if (block.type === "image") {
+        images.push(block);
+      }
+    }
+  }
+  return images;
+}
+function toAiSdkImageParts(images) {
+  return images.map((img) => ({
+    type: "image",
+    image: img.source,
+    mediaType: img.media_type || void 0
+  }));
+}
 function resolveSandboxed(basePath, relativePath) {
   const resolved = path35.resolve(basePath, relativePath);
   if (!resolved.startsWith(basePath + path35.sep) && resolved !== basePath) {
@@ -26288,115 +26567,115 @@ var FieldAccuracyEvaluator = class {
    * Evaluate a single field against the expected value.
    */
   evaluateField(fieldConfig, candidateData, expectedData) {
-    const { path: path47, match, required = true, weight = 1 } = fieldConfig;
-    const candidateValue = resolvePath(candidateData, path47);
-    const expectedValue = resolvePath(expectedData, path47);
+    const { path: path48, match, required = true, weight = 1 } = fieldConfig;
+    const candidateValue = resolvePath(candidateData, path48);
+    const expectedValue = resolvePath(expectedData, path48);
     if (expectedValue === void 0) {
       return {
-        path: path47,
+        path: path48,
         score: 1,
         // No expected value means no comparison needed
         weight,
         hit: true,
-        message: `${path47}: no expected value`
+        message: `${path48}: no expected value`
       };
     }
     if (candidateValue === void 0) {
       if (required) {
         return {
-          path: path47,
+          path: path48,
           score: 0,
           weight,
           hit: false,
-          message: `${path47} (required, missing)`
+          message: `${path48} (required, missing)`
         };
       }
       return {
-        path: path47,
+        path: path48,
         score: 1,
         // Don't penalize missing optional fields
         weight: 0,
         // Zero weight means it won't affect the score
         hit: true,
-        message: `${path47}: optional field missing`
+        message: `${path48}: optional field missing`
       };
     }
     switch (match) {
       case "exact":
-        return this.compareExact(path47, candidateValue, expectedValue, weight);
+        return this.compareExact(path48, candidateValue, expectedValue, weight);
       case "numeric_tolerance":
         return this.compareNumericTolerance(
-          path47,
+          path48,
           candidateValue,
           expectedValue,
           fieldConfig,
           weight
         );
       case "date":
-        return this.compareDate(path47, candidateValue, expectedValue, fieldConfig, weight);
+        return this.compareDate(path48, candidateValue, expectedValue, fieldConfig, weight);
       default:
         return {
-          path: path47,
+          path: path48,
           score: 0,
           weight,
           hit: false,
-          message: `${path47}: unknown match type "${match}"`
+          message: `${path48}: unknown match type "${match}"`
         };
     }
   }
   /**
    * Exact equality comparison.
    */
-  compareExact(path47, candidateValue, expectedValue, weight) {
+  compareExact(path48, candidateValue, expectedValue, weight) {
     if (deepEqual(candidateValue, expectedValue)) {
       return {
-        path: path47,
+        path: path48,
         score: 1,
         weight,
         hit: true,
-        message: path47
+        message: path48
       };
     }
     if (typeof candidateValue !== typeof expectedValue) {
       return {
-        path: path47,
+        path: path48,
         score: 0,
         weight,
         hit: false,
-        message: `${path47} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
+        message: `${path48} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
       };
     }
     return {
-      path: path47,
+      path: path48,
       score: 0,
       weight,
       hit: false,
-      message: `${path47} (value mismatch)`
+      message: `${path48} (value mismatch)`
     };
   }
   /**
    * Numeric comparison with absolute or relative tolerance.
    */
-  compareNumericTolerance(path47, candidateValue, expectedValue, fieldConfig, weight) {
+  compareNumericTolerance(path48, candidateValue, expectedValue, fieldConfig, weight) {
     const { tolerance = 0, relative = false } = fieldConfig;
     const candidateNum = toNumber(candidateValue);
     const expectedNum = toNumber(expectedValue);
     if (candidateNum === null || expectedNum === null) {
       return {
-        path: path47,
+        path: path48,
         score: 0,
         weight,
         hit: false,
-        message: `${path47} (non-numeric value)`
+        message: `${path48} (non-numeric value)`
       };
     }
     if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
       return {
-        path: path47,
+        path: path48,
         score: 0,
         weight,
         hit: false,
-        message: `${path47} (invalid numeric value)`
+        message: `${path48} (invalid numeric value)`
       };
     }
     const diff = Math.abs(candidateNum - expectedNum);
@@ -26409,61 +26688,61 @@ var FieldAccuracyEvaluator = class {
     }
     if (withinTolerance) {
       return {
-        path: path47,
+        path: path48,
         score: 1,
         weight,
         hit: true,
-        message: `${path47} (within tolerance: diff=${diff.toFixed(2)})`
+        message: `${path48} (within tolerance: diff=${diff.toFixed(2)})`
       };
     }
     return {
-      path: path47,
+      path: path48,
       score: 0,
       weight,
       hit: false,
-      message: `${path47} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
+      message: `${path48} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
     };
   }
   /**
    * Date comparison with format normalization.
    */
-  compareDate(path47, candidateValue, expectedValue, fieldConfig, weight) {
+  compareDate(path48, candidateValue, expectedValue, fieldConfig, weight) {
     const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
     const candidateDate = parseDate(String(candidateValue), formats);
     const expectedDate = parseDate(String(expectedValue), formats);
     if (candidateDate === null) {
       return {
-        path: path47,
+        path: path48,
         score: 0,
         weight,
         hit: false,
-        message: `${path47} (unparseable candidate date)`
+        message: `${path48} (unparseable candidate date)`
       };
     }
     if (expectedDate === null) {
       return {
-        path: path47,
+        path: path48,
         score: 0,
         weight,
         hit: false,
-        message: `${path47} (unparseable expected date)`
+        message: `${path48} (unparseable expected date)`
       };
     }
     if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
       return {
-        path: path47,
+        path: path48,
         score: 1,
         weight,
         hit: true,
-        message: path47
+        message: path48
       };
     }
     return {
-      path: path47,
+      path: path48,
       score: 0,
       weight,
       hit: false,
-      message: `${path47} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
+      message: `${path48} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
     };
   }
   /**
@@ -26496,11 +26775,11 @@ var FieldAccuracyEvaluator = class {
     };
   }
 };
-function resolvePath(obj, path47) {
-  if (!path47 || !obj) {
+function resolvePath(obj, path48) {
+  if (!path48 || !obj) {
     return void 0;
   }
-  const parts = path47.split(/\.|\[|\]/).filter((p) => p.length > 0);
+  const parts = path48.split(/\.|\[|\]/).filter((p) => p.length > 0);
   let current = obj;
   for (const part of parts) {
     if (current === null || current === void 0) {
@@ -26786,11 +27065,12 @@ function assembleLlmGraderPrompt(input) {
 function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
   const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
   const variables = {
-    [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2),
-    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
-    [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
+    [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
+    [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(),
+    [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
     [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
     [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
+    // Deprecated aliases
     [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
     [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
     [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -26973,8 +27253,8 @@ var TokenUsageEvaluator = class {
     };
   }
 };
-function getNestedValue(obj, path47) {
-  const parts = path47.split(".");
+function getNestedValue(obj, path48) {
+  const parts = path48.split(".");
   let current = obj;
   for (const part of parts) {
     if (current === null || current === void 0 || typeof current !== "object") {
@@ -27824,16 +28104,13 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
   const payload = {
     criteria: context2.evalCase.criteria,
     expectedOutput: context2.evalCase.expected_output,
-    outputText: context2.candidate,
     output: context2.output ?? null,
     inputFiles: context2.evalCase.file_paths,
     input: context2.evalCase.input,
     trace: context2.trace ?? null,
     fileChanges: context2.fileChanges ?? null,
     workspacePath: context2.workspacePath ?? null,
-    config: config ?? context2.config ?? null,
-    inputText: context2.evalCase.question,
-    expectedOutputText: context2.evalCase.reference_answer ?? ""
+    config: config ?? context2.config ?? null
   };
   const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
   const scriptPath = script[script.length - 1];
@@ -29469,7 +29746,8 @@ async function runEvaluation(options) {
           const budgetResult = {
             timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
             testId: evalCase.id,
-            eval_set: evalCase.eval_set,
+            dataset: evalCase.dataset,
+            category: evalCase.category,
             score: 0,
             assertions: [],
             output: [],
@@ -29505,7 +29783,8 @@ async function runEvaluation(options) {
           const haltResult = {
             timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
             testId: evalCase.id,
-            eval_set: evalCase.eval_set,
+            dataset: evalCase.dataset,
+            category: evalCase.category,
             score: 0,
             assertions: [],
             output: [],
@@ -30504,7 +30783,8 @@ async function evaluateCandidate(options) {
   return {
     timestamp: completedAt.toISOString(),
     testId: evalCase.id,
-    eval_set: evalCase.eval_set,
+    dataset: evalCase.dataset,
+    category: evalCase.category,
     conversationId: evalCase.conversation_id,
     score: score.score,
     assertions: score.assertions,
@@ -30854,7 +31134,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
   return {
     timestamp: timestamp.toISOString(),
     testId: evalCase.id,
-    eval_set: evalCase.eval_set,
+    dataset: evalCase.dataset,
+    category: evalCase.category,
     conversationId: evalCase.conversation_id,
     score: 0,
     assertions: [{ text: `Error: ${message}`, passed: false }],
@@ -31405,6 +31686,15 @@ function trimBaselineResult(result) {
   }
   return trimmed;
 }
+var DEFAULT_CATEGORY = "Uncategorized";
+function deriveCategory(relativePath) {
+  const parts = relativePath.split(path47.sep);
+  if (parts.length <= 1) {
+    return DEFAULT_CATEGORY;
+  }
+  const dirs = parts.slice(0, -1).filter((d) => d !== "evals");
+  return dirs.length > 0 ? dirs.join("/") : DEFAULT_CATEGORY;
+}
 var OTEL_BACKEND_PRESETS = {
   langfuse: {
     name: "langfuse",
@@ -31527,7 +31817,7 @@ var OtelTraceExporter = class {
         rootSpan.setAttribute("gen_ai.system", "agentv");
         rootSpan.setAttribute("agentv.test_id", result.testId);
         rootSpan.setAttribute("agentv.target", result.target);
-        if (result.eval_set) rootSpan.setAttribute("agentv.eval_set", result.eval_set);
+        if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
         rootSpan.setAttribute("agentv.score", result.score);
         if (captureContent && result.output.length > 0) {
           const lastMsg = result.output[result.output.length - 1];
@@ -31736,7 +32026,7 @@ var OtelStreamingObserver = class {
     this.rootSpan.setAttribute("gen_ai.system", "agentv");
     this.rootSpan.setAttribute("agentv.test_id", testId);
     this.rootSpan.setAttribute("agentv.target", target);
-    if (evalSet) this.rootSpan.setAttribute("agentv.eval_set", evalSet);
+    if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
     this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
   }
   /** Create and immediately export a tool span */
@@ -31907,6 +32197,9 @@ function createAgentKernel() {
 }
 export {
+  isContent,
+  isContentArray,
+  getTextContent,
   TEST_MESSAGE_ROLES,
   isTestMessageRole,
   isJsonObject,
@@ -31922,11 +32215,13 @@ export {
   buildSearchRoots,
   resolveFileReference,
   CLI_PLACEHOLDERS,
+  COMMON_TARGET_SETTINGS,
   resolveTargetDefinition,
-  interpolateEnv,
-  loadCasesFromFile,
   KNOWN_PROVIDERS,
   PROVIDER_ALIASES,
+  extractLastAssistantContent,
+  interpolateEnv,
+  loadCasesFromFile,
   computeTraceSummary,
   DEFAULT_EXPLORATION_TOOLS,
   explorationRatio,
@@ -32002,6 +32297,7 @@ export {
   substituteVariables,
   calculateRubricScore,
   buildScoreRangeOutputSchema,
+  extractImageBlocks,
   CompositeEvaluator,
   CostEvaluator,
   ExecutionMetricsEvaluator,
@@ -32051,9 +32347,11 @@ export {
   shouldEnableCache,
   shouldSkipCacheForTemperature,
   trimBaselineResult,
+  DEFAULT_CATEGORY,
+  deriveCategory,
   OTEL_BACKEND_PRESETS,
   OtelTraceExporter,
   OtelStreamingObserver,
   createAgentKernel
 };
-//# sourceMappingURL=chunk-ELQEFMGO.js.map
+//# sourceMappingURL=chunk-XEAW7OQT.js.map