@agentv/core 3.14.6 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,4 +1,5 @@
1
1
  import {
2
+ COMMON_TARGET_SETTINGS,
2
3
  TEST_MESSAGE_ROLES,
3
4
  buildDirectoryChain,
4
5
  buildSearchRoots,
@@ -6,8 +7,11 @@ import {
6
7
  extractLastAssistantContent,
7
8
  fileExists,
8
9
  findGitRoot,
10
+ getTextContent,
9
11
  interpolateEnv,
10
12
  isAgentProvider,
13
+ isContent,
14
+ isContentArray,
11
15
  isEvaluatorKind,
12
16
  isJsonObject,
13
17
  isJsonValue,
@@ -19,7 +23,7 @@ import {
19
23
  readTextFile,
20
24
  resolveFileReference,
21
25
  resolveTargetDefinition
22
- } from "./chunk-HP5PFOVK.js";
26
+ } from "./chunk-PXYYRDHH.js";
23
27
  import {
24
28
  AgentvProvider
25
29
  } from "./chunk-W5YDZWT4.js";
@@ -690,15 +694,23 @@ var TEMPLATE_VARIABLES = {
690
694
  INPUT: "input",
691
695
  OUTPUT: "output",
692
696
  FILE_CHANGES: "file_changes",
697
+ /** @deprecated Use INPUT instead — resolves to the same text value. */
693
698
  INPUT_TEXT: "input_text",
699
+ /** @deprecated Use OUTPUT instead — resolves to the same text value. */
694
700
  OUTPUT_TEXT: "output_text",
701
+ /** @deprecated Use EXPECTED_OUTPUT instead — resolves to the same text value. */
695
702
  EXPECTED_OUTPUT_TEXT: "expected_output_text"
696
703
  };
697
704
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
698
705
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
699
- TEMPLATE_VARIABLES.OUTPUT_TEXT,
706
+ TEMPLATE_VARIABLES.OUTPUT,
700
707
  TEMPLATE_VARIABLES.EXPECTED_OUTPUT
701
708
  ]);
709
+ var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
710
+ [TEMPLATE_VARIABLES.INPUT_TEXT, TEMPLATE_VARIABLES.INPUT],
711
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT],
712
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
713
+ ]);
702
714
 
703
715
  // src/evaluation/validation/prompt-validator.ts
704
716
  var ANSI_YELLOW2 = "\x1B[33m";
@@ -720,16 +732,29 @@ function validateTemplateVariables(content, source) {
720
732
  }
721
733
  match = variablePattern.exec(content);
722
734
  }
723
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
724
- const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
735
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
736
+ const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT);
725
737
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
726
738
  if (!hasRequiredFields) {
727
739
  throw new Error(
728
740
  `Missing required fields. Must include at least one of:
729
- - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
741
+ - {{ ${TEMPLATE_VARIABLES.OUTPUT} }}
730
742
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
731
743
  );
732
744
  }
745
+ const deprecatedUsed = [];
746
+ for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
747
+ if (foundVariables.has(deprecated)) {
748
+ deprecatedUsed.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
749
+ }
750
+ }
751
+ if (deprecatedUsed.length > 0) {
752
+ console.warn(
753
+ `${ANSI_YELLOW2}Warning: Template at ${source} uses deprecated variable names:
754
+ ${deprecatedUsed.join("\n ")}
755
+ These still work but will be removed in a future version.${ANSI_RESET3}`
756
+ );
757
+ }
733
758
  if (invalidVariables.length > 0) {
734
759
  const warningMessage = `${ANSI_YELLOW2}Warning: Custom evaluator template at ${source}
735
760
  Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
@@ -2147,6 +2172,19 @@ function asString2(value) {
2147
2172
  }
2148
2173
 
2149
2174
  // src/evaluation/loaders/message-processor.ts
2175
+ var IMAGE_MEDIA_TYPES = {
2176
+ ".png": "image/png",
2177
+ ".jpg": "image/jpeg",
2178
+ ".jpeg": "image/jpeg",
2179
+ ".gif": "image/gif",
2180
+ ".webp": "image/webp",
2181
+ ".svg": "image/svg+xml",
2182
+ ".bmp": "image/bmp"
2183
+ };
2184
+ function detectImageMediaType(filePath) {
2185
+ const ext = path5.extname(filePath).toLowerCase();
2186
+ return IMAGE_MEDIA_TYPES[ext];
2187
+ }
2150
2188
  var ANSI_YELLOW4 = "\x1B[33m";
2151
2189
  var ANSI_RESET5 = "\x1B[0m";
2152
2190
  async function processMessages(options) {
@@ -2212,6 +2250,47 @@ async function processMessages(options) {
2212
2250
  }
2213
2251
  continue;
2214
2252
  }
2253
+ if (segmentType === "image") {
2254
+ const rawValue = asString3(rawSegment.value);
2255
+ if (!rawValue) {
2256
+ continue;
2257
+ }
2258
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
2259
+ rawValue,
2260
+ searchRoots
2261
+ );
2262
+ if (!resolvedPath) {
2263
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
2264
+ const context = messageType === "input" ? "" : " in expected_output";
2265
+ logWarning3(`Image file not found${context}: ${displayPath}`, attempts);
2266
+ continue;
2267
+ }
2268
+ const mediaType = detectImageMediaType(resolvedPath);
2269
+ if (!mediaType) {
2270
+ logWarning3(
2271
+ `Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
2272
+ );
2273
+ continue;
2274
+ }
2275
+ try {
2276
+ const imageBuffer = await readFile4(resolvedPath);
2277
+ const base64 = imageBuffer.toString("base64");
2278
+ processedContent.push({
2279
+ type: "image",
2280
+ media_type: mediaType,
2281
+ source: `data:${mediaType};base64,${base64}`
2282
+ });
2283
+ if (verbose) {
2284
+ const label = messageType === "input" ? "[Image]" : "[Expected Output Image]";
2285
+ console.log(` ${label} Found: ${displayPath}`);
2286
+ console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
2287
+ }
2288
+ } catch (error) {
2289
+ const context = messageType === "input" ? "" : " expected output";
2290
+ logWarning3(`Could not read${context} image ${resolvedPath}: ${error.message}`);
2291
+ }
2292
+ continue;
2293
+ }
2215
2294
  const clonedSegment = cloneJsonObject(rawSegment);
2216
2295
  processedContent.push(clonedSegment);
2217
2296
  const inlineValue = clonedSegment.value;
@@ -2289,6 +2368,46 @@ async function processExpectedMessages(options) {
2289
2368
  }
2290
2369
  continue;
2291
2370
  }
2371
+ if (segmentType === "image") {
2372
+ const rawValue = asString3(rawSegment.value);
2373
+ if (!rawValue) {
2374
+ continue;
2375
+ }
2376
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
2377
+ rawValue,
2378
+ searchRoots
2379
+ );
2380
+ if (!resolvedPath) {
2381
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
2382
+ logWarning3(`Image file not found in expected_output: ${displayPath}`, attempts);
2383
+ continue;
2384
+ }
2385
+ const mediaType = detectImageMediaType(resolvedPath);
2386
+ if (!mediaType) {
2387
+ logWarning3(
2388
+ `Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
2389
+ );
2390
+ continue;
2391
+ }
2392
+ try {
2393
+ const imageBuffer = await readFile4(resolvedPath);
2394
+ const base64 = imageBuffer.toString("base64");
2395
+ processedContent.push({
2396
+ type: "image",
2397
+ media_type: mediaType,
2398
+ source: `data:${mediaType};base64,${base64}`
2399
+ });
2400
+ if (verbose) {
2401
+ console.log(` [Expected Output Image] Found: ${displayPath}`);
2402
+ console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
2403
+ }
2404
+ } catch (error) {
2405
+ logWarning3(
2406
+ `Could not read expected output image ${resolvedPath}: ${error.message}`
2407
+ );
2408
+ }
2409
+ continue;
2410
+ }
2292
2411
  processedContent.push(cloneJsonObject(rawSegment));
2293
2412
  }
2294
2413
  segment.content = processedContent;
@@ -2535,7 +2654,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2535
2654
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
2536
2655
  const testCase = {
2537
2656
  id,
2538
- eval_set: evalSetName,
2657
+ dataset: evalSetName,
2539
2658
  conversation_id: conversationId,
2540
2659
  question,
2541
2660
  input: inputMessages,
@@ -2806,7 +2925,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2806
2925
  }
2807
2926
  const suite = interpolated;
2808
2927
  const evalSetNameFromSuite = asString5(suite.name)?.trim();
2809
- const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
2928
+ const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
2810
2929
  const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
2811
2930
  const rawTestcases = resolveTests(suite);
2812
2931
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
@@ -2927,7 +3046,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2927
3046
  const caseTargets = extractTargetsFromTestCase(evalcase);
2928
3047
  const testCase = {
2929
3048
  id,
2930
- eval_set: evalSetName,
3049
+ dataset: evalSetName,
3050
+ category: options?.category,
2931
3051
  conversation_id: conversationId,
2932
3052
  question,
2933
3053
  input: inputMessages,
@@ -3851,6 +3971,49 @@ import { createWriteStream } from "node:fs";
3851
3971
  import { mkdir } from "node:fs/promises";
3852
3972
  import path10 from "node:path";
3853
3973
 
3974
+ // src/evaluation/providers/claude-content.ts
3975
+ function toContentArray(content) {
3976
+ if (!Array.isArray(content)) return void 0;
3977
+ let hasNonText = false;
3978
+ const blocks = [];
3979
+ for (const part of content) {
3980
+ if (!part || typeof part !== "object") continue;
3981
+ const p = part;
3982
+ if (p.type === "text" && typeof p.text === "string") {
3983
+ blocks.push({ type: "text", text: p.text });
3984
+ } else if (p.type === "image" && typeof p.source === "object" && p.source !== null) {
3985
+ const src = p.source;
3986
+ const mediaType = typeof p.media_type === "string" ? p.media_type : typeof src.media_type === "string" ? src.media_type : "application/octet-stream";
3987
+ const data = typeof src.data === "string" && src.data !== "" ? `data:${mediaType};base64,${src.data}` : typeof p.url === "string" && p.url !== "" ? p.url : "";
3988
+ if (!data) continue;
3989
+ blocks.push({ type: "image", media_type: mediaType, source: data });
3990
+ hasNonText = true;
3991
+ } else if (p.type === "tool_use") {
3992
+ } else if (p.type === "tool_result") {
3993
+ }
3994
+ }
3995
+ return hasNonText && blocks.length > 0 ? blocks : void 0;
3996
+ }
3997
+ function extractTextContent(content) {
3998
+ if (typeof content === "string") {
3999
+ return content;
4000
+ }
4001
+ if (!Array.isArray(content)) {
4002
+ return void 0;
4003
+ }
4004
+ const textParts = [];
4005
+ for (const part of content) {
4006
+ if (!part || typeof part !== "object") {
4007
+ continue;
4008
+ }
4009
+ const p = part;
4010
+ if (p.type === "text" && typeof p.text === "string") {
4011
+ textParts.push(p.text);
4012
+ }
4013
+ }
4014
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
4015
+ }
4016
+
3854
4017
  // src/evaluation/providers/claude-log-tracker.ts
3855
4018
  var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeLogs");
3856
4019
  var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeLogSubscribers");
@@ -4016,11 +4179,12 @@ var ClaudeCliProvider = class {
4016
4179
  if (betaMessage && typeof betaMessage === "object") {
4017
4180
  const msg = betaMessage;
4018
4181
  const content = msg.content;
4182
+ const structuredContent = toContentArray(content);
4019
4183
  const textContent = extractTextContent(content);
4020
4184
  const toolCalls = extractToolCalls(content);
4021
4185
  const outputMsg = {
4022
4186
  role: "assistant",
4023
- content: textContent,
4187
+ content: structuredContent ?? textContent,
4024
4188
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0
4025
4189
  };
4026
4190
  output.push(outputMsg);
@@ -4359,25 +4523,6 @@ function summarizeEvent(event) {
4359
4523
  return void 0;
4360
4524
  }
4361
4525
  }
4362
- function extractTextContent(content) {
4363
- if (typeof content === "string") {
4364
- return content;
4365
- }
4366
- if (!Array.isArray(content)) {
4367
- return void 0;
4368
- }
4369
- const textParts = [];
4370
- for (const part of content) {
4371
- if (!part || typeof part !== "object") {
4372
- continue;
4373
- }
4374
- const p = part;
4375
- if (p.type === "text" && typeof p.text === "string") {
4376
- textParts.push(p.text);
4377
- }
4378
- }
4379
- return textParts.length > 0 ? textParts.join("\n") : void 0;
4380
- }
4381
4526
  function extractToolCalls(content) {
4382
4527
  if (!Array.isArray(content)) {
4383
4528
  return [];
@@ -4550,11 +4695,12 @@ var ClaudeSdkProvider = class {
4550
4695
  if (betaMessage && typeof betaMessage === "object") {
4551
4696
  const msg = betaMessage;
4552
4697
  const content = msg.content;
4553
- const textContent = extractTextContent2(content);
4698
+ const structuredContent = toContentArray(content);
4699
+ const textContent = extractTextContent(content);
4554
4700
  const toolCalls = extractToolCalls2(content);
4555
4701
  const outputMsg = {
4556
4702
  role: "assistant",
4557
- content: textContent,
4703
+ content: structuredContent ?? textContent,
4558
4704
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0
4559
4705
  };
4560
4706
  output.push(outputMsg);
@@ -4672,25 +4818,6 @@ var ClaudeSdkProvider = class {
4672
4818
  }
4673
4819
  }
4674
4820
  };
4675
- function extractTextContent2(content) {
4676
- if (typeof content === "string") {
4677
- return content;
4678
- }
4679
- if (!Array.isArray(content)) {
4680
- return void 0;
4681
- }
4682
- const textParts = [];
4683
- for (const part of content) {
4684
- if (!part || typeof part !== "object") {
4685
- continue;
4686
- }
4687
- const p = part;
4688
- if (p.type === "text" && typeof p.text === "string") {
4689
- textParts.push(p.text);
4690
- }
4691
- }
4692
- return textParts.length > 0 ? textParts.join("\n") : void 0;
4693
- }
4694
4821
  function extractToolCalls2(content) {
4695
4822
  if (!Array.isArray(content)) {
4696
4823
  return [];
@@ -4914,7 +5041,7 @@ function convertMessages(messages) {
4914
5041
  return messages.map((msg) => ({
4915
5042
  role: msg.role,
4916
5043
  name: msg.name,
4917
- content: msg.content,
5044
+ content: isContentArray(msg.content) ? msg.content : typeof msg.content === "string" ? msg.content : void 0,
4918
5045
  toolCalls: msg.tool_calls?.map((tc) => ({
4919
5046
  tool: tc.tool,
4920
5047
  input: tc.input,
@@ -7167,6 +7294,35 @@ function extractPiTextContent(content) {
7167
7294
  }
7168
7295
  return textParts.length > 0 ? textParts.join("\n") : void 0;
7169
7296
  }
7297
+ function toPiContentArray(content) {
7298
+ if (!Array.isArray(content)) return void 0;
7299
+ let hasNonText = false;
7300
+ const blocks = [];
7301
+ for (const part of content) {
7302
+ if (!part || typeof part !== "object") continue;
7303
+ const p = part;
7304
+ if (p.type === "text" && typeof p.text === "string") {
7305
+ blocks.push({ type: "text", text: p.text });
7306
+ } else if (p.type === "image") {
7307
+ const mediaType = typeof p.media_type === "string" ? p.media_type : "application/octet-stream";
7308
+ let source = "";
7309
+ if (typeof p.source === "object" && p.source !== null) {
7310
+ const src = p.source;
7311
+ const srcMediaType = typeof src.media_type === "string" ? src.media_type : mediaType;
7312
+ source = typeof src.data === "string" ? `data:${srcMediaType};base64,${src.data}` : "";
7313
+ }
7314
+ if (!source && typeof p.url === "string") {
7315
+ source = p.url;
7316
+ }
7317
+ if (source) {
7318
+ blocks.push({ type: "image", media_type: mediaType, source });
7319
+ hasNonText = true;
7320
+ }
7321
+ } else if (p.type === "tool_use" || p.type === "tool_result") {
7322
+ }
7323
+ }
7324
+ return hasNonText && blocks.length > 0 ? blocks : void 0;
7325
+ }
7170
7326
  function toFiniteNumber(value) {
7171
7327
  if (typeof value === "number" && Number.isFinite(value)) return value;
7172
7328
  return void 0;
@@ -8337,7 +8493,8 @@ function convertAgentMessage(message, toolTrackers, completedToolResults) {
8337
8493
  }
8338
8494
  const msg = message;
8339
8495
  const role = typeof msg.role === "string" ? msg.role : "unknown";
8340
- const content = extractPiTextContent(msg.content);
8496
+ const structuredContent = toPiContentArray(msg.content);
8497
+ const content = structuredContent ?? extractPiTextContent(msg.content);
8341
8498
  const toolCalls = extractToolCalls4(msg.content, toolTrackers, completedToolResults);
8342
8499
  const startTimeVal = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
8343
8500
  let msgTokenUsage;
@@ -10187,13 +10344,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
10187
10344
  async function execShellWithStdin(command, stdinPayload, options = {}) {
10188
10345
  const { mkdir: mkdir16, readFile: readFile14, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
10189
10346
  const { tmpdir: tmpdir3 } = await import("node:os");
10190
- const path47 = await import("node:path");
10347
+ const path48 = await import("node:path");
10191
10348
  const { randomUUID: randomUUID10 } = await import("node:crypto");
10192
- const dir = path47.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
10349
+ const dir = path48.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
10193
10350
  await mkdir16(dir, { recursive: true });
10194
- const stdinPath = path47.join(dir, "stdin.txt");
10195
- const stdoutPath = path47.join(dir, "stdout.txt");
10196
- const stderrPath = path47.join(dir, "stderr.txt");
10351
+ const stdinPath = path48.join(dir, "stdin.txt");
10352
+ const stdoutPath = path48.join(dir, "stdout.txt");
10353
+ const stderrPath = path48.join(dir, "stderr.txt");
10197
10354
  await writeFile9(stdinPath, stdinPayload, "utf8");
10198
10355
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
10199
10356
  const { spawn: spawn5 } = await import("node:child_process");
@@ -10509,6 +10666,56 @@ function toCamelCaseDeep(obj) {
10509
10666
 
10510
10667
  // src/evaluation/evaluators/code-evaluator.ts
10511
10668
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
10669
+ var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
10670
+ async function materializeContentForGrader(messages, getWorkDir) {
10671
+ if (!messages || messages.length === 0) return messages ?? null;
10672
+ let hasAnyImage = false;
10673
+ for (const msg of messages) {
10674
+ if (isContentArray(msg.content)) {
10675
+ for (const block of msg.content) {
10676
+ if (block.type === "image") {
10677
+ hasAnyImage = true;
10678
+ break;
10679
+ }
10680
+ }
10681
+ }
10682
+ if (hasAnyImage) break;
10683
+ }
10684
+ if (!hasAnyImage) return messages;
10685
+ let counter = 0;
10686
+ const result = [];
10687
+ for (const msg of messages) {
10688
+ if (!isContentArray(msg.content)) {
10689
+ result.push(msg);
10690
+ continue;
10691
+ }
10692
+ if (!msg.content.some((b) => b.type === "image")) {
10693
+ result.push(msg);
10694
+ continue;
10695
+ }
10696
+ const blocks = [];
10697
+ for (const block of msg.content) {
10698
+ if (block.type !== "image") {
10699
+ blocks.push({ ...block });
10700
+ continue;
10701
+ }
10702
+ const img = block;
10703
+ const match = DATA_URI_RE.exec(img.source);
10704
+ if (match) {
10705
+ const [, mediaType, base64Data] = match;
10706
+ const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
10707
+ const dir = await getWorkDir();
10708
+ const filePath = join(dir, `img-${counter++}.${ext}`);
10709
+ await writeFile6(filePath, Buffer.from(base64Data, "base64"));
10710
+ blocks.push({ type: "image", media_type: img.media_type, path: filePath });
10711
+ } else {
10712
+ blocks.push({ type: "image", media_type: img.media_type, path: img.source });
10713
+ }
10714
+ }
10715
+ result.push({ ...msg, content: blocks });
10716
+ }
10717
+ return result;
10718
+ }
10512
10719
  var CodeEvaluator = class {
10513
10720
  kind = "code-grader";
10514
10721
  command;
@@ -10524,7 +10731,18 @@ var CodeEvaluator = class {
10524
10731
  this.target = options.target;
10525
10732
  }
10526
10733
  async evaluate(context) {
10527
- let outputForPayload = context.output ?? null;
10734
+ let imageTmpDir;
10735
+ const getImageDir = async () => {
10736
+ if (!imageTmpDir) {
10737
+ imageTmpDir = await mkdtemp2(join(tmpdir2(), "agentv-img-"));
10738
+ }
10739
+ return imageTmpDir;
10740
+ };
10741
+ const materializedOutput = await materializeContentForGrader(
10742
+ context.output,
10743
+ getImageDir
10744
+ );
10745
+ let outputForPayload = materializedOutput;
10528
10746
  let outputPath;
10529
10747
  if (outputForPayload) {
10530
10748
  const serialized = JSON.stringify(outputForPayload);
@@ -10537,12 +10755,17 @@ var CodeEvaluator = class {
10537
10755
  }
10538
10756
  const payload = {
10539
10757
  criteria: context.evalCase.criteria,
10540
- expectedOutput: context.evalCase.expected_output,
10541
- outputText: context.candidate,
10758
+ expectedOutput: await materializeContentForGrader(
10759
+ context.evalCase.expected_output,
10760
+ getImageDir
10761
+ ),
10542
10762
  output: outputForPayload,
10543
10763
  outputPath,
10544
10764
  inputFiles: context.evalCase.file_paths,
10545
- input: context.evalCase.input,
10765
+ input: await materializeContentForGrader(
10766
+ context.evalCase.input,
10767
+ getImageDir
10768
+ ),
10546
10769
  trace: context.trace ?? null,
10547
10770
  tokenUsage: context.tokenUsage ?? null,
10548
10771
  costUsd: context.costUsd ?? null,
@@ -10551,9 +10774,7 @@ var CodeEvaluator = class {
10551
10774
  endTime: context.endTime ?? null,
10552
10775
  fileChanges: context.fileChanges ?? null,
10553
10776
  workspacePath: context.workspacePath ?? null,
10554
- config: this.config ?? null,
10555
- inputText: context.evalCase.question,
10556
- expectedOutputText: context.evalCase.reference_answer ?? ""
10777
+ config: this.config ?? null
10557
10778
  };
10558
10779
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
10559
10780
  let proxyEnv;
@@ -10643,6 +10864,10 @@ var CodeEvaluator = class {
10643
10864
  await rm3(dirname(outputPath), { recursive: true, force: true }).catch(() => {
10644
10865
  });
10645
10866
  }
10867
+ if (imageTmpDir) {
10868
+ await rm3(imageTmpDir, { recursive: true, force: true }).catch(() => {
10869
+ });
10870
+ }
10646
10871
  }
10647
10872
  }
10648
10873
  };
@@ -10720,13 +10945,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
10720
10945
  {{${TEMPLATE_VARIABLES.CRITERIA}}}
10721
10946
 
10722
10947
  [[ ## question ## ]]
10723
- {{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
10948
+ {{${TEMPLATE_VARIABLES.INPUT}}}
10724
10949
 
10725
10950
  [[ ## reference_answer ## ]]
10726
- {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
10951
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT}}}
10727
10952
 
10728
10953
  [[ ## answer ## ]]
10729
- {{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
10954
+ {{${TEMPLATE_VARIABLES.OUTPUT}}}`;
10730
10955
  var freeformEvaluationSchema = z3.object({
10731
10956
  score: z3.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
10732
10957
  assertions: z3.array(
@@ -10798,21 +11023,19 @@ var LlmGraderEvaluator = class {
10798
11023
  async evaluateFreeform(context, graderProvider) {
10799
11024
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
10800
11025
  const variables = {
10801
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context.evalCase.input, null, 2),
10802
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(
10803
- context.evalCase.expected_output,
10804
- null,
10805
- 2
10806
- ),
10807
- [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context.output ?? [], null, 2),
11026
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
11027
+ [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
11028
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
10808
11029
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
10809
11030
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
11031
+ // Deprecated aliases — same values as the primary variables above
10810
11032
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
10811
11033
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
10812
11034
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
10813
11035
  };
10814
11036
  const systemPrompt = buildOutputSchema();
10815
11037
  const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
11038
+ warnDeprecatedTemplateVars(evaluatorTemplate);
10816
11039
  let userPrompt = substituteVariables(evaluatorTemplate, variables);
10817
11040
  if (context.fileChanges && !context.evaluatorTemplateOverride && !this.evaluatorTemplate) {
10818
11041
  userPrompt += `
@@ -10824,13 +11047,15 @@ ${context.fileChanges}`;
10824
11047
  userPrompt,
10825
11048
  systemPrompt
10826
11049
  };
11050
+ const images = context.output ? extractImageBlocks(context.output) : [];
10827
11051
  try {
10828
11052
  const { data, tokenUsage } = await this.runWithRetry({
10829
11053
  context,
10830
11054
  graderProvider,
10831
11055
  systemPrompt,
10832
11056
  userPrompt,
10833
- schema: freeformEvaluationSchema
11057
+ schema: freeformEvaluationSchema,
11058
+ images
10834
11059
  });
10835
11060
  const score = clampScore(data.score);
10836
11061
  const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
@@ -10874,13 +11099,15 @@ ${context.fileChanges}`;
10874
11099
  userPrompt: prompt,
10875
11100
  systemPrompt
10876
11101
  };
11102
+ const images = context.output ? extractImageBlocks(context.output) : [];
10877
11103
  try {
10878
11104
  const { data, tokenUsage } = await this.runWithRetry({
10879
11105
  context,
10880
11106
  graderProvider,
10881
11107
  systemPrompt,
10882
11108
  userPrompt: prompt,
10883
- schema: rubricEvaluationSchema
11109
+ schema: rubricEvaluationSchema,
11110
+ images
10884
11111
  });
10885
11112
  const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
10886
11113
  return {
@@ -10917,13 +11144,15 @@ ${context.fileChanges}`;
10917
11144
  userPrompt: prompt,
10918
11145
  systemPrompt
10919
11146
  };
11147
+ const images = context.output ? extractImageBlocks(context.output) : [];
10920
11148
  try {
10921
11149
  const { data, tokenUsage } = await this.runWithRetry({
10922
11150
  context,
10923
11151
  graderProvider,
10924
11152
  systemPrompt,
10925
11153
  userPrompt: prompt,
10926
- schema: scoreRangeEvaluationSchema
11154
+ schema: scoreRangeEvaluationSchema,
11155
+ images
10927
11156
  });
10928
11157
  const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
10929
11158
  return {
@@ -11130,12 +11359,17 @@ ${context.fileChanges}`;
11130
11359
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
11131
11360
  const variables = {
11132
11361
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
11362
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
11363
+ [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
11364
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
11365
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
11366
+ // Deprecated aliases
11133
11367
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
11134
11368
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
11135
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
11136
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
11369
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
11137
11370
  };
11138
11371
  if (this.evaluatorTemplate) {
11372
+ warnDeprecatedTemplateVars(this.evaluatorTemplate);
11139
11373
  return substituteVariables(this.evaluatorTemplate, variables);
11140
11374
  }
11141
11375
  const config = context.evaluator;
@@ -11186,11 +11420,16 @@ ${context.fileChanges}`;
11186
11420
  if (this.evaluatorTemplate) {
11187
11421
  const variables = {
11188
11422
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
11423
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
11424
+ [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
11425
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
11426
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
11427
+ // Deprecated aliases
11189
11428
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
11190
11429
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
11191
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
11192
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
11430
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
11193
11431
  };
11432
+ warnDeprecatedTemplateVars(this.evaluatorTemplate);
11194
11433
  const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
11195
11434
  const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
11196
11435
  return `${customPrompt}
@@ -11361,18 +11600,35 @@ ${outputSchema}`;
11361
11600
  // LLM mode retry logic
11362
11601
  // ---------------------------------------------------------------------------
11363
11602
  async runWithRetry(options) {
11364
- const { context, graderProvider, systemPrompt, userPrompt, schema } = options;
11603
+ const { context, graderProvider, systemPrompt, userPrompt, schema, images } = options;
11365
11604
  let lastError;
11366
11605
  for (let attempt = 1; attempt <= 3; attempt++) {
11367
11606
  try {
11368
11607
  const model = graderProvider.asLanguageModel?.();
11369
11608
  if (model) {
11370
- const result = await generateText2({
11609
+ const modelOptions = {
11610
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
11611
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
11612
+ };
11613
+ const hasImages = images && images.length > 0;
11614
+ const result = hasImages ? await generateText2({
11615
+ model,
11616
+ system: systemPrompt,
11617
+ messages: [
11618
+ {
11619
+ role: "user",
11620
+ content: [
11621
+ { type: "text", text: userPrompt },
11622
+ ...toAiSdkImageParts(images)
11623
+ ]
11624
+ }
11625
+ ],
11626
+ ...modelOptions
11627
+ }) : await generateText2({
11371
11628
  model,
11372
11629
  system: systemPrompt,
11373
11630
  prompt: userPrompt,
11374
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
11375
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
11631
+ ...modelOptions
11376
11632
  });
11377
11633
  const data2 = schema.parse(parseJsonFromText(result.text));
11378
11634
  const rawUsage = result.usage;
@@ -11432,6 +11688,26 @@ function substituteVariables(template, variables) {
11432
11688
  return variables[varName] ?? match;
11433
11689
  });
11434
11690
  }
11691
+ var ANSI_YELLOW7 = "\x1B[33m";
11692
+ var ANSI_RESET8 = "\x1B[0m";
11693
+ var warnedTemplateStrings = /* @__PURE__ */ new Set();
11694
+ function warnDeprecatedTemplateVars(template) {
11695
+ if (warnedTemplateStrings.has(template)) return;
11696
+ const used = [];
11697
+ for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
11698
+ if (new RegExp(`\\{\\{\\s*${deprecated}\\s*\\}\\}`).test(template)) {
11699
+ used.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
11700
+ }
11701
+ }
11702
+ if (used.length > 0) {
11703
+ warnedTemplateStrings.add(template);
11704
+ console.warn(
11705
+ `${ANSI_YELLOW7}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
11706
+ ${used.join("\n ")}
11707
+ Update your custom evaluator template to use the new names.${ANSI_RESET8}`
11708
+ );
11709
+ }
11710
+ }
11435
11711
  function calculateRubricScore(result, rubrics) {
11436
11712
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
11437
11713
  const assertions = [];
@@ -11526,6 +11802,26 @@ function calculateScoreRangeResult(result, rubrics) {
11526
11802
  }
11527
11803
  };
11528
11804
  }
11805
+ function extractImageBlocks(messages) {
11806
+ const images = [];
11807
+ for (const msg of messages) {
11808
+ if (msg.role !== "assistant") continue;
11809
+ if (!isContentArray(msg.content)) continue;
11810
+ for (const block of msg.content) {
11811
+ if (block.type === "image") {
11812
+ images.push(block);
11813
+ }
11814
+ }
11815
+ }
11816
+ return images;
11817
+ }
11818
+ function toAiSdkImageParts(images) {
11819
+ return images.map((img) => ({
11820
+ type: "image",
11821
+ image: img.source,
11822
+ mediaType: img.media_type || void 0
11823
+ }));
11824
+ }
11529
11825
  function resolveSandboxed(basePath, relativePath) {
11530
11826
  const resolved = path35.resolve(basePath, relativePath);
11531
11827
  if (!resolved.startsWith(basePath + path35.sep) && resolved !== basePath) {
@@ -12267,115 +12563,115 @@ var FieldAccuracyEvaluator = class {
12267
12563
  * Evaluate a single field against the expected value.
12268
12564
  */
12269
12565
  evaluateField(fieldConfig, candidateData, expectedData) {
12270
- const { path: path47, match, required = true, weight = 1 } = fieldConfig;
12271
- const candidateValue = resolvePath(candidateData, path47);
12272
- const expectedValue = resolvePath(expectedData, path47);
12566
+ const { path: path48, match, required = true, weight = 1 } = fieldConfig;
12567
+ const candidateValue = resolvePath(candidateData, path48);
12568
+ const expectedValue = resolvePath(expectedData, path48);
12273
12569
  if (expectedValue === void 0) {
12274
12570
  return {
12275
- path: path47,
12571
+ path: path48,
12276
12572
  score: 1,
12277
12573
  // No expected value means no comparison needed
12278
12574
  weight,
12279
12575
  hit: true,
12280
- message: `${path47}: no expected value`
12576
+ message: `${path48}: no expected value`
12281
12577
  };
12282
12578
  }
12283
12579
  if (candidateValue === void 0) {
12284
12580
  if (required) {
12285
12581
  return {
12286
- path: path47,
12582
+ path: path48,
12287
12583
  score: 0,
12288
12584
  weight,
12289
12585
  hit: false,
12290
- message: `${path47} (required, missing)`
12586
+ message: `${path48} (required, missing)`
12291
12587
  };
12292
12588
  }
12293
12589
  return {
12294
- path: path47,
12590
+ path: path48,
12295
12591
  score: 1,
12296
12592
  // Don't penalize missing optional fields
12297
12593
  weight: 0,
12298
12594
  // Zero weight means it won't affect the score
12299
12595
  hit: true,
12300
- message: `${path47}: optional field missing`
12596
+ message: `${path48}: optional field missing`
12301
12597
  };
12302
12598
  }
12303
12599
  switch (match) {
12304
12600
  case "exact":
12305
- return this.compareExact(path47, candidateValue, expectedValue, weight);
12601
+ return this.compareExact(path48, candidateValue, expectedValue, weight);
12306
12602
  case "numeric_tolerance":
12307
12603
  return this.compareNumericTolerance(
12308
- path47,
12604
+ path48,
12309
12605
  candidateValue,
12310
12606
  expectedValue,
12311
12607
  fieldConfig,
12312
12608
  weight
12313
12609
  );
12314
12610
  case "date":
12315
- return this.compareDate(path47, candidateValue, expectedValue, fieldConfig, weight);
12611
+ return this.compareDate(path48, candidateValue, expectedValue, fieldConfig, weight);
12316
12612
  default:
12317
12613
  return {
12318
- path: path47,
12614
+ path: path48,
12319
12615
  score: 0,
12320
12616
  weight,
12321
12617
  hit: false,
12322
- message: `${path47}: unknown match type "${match}"`
12618
+ message: `${path48}: unknown match type "${match}"`
12323
12619
  };
12324
12620
  }
12325
12621
  }
12326
12622
  /**
12327
12623
  * Exact equality comparison.
12328
12624
  */
12329
- compareExact(path47, candidateValue, expectedValue, weight) {
12625
+ compareExact(path48, candidateValue, expectedValue, weight) {
12330
12626
  if (deepEqual(candidateValue, expectedValue)) {
12331
12627
  return {
12332
- path: path47,
12628
+ path: path48,
12333
12629
  score: 1,
12334
12630
  weight,
12335
12631
  hit: true,
12336
- message: path47
12632
+ message: path48
12337
12633
  };
12338
12634
  }
12339
12635
  if (typeof candidateValue !== typeof expectedValue) {
12340
12636
  return {
12341
- path: path47,
12637
+ path: path48,
12342
12638
  score: 0,
12343
12639
  weight,
12344
12640
  hit: false,
12345
- message: `${path47} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
12641
+ message: `${path48} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
12346
12642
  };
12347
12643
  }
12348
12644
  return {
12349
- path: path47,
12645
+ path: path48,
12350
12646
  score: 0,
12351
12647
  weight,
12352
12648
  hit: false,
12353
- message: `${path47} (value mismatch)`
12649
+ message: `${path48} (value mismatch)`
12354
12650
  };
12355
12651
  }
12356
12652
  /**
12357
12653
  * Numeric comparison with absolute or relative tolerance.
12358
12654
  */
12359
- compareNumericTolerance(path47, candidateValue, expectedValue, fieldConfig, weight) {
12655
+ compareNumericTolerance(path48, candidateValue, expectedValue, fieldConfig, weight) {
12360
12656
  const { tolerance = 0, relative = false } = fieldConfig;
12361
12657
  const candidateNum = toNumber(candidateValue);
12362
12658
  const expectedNum = toNumber(expectedValue);
12363
12659
  if (candidateNum === null || expectedNum === null) {
12364
12660
  return {
12365
- path: path47,
12661
+ path: path48,
12366
12662
  score: 0,
12367
12663
  weight,
12368
12664
  hit: false,
12369
- message: `${path47} (non-numeric value)`
12665
+ message: `${path48} (non-numeric value)`
12370
12666
  };
12371
12667
  }
12372
12668
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
12373
12669
  return {
12374
- path: path47,
12670
+ path: path48,
12375
12671
  score: 0,
12376
12672
  weight,
12377
12673
  hit: false,
12378
- message: `${path47} (invalid numeric value)`
12674
+ message: `${path48} (invalid numeric value)`
12379
12675
  };
12380
12676
  }
12381
12677
  const diff = Math.abs(candidateNum - expectedNum);
@@ -12388,61 +12684,61 @@ var FieldAccuracyEvaluator = class {
12388
12684
  }
12389
12685
  if (withinTolerance) {
12390
12686
  return {
12391
- path: path47,
12687
+ path: path48,
12392
12688
  score: 1,
12393
12689
  weight,
12394
12690
  hit: true,
12395
- message: `${path47} (within tolerance: diff=${diff.toFixed(2)})`
12691
+ message: `${path48} (within tolerance: diff=${diff.toFixed(2)})`
12396
12692
  };
12397
12693
  }
12398
12694
  return {
12399
- path: path47,
12695
+ path: path48,
12400
12696
  score: 0,
12401
12697
  weight,
12402
12698
  hit: false,
12403
- message: `${path47} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
12699
+ message: `${path48} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
12404
12700
  };
12405
12701
  }
12406
12702
  /**
12407
12703
  * Date comparison with format normalization.
12408
12704
  */
12409
- compareDate(path47, candidateValue, expectedValue, fieldConfig, weight) {
12705
+ compareDate(path48, candidateValue, expectedValue, fieldConfig, weight) {
12410
12706
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
12411
12707
  const candidateDate = parseDate(String(candidateValue), formats);
12412
12708
  const expectedDate = parseDate(String(expectedValue), formats);
12413
12709
  if (candidateDate === null) {
12414
12710
  return {
12415
- path: path47,
12711
+ path: path48,
12416
12712
  score: 0,
12417
12713
  weight,
12418
12714
  hit: false,
12419
- message: `${path47} (unparseable candidate date)`
12715
+ message: `${path48} (unparseable candidate date)`
12420
12716
  };
12421
12717
  }
12422
12718
  if (expectedDate === null) {
12423
12719
  return {
12424
- path: path47,
12720
+ path: path48,
12425
12721
  score: 0,
12426
12722
  weight,
12427
12723
  hit: false,
12428
- message: `${path47} (unparseable expected date)`
12724
+ message: `${path48} (unparseable expected date)`
12429
12725
  };
12430
12726
  }
12431
12727
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
12432
12728
  return {
12433
- path: path47,
12729
+ path: path48,
12434
12730
  score: 1,
12435
12731
  weight,
12436
12732
  hit: true,
12437
- message: path47
12733
+ message: path48
12438
12734
  };
12439
12735
  }
12440
12736
  return {
12441
- path: path47,
12737
+ path: path48,
12442
12738
  score: 0,
12443
12739
  weight,
12444
12740
  hit: false,
12445
- message: `${path47} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
12741
+ message: `${path48} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
12446
12742
  };
12447
12743
  }
12448
12744
  /**
@@ -12475,11 +12771,11 @@ var FieldAccuracyEvaluator = class {
12475
12771
  };
12476
12772
  }
12477
12773
  };
12478
- function resolvePath(obj, path47) {
12479
- if (!path47 || !obj) {
12774
+ function resolvePath(obj, path48) {
12775
+ if (!path48 || !obj) {
12480
12776
  return void 0;
12481
12777
  }
12482
- const parts = path47.split(/\.|\[|\]/).filter((p) => p.length > 0);
12778
+ const parts = path48.split(/\.|\[|\]/).filter((p) => p.length > 0);
12483
12779
  let current = obj;
12484
12780
  for (const part of parts) {
12485
12781
  if (current === null || current === void 0) {
@@ -12771,11 +13067,12 @@ function assembleLlmGraderPrompt(input) {
12771
13067
  function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
12772
13068
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
12773
13069
  const variables = {
12774
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2),
12775
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
12776
- [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
13070
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
13071
+ [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(),
13072
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
12777
13073
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
12778
13074
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
13075
+ // Deprecated aliases
12779
13076
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
12780
13077
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
12781
13078
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -12962,8 +13259,8 @@ var TokenUsageEvaluator = class {
12962
13259
  };
12963
13260
 
12964
13261
  // src/evaluation/evaluators/tool-trajectory.ts
12965
- function getNestedValue(obj, path47) {
12966
- const parts = path47.split(".");
13262
+ function getNestedValue(obj, path48) {
13263
+ const parts = path48.split(".");
12967
13264
  let current = obj;
12968
13265
  for (const part of parts) {
12969
13266
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -13832,16 +14129,13 @@ async function executePromptTemplate(script, context, config, timeoutMs) {
13832
14129
  const payload = {
13833
14130
  criteria: context.evalCase.criteria,
13834
14131
  expectedOutput: context.evalCase.expected_output,
13835
- outputText: context.candidate,
13836
14132
  output: context.output ?? null,
13837
14133
  inputFiles: context.evalCase.file_paths,
13838
14134
  input: context.evalCase.input,
13839
14135
  trace: context.trace ?? null,
13840
14136
  fileChanges: context.fileChanges ?? null,
13841
14137
  workspacePath: context.workspacePath ?? null,
13842
- config: config ?? context.config ?? null,
13843
- inputText: context.evalCase.question,
13844
- expectedOutputText: context.evalCase.reference_answer ?? ""
14138
+ config: config ?? context.config ?? null
13845
14139
  };
13846
14140
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
13847
14141
  const scriptPath = script[script.length - 1];
@@ -15521,7 +15815,8 @@ async function runEvaluation(options) {
15521
15815
  const budgetResult = {
15522
15816
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
15523
15817
  testId: evalCase.id,
15524
- eval_set: evalCase.eval_set,
15818
+ dataset: evalCase.dataset,
15819
+ category: evalCase.category,
15525
15820
  score: 0,
15526
15821
  assertions: [],
15527
15822
  output: [],
@@ -15557,7 +15852,8 @@ async function runEvaluation(options) {
15557
15852
  const haltResult = {
15558
15853
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
15559
15854
  testId: evalCase.id,
15560
- eval_set: evalCase.eval_set,
15855
+ dataset: evalCase.dataset,
15856
+ category: evalCase.category,
15561
15857
  score: 0,
15562
15858
  assertions: [],
15563
15859
  output: [],
@@ -16556,7 +16852,8 @@ async function evaluateCandidate(options) {
16556
16852
  return {
16557
16853
  timestamp: completedAt.toISOString(),
16558
16854
  testId: evalCase.id,
16559
- eval_set: evalCase.eval_set,
16855
+ dataset: evalCase.dataset,
16856
+ category: evalCase.category,
16560
16857
  conversationId: evalCase.conversation_id,
16561
16858
  score: score.score,
16562
16859
  assertions: score.assertions,
@@ -16906,7 +17203,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
16906
17203
  return {
16907
17204
  timestamp: timestamp.toISOString(),
16908
17205
  testId: evalCase.id,
16909
- eval_set: evalCase.eval_set,
17206
+ dataset: evalCase.dataset,
17207
+ category: evalCase.category,
16910
17208
  conversationId: evalCase.conversation_id,
16911
17209
  score: 0,
16912
17210
  assertions: [{ text: `Error: ${message}`, passed: false }],
@@ -17479,6 +17777,18 @@ function trimBaselineResult(result) {
17479
17777
  return trimmed;
17480
17778
  }
17481
17779
 
17780
+ // src/evaluation/category.ts
17781
+ import path47 from "node:path";
17782
+ var DEFAULT_CATEGORY = "Uncategorized";
17783
+ function deriveCategory(relativePath) {
17784
+ const parts = relativePath.split(path47.sep);
17785
+ if (parts.length <= 1) {
17786
+ return DEFAULT_CATEGORY;
17787
+ }
17788
+ const dirs = parts.slice(0, -1).filter((d) => d !== "evals");
17789
+ return dirs.length > 0 ? dirs.join("/") : DEFAULT_CATEGORY;
17790
+ }
17791
+
17482
17792
  // src/observability/otel-exporter.ts
17483
17793
  var OTEL_BACKEND_PRESETS = {
17484
17794
  langfuse: {
@@ -17602,7 +17912,7 @@ var OtelTraceExporter = class {
17602
17912
  rootSpan.setAttribute("gen_ai.system", "agentv");
17603
17913
  rootSpan.setAttribute("agentv.test_id", result.testId);
17604
17914
  rootSpan.setAttribute("agentv.target", result.target);
17605
- if (result.eval_set) rootSpan.setAttribute("agentv.eval_set", result.eval_set);
17915
+ if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
17606
17916
  rootSpan.setAttribute("agentv.score", result.score);
17607
17917
  if (captureContent && result.output.length > 0) {
17608
17918
  const lastMsg = result.output[result.output.length - 1];
@@ -17811,7 +18121,7 @@ var OtelStreamingObserver = class {
17811
18121
  this.rootSpan.setAttribute("gen_ai.system", "agentv");
17812
18122
  this.rootSpan.setAttribute("agentv.test_id", testId);
17813
18123
  this.rootSpan.setAttribute("agentv.target", target);
17814
- if (evalSet) this.rootSpan.setAttribute("agentv.eval_set", evalSet);
18124
+ if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
17815
18125
  this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
17816
18126
  }
17817
18127
  /** Create and immediately export a tool span */
@@ -17983,9 +18293,11 @@ function createAgentKernel() {
17983
18293
  return { status: "stub" };
17984
18294
  }
17985
18295
  export {
18296
+ COMMON_TARGET_SETTINGS,
17986
18297
  CodeEvaluator,
17987
18298
  CompositeEvaluator,
17988
18299
  CostEvaluator,
18300
+ DEFAULT_CATEGORY,
17989
18301
  DEFAULT_EVALUATOR_TEMPLATE,
17990
18302
  DEFAULT_EVAL_PATTERNS,
17991
18303
  DEFAULT_EXPLORATION_TOOLS,
@@ -18039,6 +18351,7 @@ export {
18039
18351
  createTempWorkspace,
18040
18352
  deepEqual,
18041
18353
  defineConfig,
18354
+ deriveCategory,
18042
18355
  detectFormat,
18043
18356
  discoverAssertions,
18044
18357
  discoverCopilotSessions,
@@ -18052,7 +18365,9 @@ export {
18052
18365
  explorationRatio,
18053
18366
  extractCacheConfig,
18054
18367
  extractFailOnError,
18368
+ extractImageBlocks,
18055
18369
  extractJsonBlob,
18370
+ extractLastAssistantContent,
18056
18371
  extractTargetFromSuite,
18057
18372
  extractTargetsFromSuite,
18058
18373
  extractTargetsFromTestCase,
@@ -18066,12 +18381,15 @@ export {
18066
18381
  getAgentvHome,
18067
18382
  getOutputFilenames,
18068
18383
  getSubagentsRoot,
18384
+ getTextContent,
18069
18385
  getTraceStateRoot,
18070
18386
  getWorkspacePath,
18071
18387
  getWorkspacePoolRoot,
18072
18388
  getWorkspacesRoot,
18073
18389
  initializeBaseline,
18074
18390
  isAgentSkillsFormat,
18391
+ isContent,
18392
+ isContentArray,
18075
18393
  isEvaluatorKind,
18076
18394
  isJsonObject,
18077
18395
  isJsonValue,