@agentv/core 3.14.5 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1,4 +1,5 @@
1
1
  import {
2
+ COMMON_TARGET_SETTINGS,
2
3
  TEST_MESSAGE_ROLES,
3
4
  buildDirectoryChain,
4
5
  buildSearchRoots,
@@ -6,8 +7,11 @@ import {
6
7
  extractLastAssistantContent,
7
8
  fileExists,
8
9
  findGitRoot,
10
+ getTextContent,
9
11
  interpolateEnv,
10
12
  isAgentProvider,
13
+ isContent,
14
+ isContentArray,
11
15
  isEvaluatorKind,
12
16
  isJsonObject,
13
17
  isJsonValue,
@@ -19,7 +23,7 @@ import {
19
23
  readTextFile,
20
24
  resolveFileReference,
21
25
  resolveTargetDefinition
22
- } from "./chunk-HP5PFOVK.js";
26
+ } from "./chunk-PXYYRDHH.js";
23
27
  import {
24
28
  AgentvProvider
25
29
  } from "./chunk-W5YDZWT4.js";
@@ -690,15 +694,23 @@ var TEMPLATE_VARIABLES = {
690
694
  INPUT: "input",
691
695
  OUTPUT: "output",
692
696
  FILE_CHANGES: "file_changes",
697
+ /** @deprecated Use INPUT instead — resolves to the same text value. */
693
698
  INPUT_TEXT: "input_text",
699
+ /** @deprecated Use OUTPUT instead — resolves to the same text value. */
694
700
  OUTPUT_TEXT: "output_text",
701
+ /** @deprecated Use EXPECTED_OUTPUT instead — resolves to the same text value. */
695
702
  EXPECTED_OUTPUT_TEXT: "expected_output_text"
696
703
  };
697
704
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
698
705
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
699
- TEMPLATE_VARIABLES.OUTPUT_TEXT,
706
+ TEMPLATE_VARIABLES.OUTPUT,
700
707
  TEMPLATE_VARIABLES.EXPECTED_OUTPUT
701
708
  ]);
709
+ var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
710
+ [TEMPLATE_VARIABLES.INPUT_TEXT, TEMPLATE_VARIABLES.INPUT],
711
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT],
712
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
713
+ ]);
702
714
 
703
715
  // src/evaluation/validation/prompt-validator.ts
704
716
  var ANSI_YELLOW2 = "\x1B[33m";
@@ -720,16 +732,29 @@ function validateTemplateVariables(content, source) {
720
732
  }
721
733
  match = variablePattern.exec(content);
722
734
  }
723
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
735
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
724
736
  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
725
737
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
726
738
  if (!hasRequiredFields) {
727
739
  throw new Error(
728
740
  `Missing required fields. Must include at least one of:
729
- - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
741
+ - {{ ${TEMPLATE_VARIABLES.OUTPUT} }}
730
742
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
731
743
  );
732
744
  }
745
+ const deprecatedUsed = [];
746
+ for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
747
+ if (foundVariables.has(deprecated)) {
748
+ deprecatedUsed.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
749
+ }
750
+ }
751
+ if (deprecatedUsed.length > 0) {
752
+ console.warn(
753
+ `${ANSI_YELLOW2}Warning: Template at ${source} uses deprecated variable names:
754
+ ${deprecatedUsed.join("\n ")}
755
+ These still work but will be removed in a future version.${ANSI_RESET3}`
756
+ );
757
+ }
733
758
  if (invalidVariables.length > 0) {
734
759
  const warningMessage = `${ANSI_YELLOW2}Warning: Custom evaluator template at ${source}
735
760
  Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
@@ -2147,6 +2172,19 @@ function asString2(value) {
2147
2172
  }
2148
2173
 
2149
2174
  // src/evaluation/loaders/message-processor.ts
2175
+ var IMAGE_MEDIA_TYPES = {
2176
+ ".png": "image/png",
2177
+ ".jpg": "image/jpeg",
2178
+ ".jpeg": "image/jpeg",
2179
+ ".gif": "image/gif",
2180
+ ".webp": "image/webp",
2181
+ ".svg": "image/svg+xml",
2182
+ ".bmp": "image/bmp"
2183
+ };
2184
+ function detectImageMediaType(filePath) {
2185
+ const ext = path5.extname(filePath).toLowerCase();
2186
+ return IMAGE_MEDIA_TYPES[ext];
2187
+ }
2150
2188
  var ANSI_YELLOW4 = "\x1B[33m";
2151
2189
  var ANSI_RESET5 = "\x1B[0m";
2152
2190
  async function processMessages(options) {
@@ -2212,6 +2250,47 @@ async function processMessages(options) {
2212
2250
  }
2213
2251
  continue;
2214
2252
  }
2253
+ if (segmentType === "image") {
2254
+ const rawValue = asString3(rawSegment.value);
2255
+ if (!rawValue) {
2256
+ continue;
2257
+ }
2258
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
2259
+ rawValue,
2260
+ searchRoots
2261
+ );
2262
+ if (!resolvedPath) {
2263
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
2264
+ const context = messageType === "input" ? "" : " in expected_output";
2265
+ logWarning3(`Image file not found${context}: ${displayPath}`, attempts);
2266
+ continue;
2267
+ }
2268
+ const mediaType = detectImageMediaType(resolvedPath);
2269
+ if (!mediaType) {
2270
+ logWarning3(
2271
+ `Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
2272
+ );
2273
+ continue;
2274
+ }
2275
+ try {
2276
+ const imageBuffer = await readFile4(resolvedPath);
2277
+ const base64 = imageBuffer.toString("base64");
2278
+ processedContent.push({
2279
+ type: "image",
2280
+ media_type: mediaType,
2281
+ source: `data:${mediaType};base64,${base64}`
2282
+ });
2283
+ if (verbose) {
2284
+ const label = messageType === "input" ? "[Image]" : "[Expected Output Image]";
2285
+ console.log(` ${label} Found: ${displayPath}`);
2286
+ console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
2287
+ }
2288
+ } catch (error) {
2289
+ const context = messageType === "input" ? "" : " expected output";
2290
+ logWarning3(`Could not read${context} image ${resolvedPath}: ${error.message}`);
2291
+ }
2292
+ continue;
2293
+ }
2215
2294
  const clonedSegment = cloneJsonObject(rawSegment);
2216
2295
  processedContent.push(clonedSegment);
2217
2296
  const inlineValue = clonedSegment.value;
@@ -2289,6 +2368,46 @@ async function processExpectedMessages(options) {
2289
2368
  }
2290
2369
  continue;
2291
2370
  }
2371
+ if (segmentType === "image") {
2372
+ const rawValue = asString3(rawSegment.value);
2373
+ if (!rawValue) {
2374
+ continue;
2375
+ }
2376
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
2377
+ rawValue,
2378
+ searchRoots
2379
+ );
2380
+ if (!resolvedPath) {
2381
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
2382
+ logWarning3(`Image file not found in expected_output: ${displayPath}`, attempts);
2383
+ continue;
2384
+ }
2385
+ const mediaType = detectImageMediaType(resolvedPath);
2386
+ if (!mediaType) {
2387
+ logWarning3(
2388
+ `Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
2389
+ );
2390
+ continue;
2391
+ }
2392
+ try {
2393
+ const imageBuffer = await readFile4(resolvedPath);
2394
+ const base64 = imageBuffer.toString("base64");
2395
+ processedContent.push({
2396
+ type: "image",
2397
+ media_type: mediaType,
2398
+ source: `data:${mediaType};base64,${base64}`
2399
+ });
2400
+ if (verbose) {
2401
+ console.log(` [Expected Output Image] Found: ${displayPath}`);
2402
+ console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
2403
+ }
2404
+ } catch (error) {
2405
+ logWarning3(
2406
+ `Could not read expected output image ${resolvedPath}: ${error.message}`
2407
+ );
2408
+ }
2409
+ continue;
2410
+ }
2292
2411
  processedContent.push(cloneJsonObject(rawSegment));
2293
2412
  }
2294
2413
  segment.content = processedContent;
@@ -2535,7 +2654,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
2535
2654
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
2536
2655
  const testCase = {
2537
2656
  id,
2538
- eval_set: evalSetName,
2657
+ dataset: evalSetName,
2539
2658
  conversation_id: conversationId,
2540
2659
  question,
2541
2660
  input: inputMessages,
@@ -2806,7 +2925,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2806
2925
  }
2807
2926
  const suite = interpolated;
2808
2927
  const evalSetNameFromSuite = asString5(suite.name)?.trim();
2809
- const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
2928
+ const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
2810
2929
  const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
2811
2930
  const rawTestcases = resolveTests(suite);
2812
2931
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
@@ -2927,7 +3046,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
2927
3046
  const caseTargets = extractTargetsFromTestCase(evalcase);
2928
3047
  const testCase = {
2929
3048
  id,
2930
- eval_set: evalSetName,
3049
+ dataset: evalSetName,
3050
+ category: options?.category,
2931
3051
  conversation_id: conversationId,
2932
3052
  question,
2933
3053
  input: inputMessages,
@@ -3851,6 +3971,48 @@ import { createWriteStream } from "node:fs";
3851
3971
  import { mkdir } from "node:fs/promises";
3852
3972
  import path10 from "node:path";
3853
3973
 
3974
+ // src/evaluation/providers/claude-content.ts
3975
+ function toContentArray(content) {
3976
+ if (!Array.isArray(content)) return void 0;
3977
+ let hasNonText = false;
3978
+ const blocks = [];
3979
+ for (const part of content) {
3980
+ if (!part || typeof part !== "object") continue;
3981
+ const p = part;
3982
+ if (p.type === "text" && typeof p.text === "string") {
3983
+ blocks.push({ type: "text", text: p.text });
3984
+ } else if (p.type === "image" && typeof p.source === "object" && p.source !== null) {
3985
+ const src = p.source;
3986
+ const mediaType = typeof p.media_type === "string" ? p.media_type : typeof src.media_type === "string" ? src.media_type : "application/octet-stream";
3987
+ const data = typeof src.data === "string" ? `data:${mediaType};base64,${src.data}` : typeof p.url === "string" ? p.url : "";
3988
+ blocks.push({ type: "image", media_type: mediaType, source: data });
3989
+ hasNonText = true;
3990
+ } else if (p.type === "tool_use") {
3991
+ } else if (p.type === "tool_result") {
3992
+ }
3993
+ }
3994
+ return hasNonText && blocks.length > 0 ? blocks : void 0;
3995
+ }
3996
+ function extractTextContent(content) {
3997
+ if (typeof content === "string") {
3998
+ return content;
3999
+ }
4000
+ if (!Array.isArray(content)) {
4001
+ return void 0;
4002
+ }
4003
+ const textParts = [];
4004
+ for (const part of content) {
4005
+ if (!part || typeof part !== "object") {
4006
+ continue;
4007
+ }
4008
+ const p = part;
4009
+ if (p.type === "text" && typeof p.text === "string") {
4010
+ textParts.push(p.text);
4011
+ }
4012
+ }
4013
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
4014
+ }
4015
+
3854
4016
  // src/evaluation/providers/claude-log-tracker.ts
3855
4017
  var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeLogs");
3856
4018
  var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeLogSubscribers");
@@ -4016,11 +4178,12 @@ var ClaudeCliProvider = class {
4016
4178
  if (betaMessage && typeof betaMessage === "object") {
4017
4179
  const msg = betaMessage;
4018
4180
  const content = msg.content;
4181
+ const structuredContent = toContentArray(content);
4019
4182
  const textContent = extractTextContent(content);
4020
4183
  const toolCalls = extractToolCalls(content);
4021
4184
  const outputMsg = {
4022
4185
  role: "assistant",
4023
- content: textContent,
4186
+ content: structuredContent ?? textContent,
4024
4187
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0
4025
4188
  };
4026
4189
  output.push(outputMsg);
@@ -4359,25 +4522,6 @@ function summarizeEvent(event) {
4359
4522
  return void 0;
4360
4523
  }
4361
4524
  }
4362
- function extractTextContent(content) {
4363
- if (typeof content === "string") {
4364
- return content;
4365
- }
4366
- if (!Array.isArray(content)) {
4367
- return void 0;
4368
- }
4369
- const textParts = [];
4370
- for (const part of content) {
4371
- if (!part || typeof part !== "object") {
4372
- continue;
4373
- }
4374
- const p = part;
4375
- if (p.type === "text" && typeof p.text === "string") {
4376
- textParts.push(p.text);
4377
- }
4378
- }
4379
- return textParts.length > 0 ? textParts.join("\n") : void 0;
4380
- }
4381
4525
  function extractToolCalls(content) {
4382
4526
  if (!Array.isArray(content)) {
4383
4527
  return [];
@@ -4550,11 +4694,12 @@ var ClaudeSdkProvider = class {
4550
4694
  if (betaMessage && typeof betaMessage === "object") {
4551
4695
  const msg = betaMessage;
4552
4696
  const content = msg.content;
4553
- const textContent = extractTextContent2(content);
4697
+ const structuredContent = toContentArray(content);
4698
+ const textContent = extractTextContent(content);
4554
4699
  const toolCalls = extractToolCalls2(content);
4555
4700
  const outputMsg = {
4556
4701
  role: "assistant",
4557
- content: textContent,
4702
+ content: structuredContent ?? textContent,
4558
4703
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0
4559
4704
  };
4560
4705
  output.push(outputMsg);
@@ -4672,25 +4817,6 @@ var ClaudeSdkProvider = class {
4672
4817
  }
4673
4818
  }
4674
4819
  };
4675
- function extractTextContent2(content) {
4676
- if (typeof content === "string") {
4677
- return content;
4678
- }
4679
- if (!Array.isArray(content)) {
4680
- return void 0;
4681
- }
4682
- const textParts = [];
4683
- for (const part of content) {
4684
- if (!part || typeof part !== "object") {
4685
- continue;
4686
- }
4687
- const p = part;
4688
- if (p.type === "text" && typeof p.text === "string") {
4689
- textParts.push(p.text);
4690
- }
4691
- }
4692
- return textParts.length > 0 ? textParts.join("\n") : void 0;
4693
- }
4694
4820
  function extractToolCalls2(content) {
4695
4821
  if (!Array.isArray(content)) {
4696
4822
  return [];
@@ -4914,7 +5040,7 @@ function convertMessages(messages) {
4914
5040
  return messages.map((msg) => ({
4915
5041
  role: msg.role,
4916
5042
  name: msg.name,
4917
- content: msg.content,
5043
+ content: isContentArray(msg.content) ? msg.content : typeof msg.content === "string" ? msg.content : void 0,
4918
5044
  toolCalls: msg.tool_calls?.map((tc) => ({
4919
5045
  tool: tc.tool,
4920
5046
  input: tc.input,
@@ -7167,6 +7293,35 @@ function extractPiTextContent(content) {
7167
7293
  }
7168
7294
  return textParts.length > 0 ? textParts.join("\n") : void 0;
7169
7295
  }
7296
+ function toPiContentArray(content) {
7297
+ if (!Array.isArray(content)) return void 0;
7298
+ let hasNonText = false;
7299
+ const blocks = [];
7300
+ for (const part of content) {
7301
+ if (!part || typeof part !== "object") continue;
7302
+ const p = part;
7303
+ if (p.type === "text" && typeof p.text === "string") {
7304
+ blocks.push({ type: "text", text: p.text });
7305
+ } else if (p.type === "image") {
7306
+ const mediaType = typeof p.media_type === "string" ? p.media_type : "application/octet-stream";
7307
+ let source = "";
7308
+ if (typeof p.source === "object" && p.source !== null) {
7309
+ const src = p.source;
7310
+ const srcMediaType = typeof src.media_type === "string" ? src.media_type : mediaType;
7311
+ source = typeof src.data === "string" ? `data:${srcMediaType};base64,${src.data}` : "";
7312
+ }
7313
+ if (!source && typeof p.url === "string") {
7314
+ source = p.url;
7315
+ }
7316
+ if (source) {
7317
+ blocks.push({ type: "image", media_type: mediaType, source });
7318
+ hasNonText = true;
7319
+ }
7320
+ } else if (p.type === "tool_use" || p.type === "tool_result") {
7321
+ }
7322
+ }
7323
+ return hasNonText && blocks.length > 0 ? blocks : void 0;
7324
+ }
7170
7325
  function toFiniteNumber(value) {
7171
7326
  if (typeof value === "number" && Number.isFinite(value)) return value;
7172
7327
  return void 0;
@@ -8337,7 +8492,8 @@ function convertAgentMessage(message, toolTrackers, completedToolResults) {
8337
8492
  }
8338
8493
  const msg = message;
8339
8494
  const role = typeof msg.role === "string" ? msg.role : "unknown";
8340
- const content = extractPiTextContent(msg.content);
8495
+ const structuredContent = toPiContentArray(msg.content);
8496
+ const content = structuredContent ?? extractPiTextContent(msg.content);
8341
8497
  const toolCalls = extractToolCalls4(msg.content, toolTrackers, completedToolResults);
8342
8498
  const startTimeVal = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
8343
8499
  let msgTokenUsage;
@@ -10187,13 +10343,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
10187
10343
  async function execShellWithStdin(command, stdinPayload, options = {}) {
10188
10344
  const { mkdir: mkdir16, readFile: readFile14, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
10189
10345
  const { tmpdir: tmpdir3 } = await import("node:os");
10190
- const path47 = await import("node:path");
10346
+ const path48 = await import("node:path");
10191
10347
  const { randomUUID: randomUUID10 } = await import("node:crypto");
10192
- const dir = path47.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
10348
+ const dir = path48.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
10193
10349
  await mkdir16(dir, { recursive: true });
10194
- const stdinPath = path47.join(dir, "stdin.txt");
10195
- const stdoutPath = path47.join(dir, "stdout.txt");
10196
- const stderrPath = path47.join(dir, "stderr.txt");
10350
+ const stdinPath = path48.join(dir, "stdin.txt");
10351
+ const stdoutPath = path48.join(dir, "stdout.txt");
10352
+ const stderrPath = path48.join(dir, "stderr.txt");
10197
10353
  await writeFile9(stdinPath, stdinPayload, "utf8");
10198
10354
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
10199
10355
  const { spawn: spawn5 } = await import("node:child_process");
@@ -10509,6 +10665,56 @@ function toCamelCaseDeep(obj) {
10509
10665
 
10510
10666
  // src/evaluation/evaluators/code-evaluator.ts
10511
10667
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
10668
+ var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
10669
+ async function materializeContentForGrader(messages, getWorkDir) {
10670
+ if (!messages || messages.length === 0) return messages ?? null;
10671
+ let hasAnyImage = false;
10672
+ for (const msg of messages) {
10673
+ if (isContentArray(msg.content)) {
10674
+ for (const block of msg.content) {
10675
+ if (block.type === "image") {
10676
+ hasAnyImage = true;
10677
+ break;
10678
+ }
10679
+ }
10680
+ }
10681
+ if (hasAnyImage) break;
10682
+ }
10683
+ if (!hasAnyImage) return messages;
10684
+ let counter = 0;
10685
+ const result = [];
10686
+ for (const msg of messages) {
10687
+ if (!isContentArray(msg.content)) {
10688
+ result.push(msg);
10689
+ continue;
10690
+ }
10691
+ if (!msg.content.some((b) => b.type === "image")) {
10692
+ result.push(msg);
10693
+ continue;
10694
+ }
10695
+ const blocks = [];
10696
+ for (const block of msg.content) {
10697
+ if (block.type !== "image") {
10698
+ blocks.push({ ...block });
10699
+ continue;
10700
+ }
10701
+ const img = block;
10702
+ const match = DATA_URI_RE.exec(img.source);
10703
+ if (match) {
10704
+ const [, mediaType, base64Data] = match;
10705
+ const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
10706
+ const dir = await getWorkDir();
10707
+ const filePath = join(dir, `img-${counter++}.${ext}`);
10708
+ await writeFile6(filePath, Buffer.from(base64Data, "base64"));
10709
+ blocks.push({ type: "image", media_type: img.media_type, path: filePath });
10710
+ } else {
10711
+ blocks.push({ type: "image", media_type: img.media_type, path: img.source });
10712
+ }
10713
+ }
10714
+ result.push({ ...msg, content: blocks });
10715
+ }
10716
+ return result;
10717
+ }
10512
10718
  var CodeEvaluator = class {
10513
10719
  kind = "code-grader";
10514
10720
  command;
@@ -10524,7 +10730,18 @@ var CodeEvaluator = class {
10524
10730
  this.target = options.target;
10525
10731
  }
10526
10732
  async evaluate(context) {
10527
- let outputForPayload = context.output ?? null;
10733
+ let imageTmpDir;
10734
+ const getImageDir = async () => {
10735
+ if (!imageTmpDir) {
10736
+ imageTmpDir = await mkdtemp2(join(tmpdir2(), "agentv-img-"));
10737
+ }
10738
+ return imageTmpDir;
10739
+ };
10740
+ const materializedOutput = await materializeContentForGrader(
10741
+ context.output,
10742
+ getImageDir
10743
+ );
10744
+ let outputForPayload = materializedOutput;
10528
10745
  let outputPath;
10529
10746
  if (outputForPayload) {
10530
10747
  const serialized = JSON.stringify(outputForPayload);
@@ -10537,12 +10754,17 @@ var CodeEvaluator = class {
10537
10754
  }
10538
10755
  const payload = {
10539
10756
  criteria: context.evalCase.criteria,
10540
- expectedOutput: context.evalCase.expected_output,
10541
- outputText: context.candidate,
10757
+ expectedOutput: await materializeContentForGrader(
10758
+ context.evalCase.expected_output,
10759
+ getImageDir
10760
+ ),
10542
10761
  output: outputForPayload,
10543
10762
  outputPath,
10544
10763
  inputFiles: context.evalCase.file_paths,
10545
- input: context.evalCase.input,
10764
+ input: await materializeContentForGrader(
10765
+ context.evalCase.input,
10766
+ getImageDir
10767
+ ),
10546
10768
  trace: context.trace ?? null,
10547
10769
  tokenUsage: context.tokenUsage ?? null,
10548
10770
  costUsd: context.costUsd ?? null,
@@ -10551,9 +10773,7 @@ var CodeEvaluator = class {
10551
10773
  endTime: context.endTime ?? null,
10552
10774
  fileChanges: context.fileChanges ?? null,
10553
10775
  workspacePath: context.workspacePath ?? null,
10554
- config: this.config ?? null,
10555
- inputText: context.evalCase.question,
10556
- expectedOutputText: context.evalCase.reference_answer ?? ""
10776
+ config: this.config ?? null
10557
10777
  };
10558
10778
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
10559
10779
  let proxyEnv;
@@ -10643,6 +10863,10 @@ var CodeEvaluator = class {
10643
10863
  await rm3(dirname(outputPath), { recursive: true, force: true }).catch(() => {
10644
10864
  });
10645
10865
  }
10866
+ if (imageTmpDir) {
10867
+ await rm3(imageTmpDir, { recursive: true, force: true }).catch(() => {
10868
+ });
10869
+ }
10646
10870
  }
10647
10871
  }
10648
10872
  };
@@ -10720,13 +10944,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
10720
10944
  {{${TEMPLATE_VARIABLES.CRITERIA}}}
10721
10945
 
10722
10946
  [[ ## question ## ]]
10723
- {{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
10947
+ {{${TEMPLATE_VARIABLES.INPUT}}}
10724
10948
 
10725
10949
  [[ ## reference_answer ## ]]
10726
- {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
10950
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT}}}
10727
10951
 
10728
10952
  [[ ## answer ## ]]
10729
- {{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
10953
+ {{${TEMPLATE_VARIABLES.OUTPUT}}}`;
10730
10954
  var freeformEvaluationSchema = z3.object({
10731
10955
  score: z3.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
10732
10956
  assertions: z3.array(
@@ -10798,21 +11022,19 @@ var LlmGraderEvaluator = class {
10798
11022
  async evaluateFreeform(context, graderProvider) {
10799
11023
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
10800
11024
  const variables = {
10801
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context.evalCase.input, null, 2),
10802
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(
10803
- context.evalCase.expected_output,
10804
- null,
10805
- 2
10806
- ),
10807
- [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context.output ?? [], null, 2),
11025
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
11026
+ [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
11027
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
10808
11028
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
10809
11029
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
11030
+ // Deprecated aliases — same values as the primary variables above
10810
11031
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
10811
11032
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
10812
11033
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
10813
11034
  };
10814
11035
  const systemPrompt = buildOutputSchema();
10815
11036
  const evaluatorTemplate = context.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
11037
+ warnDeprecatedTemplateVars(evaluatorTemplate);
10816
11038
  let userPrompt = substituteVariables(evaluatorTemplate, variables);
10817
11039
  if (context.fileChanges && !context.evaluatorTemplateOverride && !this.evaluatorTemplate) {
10818
11040
  userPrompt += `
@@ -10824,13 +11046,15 @@ ${context.fileChanges}`;
10824
11046
  userPrompt,
10825
11047
  systemPrompt
10826
11048
  };
11049
+ const images = context.output ? extractImageBlocks(context.output) : [];
10827
11050
  try {
10828
11051
  const { data, tokenUsage } = await this.runWithRetry({
10829
11052
  context,
10830
11053
  graderProvider,
10831
11054
  systemPrompt,
10832
11055
  userPrompt,
10833
- schema: freeformEvaluationSchema
11056
+ schema: freeformEvaluationSchema,
11057
+ images
10834
11058
  });
10835
11059
  const score = clampScore(data.score);
10836
11060
  const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
@@ -10874,13 +11098,15 @@ ${context.fileChanges}`;
10874
11098
  userPrompt: prompt,
10875
11099
  systemPrompt
10876
11100
  };
11101
+ const images = context.output ? extractImageBlocks(context.output) : [];
10877
11102
  try {
10878
11103
  const { data, tokenUsage } = await this.runWithRetry({
10879
11104
  context,
10880
11105
  graderProvider,
10881
11106
  systemPrompt,
10882
11107
  userPrompt: prompt,
10883
- schema: rubricEvaluationSchema
11108
+ schema: rubricEvaluationSchema,
11109
+ images
10884
11110
  });
10885
11111
  const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
10886
11112
  return {
@@ -10917,13 +11143,15 @@ ${context.fileChanges}`;
10917
11143
  userPrompt: prompt,
10918
11144
  systemPrompt
10919
11145
  };
11146
+ const images = context.output ? extractImageBlocks(context.output) : [];
10920
11147
  try {
10921
11148
  const { data, tokenUsage } = await this.runWithRetry({
10922
11149
  context,
10923
11150
  graderProvider,
10924
11151
  systemPrompt,
10925
11152
  userPrompt: prompt,
10926
- schema: scoreRangeEvaluationSchema
11153
+ schema: scoreRangeEvaluationSchema,
11154
+ images
10927
11155
  });
10928
11156
  const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
10929
11157
  return {
@@ -11130,12 +11358,17 @@ ${context.fileChanges}`;
11130
11358
  const formattedQuestion = context.promptInputs.question && context.promptInputs.question.trim().length > 0 ? context.promptInputs.question : context.evalCase.question;
11131
11359
  const variables = {
11132
11360
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
11361
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
11362
+ [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
11363
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
11364
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
11365
+ // Deprecated aliases
11133
11366
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
11134
11367
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
11135
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
11136
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
11368
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
11137
11369
  };
11138
11370
  if (this.evaluatorTemplate) {
11371
+ warnDeprecatedTemplateVars(this.evaluatorTemplate);
11139
11372
  return substituteVariables(this.evaluatorTemplate, variables);
11140
11373
  }
11141
11374
  const config = context.evaluator;
@@ -11186,11 +11419,16 @@ ${context.fileChanges}`;
11186
11419
  if (this.evaluatorTemplate) {
11187
11420
  const variables = {
11188
11421
  [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
11422
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
11423
+ [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(),
11424
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? "").trim(),
11425
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? "",
11426
+ // Deprecated aliases
11189
11427
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
11190
11428
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(),
11191
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim(),
11192
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? ""
11429
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context.evalCase.reference_answer ?? "").trim()
11193
11430
  };
11431
+ warnDeprecatedTemplateVars(this.evaluatorTemplate);
11194
11432
  const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
11195
11433
  const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
11196
11434
  return `${customPrompt}
@@ -11361,18 +11599,35 @@ ${outputSchema}`;
11361
11599
  // LLM mode retry logic
11362
11600
  // ---------------------------------------------------------------------------
11363
11601
  async runWithRetry(options) {
11364
- const { context, graderProvider, systemPrompt, userPrompt, schema } = options;
11602
+ const { context, graderProvider, systemPrompt, userPrompt, schema, images } = options;
11365
11603
  let lastError;
11366
11604
  for (let attempt = 1; attempt <= 3; attempt++) {
11367
11605
  try {
11368
11606
  const model = graderProvider.asLanguageModel?.();
11369
11607
  if (model) {
11370
- const result = await generateText2({
11608
+ const modelOptions = {
11609
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
11610
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
11611
+ };
11612
+ const hasImages = images && images.length > 0;
11613
+ const result = hasImages ? await generateText2({
11614
+ model,
11615
+ system: systemPrompt,
11616
+ messages: [
11617
+ {
11618
+ role: "user",
11619
+ content: [
11620
+ { type: "text", text: userPrompt },
11621
+ ...toAiSdkImageParts(images)
11622
+ ]
11623
+ }
11624
+ ],
11625
+ ...modelOptions
11626
+ }) : await generateText2({
11371
11627
  model,
11372
11628
  system: systemPrompt,
11373
11629
  prompt: userPrompt,
11374
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
11375
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
11630
+ ...modelOptions
11376
11631
  });
11377
11632
  const data2 = schema.parse(parseJsonFromText(result.text));
11378
11633
  const rawUsage = result.usage;
@@ -11432,6 +11687,26 @@ function substituteVariables(template, variables) {
11432
11687
  return variables[varName] ?? match;
11433
11688
  });
11434
11689
  }
11690
+ var ANSI_YELLOW7 = "\x1B[33m";
11691
+ var ANSI_RESET8 = "\x1B[0m";
11692
+ var warnedTemplateStrings = /* @__PURE__ */ new Set();
11693
+ function warnDeprecatedTemplateVars(template) {
11694
+ if (warnedTemplateStrings.has(template)) return;
11695
+ const used = [];
11696
+ for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
11697
+ if (new RegExp(`\\{\\{\\s*${deprecated}\\s*\\}\\}`).test(template)) {
11698
+ used.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
11699
+ }
11700
+ }
11701
+ if (used.length > 0) {
11702
+ warnedTemplateStrings.add(template);
11703
+ console.warn(
11704
+ `${ANSI_YELLOW7}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
11705
+ ${used.join("\n ")}
11706
+ Update your custom evaluator template to use the new names.${ANSI_RESET8}`
11707
+ );
11708
+ }
11709
+ }
11435
11710
  function calculateRubricScore(result, rubrics) {
11436
11711
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
11437
11712
  const assertions = [];
@@ -11526,6 +11801,26 @@ function calculateScoreRangeResult(result, rubrics) {
11526
11801
  }
11527
11802
  };
11528
11803
  }
11804
+ function extractImageBlocks(messages) {
11805
+ const images = [];
11806
+ for (const msg of messages) {
11807
+ if (msg.role !== "assistant") continue;
11808
+ if (!isContentArray(msg.content)) continue;
11809
+ for (const block of msg.content) {
11810
+ if (block.type === "image") {
11811
+ images.push(block);
11812
+ }
11813
+ }
11814
+ }
11815
+ return images;
11816
+ }
11817
+ function toAiSdkImageParts(images) {
11818
+ return images.map((img) => ({
11819
+ type: "image",
11820
+ image: img.source,
11821
+ mediaType: img.media_type || void 0
11822
+ }));
11823
+ }
11529
11824
  function resolveSandboxed(basePath, relativePath) {
11530
11825
  const resolved = path35.resolve(basePath, relativePath);
11531
11826
  if (!resolved.startsWith(basePath + path35.sep) && resolved !== basePath) {
@@ -12267,115 +12562,115 @@ var FieldAccuracyEvaluator = class {
12267
12562
  * Evaluate a single field against the expected value.
12268
12563
  */
12269
12564
  evaluateField(fieldConfig, candidateData, expectedData) {
12270
- const { path: path47, match, required = true, weight = 1 } = fieldConfig;
12271
- const candidateValue = resolvePath(candidateData, path47);
12272
- const expectedValue = resolvePath(expectedData, path47);
12565
+ const { path: path48, match, required = true, weight = 1 } = fieldConfig;
12566
+ const candidateValue = resolvePath(candidateData, path48);
12567
+ const expectedValue = resolvePath(expectedData, path48);
12273
12568
  if (expectedValue === void 0) {
12274
12569
  return {
12275
- path: path47,
12570
+ path: path48,
12276
12571
  score: 1,
12277
12572
  // No expected value means no comparison needed
12278
12573
  weight,
12279
12574
  hit: true,
12280
- message: `${path47}: no expected value`
12575
+ message: `${path48}: no expected value`
12281
12576
  };
12282
12577
  }
12283
12578
  if (candidateValue === void 0) {
12284
12579
  if (required) {
12285
12580
  return {
12286
- path: path47,
12581
+ path: path48,
12287
12582
  score: 0,
12288
12583
  weight,
12289
12584
  hit: false,
12290
- message: `${path47} (required, missing)`
12585
+ message: `${path48} (required, missing)`
12291
12586
  };
12292
12587
  }
12293
12588
  return {
12294
- path: path47,
12589
+ path: path48,
12295
12590
  score: 1,
12296
12591
  // Don't penalize missing optional fields
12297
12592
  weight: 0,
12298
12593
  // Zero weight means it won't affect the score
12299
12594
  hit: true,
12300
- message: `${path47}: optional field missing`
12595
+ message: `${path48}: optional field missing`
12301
12596
  };
12302
12597
  }
12303
12598
  switch (match) {
12304
12599
  case "exact":
12305
- return this.compareExact(path47, candidateValue, expectedValue, weight);
12600
+ return this.compareExact(path48, candidateValue, expectedValue, weight);
12306
12601
  case "numeric_tolerance":
12307
12602
  return this.compareNumericTolerance(
12308
- path47,
12603
+ path48,
12309
12604
  candidateValue,
12310
12605
  expectedValue,
12311
12606
  fieldConfig,
12312
12607
  weight
12313
12608
  );
12314
12609
  case "date":
12315
- return this.compareDate(path47, candidateValue, expectedValue, fieldConfig, weight);
12610
+ return this.compareDate(path48, candidateValue, expectedValue, fieldConfig, weight);
12316
12611
  default:
12317
12612
  return {
12318
- path: path47,
12613
+ path: path48,
12319
12614
  score: 0,
12320
12615
  weight,
12321
12616
  hit: false,
12322
- message: `${path47}: unknown match type "${match}"`
12617
+ message: `${path48}: unknown match type "${match}"`
12323
12618
  };
12324
12619
  }
12325
12620
  }
12326
12621
  /**
12327
12622
  * Exact equality comparison.
12328
12623
  */
12329
- compareExact(path47, candidateValue, expectedValue, weight) {
12624
+ compareExact(path48, candidateValue, expectedValue, weight) {
12330
12625
  if (deepEqual(candidateValue, expectedValue)) {
12331
12626
  return {
12332
- path: path47,
12627
+ path: path48,
12333
12628
  score: 1,
12334
12629
  weight,
12335
12630
  hit: true,
12336
- message: path47
12631
+ message: path48
12337
12632
  };
12338
12633
  }
12339
12634
  if (typeof candidateValue !== typeof expectedValue) {
12340
12635
  return {
12341
- path: path47,
12636
+ path: path48,
12342
12637
  score: 0,
12343
12638
  weight,
12344
12639
  hit: false,
12345
- message: `${path47} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
12640
+ message: `${path48} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
12346
12641
  };
12347
12642
  }
12348
12643
  return {
12349
- path: path47,
12644
+ path: path48,
12350
12645
  score: 0,
12351
12646
  weight,
12352
12647
  hit: false,
12353
- message: `${path47} (value mismatch)`
12648
+ message: `${path48} (value mismatch)`
12354
12649
  };
12355
12650
  }
12356
12651
  /**
12357
12652
  * Numeric comparison with absolute or relative tolerance.
12358
12653
  */
12359
- compareNumericTolerance(path47, candidateValue, expectedValue, fieldConfig, weight) {
12654
+ compareNumericTolerance(path48, candidateValue, expectedValue, fieldConfig, weight) {
12360
12655
  const { tolerance = 0, relative = false } = fieldConfig;
12361
12656
  const candidateNum = toNumber(candidateValue);
12362
12657
  const expectedNum = toNumber(expectedValue);
12363
12658
  if (candidateNum === null || expectedNum === null) {
12364
12659
  return {
12365
- path: path47,
12660
+ path: path48,
12366
12661
  score: 0,
12367
12662
  weight,
12368
12663
  hit: false,
12369
- message: `${path47} (non-numeric value)`
12664
+ message: `${path48} (non-numeric value)`
12370
12665
  };
12371
12666
  }
12372
12667
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
12373
12668
  return {
12374
- path: path47,
12669
+ path: path48,
12375
12670
  score: 0,
12376
12671
  weight,
12377
12672
  hit: false,
12378
- message: `${path47} (invalid numeric value)`
12673
+ message: `${path48} (invalid numeric value)`
12379
12674
  };
12380
12675
  }
12381
12676
  const diff = Math.abs(candidateNum - expectedNum);
@@ -12388,61 +12683,61 @@ var FieldAccuracyEvaluator = class {
12388
12683
  }
12389
12684
  if (withinTolerance) {
12390
12685
  return {
12391
- path: path47,
12686
+ path: path48,
12392
12687
  score: 1,
12393
12688
  weight,
12394
12689
  hit: true,
12395
- message: `${path47} (within tolerance: diff=${diff.toFixed(2)})`
12690
+ message: `${path48} (within tolerance: diff=${diff.toFixed(2)})`
12396
12691
  };
12397
12692
  }
12398
12693
  return {
12399
- path: path47,
12694
+ path: path48,
12400
12695
  score: 0,
12401
12696
  weight,
12402
12697
  hit: false,
12403
- message: `${path47} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
12698
+ message: `${path48} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
12404
12699
  };
12405
12700
  }
12406
12701
  /**
12407
12702
  * Date comparison with format normalization.
12408
12703
  */
12409
- compareDate(path47, candidateValue, expectedValue, fieldConfig, weight) {
12704
+ compareDate(path48, candidateValue, expectedValue, fieldConfig, weight) {
12410
12705
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
12411
12706
  const candidateDate = parseDate(String(candidateValue), formats);
12412
12707
  const expectedDate = parseDate(String(expectedValue), formats);
12413
12708
  if (candidateDate === null) {
12414
12709
  return {
12415
- path: path47,
12710
+ path: path48,
12416
12711
  score: 0,
12417
12712
  weight,
12418
12713
  hit: false,
12419
- message: `${path47} (unparseable candidate date)`
12714
+ message: `${path48} (unparseable candidate date)`
12420
12715
  };
12421
12716
  }
12422
12717
  if (expectedDate === null) {
12423
12718
  return {
12424
- path: path47,
12719
+ path: path48,
12425
12720
  score: 0,
12426
12721
  weight,
12427
12722
  hit: false,
12428
- message: `${path47} (unparseable expected date)`
12723
+ message: `${path48} (unparseable expected date)`
12429
12724
  };
12430
12725
  }
12431
12726
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
12432
12727
  return {
12433
- path: path47,
12728
+ path: path48,
12434
12729
  score: 1,
12435
12730
  weight,
12436
12731
  hit: true,
12437
- message: path47
12732
+ message: path48
12438
12733
  };
12439
12734
  }
12440
12735
  return {
12441
- path: path47,
12736
+ path: path48,
12442
12737
  score: 0,
12443
12738
  weight,
12444
12739
  hit: false,
12445
- message: `${path47} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
12740
+ message: `${path48} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
12446
12741
  };
12447
12742
  }
12448
12743
  /**
@@ -12475,11 +12770,11 @@ var FieldAccuracyEvaluator = class {
12475
12770
  };
12476
12771
  }
12477
12772
  };
12478
- function resolvePath(obj, path47) {
12479
- if (!path47 || !obj) {
12773
+ function resolvePath(obj, path48) {
12774
+ if (!path48 || !obj) {
12480
12775
  return void 0;
12481
12776
  }
12482
- const parts = path47.split(/\.|\[|\]/).filter((p) => p.length > 0);
12777
+ const parts = path48.split(/\.|\[|\]/).filter((p) => p.length > 0);
12483
12778
  let current = obj;
12484
12779
  for (const part of parts) {
12485
12780
  if (current === null || current === void 0) {
@@ -12771,11 +13066,12 @@ function assembleLlmGraderPrompt(input) {
12771
13066
  function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
12772
13067
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
12773
13068
  const variables = {
12774
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2),
12775
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
12776
- [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
13069
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
13070
+ [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(),
13071
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
12777
13072
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
12778
13073
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
13074
+ // Deprecated aliases
12779
13075
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
12780
13076
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
12781
13077
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -12962,8 +13258,8 @@ var TokenUsageEvaluator = class {
12962
13258
  };
12963
13259
 
12964
13260
  // src/evaluation/evaluators/tool-trajectory.ts
12965
- function getNestedValue(obj, path47) {
12966
- const parts = path47.split(".");
13261
+ function getNestedValue(obj, path48) {
13262
+ const parts = path48.split(".");
12967
13263
  let current = obj;
12968
13264
  for (const part of parts) {
12969
13265
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -13832,16 +14128,13 @@ async function executePromptTemplate(script, context, config, timeoutMs) {
13832
14128
  const payload = {
13833
14129
  criteria: context.evalCase.criteria,
13834
14130
  expectedOutput: context.evalCase.expected_output,
13835
- outputText: context.candidate,
13836
14131
  output: context.output ?? null,
13837
14132
  inputFiles: context.evalCase.file_paths,
13838
14133
  input: context.evalCase.input,
13839
14134
  trace: context.trace ?? null,
13840
14135
  fileChanges: context.fileChanges ?? null,
13841
14136
  workspacePath: context.workspacePath ?? null,
13842
- config: config ?? context.config ?? null,
13843
- inputText: context.evalCase.question,
13844
- expectedOutputText: context.evalCase.reference_answer ?? ""
14137
+ config: config ?? context.config ?? null
13845
14138
  };
13846
14139
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
13847
14140
  const scriptPath = script[script.length - 1];
@@ -15521,7 +15814,8 @@ async function runEvaluation(options) {
15521
15814
  const budgetResult = {
15522
15815
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
15523
15816
  testId: evalCase.id,
15524
- eval_set: evalCase.eval_set,
15817
+ dataset: evalCase.dataset,
15818
+ category: evalCase.category,
15525
15819
  score: 0,
15526
15820
  assertions: [],
15527
15821
  output: [],
@@ -15557,7 +15851,8 @@ async function runEvaluation(options) {
15557
15851
  const haltResult = {
15558
15852
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
15559
15853
  testId: evalCase.id,
15560
- eval_set: evalCase.eval_set,
15854
+ dataset: evalCase.dataset,
15855
+ category: evalCase.category,
15561
15856
  score: 0,
15562
15857
  assertions: [],
15563
15858
  output: [],
@@ -16556,7 +16851,8 @@ async function evaluateCandidate(options) {
16556
16851
  return {
16557
16852
  timestamp: completedAt.toISOString(),
16558
16853
  testId: evalCase.id,
16559
- eval_set: evalCase.eval_set,
16854
+ dataset: evalCase.dataset,
16855
+ category: evalCase.category,
16560
16856
  conversationId: evalCase.conversation_id,
16561
16857
  score: score.score,
16562
16858
  assertions: score.assertions,
@@ -16906,7 +17202,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
16906
17202
  return {
16907
17203
  timestamp: timestamp.toISOString(),
16908
17204
  testId: evalCase.id,
16909
- eval_set: evalCase.eval_set,
17205
+ dataset: evalCase.dataset,
17206
+ category: evalCase.category,
16910
17207
  conversationId: evalCase.conversation_id,
16911
17208
  score: 0,
16912
17209
  assertions: [{ text: `Error: ${message}`, passed: false }],
@@ -17479,6 +17776,18 @@ function trimBaselineResult(result) {
17479
17776
  return trimmed;
17480
17777
  }
17481
17778
 
17779
+ // src/evaluation/category.ts
17780
+ import path47 from "node:path";
17781
+ var DEFAULT_CATEGORY = "Uncategorized";
17782
+ function deriveCategory(relativePath) {
17783
+ const parts = relativePath.split(path47.sep);
17784
+ if (parts.length <= 1) {
17785
+ return DEFAULT_CATEGORY;
17786
+ }
17787
+ const dirs = parts.slice(0, -1).filter((d) => d !== "evals");
17788
+ return dirs.length > 0 ? dirs.join("/") : DEFAULT_CATEGORY;
17789
+ }
17790
+
17482
17791
  // src/observability/otel-exporter.ts
17483
17792
  var OTEL_BACKEND_PRESETS = {
17484
17793
  langfuse: {
@@ -17602,7 +17911,7 @@ var OtelTraceExporter = class {
17602
17911
  rootSpan.setAttribute("gen_ai.system", "agentv");
17603
17912
  rootSpan.setAttribute("agentv.test_id", result.testId);
17604
17913
  rootSpan.setAttribute("agentv.target", result.target);
17605
- if (result.eval_set) rootSpan.setAttribute("agentv.eval_set", result.eval_set);
17914
+ if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
17606
17915
  rootSpan.setAttribute("agentv.score", result.score);
17607
17916
  if (captureContent && result.output.length > 0) {
17608
17917
  const lastMsg = result.output[result.output.length - 1];
@@ -17811,7 +18120,7 @@ var OtelStreamingObserver = class {
17811
18120
  this.rootSpan.setAttribute("gen_ai.system", "agentv");
17812
18121
  this.rootSpan.setAttribute("agentv.test_id", testId);
17813
18122
  this.rootSpan.setAttribute("agentv.target", target);
17814
- if (evalSet) this.rootSpan.setAttribute("agentv.eval_set", evalSet);
18123
+ if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
17815
18124
  this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
17816
18125
  }
17817
18126
  /** Create and immediately export a tool span */
@@ -17983,9 +18292,11 @@ function createAgentKernel() {
17983
18292
  return { status: "stub" };
17984
18293
  }
17985
18294
  export {
18295
+ COMMON_TARGET_SETTINGS,
17986
18296
  CodeEvaluator,
17987
18297
  CompositeEvaluator,
17988
18298
  CostEvaluator,
18299
+ DEFAULT_CATEGORY,
17989
18300
  DEFAULT_EVALUATOR_TEMPLATE,
17990
18301
  DEFAULT_EVAL_PATTERNS,
17991
18302
  DEFAULT_EXPLORATION_TOOLS,
@@ -18039,6 +18350,7 @@ export {
18039
18350
  createTempWorkspace,
18040
18351
  deepEqual,
18041
18352
  defineConfig,
18353
+ deriveCategory,
18042
18354
  detectFormat,
18043
18355
  discoverAssertions,
18044
18356
  discoverCopilotSessions,
@@ -18052,7 +18364,9 @@ export {
18052
18364
  explorationRatio,
18053
18365
  extractCacheConfig,
18054
18366
  extractFailOnError,
18367
+ extractImageBlocks,
18055
18368
  extractJsonBlob,
18369
+ extractLastAssistantContent,
18056
18370
  extractTargetFromSuite,
18057
18371
  extractTargetsFromSuite,
18058
18372
  extractTargetsFromTestCase,
@@ -18066,12 +18380,15 @@ export {
18066
18380
  getAgentvHome,
18067
18381
  getOutputFilenames,
18068
18382
  getSubagentsRoot,
18383
+ getTextContent,
18069
18384
  getTraceStateRoot,
18070
18385
  getWorkspacePath,
18071
18386
  getWorkspacePoolRoot,
18072
18387
  getWorkspacesRoot,
18073
18388
  initializeBaseline,
18074
18389
  isAgentSkillsFormat,
18390
+ isContent,
18391
+ isContentArray,
18075
18392
  isEvaluatorKind,
18076
18393
  isJsonObject,
18077
18394
  isJsonValue,