@agentv/core 3.14.6 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1315,12 +1315,12 @@ function serializeAttributeValue(value) {
1315
1315
  if (Array.isArray(value)) return { arrayValue: { values: value.map(serializeAttributeValue) } };
1316
1316
  return { stringValue: String(value) };
1317
1317
  }
1318
- var import_promises35, import_node_path51, OtlpJsonFileExporter;
1318
+ var import_promises35, import_node_path52, OtlpJsonFileExporter;
1319
1319
  var init_otlp_json_file_exporter = __esm({
1320
1320
  "src/observability/otlp-json-file-exporter.ts"() {
1321
1321
  "use strict";
1322
1322
  import_promises35 = require("fs/promises");
1323
- import_node_path51 = require("path");
1323
+ import_node_path52 = require("path");
1324
1324
  OtlpJsonFileExporter = class {
1325
1325
  // biome-ignore lint/suspicious/noExplicitAny: serialized span data
1326
1326
  spans = [];
@@ -1359,7 +1359,7 @@ var init_otlp_json_file_exporter = __esm({
1359
1359
  }
1360
1360
  async flush() {
1361
1361
  if (this.spans.length === 0) return;
1362
- await (0, import_promises35.mkdir)((0, import_node_path51.dirname)(this.filePath), { recursive: true });
1362
+ await (0, import_promises35.mkdir)((0, import_node_path52.dirname)(this.filePath), { recursive: true });
1363
1363
  const otlpJson = {
1364
1364
  resourceSpans: [
1365
1365
  {
@@ -1383,9 +1383,11 @@ var init_otlp_json_file_exporter = __esm({
1383
1383
  // src/index.ts
1384
1384
  var index_exports = {};
1385
1385
  __export(index_exports, {
1386
+ COMMON_TARGET_SETTINGS: () => COMMON_TARGET_SETTINGS,
1386
1387
  CodeEvaluator: () => CodeEvaluator,
1387
1388
  CompositeEvaluator: () => CompositeEvaluator,
1388
1389
  CostEvaluator: () => CostEvaluator,
1390
+ DEFAULT_CATEGORY: () => DEFAULT_CATEGORY,
1389
1391
  DEFAULT_EVALUATOR_TEMPLATE: () => DEFAULT_EVALUATOR_TEMPLATE,
1390
1392
  DEFAULT_EVAL_PATTERNS: () => DEFAULT_EVAL_PATTERNS,
1391
1393
  DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
@@ -1439,6 +1441,7 @@ __export(index_exports, {
1439
1441
  createTempWorkspace: () => createTempWorkspace,
1440
1442
  deepEqual: () => deepEqual,
1441
1443
  defineConfig: () => defineConfig,
1444
+ deriveCategory: () => deriveCategory,
1442
1445
  detectFormat: () => detectFormat,
1443
1446
  discoverAssertions: () => discoverAssertions,
1444
1447
  discoverCopilotSessions: () => discoverCopilotSessions,
@@ -1452,7 +1455,9 @@ __export(index_exports, {
1452
1455
  explorationRatio: () => explorationRatio,
1453
1456
  extractCacheConfig: () => extractCacheConfig,
1454
1457
  extractFailOnError: () => extractFailOnError,
1458
+ extractImageBlocks: () => extractImageBlocks,
1455
1459
  extractJsonBlob: () => extractJsonBlob,
1460
+ extractLastAssistantContent: () => extractLastAssistantContent,
1456
1461
  extractTargetFromSuite: () => extractTargetFromSuite,
1457
1462
  extractTargetsFromSuite: () => extractTargetsFromSuite,
1458
1463
  extractTargetsFromTestCase: () => extractTargetsFromTestCase,
@@ -1466,12 +1471,15 @@ __export(index_exports, {
1466
1471
  getAgentvHome: () => getAgentvHome,
1467
1472
  getOutputFilenames: () => getOutputFilenames,
1468
1473
  getSubagentsRoot: () => getSubagentsRoot,
1474
+ getTextContent: () => getTextContent,
1469
1475
  getTraceStateRoot: () => getTraceStateRoot,
1470
1476
  getWorkspacePath: () => getWorkspacePath,
1471
1477
  getWorkspacePoolRoot: () => getWorkspacePoolRoot,
1472
1478
  getWorkspacesRoot: () => getWorkspacesRoot,
1473
1479
  initializeBaseline: () => initializeBaseline,
1474
1480
  isAgentSkillsFormat: () => isAgentSkillsFormat,
1481
+ isContent: () => isContent,
1482
+ isContentArray: () => isContentArray,
1475
1483
  isEvaluatorKind: () => isEvaluatorKind,
1476
1484
  isJsonObject: () => isJsonObject,
1477
1485
  isJsonValue: () => isJsonValue,
@@ -1533,6 +1541,29 @@ __export(index_exports, {
1533
1541
  });
1534
1542
  module.exports = __toCommonJS(index_exports);
1535
1543
 
1544
+ // src/evaluation/content.ts
1545
+ var CONTENT_TYPES = /* @__PURE__ */ new Set(["text", "image", "file"]);
1546
+ function isContent(value) {
1547
+ if (!value || typeof value !== "object") return false;
1548
+ const v = value;
1549
+ return typeof v.type === "string" && CONTENT_TYPES.has(v.type);
1550
+ }
1551
+ function isContentArray(value) {
1552
+ return Array.isArray(value) && value.length > 0 && value.every(isContent);
1553
+ }
1554
+ function getTextContent(content) {
1555
+ if (content == null) return "";
1556
+ if (typeof content === "string") return content;
1557
+ if (!Array.isArray(content)) return "";
1558
+ const parts = [];
1559
+ for (const block of content) {
1560
+ if (block.type === "text") {
1561
+ parts.push(block.text);
1562
+ }
1563
+ }
1564
+ return parts.join("\n");
1565
+ }
1566
+
1536
1567
  // src/evaluation/types.ts
1537
1568
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
1538
1569
  var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
@@ -2411,15 +2442,23 @@ var TEMPLATE_VARIABLES = {
2411
2442
  INPUT: "input",
2412
2443
  OUTPUT: "output",
2413
2444
  FILE_CHANGES: "file_changes",
2445
+ /** @deprecated Use INPUT instead — resolves to the same text value. */
2414
2446
  INPUT_TEXT: "input_text",
2447
+ /** @deprecated Use OUTPUT instead — resolves to the same text value. */
2415
2448
  OUTPUT_TEXT: "output_text",
2449
+ /** @deprecated Use EXPECTED_OUTPUT instead — resolves to the same text value. */
2416
2450
  EXPECTED_OUTPUT_TEXT: "expected_output_text"
2417
2451
  };
2418
2452
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
2419
2453
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
2420
- TEMPLATE_VARIABLES.OUTPUT_TEXT,
2454
+ TEMPLATE_VARIABLES.OUTPUT,
2421
2455
  TEMPLATE_VARIABLES.EXPECTED_OUTPUT
2422
2456
  ]);
2457
+ var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
2458
+ [TEMPLATE_VARIABLES.INPUT_TEXT, TEMPLATE_VARIABLES.INPUT],
2459
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT],
2460
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
2461
+ ]);
2423
2462
 
2424
2463
  // src/evaluation/validation/prompt-validator.ts
2425
2464
  var ANSI_YELLOW3 = "\x1B[33m";
@@ -2441,16 +2480,29 @@ function validateTemplateVariables(content, source) {
2441
2480
  }
2442
2481
  match = variablePattern.exec(content);
2443
2482
  }
2444
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
2445
- const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
2483
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
2484
+ const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT);
2446
2485
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
2447
2486
  if (!hasRequiredFields) {
2448
2487
  throw new Error(
2449
2488
  `Missing required fields. Must include at least one of:
2450
- - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
2489
+ - {{ ${TEMPLATE_VARIABLES.OUTPUT} }}
2451
2490
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
2452
2491
  );
2453
2492
  }
2493
+ const deprecatedUsed = [];
2494
+ for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
2495
+ if (foundVariables.has(deprecated)) {
2496
+ deprecatedUsed.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
2497
+ }
2498
+ }
2499
+ if (deprecatedUsed.length > 0) {
2500
+ console.warn(
2501
+ `${ANSI_YELLOW3}Warning: Template at ${source} uses deprecated variable names:
2502
+ ${deprecatedUsed.join("\n ")}
2503
+ These still work but will be removed in a future version.${ANSI_RESET4}`
2504
+ );
2505
+ }
2454
2506
  if (invalidVariables.length > 0) {
2455
2507
  const warningMessage = `${ANSI_YELLOW3}Warning: Custom evaluator template at ${source}
2456
2508
  Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
@@ -3868,6 +3920,19 @@ function asString2(value) {
3868
3920
  }
3869
3921
 
3870
3922
  // src/evaluation/loaders/message-processor.ts
3923
+ var IMAGE_MEDIA_TYPES = {
3924
+ ".png": "image/png",
3925
+ ".jpg": "image/jpeg",
3926
+ ".jpeg": "image/jpeg",
3927
+ ".gif": "image/gif",
3928
+ ".webp": "image/webp",
3929
+ ".svg": "image/svg+xml",
3930
+ ".bmp": "image/bmp"
3931
+ };
3932
+ function detectImageMediaType(filePath) {
3933
+ const ext = import_node_path6.default.extname(filePath).toLowerCase();
3934
+ return IMAGE_MEDIA_TYPES[ext];
3935
+ }
3871
3936
  var ANSI_YELLOW5 = "\x1B[33m";
3872
3937
  var ANSI_RESET6 = "\x1B[0m";
3873
3938
  async function processMessages(options) {
@@ -3933,6 +3998,47 @@ async function processMessages(options) {
3933
3998
  }
3934
3999
  continue;
3935
4000
  }
4001
+ if (segmentType === "image") {
4002
+ const rawValue = asString3(rawSegment.value);
4003
+ if (!rawValue) {
4004
+ continue;
4005
+ }
4006
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
4007
+ rawValue,
4008
+ searchRoots
4009
+ );
4010
+ if (!resolvedPath) {
4011
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
4012
+ const context2 = messageType === "input" ? "" : " in expected_output";
4013
+ logWarning3(`Image file not found${context2}: ${displayPath}`, attempts);
4014
+ continue;
4015
+ }
4016
+ const mediaType = detectImageMediaType(resolvedPath);
4017
+ if (!mediaType) {
4018
+ logWarning3(
4019
+ `Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
4020
+ );
4021
+ continue;
4022
+ }
4023
+ try {
4024
+ const imageBuffer = await (0, import_promises6.readFile)(resolvedPath);
4025
+ const base64 = imageBuffer.toString("base64");
4026
+ processedContent.push({
4027
+ type: "image",
4028
+ media_type: mediaType,
4029
+ source: `data:${mediaType};base64,${base64}`
4030
+ });
4031
+ if (verbose) {
4032
+ const label = messageType === "input" ? "[Image]" : "[Expected Output Image]";
4033
+ console.log(` ${label} Found: ${displayPath}`);
4034
+ console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
4035
+ }
4036
+ } catch (error) {
4037
+ const context2 = messageType === "input" ? "" : " expected output";
4038
+ logWarning3(`Could not read${context2} image ${resolvedPath}: ${error.message}`);
4039
+ }
4040
+ continue;
4041
+ }
3936
4042
  const clonedSegment = cloneJsonObject(rawSegment);
3937
4043
  processedContent.push(clonedSegment);
3938
4044
  const inlineValue = clonedSegment.value;
@@ -4010,6 +4116,46 @@ async function processExpectedMessages(options) {
4010
4116
  }
4011
4117
  continue;
4012
4118
  }
4119
+ if (segmentType === "image") {
4120
+ const rawValue = asString3(rawSegment.value);
4121
+ if (!rawValue) {
4122
+ continue;
4123
+ }
4124
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
4125
+ rawValue,
4126
+ searchRoots
4127
+ );
4128
+ if (!resolvedPath) {
4129
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
4130
+ logWarning3(`Image file not found in expected_output: ${displayPath}`, attempts);
4131
+ continue;
4132
+ }
4133
+ const mediaType = detectImageMediaType(resolvedPath);
4134
+ if (!mediaType) {
4135
+ logWarning3(
4136
+ `Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
4137
+ );
4138
+ continue;
4139
+ }
4140
+ try {
4141
+ const imageBuffer = await (0, import_promises6.readFile)(resolvedPath);
4142
+ const base64 = imageBuffer.toString("base64");
4143
+ processedContent.push({
4144
+ type: "image",
4145
+ media_type: mediaType,
4146
+ source: `data:${mediaType};base64,${base64}`
4147
+ });
4148
+ if (verbose) {
4149
+ console.log(` [Expected Output Image] Found: ${displayPath}`);
4150
+ console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
4151
+ }
4152
+ } catch (error) {
4153
+ logWarning3(
4154
+ `Could not read expected output image ${resolvedPath}: ${error.message}`
4155
+ );
4156
+ }
4157
+ continue;
4158
+ }
4013
4159
  processedContent.push(cloneJsonObject(rawSegment));
4014
4160
  }
4015
4161
  segment.content = processedContent;
@@ -4256,7 +4402,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4256
4402
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
4257
4403
  const testCase = {
4258
4404
  id,
4259
- eval_set: evalSetName,
4405
+ dataset: evalSetName,
4260
4406
  conversation_id: conversationId,
4261
4407
  question,
4262
4408
  input: inputMessages,
@@ -4527,7 +4673,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4527
4673
  }
4528
4674
  const suite = interpolated;
4529
4675
  const evalSetNameFromSuite = asString5(suite.name)?.trim();
4530
- const fallbackEvalSet = import_node_path8.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
4676
+ const fallbackEvalSet = import_node_path8.default.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
4531
4677
  const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
4532
4678
  const rawTestcases = resolveTests(suite);
4533
4679
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
@@ -4648,7 +4794,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4648
4794
  const caseTargets = extractTargetsFromTestCase(evalcase);
4649
4795
  const testCase = {
4650
4796
  id,
4651
- eval_set: evalSetName,
4797
+ dataset: evalSetName,
4798
+ category: options?.category,
4652
4799
  conversation_id: conversationId,
4653
4800
  question,
4654
4801
  input: inputMessages,
@@ -5690,6 +5837,49 @@ var import_node_fs4 = require("fs");
5690
5837
  var import_promises10 = require("fs/promises");
5691
5838
  var import_node_path12 = __toESM(require("path"), 1);
5692
5839
 
5840
+ // src/evaluation/providers/claude-content.ts
5841
+ function toContentArray(content) {
5842
+ if (!Array.isArray(content)) return void 0;
5843
+ let hasNonText = false;
5844
+ const blocks = [];
5845
+ for (const part of content) {
5846
+ if (!part || typeof part !== "object") continue;
5847
+ const p = part;
5848
+ if (p.type === "text" && typeof p.text === "string") {
5849
+ blocks.push({ type: "text", text: p.text });
5850
+ } else if (p.type === "image" && typeof p.source === "object" && p.source !== null) {
5851
+ const src = p.source;
5852
+ const mediaType = typeof p.media_type === "string" ? p.media_type : typeof src.media_type === "string" ? src.media_type : "application/octet-stream";
5853
+ const data = typeof src.data === "string" && src.data !== "" ? `data:${mediaType};base64,${src.data}` : typeof p.url === "string" && p.url !== "" ? p.url : "";
5854
+ if (!data) continue;
5855
+ blocks.push({ type: "image", media_type: mediaType, source: data });
5856
+ hasNonText = true;
5857
+ } else if (p.type === "tool_use") {
5858
+ } else if (p.type === "tool_result") {
5859
+ }
5860
+ }
5861
+ return hasNonText && blocks.length > 0 ? blocks : void 0;
5862
+ }
5863
+ function extractTextContent(content) {
5864
+ if (typeof content === "string") {
5865
+ return content;
5866
+ }
5867
+ if (!Array.isArray(content)) {
5868
+ return void 0;
5869
+ }
5870
+ const textParts = [];
5871
+ for (const part of content) {
5872
+ if (!part || typeof part !== "object") {
5873
+ continue;
5874
+ }
5875
+ const p = part;
5876
+ if (p.type === "text" && typeof p.text === "string") {
5877
+ textParts.push(p.text);
5878
+ }
5879
+ }
5880
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
5881
+ }
5882
+
5693
5883
  // src/evaluation/providers/claude-log-tracker.ts
5694
5884
  var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeLogs");
5695
5885
  var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeLogSubscribers");
@@ -5855,11 +6045,12 @@ var ClaudeCliProvider = class {
5855
6045
  if (betaMessage && typeof betaMessage === "object") {
5856
6046
  const msg = betaMessage;
5857
6047
  const content = msg.content;
6048
+ const structuredContent = toContentArray(content);
5858
6049
  const textContent = extractTextContent(content);
5859
6050
  const toolCalls = extractToolCalls(content);
5860
6051
  const outputMsg = {
5861
6052
  role: "assistant",
5862
- content: textContent,
6053
+ content: structuredContent ?? textContent,
5863
6054
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0
5864
6055
  };
5865
6056
  output.push(outputMsg);
@@ -6198,25 +6389,6 @@ function summarizeEvent(event) {
6198
6389
  return void 0;
6199
6390
  }
6200
6391
  }
6201
- function extractTextContent(content) {
6202
- if (typeof content === "string") {
6203
- return content;
6204
- }
6205
- if (!Array.isArray(content)) {
6206
- return void 0;
6207
- }
6208
- const textParts = [];
6209
- for (const part of content) {
6210
- if (!part || typeof part !== "object") {
6211
- continue;
6212
- }
6213
- const p = part;
6214
- if (p.type === "text" && typeof p.text === "string") {
6215
- textParts.push(p.text);
6216
- }
6217
- }
6218
- return textParts.length > 0 ? textParts.join("\n") : void 0;
6219
- }
6220
6392
  function extractToolCalls(content) {
6221
6393
  if (!Array.isArray(content)) {
6222
6394
  return [];
@@ -6389,11 +6561,12 @@ var ClaudeSdkProvider = class {
6389
6561
  if (betaMessage && typeof betaMessage === "object") {
6390
6562
  const msg = betaMessage;
6391
6563
  const content = msg.content;
6392
- const textContent = extractTextContent2(content);
6564
+ const structuredContent = toContentArray(content);
6565
+ const textContent = extractTextContent(content);
6393
6566
  const toolCalls = extractToolCalls2(content);
6394
6567
  const outputMsg = {
6395
6568
  role: "assistant",
6396
- content: textContent,
6569
+ content: structuredContent ?? textContent,
6397
6570
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0
6398
6571
  };
6399
6572
  output.push(outputMsg);
@@ -6511,25 +6684,6 @@ var ClaudeSdkProvider = class {
6511
6684
  }
6512
6685
  }
6513
6686
  };
6514
- function extractTextContent2(content) {
6515
- if (typeof content === "string") {
6516
- return content;
6517
- }
6518
- if (!Array.isArray(content)) {
6519
- return void 0;
6520
- }
6521
- const textParts = [];
6522
- for (const part of content) {
6523
- if (!part || typeof part !== "object") {
6524
- continue;
6525
- }
6526
- const p = part;
6527
- if (p.type === "text" && typeof p.text === "string") {
6528
- textParts.push(p.text);
6529
- }
6530
- }
6531
- return textParts.length > 0 ? textParts.join("\n") : void 0;
6532
- }
6533
6687
  function extractToolCalls2(content) {
6534
6688
  if (!Array.isArray(content)) {
6535
6689
  return [];
@@ -6753,7 +6907,7 @@ function convertMessages(messages) {
6753
6907
  return messages.map((msg) => ({
6754
6908
  role: msg.role,
6755
6909
  name: msg.name,
6756
- content: msg.content,
6910
+ content: isContentArray(msg.content) ? msg.content : typeof msg.content === "string" ? msg.content : void 0,
6757
6911
  toolCalls: msg.tool_calls?.map((tc) => ({
6758
6912
  tool: tc.tool,
6759
6913
  input: tc.input,
@@ -9007,6 +9161,35 @@ function extractPiTextContent(content) {
9007
9161
  }
9008
9162
  return textParts.length > 0 ? textParts.join("\n") : void 0;
9009
9163
  }
9164
+ function toPiContentArray(content) {
9165
+ if (!Array.isArray(content)) return void 0;
9166
+ let hasNonText = false;
9167
+ const blocks = [];
9168
+ for (const part of content) {
9169
+ if (!part || typeof part !== "object") continue;
9170
+ const p = part;
9171
+ if (p.type === "text" && typeof p.text === "string") {
9172
+ blocks.push({ type: "text", text: p.text });
9173
+ } else if (p.type === "image") {
9174
+ const mediaType = typeof p.media_type === "string" ? p.media_type : "application/octet-stream";
9175
+ let source = "";
9176
+ if (typeof p.source === "object" && p.source !== null) {
9177
+ const src = p.source;
9178
+ const srcMediaType = typeof src.media_type === "string" ? src.media_type : mediaType;
9179
+ source = typeof src.data === "string" ? `data:${srcMediaType};base64,${src.data}` : "";
9180
+ }
9181
+ if (!source && typeof p.url === "string") {
9182
+ source = p.url;
9183
+ }
9184
+ if (source) {
9185
+ blocks.push({ type: "image", media_type: mediaType, source });
9186
+ hasNonText = true;
9187
+ }
9188
+ } else if (p.type === "tool_use" || p.type === "tool_result") {
9189
+ }
9190
+ }
9191
+ return hasNonText && blocks.length > 0 ? blocks : void 0;
9192
+ }
9010
9193
  function toFiniteNumber(value) {
9011
9194
  if (typeof value === "number" && Number.isFinite(value)) return value;
9012
9195
  return void 0;
@@ -10178,7 +10361,8 @@ function convertAgentMessage(message, toolTrackers, completedToolResults) {
10178
10361
  }
10179
10362
  const msg = message;
10180
10363
  const role = typeof msg.role === "string" ? msg.role : "unknown";
10181
- const content = extractPiTextContent(msg.content);
10364
+ const structuredContent = toPiContentArray(msg.content);
10365
+ const content = structuredContent ?? extractPiTextContent(msg.content);
10182
10366
  const toolCalls = extractToolCalls4(msg.content, toolTrackers, completedToolResults);
10183
10367
  const startTimeVal = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
10184
10368
  let msgTokenUsage;
@@ -10440,6 +10624,12 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
10440
10624
  "FILES",
10441
10625
  "OUTPUT_FILE"
10442
10626
  ]);
10627
+ var COMMON_TARGET_SETTINGS = [
10628
+ "provider_batching",
10629
+ "providerBatching",
10630
+ "subagent_mode_allowed",
10631
+ "subagentModeAllowed"
10632
+ ];
10443
10633
  var BASE_TARGET_SCHEMA = import_zod3.z.object({
10444
10634
  name: import_zod3.z.string().min(1, "target name is required"),
10445
10635
  provider: import_zod3.z.string().min(1, "provider is required"),
@@ -10448,7 +10638,8 @@ var BASE_TARGET_SCHEMA = import_zod3.z.object({
10448
10638
  // backward compat
10449
10639
  workers: import_zod3.z.number().int().min(1).optional(),
10450
10640
  workspace_template: import_zod3.z.string().optional(),
10451
- workspaceTemplate: import_zod3.z.string().optional()
10641
+ workspaceTemplate: import_zod3.z.string().optional(),
10642
+ subagent_mode_allowed: import_zod3.z.boolean().optional()
10452
10643
  }).passthrough();
10453
10644
  var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
10454
10645
  var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
@@ -10511,42 +10702,40 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
10511
10702
  const providerBatching = resolveOptionalBoolean(
10512
10703
  parsed.provider_batching ?? parsed.providerBatching
10513
10704
  );
10705
+ const subagentModeAllowed = resolveOptionalBoolean(
10706
+ parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
10707
+ );
10708
+ const base = {
10709
+ name: parsed.name,
10710
+ graderTarget: parsed.grader_target ?? parsed.judge_target,
10711
+ workers: parsed.workers,
10712
+ providerBatching,
10713
+ subagentModeAllowed
10714
+ };
10514
10715
  switch (provider) {
10515
10716
  case "openai":
10516
10717
  return {
10517
10718
  kind: "openai",
10518
- name: parsed.name,
10519
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10520
- workers: parsed.workers,
10521
- providerBatching,
10719
+ ...base,
10522
10720
  config: resolveOpenAIConfig(parsed, env)
10523
10721
  };
10524
10722
  case "openrouter":
10525
10723
  return {
10526
10724
  kind: "openrouter",
10527
- name: parsed.name,
10528
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10529
- workers: parsed.workers,
10530
- providerBatching,
10725
+ ...base,
10531
10726
  config: resolveOpenRouterConfig(parsed, env)
10532
10727
  };
10533
10728
  case "azure":
10534
10729
  case "azure-openai":
10535
10730
  return {
10536
10731
  kind: "azure",
10537
- name: parsed.name,
10538
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10539
- workers: parsed.workers,
10540
- providerBatching,
10732
+ ...base,
10541
10733
  config: resolveAzureConfig(parsed, env)
10542
10734
  };
10543
10735
  case "anthropic":
10544
10736
  return {
10545
10737
  kind: "anthropic",
10546
- name: parsed.name,
10547
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10548
- workers: parsed.workers,
10549
- providerBatching,
10738
+ ...base,
10550
10739
  config: resolveAnthropicConfig(parsed, env)
10551
10740
  };
10552
10741
  case "gemini":
@@ -10554,68 +10743,47 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
10554
10743
  case "google-gemini":
10555
10744
  return {
10556
10745
  kind: "gemini",
10557
- name: parsed.name,
10558
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10559
- workers: parsed.workers,
10560
- providerBatching,
10746
+ ...base,
10561
10747
  config: resolveGeminiConfig(parsed, env)
10562
10748
  };
10563
10749
  case "codex":
10564
10750
  case "codex-cli":
10565
10751
  return {
10566
10752
  kind: "codex",
10567
- name: parsed.name,
10568
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10569
- workers: parsed.workers,
10570
- providerBatching,
10753
+ ...base,
10571
10754
  config: resolveCodexConfig(parsed, env, evalFilePath)
10572
10755
  };
10573
10756
  case "copilot-sdk":
10574
10757
  case "copilot_sdk":
10575
10758
  return {
10576
10759
  kind: "copilot-sdk",
10577
- name: parsed.name,
10578
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10579
- workers: parsed.workers,
10580
- providerBatching,
10760
+ ...base,
10581
10761
  config: resolveCopilotSdkConfig(parsed, env, evalFilePath)
10582
10762
  };
10583
10763
  case "copilot":
10584
10764
  case "copilot-cli":
10585
10765
  return {
10586
10766
  kind: "copilot-cli",
10587
- name: parsed.name,
10588
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10589
- workers: parsed.workers,
10590
- providerBatching,
10767
+ ...base,
10591
10768
  config: resolveCopilotCliConfig(parsed, env, evalFilePath)
10592
10769
  };
10593
10770
  case "copilot-log":
10594
10771
  return {
10595
10772
  kind: "copilot-log",
10596
- name: parsed.name,
10597
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10598
- workers: parsed.workers,
10599
- providerBatching,
10773
+ ...base,
10600
10774
  config: resolveCopilotLogConfig(parsed, env)
10601
10775
  };
10602
10776
  case "pi":
10603
10777
  case "pi-coding-agent":
10604
10778
  return {
10605
10779
  kind: "pi-coding-agent",
10606
- name: parsed.name,
10607
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10608
- workers: parsed.workers,
10609
- providerBatching,
10780
+ ...base,
10610
10781
  config: resolvePiCodingAgentConfig(parsed, env, evalFilePath)
10611
10782
  };
10612
10783
  case "pi-cli":
10613
10784
  return {
10614
10785
  kind: "pi-cli",
10615
- name: parsed.name,
10616
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10617
- workers: parsed.workers,
10618
- providerBatching,
10786
+ ...base,
10619
10787
  config: resolvePiCliConfig(parsed, env, evalFilePath)
10620
10788
  };
10621
10789
  case "claude":
@@ -10623,38 +10791,26 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
10623
10791
  case "claude-cli":
10624
10792
  return {
10625
10793
  kind: "claude-cli",
10626
- name: parsed.name,
10627
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10628
- workers: parsed.workers,
10629
- providerBatching,
10794
+ ...base,
10630
10795
  config: resolveClaudeConfig(parsed, env, evalFilePath)
10631
10796
  };
10632
10797
  case "claude-sdk":
10633
10798
  return {
10634
10799
  kind: "claude-sdk",
10635
- name: parsed.name,
10636
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10637
- workers: parsed.workers,
10638
- providerBatching,
10800
+ ...base,
10639
10801
  config: resolveClaudeConfig(parsed, env, evalFilePath)
10640
10802
  };
10641
10803
  case "mock":
10642
10804
  return {
10643
10805
  kind: "mock",
10644
- name: parsed.name,
10645
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10646
- workers: parsed.workers,
10647
- providerBatching,
10806
+ ...base,
10648
10807
  config: resolveMockConfig(parsed)
10649
10808
  };
10650
10809
  case "vscode":
10651
10810
  case "vscode-insiders":
10652
10811
  return {
10653
10812
  kind: provider,
10654
- name: parsed.name,
10655
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10656
- workers: parsed.workers,
10657
- providerBatching,
10813
+ ...base,
10658
10814
  config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders", evalFilePath)
10659
10815
  };
10660
10816
  case "agentv": {
@@ -10667,29 +10823,21 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
10667
10823
  const temperature = typeof parsed.temperature === "number" ? parsed.temperature : 0;
10668
10824
  return {
10669
10825
  kind: "agentv",
10670
- name: parsed.name,
10671
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10826
+ ...base,
10672
10827
  workers: typeof parsed.workers === "number" ? parsed.workers : void 0,
10673
- providerBatching,
10674
10828
  config: { model, temperature }
10675
10829
  };
10676
10830
  }
10677
10831
  case "cli":
10678
10832
  return {
10679
10833
  kind: "cli",
10680
- name: parsed.name,
10681
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10682
- workers: parsed.workers,
10683
- providerBatching,
10834
+ ...base,
10684
10835
  config: resolveCliConfig(parsed, env, evalFilePath)
10685
10836
  };
10686
10837
  default:
10687
10838
  return {
10688
10839
  kind: "cli",
10689
- name: parsed.name,
10690
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10691
- workers: parsed.workers,
10692
- providerBatching,
10840
+ ...base,
10693
10841
  config: resolveDiscoveredProviderConfig(parsed, provider, env, evalFilePath)
10694
10842
  };
10695
10843
  }
@@ -11317,8 +11465,8 @@ function resolveCliConfig(target, env, evalFilePath) {
11317
11465
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
11318
11466
  if (!parseResult.success) {
11319
11467
  const firstError = parseResult.error.errors[0];
11320
- const path50 = firstError?.path.join(".") || "";
11321
- const prefix = path50 ? `${target.name} ${path50}: ` : `${target.name}: `;
11468
+ const path51 = firstError?.path.join(".") || "";
11469
+ const prefix = path51 ? `${target.name} ${path51}: ` : `${target.name}: `;
11322
11470
  throw new Error(`${prefix}${firstError?.message}`);
11323
11471
  }
11324
11472
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -13007,6 +13155,41 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
13007
13155
  }
13008
13156
  }
13009
13157
 
13158
+ // src/evaluation/providers/types.ts
13159
+ var AGENT_PROVIDER_KINDS = [
13160
+ "codex",
13161
+ "copilot-sdk",
13162
+ "copilot-cli",
13163
+ "pi-coding-agent",
13164
+ "pi-cli",
13165
+ "claude",
13166
+ "claude-cli",
13167
+ "claude-sdk",
13168
+ "vscode",
13169
+ "vscode-insiders"
13170
+ ];
13171
+ function extractLastAssistantContent(messages) {
13172
+ if (!messages || messages.length === 0) {
13173
+ return "";
13174
+ }
13175
+ for (let i = messages.length - 1; i >= 0; i--) {
13176
+ const msg = messages[i];
13177
+ if (msg.role === "assistant" && msg.content !== void 0) {
13178
+ if (typeof msg.content === "string") {
13179
+ return msg.content;
13180
+ }
13181
+ if (isContentArray(msg.content)) {
13182
+ return getTextContent(msg.content);
13183
+ }
13184
+ return JSON.stringify(msg.content);
13185
+ }
13186
+ }
13187
+ return "";
13188
+ }
13189
+ function isAgentProvider(provider) {
13190
+ return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
13191
+ }
13192
+
13010
13193
  // src/evaluation/providers/targets-file.ts
13011
13194
  var import_node_fs11 = require("fs");
13012
13195
  var import_promises27 = require("fs/promises");
@@ -13319,13 +13502,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
13319
13502
  async function execShellWithStdin(command, stdinPayload, options = {}) {
13320
13503
  const { mkdir: mkdir17, readFile: readFile16, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
13321
13504
  const { tmpdir: tmpdir3 } = await import("os");
13322
- const path50 = await import("path");
13505
+ const path51 = await import("path");
13323
13506
  const { randomUUID: randomUUID10 } = await import("crypto");
13324
- const dir = path50.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
13507
+ const dir = path51.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
13325
13508
  await mkdir17(dir, { recursive: true });
13326
- const stdinPath = path50.join(dir, "stdin.txt");
13327
- const stdoutPath = path50.join(dir, "stdout.txt");
13328
- const stderrPath = path50.join(dir, "stderr.txt");
13509
+ const stdinPath = path51.join(dir, "stdin.txt");
13510
+ const stdoutPath = path51.join(dir, "stdout.txt");
13511
+ const stderrPath = path51.join(dir, "stderr.txt");
13329
13512
  await writeFile9(stdinPath, stdinPayload, "utf8");
13330
13513
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
13331
13514
  const { spawn: spawn5 } = await import("child_process");
@@ -13457,7 +13640,7 @@ async function createTargetProxy(options) {
13457
13640
  totalOutputTokens += response.tokenUsage.output;
13458
13641
  }
13459
13642
  const output = response.output ?? [];
13460
- const rawText = extractLastAssistantContent(output);
13643
+ const rawText = extractLastAssistantContent2(output);
13461
13644
  const result = {
13462
13645
  output,
13463
13646
  rawText,
@@ -13515,7 +13698,7 @@ async function createTargetProxy(options) {
13515
13698
  const output = response.output ?? [];
13516
13699
  responses.push({
13517
13700
  output,
13518
- rawText: extractLastAssistantContent(output),
13701
+ rawText: extractLastAssistantContent2(output),
13519
13702
  tokenUsage: response.tokenUsage
13520
13703
  });
13521
13704
  } catch (error) {
@@ -13572,7 +13755,7 @@ function readBody(req) {
13572
13755
  req.on("error", reject);
13573
13756
  });
13574
13757
  }
13575
- function extractLastAssistantContent(messages) {
13758
+ function extractLastAssistantContent2(messages) {
13576
13759
  for (let i = messages.length - 1; i >= 0; i--) {
13577
13760
  const msg = messages[i];
13578
13761
  if (msg.role === "assistant" && msg.content !== void 0) {
@@ -13641,6 +13824,56 @@ function toCamelCaseDeep(obj) {
13641
13824
 
13642
13825
  // src/evaluation/evaluators/code-evaluator.ts
13643
13826
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
13827
+ var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
13828
+ async function materializeContentForGrader(messages, getWorkDir) {
13829
+ if (!messages || messages.length === 0) return messages ?? null;
13830
+ let hasAnyImage = false;
13831
+ for (const msg of messages) {
13832
+ if (isContentArray(msg.content)) {
13833
+ for (const block of msg.content) {
13834
+ if (block.type === "image") {
13835
+ hasAnyImage = true;
13836
+ break;
13837
+ }
13838
+ }
13839
+ }
13840
+ if (hasAnyImage) break;
13841
+ }
13842
+ if (!hasAnyImage) return messages;
13843
+ let counter = 0;
13844
+ const result = [];
13845
+ for (const msg of messages) {
13846
+ if (!isContentArray(msg.content)) {
13847
+ result.push(msg);
13848
+ continue;
13849
+ }
13850
+ if (!msg.content.some((b) => b.type === "image")) {
13851
+ result.push(msg);
13852
+ continue;
13853
+ }
13854
+ const blocks = [];
13855
+ for (const block of msg.content) {
13856
+ if (block.type !== "image") {
13857
+ blocks.push({ ...block });
13858
+ continue;
13859
+ }
13860
+ const img = block;
13861
+ const match = DATA_URI_RE.exec(img.source);
13862
+ if (match) {
13863
+ const [, mediaType, base64Data] = match;
13864
+ const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
13865
+ const dir = await getWorkDir();
13866
+ const filePath = (0, import_node_path38.join)(dir, `img-${counter++}.${ext}`);
13867
+ await (0, import_promises28.writeFile)(filePath, Buffer.from(base64Data, "base64"));
13868
+ blocks.push({ type: "image", media_type: img.media_type, path: filePath });
13869
+ } else {
13870
+ blocks.push({ type: "image", media_type: img.media_type, path: img.source });
13871
+ }
13872
+ }
13873
+ result.push({ ...msg, content: blocks });
13874
+ }
13875
+ return result;
13876
+ }
13644
13877
  var CodeEvaluator = class {
13645
13878
  kind = "code-grader";
13646
13879
  command;
@@ -13656,7 +13889,18 @@ var CodeEvaluator = class {
13656
13889
  this.target = options.target;
13657
13890
  }
13658
13891
  async evaluate(context2) {
13659
- let outputForPayload = context2.output ?? null;
13892
+ let imageTmpDir;
13893
+ const getImageDir = async () => {
13894
+ if (!imageTmpDir) {
13895
+ imageTmpDir = await (0, import_promises28.mkdtemp)((0, import_node_path38.join)((0, import_node_os7.tmpdir)(), "agentv-img-"));
13896
+ }
13897
+ return imageTmpDir;
13898
+ };
13899
+ const materializedOutput = await materializeContentForGrader(
13900
+ context2.output,
13901
+ getImageDir
13902
+ );
13903
+ let outputForPayload = materializedOutput;
13660
13904
  let outputPath;
13661
13905
  if (outputForPayload) {
13662
13906
  const serialized = JSON.stringify(outputForPayload);
@@ -13669,12 +13913,17 @@ var CodeEvaluator = class {
13669
13913
  }
13670
13914
  const payload = {
13671
13915
  criteria: context2.evalCase.criteria,
13672
- expectedOutput: context2.evalCase.expected_output,
13673
- outputText: context2.candidate,
13916
+ expectedOutput: await materializeContentForGrader(
13917
+ context2.evalCase.expected_output,
13918
+ getImageDir
13919
+ ),
13674
13920
  output: outputForPayload,
13675
13921
  outputPath,
13676
13922
  inputFiles: context2.evalCase.file_paths,
13677
- input: context2.evalCase.input,
13923
+ input: await materializeContentForGrader(
13924
+ context2.evalCase.input,
13925
+ getImageDir
13926
+ ),
13678
13927
  trace: context2.trace ?? null,
13679
13928
  tokenUsage: context2.tokenUsage ?? null,
13680
13929
  costUsd: context2.costUsd ?? null,
@@ -13683,9 +13932,7 @@ var CodeEvaluator = class {
13683
13932
  endTime: context2.endTime ?? null,
13684
13933
  fileChanges: context2.fileChanges ?? null,
13685
13934
  workspacePath: context2.workspacePath ?? null,
13686
- config: this.config ?? null,
13687
- inputText: context2.evalCase.question,
13688
- expectedOutputText: context2.evalCase.reference_answer ?? ""
13935
+ config: this.config ?? null
13689
13936
  };
13690
13937
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
13691
13938
  let proxyEnv;
@@ -13775,6 +14022,10 @@ var CodeEvaluator = class {
13775
14022
  await (0, import_promises28.rm)((0, import_node_path38.dirname)(outputPath), { recursive: true, force: true }).catch(() => {
13776
14023
  });
13777
14024
  }
14025
+ if (imageTmpDir) {
14026
+ await (0, import_promises28.rm)(imageTmpDir, { recursive: true, force: true }).catch(() => {
14027
+ });
14028
+ }
13778
14029
  }
13779
14030
  }
13780
14031
  };
@@ -13802,38 +14053,6 @@ ${tail}`;
13802
14053
  // src/evaluation/evaluators/composite.ts
13803
14054
  var import_ai3 = require("ai");
13804
14055
 
13805
- // src/evaluation/providers/types.ts
13806
- var AGENT_PROVIDER_KINDS = [
13807
- "codex",
13808
- "copilot-sdk",
13809
- "copilot-cli",
13810
- "pi-coding-agent",
13811
- "pi-cli",
13812
- "claude",
13813
- "claude-cli",
13814
- "claude-sdk",
13815
- "vscode",
13816
- "vscode-insiders"
13817
- ];
13818
- function extractLastAssistantContent2(messages) {
13819
- if (!messages || messages.length === 0) {
13820
- return "";
13821
- }
13822
- for (let i = messages.length - 1; i >= 0; i--) {
13823
- const msg = messages[i];
13824
- if (msg.role === "assistant" && msg.content !== void 0) {
13825
- if (typeof msg.content === "string") {
13826
- return msg.content;
13827
- }
13828
- return JSON.stringify(msg.content);
13829
- }
13830
- }
13831
- return "";
13832
- }
13833
- function isAgentProvider(provider) {
13834
- return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
13835
- }
13836
-
13837
14056
  // src/evaluation/evaluators/llm-grader.ts
13838
14057
  var import_promises29 = __toESM(require("fs/promises"), 1);
13839
14058
  var import_node_path39 = __toESM(require("path"), 1);
@@ -13884,13 +14103,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
13884
14103
  {{${TEMPLATE_VARIABLES.CRITERIA}}}
13885
14104
 
13886
14105
  [[ ## question ## ]]
13887
- {{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
14106
+ {{${TEMPLATE_VARIABLES.INPUT}}}
13888
14107
 
13889
14108
  [[ ## reference_answer ## ]]
13890
- {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
14109
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT}}}
13891
14110
 
13892
14111
  [[ ## answer ## ]]
13893
- {{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
14112
+ {{${TEMPLATE_VARIABLES.OUTPUT}}}`;
13894
14113
  var freeformEvaluationSchema = import_zod4.z.object({
13895
14114
  score: import_zod4.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
13896
14115
  assertions: import_zod4.z.array(
@@ -13962,21 +14181,19 @@ var LlmGraderEvaluator = class {
13962
14181
  async evaluateFreeform(context2, graderProvider) {
13963
14182
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
13964
14183
  const variables = {
13965
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context2.evalCase.input, null, 2),
13966
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(
13967
- context2.evalCase.expected_output,
13968
- null,
13969
- 2
13970
- ),
13971
- [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
14184
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
14185
+ [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
14186
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
13972
14187
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
13973
14188
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
14189
+ // Deprecated aliases — same values as the primary variables above
13974
14190
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
13975
14191
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
13976
14192
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
13977
14193
  };
13978
14194
  const systemPrompt = buildOutputSchema();
13979
14195
  const evaluatorTemplate = context2.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
14196
+ warnDeprecatedTemplateVars(evaluatorTemplate);
13980
14197
  let userPrompt = substituteVariables(evaluatorTemplate, variables);
13981
14198
  if (context2.fileChanges && !context2.evaluatorTemplateOverride && !this.evaluatorTemplate) {
13982
14199
  userPrompt += `
@@ -13988,13 +14205,15 @@ ${context2.fileChanges}`;
13988
14205
  userPrompt,
13989
14206
  systemPrompt
13990
14207
  };
14208
+ const images = context2.output ? extractImageBlocks(context2.output) : [];
13991
14209
  try {
13992
14210
  const { data, tokenUsage } = await this.runWithRetry({
13993
14211
  context: context2,
13994
14212
  graderProvider,
13995
14213
  systemPrompt,
13996
14214
  userPrompt,
13997
- schema: freeformEvaluationSchema
14215
+ schema: freeformEvaluationSchema,
14216
+ images
13998
14217
  });
13999
14218
  const score = clampScore(data.score);
14000
14219
  const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
@@ -14038,13 +14257,15 @@ ${context2.fileChanges}`;
14038
14257
  userPrompt: prompt,
14039
14258
  systemPrompt
14040
14259
  };
14260
+ const images = context2.output ? extractImageBlocks(context2.output) : [];
14041
14261
  try {
14042
14262
  const { data, tokenUsage } = await this.runWithRetry({
14043
14263
  context: context2,
14044
14264
  graderProvider,
14045
14265
  systemPrompt,
14046
14266
  userPrompt: prompt,
14047
- schema: rubricEvaluationSchema
14267
+ schema: rubricEvaluationSchema,
14268
+ images
14048
14269
  });
14049
14270
  const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
14050
14271
  return {
@@ -14081,13 +14302,15 @@ ${context2.fileChanges}`;
14081
14302
  userPrompt: prompt,
14082
14303
  systemPrompt
14083
14304
  };
14305
+ const images = context2.output ? extractImageBlocks(context2.output) : [];
14084
14306
  try {
14085
14307
  const { data, tokenUsage } = await this.runWithRetry({
14086
14308
  context: context2,
14087
14309
  graderProvider,
14088
14310
  systemPrompt,
14089
14311
  userPrompt: prompt,
14090
- schema: scoreRangeEvaluationSchema
14312
+ schema: scoreRangeEvaluationSchema,
14313
+ images
14091
14314
  });
14092
14315
  const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
14093
14316
  return {
@@ -14217,7 +14440,7 @@ ${context2.fileChanges}`;
14217
14440
  evalCaseId: context2.evalCase.id,
14218
14441
  attempt: context2.attempt
14219
14442
  });
14220
- const assistantContent = extractLastAssistantContent2(response.output);
14443
+ const assistantContent = extractLastAssistantContent(response.output);
14221
14444
  if (!assistantContent) {
14222
14445
  return {
14223
14446
  score: 0,
@@ -14294,12 +14517,17 @@ ${context2.fileChanges}`;
14294
14517
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
14295
14518
  const variables = {
14296
14519
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
14520
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
14521
+ [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
14522
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
14523
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
14524
+ // Deprecated aliases
14297
14525
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
14298
14526
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
14299
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
14300
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
14527
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
14301
14528
  };
14302
14529
  if (this.evaluatorTemplate) {
14530
+ warnDeprecatedTemplateVars(this.evaluatorTemplate);
14303
14531
  return substituteVariables(this.evaluatorTemplate, variables);
14304
14532
  }
14305
14533
  const config = context2.evaluator;
@@ -14350,11 +14578,16 @@ ${context2.fileChanges}`;
14350
14578
  if (this.evaluatorTemplate) {
14351
14579
  const variables = {
14352
14580
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
14581
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
14582
+ [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
14583
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
14584
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
14585
+ // Deprecated aliases
14353
14586
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
14354
14587
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
14355
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
14356
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
14588
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
14357
14589
  };
14590
+ warnDeprecatedTemplateVars(this.evaluatorTemplate);
14358
14591
  const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
14359
14592
  const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
14360
14593
  return `${customPrompt}
@@ -14525,18 +14758,35 @@ ${outputSchema}`;
14525
14758
  // LLM mode retry logic
14526
14759
  // ---------------------------------------------------------------------------
14527
14760
  async runWithRetry(options) {
14528
- const { context: context2, graderProvider, systemPrompt, userPrompt, schema } = options;
14761
+ const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
14529
14762
  let lastError;
14530
14763
  for (let attempt = 1; attempt <= 3; attempt++) {
14531
14764
  try {
14532
14765
  const model = graderProvider.asLanguageModel?.();
14533
14766
  if (model) {
14534
- const result = await (0, import_ai2.generateText)({
14767
+ const modelOptions = {
14768
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
14769
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
14770
+ };
14771
+ const hasImages = images && images.length > 0;
14772
+ const result = hasImages ? await (0, import_ai2.generateText)({
14773
+ model,
14774
+ system: systemPrompt,
14775
+ messages: [
14776
+ {
14777
+ role: "user",
14778
+ content: [
14779
+ { type: "text", text: userPrompt },
14780
+ ...toAiSdkImageParts(images)
14781
+ ]
14782
+ }
14783
+ ],
14784
+ ...modelOptions
14785
+ }) : await (0, import_ai2.generateText)({
14535
14786
  model,
14536
14787
  system: systemPrompt,
14537
14788
  prompt: userPrompt,
14538
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
14539
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
14789
+ ...modelOptions
14540
14790
  });
14541
14791
  const data2 = schema.parse(parseJsonFromText(result.text));
14542
14792
  const rawUsage = result.usage;
@@ -14551,7 +14801,7 @@ ${outputSchema}`;
14551
14801
  maxOutputTokens: this.maxOutputTokens,
14552
14802
  temperature: this.temperature
14553
14803
  });
14554
- const data = schema.parse(parseJsonFromText(extractLastAssistantContent2(response.output)));
14804
+ const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
14555
14805
  return { data, providerResponse: response, tokenUsage: response.tokenUsage };
14556
14806
  } catch (e) {
14557
14807
  lastError = e instanceof Error ? e : new Error(String(e));
@@ -14596,6 +14846,26 @@ function substituteVariables(template, variables) {
14596
14846
  return variables[varName] ?? match;
14597
14847
  });
14598
14848
  }
14849
+ var ANSI_YELLOW8 = "\x1B[33m";
14850
+ var ANSI_RESET9 = "\x1B[0m";
14851
+ var warnedTemplateStrings = /* @__PURE__ */ new Set();
14852
+ function warnDeprecatedTemplateVars(template) {
14853
+ if (warnedTemplateStrings.has(template)) return;
14854
+ const used = [];
14855
+ for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
14856
+ if (new RegExp(`\\{\\{\\s*${deprecated}\\s*\\}\\}`).test(template)) {
14857
+ used.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
14858
+ }
14859
+ }
14860
+ if (used.length > 0) {
14861
+ warnedTemplateStrings.add(template);
14862
+ console.warn(
14863
+ `${ANSI_YELLOW8}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
14864
+ ${used.join("\n ")}
14865
+ Update your custom evaluator template to use the new names.${ANSI_RESET9}`
14866
+ );
14867
+ }
14868
+ }
14599
14869
  function calculateRubricScore(result, rubrics) {
14600
14870
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
14601
14871
  const assertions = [];
@@ -14690,6 +14960,26 @@ function calculateScoreRangeResult(result, rubrics) {
14690
14960
  }
14691
14961
  };
14692
14962
  }
14963
+ function extractImageBlocks(messages) {
14964
+ const images = [];
14965
+ for (const msg of messages) {
14966
+ if (msg.role !== "assistant") continue;
14967
+ if (!isContentArray(msg.content)) continue;
14968
+ for (const block of msg.content) {
14969
+ if (block.type === "image") {
14970
+ images.push(block);
14971
+ }
14972
+ }
14973
+ }
14974
+ return images;
14975
+ }
14976
+ function toAiSdkImageParts(images) {
14977
+ return images.map((img) => ({
14978
+ type: "image",
14979
+ image: img.source,
14980
+ mediaType: img.media_type || void 0
14981
+ }));
14982
+ }
14693
14983
  function resolveSandboxed(basePath, relativePath) {
14694
14984
  const resolved = import_node_path39.default.resolve(basePath, relativePath);
14695
14985
  if (!resolved.startsWith(basePath + import_node_path39.default.sep) && resolved !== basePath) {
@@ -15075,7 +15365,7 @@ var CompositeEvaluator = class {
15075
15365
  attempt: context2.attempt
15076
15366
  });
15077
15367
  const data = freeformEvaluationSchema.parse(
15078
- parseJsonFromText(extractLastAssistantContent2(response.output))
15368
+ parseJsonFromText(extractLastAssistantContent(response.output))
15079
15369
  );
15080
15370
  const score = clampScore(data.score);
15081
15371
  const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
@@ -15431,115 +15721,115 @@ var FieldAccuracyEvaluator = class {
15431
15721
  * Evaluate a single field against the expected value.
15432
15722
  */
15433
15723
  evaluateField(fieldConfig, candidateData, expectedData) {
15434
- const { path: path50, match, required = true, weight = 1 } = fieldConfig;
15435
- const candidateValue = resolvePath(candidateData, path50);
15436
- const expectedValue = resolvePath(expectedData, path50);
15724
+ const { path: path51, match, required = true, weight = 1 } = fieldConfig;
15725
+ const candidateValue = resolvePath(candidateData, path51);
15726
+ const expectedValue = resolvePath(expectedData, path51);
15437
15727
  if (expectedValue === void 0) {
15438
15728
  return {
15439
- path: path50,
15729
+ path: path51,
15440
15730
  score: 1,
15441
15731
  // No expected value means no comparison needed
15442
15732
  weight,
15443
15733
  hit: true,
15444
- message: `${path50}: no expected value`
15734
+ message: `${path51}: no expected value`
15445
15735
  };
15446
15736
  }
15447
15737
  if (candidateValue === void 0) {
15448
15738
  if (required) {
15449
15739
  return {
15450
- path: path50,
15740
+ path: path51,
15451
15741
  score: 0,
15452
15742
  weight,
15453
15743
  hit: false,
15454
- message: `${path50} (required, missing)`
15744
+ message: `${path51} (required, missing)`
15455
15745
  };
15456
15746
  }
15457
15747
  return {
15458
- path: path50,
15748
+ path: path51,
15459
15749
  score: 1,
15460
15750
  // Don't penalize missing optional fields
15461
15751
  weight: 0,
15462
15752
  // Zero weight means it won't affect the score
15463
15753
  hit: true,
15464
- message: `${path50}: optional field missing`
15754
+ message: `${path51}: optional field missing`
15465
15755
  };
15466
15756
  }
15467
15757
  switch (match) {
15468
15758
  case "exact":
15469
- return this.compareExact(path50, candidateValue, expectedValue, weight);
15759
+ return this.compareExact(path51, candidateValue, expectedValue, weight);
15470
15760
  case "numeric_tolerance":
15471
15761
  return this.compareNumericTolerance(
15472
- path50,
15762
+ path51,
15473
15763
  candidateValue,
15474
15764
  expectedValue,
15475
15765
  fieldConfig,
15476
15766
  weight
15477
15767
  );
15478
15768
  case "date":
15479
- return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
15769
+ return this.compareDate(path51, candidateValue, expectedValue, fieldConfig, weight);
15480
15770
  default:
15481
15771
  return {
15482
- path: path50,
15772
+ path: path51,
15483
15773
  score: 0,
15484
15774
  weight,
15485
15775
  hit: false,
15486
- message: `${path50}: unknown match type "${match}"`
15776
+ message: `${path51}: unknown match type "${match}"`
15487
15777
  };
15488
15778
  }
15489
15779
  }
15490
15780
  /**
15491
15781
  * Exact equality comparison.
15492
15782
  */
15493
- compareExact(path50, candidateValue, expectedValue, weight) {
15783
+ compareExact(path51, candidateValue, expectedValue, weight) {
15494
15784
  if (deepEqual(candidateValue, expectedValue)) {
15495
15785
  return {
15496
- path: path50,
15786
+ path: path51,
15497
15787
  score: 1,
15498
15788
  weight,
15499
15789
  hit: true,
15500
- message: path50
15790
+ message: path51
15501
15791
  };
15502
15792
  }
15503
15793
  if (typeof candidateValue !== typeof expectedValue) {
15504
15794
  return {
15505
- path: path50,
15795
+ path: path51,
15506
15796
  score: 0,
15507
15797
  weight,
15508
15798
  hit: false,
15509
- message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
15799
+ message: `${path51} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
15510
15800
  };
15511
15801
  }
15512
15802
  return {
15513
- path: path50,
15803
+ path: path51,
15514
15804
  score: 0,
15515
15805
  weight,
15516
15806
  hit: false,
15517
- message: `${path50} (value mismatch)`
15807
+ message: `${path51} (value mismatch)`
15518
15808
  };
15519
15809
  }
15520
15810
  /**
15521
15811
  * Numeric comparison with absolute or relative tolerance.
15522
15812
  */
15523
- compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
15813
+ compareNumericTolerance(path51, candidateValue, expectedValue, fieldConfig, weight) {
15524
15814
  const { tolerance = 0, relative = false } = fieldConfig;
15525
15815
  const candidateNum = toNumber(candidateValue);
15526
15816
  const expectedNum = toNumber(expectedValue);
15527
15817
  if (candidateNum === null || expectedNum === null) {
15528
15818
  return {
15529
- path: path50,
15819
+ path: path51,
15530
15820
  score: 0,
15531
15821
  weight,
15532
15822
  hit: false,
15533
- message: `${path50} (non-numeric value)`
15823
+ message: `${path51} (non-numeric value)`
15534
15824
  };
15535
15825
  }
15536
15826
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
15537
15827
  return {
15538
- path: path50,
15828
+ path: path51,
15539
15829
  score: 0,
15540
15830
  weight,
15541
15831
  hit: false,
15542
- message: `${path50} (invalid numeric value)`
15832
+ message: `${path51} (invalid numeric value)`
15543
15833
  };
15544
15834
  }
15545
15835
  const diff = Math.abs(candidateNum - expectedNum);
@@ -15552,61 +15842,61 @@ var FieldAccuracyEvaluator = class {
15552
15842
  }
15553
15843
  if (withinTolerance) {
15554
15844
  return {
15555
- path: path50,
15845
+ path: path51,
15556
15846
  score: 1,
15557
15847
  weight,
15558
15848
  hit: true,
15559
- message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
15849
+ message: `${path51} (within tolerance: diff=${diff.toFixed(2)})`
15560
15850
  };
15561
15851
  }
15562
15852
  return {
15563
- path: path50,
15853
+ path: path51,
15564
15854
  score: 0,
15565
15855
  weight,
15566
15856
  hit: false,
15567
- message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
15857
+ message: `${path51} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
15568
15858
  };
15569
15859
  }
15570
15860
  /**
15571
15861
  * Date comparison with format normalization.
15572
15862
  */
15573
- compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
15863
+ compareDate(path51, candidateValue, expectedValue, fieldConfig, weight) {
15574
15864
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
15575
15865
  const candidateDate = parseDate(String(candidateValue), formats);
15576
15866
  const expectedDate = parseDate(String(expectedValue), formats);
15577
15867
  if (candidateDate === null) {
15578
15868
  return {
15579
- path: path50,
15869
+ path: path51,
15580
15870
  score: 0,
15581
15871
  weight,
15582
15872
  hit: false,
15583
- message: `${path50} (unparseable candidate date)`
15873
+ message: `${path51} (unparseable candidate date)`
15584
15874
  };
15585
15875
  }
15586
15876
  if (expectedDate === null) {
15587
15877
  return {
15588
- path: path50,
15878
+ path: path51,
15589
15879
  score: 0,
15590
15880
  weight,
15591
15881
  hit: false,
15592
- message: `${path50} (unparseable expected date)`
15882
+ message: `${path51} (unparseable expected date)`
15593
15883
  };
15594
15884
  }
15595
15885
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
15596
15886
  return {
15597
- path: path50,
15887
+ path: path51,
15598
15888
  score: 1,
15599
15889
  weight,
15600
15890
  hit: true,
15601
- message: path50
15891
+ message: path51
15602
15892
  };
15603
15893
  }
15604
15894
  return {
15605
- path: path50,
15895
+ path: path51,
15606
15896
  score: 0,
15607
15897
  weight,
15608
15898
  hit: false,
15609
- message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
15899
+ message: `${path51} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
15610
15900
  };
15611
15901
  }
15612
15902
  /**
@@ -15639,11 +15929,11 @@ var FieldAccuracyEvaluator = class {
15639
15929
  };
15640
15930
  }
15641
15931
  };
15642
- function resolvePath(obj, path50) {
15643
- if (!path50 || !obj) {
15932
+ function resolvePath(obj, path51) {
15933
+ if (!path51 || !obj) {
15644
15934
  return void 0;
15645
15935
  }
15646
- const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
15936
+ const parts = path51.split(/\.|\[|\]/).filter((p) => p.length > 0);
15647
15937
  let current = obj;
15648
15938
  for (const part of parts) {
15649
15939
  if (current === null || current === void 0) {
@@ -15935,11 +16225,12 @@ function assembleLlmGraderPrompt(input) {
15935
16225
  function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
15936
16226
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
15937
16227
  const variables = {
15938
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2),
15939
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
15940
- [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
16228
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
16229
+ [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(),
16230
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
15941
16231
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
15942
16232
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
16233
+ // Deprecated aliases
15943
16234
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
15944
16235
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
15945
16236
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -16126,8 +16417,8 @@ var TokenUsageEvaluator = class {
16126
16417
  };
16127
16418
 
16128
16419
  // src/evaluation/evaluators/tool-trajectory.ts
16129
- function getNestedValue(obj, path50) {
16130
- const parts = path50.split(".");
16420
+ function getNestedValue(obj, path51) {
16421
+ const parts = path51.split(".");
16131
16422
  let current = obj;
16132
16423
  for (const part of parts) {
16133
16424
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -16996,16 +17287,13 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
16996
17287
  const payload = {
16997
17288
  criteria: context2.evalCase.criteria,
16998
17289
  expectedOutput: context2.evalCase.expected_output,
16999
- outputText: context2.candidate,
17000
17290
  output: context2.output ?? null,
17001
17291
  inputFiles: context2.evalCase.file_paths,
17002
17292
  input: context2.evalCase.input,
17003
17293
  trace: context2.trace ?? null,
17004
17294
  fileChanges: context2.fileChanges ?? null,
17005
17295
  workspacePath: context2.workspacePath ?? null,
17006
- config: config ?? context2.config ?? null,
17007
- inputText: context2.evalCase.question,
17008
- expectedOutputText: context2.evalCase.reference_answer ?? ""
17296
+ config: config ?? context2.config ?? null
17009
17297
  };
17010
17298
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
17011
17299
  const scriptPath = script[script.length - 1];
@@ -18685,7 +18973,8 @@ async function runEvaluation(options) {
18685
18973
  const budgetResult = {
18686
18974
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
18687
18975
  testId: evalCase.id,
18688
- eval_set: evalCase.eval_set,
18976
+ dataset: evalCase.dataset,
18977
+ category: evalCase.category,
18689
18978
  score: 0,
18690
18979
  assertions: [],
18691
18980
  output: [],
@@ -18721,7 +19010,8 @@ async function runEvaluation(options) {
18721
19010
  const haltResult = {
18722
19011
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
18723
19012
  testId: evalCase.id,
18724
- eval_set: evalCase.eval_set,
19013
+ dataset: evalCase.dataset,
19014
+ category: evalCase.category,
18725
19015
  score: 0,
18726
19016
  assertions: [],
18727
19017
  output: [],
@@ -19004,7 +19294,7 @@ async function runBatchEvaluation(options) {
19004
19294
  const tokenUsage = merged?.tokenUsage;
19005
19295
  const startTime = merged?.startTime;
19006
19296
  const endTime = merged?.endTime;
19007
- const candidate = extractLastAssistantContent2(output);
19297
+ const candidate = extractLastAssistantContent(output);
19008
19298
  const providerError = extractProviderError(providerResponse);
19009
19299
  let result;
19010
19300
  try {
@@ -19412,7 +19702,7 @@ async function runEvalCase(options) {
19412
19702
  const tokenUsage = merged?.tokenUsage;
19413
19703
  const startTime = merged?.startTime;
19414
19704
  const endTime = merged?.endTime;
19415
- const candidate = extractLastAssistantContent2(output);
19705
+ const candidate = extractLastAssistantContent(output);
19416
19706
  let fileChanges;
19417
19707
  if (baselineCommit && workspacePath) {
19418
19708
  try {
@@ -19720,7 +20010,8 @@ async function evaluateCandidate(options) {
19720
20010
  return {
19721
20011
  timestamp: completedAt.toISOString(),
19722
20012
  testId: evalCase.id,
19723
- eval_set: evalCase.eval_set,
20013
+ dataset: evalCase.dataset,
20014
+ category: evalCase.category,
19724
20015
  conversationId: evalCase.conversation_id,
19725
20016
  score: score.score,
19726
20017
  assertions: score.assertions,
@@ -20070,7 +20361,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
20070
20361
  return {
20071
20362
  timestamp: timestamp.toISOString(),
20072
20363
  testId: evalCase.id,
20073
- eval_set: evalCase.eval_set,
20364
+ dataset: evalCase.dataset,
20365
+ category: evalCase.category,
20074
20366
  conversationId: evalCase.conversation_id,
20075
20367
  score: 0,
20076
20368
  assertions: [{ text: `Error: ${message}`, passed: false }],
@@ -20643,6 +20935,18 @@ function trimBaselineResult(result) {
20643
20935
  return trimmed;
20644
20936
  }
20645
20937
 
20938
+ // src/evaluation/category.ts
20939
+ var import_node_path51 = __toESM(require("path"), 1);
20940
+ var DEFAULT_CATEGORY = "Uncategorized";
20941
+ function deriveCategory(relativePath) {
20942
+ const parts = relativePath.split(import_node_path51.default.sep);
20943
+ if (parts.length <= 1) {
20944
+ return DEFAULT_CATEGORY;
20945
+ }
20946
+ const dirs = parts.slice(0, -1).filter((d) => d !== "evals");
20947
+ return dirs.length > 0 ? dirs.join("/") : DEFAULT_CATEGORY;
20948
+ }
20949
+
20646
20950
  // src/observability/otel-exporter.ts
20647
20951
  var OTEL_BACKEND_PRESETS = {
20648
20952
  langfuse: {
@@ -20766,7 +21070,7 @@ var OtelTraceExporter = class {
20766
21070
  rootSpan.setAttribute("gen_ai.system", "agentv");
20767
21071
  rootSpan.setAttribute("agentv.test_id", result.testId);
20768
21072
  rootSpan.setAttribute("agentv.target", result.target);
20769
- if (result.eval_set) rootSpan.setAttribute("agentv.eval_set", result.eval_set);
21073
+ if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
20770
21074
  rootSpan.setAttribute("agentv.score", result.score);
20771
21075
  if (captureContent && result.output.length > 0) {
20772
21076
  const lastMsg = result.output[result.output.length - 1];
@@ -20975,7 +21279,7 @@ var OtelStreamingObserver = class {
20975
21279
  this.rootSpan.setAttribute("gen_ai.system", "agentv");
20976
21280
  this.rootSpan.setAttribute("agentv.test_id", testId);
20977
21281
  this.rootSpan.setAttribute("agentv.target", target);
20978
- if (evalSet) this.rootSpan.setAttribute("agentv.eval_set", evalSet);
21282
+ if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
20979
21283
  this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
20980
21284
  }
20981
21285
  /** Create and immediately export a tool span */
@@ -21151,9 +21455,11 @@ function createAgentKernel() {
21151
21455
  }
21152
21456
  // Annotate the CommonJS export names for ESM import in node:
21153
21457
  0 && (module.exports = {
21458
+ COMMON_TARGET_SETTINGS,
21154
21459
  CodeEvaluator,
21155
21460
  CompositeEvaluator,
21156
21461
  CostEvaluator,
21462
+ DEFAULT_CATEGORY,
21157
21463
  DEFAULT_EVALUATOR_TEMPLATE,
21158
21464
  DEFAULT_EVAL_PATTERNS,
21159
21465
  DEFAULT_EXPLORATION_TOOLS,
@@ -21207,6 +21513,7 @@ function createAgentKernel() {
21207
21513
  createTempWorkspace,
21208
21514
  deepEqual,
21209
21515
  defineConfig,
21516
+ deriveCategory,
21210
21517
  detectFormat,
21211
21518
  discoverAssertions,
21212
21519
  discoverCopilotSessions,
@@ -21220,7 +21527,9 @@ function createAgentKernel() {
21220
21527
  explorationRatio,
21221
21528
  extractCacheConfig,
21222
21529
  extractFailOnError,
21530
+ extractImageBlocks,
21223
21531
  extractJsonBlob,
21532
+ extractLastAssistantContent,
21224
21533
  extractTargetFromSuite,
21225
21534
  extractTargetsFromSuite,
21226
21535
  extractTargetsFromTestCase,
@@ -21234,12 +21543,15 @@ function createAgentKernel() {
21234
21543
  getAgentvHome,
21235
21544
  getOutputFilenames,
21236
21545
  getSubagentsRoot,
21546
+ getTextContent,
21237
21547
  getTraceStateRoot,
21238
21548
  getWorkspacePath,
21239
21549
  getWorkspacePoolRoot,
21240
21550
  getWorkspacesRoot,
21241
21551
  initializeBaseline,
21242
21552
  isAgentSkillsFormat,
21553
+ isContent,
21554
+ isContentArray,
21243
21555
  isEvaluatorKind,
21244
21556
  isJsonObject,
21245
21557
  isJsonValue,