@agentv/core 3.14.6 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1315,12 +1315,12 @@ function serializeAttributeValue(value) {
1315
1315
  if (Array.isArray(value)) return { arrayValue: { values: value.map(serializeAttributeValue) } };
1316
1316
  return { stringValue: String(value) };
1317
1317
  }
1318
- var import_promises35, import_node_path51, OtlpJsonFileExporter;
1318
+ var import_promises35, import_node_path52, OtlpJsonFileExporter;
1319
1319
  var init_otlp_json_file_exporter = __esm({
1320
1320
  "src/observability/otlp-json-file-exporter.ts"() {
1321
1321
  "use strict";
1322
1322
  import_promises35 = require("fs/promises");
1323
- import_node_path51 = require("path");
1323
+ import_node_path52 = require("path");
1324
1324
  OtlpJsonFileExporter = class {
1325
1325
  // biome-ignore lint/suspicious/noExplicitAny: serialized span data
1326
1326
  spans = [];
@@ -1359,7 +1359,7 @@ var init_otlp_json_file_exporter = __esm({
1359
1359
  }
1360
1360
  async flush() {
1361
1361
  if (this.spans.length === 0) return;
1362
- await (0, import_promises35.mkdir)((0, import_node_path51.dirname)(this.filePath), { recursive: true });
1362
+ await (0, import_promises35.mkdir)((0, import_node_path52.dirname)(this.filePath), { recursive: true });
1363
1363
  const otlpJson = {
1364
1364
  resourceSpans: [
1365
1365
  {
@@ -1383,9 +1383,11 @@ var init_otlp_json_file_exporter = __esm({
1383
1383
  // src/index.ts
1384
1384
  var index_exports = {};
1385
1385
  __export(index_exports, {
1386
+ COMMON_TARGET_SETTINGS: () => COMMON_TARGET_SETTINGS,
1386
1387
  CodeEvaluator: () => CodeEvaluator,
1387
1388
  CompositeEvaluator: () => CompositeEvaluator,
1388
1389
  CostEvaluator: () => CostEvaluator,
1390
+ DEFAULT_CATEGORY: () => DEFAULT_CATEGORY,
1389
1391
  DEFAULT_EVALUATOR_TEMPLATE: () => DEFAULT_EVALUATOR_TEMPLATE,
1390
1392
  DEFAULT_EVAL_PATTERNS: () => DEFAULT_EVAL_PATTERNS,
1391
1393
  DEFAULT_EXPLORATION_TOOLS: () => DEFAULT_EXPLORATION_TOOLS,
@@ -1439,6 +1441,7 @@ __export(index_exports, {
1439
1441
  createTempWorkspace: () => createTempWorkspace,
1440
1442
  deepEqual: () => deepEqual,
1441
1443
  defineConfig: () => defineConfig,
1444
+ deriveCategory: () => deriveCategory,
1442
1445
  detectFormat: () => detectFormat,
1443
1446
  discoverAssertions: () => discoverAssertions,
1444
1447
  discoverCopilotSessions: () => discoverCopilotSessions,
@@ -1452,7 +1455,9 @@ __export(index_exports, {
1452
1455
  explorationRatio: () => explorationRatio,
1453
1456
  extractCacheConfig: () => extractCacheConfig,
1454
1457
  extractFailOnError: () => extractFailOnError,
1458
+ extractImageBlocks: () => extractImageBlocks,
1455
1459
  extractJsonBlob: () => extractJsonBlob,
1460
+ extractLastAssistantContent: () => extractLastAssistantContent,
1456
1461
  extractTargetFromSuite: () => extractTargetFromSuite,
1457
1462
  extractTargetsFromSuite: () => extractTargetsFromSuite,
1458
1463
  extractTargetsFromTestCase: () => extractTargetsFromTestCase,
@@ -1466,12 +1471,15 @@ __export(index_exports, {
1466
1471
  getAgentvHome: () => getAgentvHome,
1467
1472
  getOutputFilenames: () => getOutputFilenames,
1468
1473
  getSubagentsRoot: () => getSubagentsRoot,
1474
+ getTextContent: () => getTextContent,
1469
1475
  getTraceStateRoot: () => getTraceStateRoot,
1470
1476
  getWorkspacePath: () => getWorkspacePath,
1471
1477
  getWorkspacePoolRoot: () => getWorkspacePoolRoot,
1472
1478
  getWorkspacesRoot: () => getWorkspacesRoot,
1473
1479
  initializeBaseline: () => initializeBaseline,
1474
1480
  isAgentSkillsFormat: () => isAgentSkillsFormat,
1481
+ isContent: () => isContent,
1482
+ isContentArray: () => isContentArray,
1475
1483
  isEvaluatorKind: () => isEvaluatorKind,
1476
1484
  isJsonObject: () => isJsonObject,
1477
1485
  isJsonValue: () => isJsonValue,
@@ -1533,6 +1541,29 @@ __export(index_exports, {
1533
1541
  });
1534
1542
  module.exports = __toCommonJS(index_exports);
1535
1543
 
1544
+ // src/evaluation/content.ts
1545
+ var CONTENT_TYPES = /* @__PURE__ */ new Set(["text", "image", "file"]);
1546
+ function isContent(value) {
1547
+ if (!value || typeof value !== "object") return false;
1548
+ const v = value;
1549
+ return typeof v.type === "string" && CONTENT_TYPES.has(v.type);
1550
+ }
1551
+ function isContentArray(value) {
1552
+ return Array.isArray(value) && value.length > 0 && value.every(isContent);
1553
+ }
1554
+ function getTextContent(content) {
1555
+ if (content == null) return "";
1556
+ if (typeof content === "string") return content;
1557
+ if (!Array.isArray(content)) return "";
1558
+ const parts = [];
1559
+ for (const block of content) {
1560
+ if (block.type === "text") {
1561
+ parts.push(block.text);
1562
+ }
1563
+ }
1564
+ return parts.join("\n");
1565
+ }
1566
+
1536
1567
  // src/evaluation/types.ts
1537
1568
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
1538
1569
  var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
@@ -2411,15 +2442,23 @@ var TEMPLATE_VARIABLES = {
2411
2442
  INPUT: "input",
2412
2443
  OUTPUT: "output",
2413
2444
  FILE_CHANGES: "file_changes",
2445
+ /** @deprecated Use INPUT instead — resolves to the same text value. */
2414
2446
  INPUT_TEXT: "input_text",
2447
+ /** @deprecated Use OUTPUT instead — resolves to the same text value. */
2415
2448
  OUTPUT_TEXT: "output_text",
2449
+ /** @deprecated Use EXPECTED_OUTPUT instead — resolves to the same text value. */
2416
2450
  EXPECTED_OUTPUT_TEXT: "expected_output_text"
2417
2451
  };
2418
2452
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
2419
2453
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
2420
- TEMPLATE_VARIABLES.OUTPUT_TEXT,
2454
+ TEMPLATE_VARIABLES.OUTPUT,
2421
2455
  TEMPLATE_VARIABLES.EXPECTED_OUTPUT
2422
2456
  ]);
2457
+ var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
2458
+ [TEMPLATE_VARIABLES.INPUT_TEXT, TEMPLATE_VARIABLES.INPUT],
2459
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT],
2460
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
2461
+ ]);
2423
2462
 
2424
2463
  // src/evaluation/validation/prompt-validator.ts
2425
2464
  var ANSI_YELLOW3 = "\x1B[33m";
@@ -2441,16 +2480,29 @@ function validateTemplateVariables(content, source) {
2441
2480
  }
2442
2481
  match = variablePattern.exec(content);
2443
2482
  }
2444
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
2483
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
2445
2484
  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
2446
2485
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
2447
2486
  if (!hasRequiredFields) {
2448
2487
  throw new Error(
2449
2488
  `Missing required fields. Must include at least one of:
2450
- - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
2489
+ - {{ ${TEMPLATE_VARIABLES.OUTPUT} }}
2451
2490
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
2452
2491
  );
2453
2492
  }
2493
+ const deprecatedUsed = [];
2494
+ for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
2495
+ if (foundVariables.has(deprecated)) {
2496
+ deprecatedUsed.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
2497
+ }
2498
+ }
2499
+ if (deprecatedUsed.length > 0) {
2500
+ console.warn(
2501
+ `${ANSI_YELLOW3}Warning: Template at ${source} uses deprecated variable names:
2502
+ ${deprecatedUsed.join("\n ")}
2503
+ These still work but will be removed in a future version.${ANSI_RESET4}`
2504
+ );
2505
+ }
2454
2506
  if (invalidVariables.length > 0) {
2455
2507
  const warningMessage = `${ANSI_YELLOW3}Warning: Custom evaluator template at ${source}
2456
2508
  Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
@@ -3868,6 +3920,19 @@ function asString2(value) {
3868
3920
  }
3869
3921
 
3870
3922
  // src/evaluation/loaders/message-processor.ts
3923
+ var IMAGE_MEDIA_TYPES = {
3924
+ ".png": "image/png",
3925
+ ".jpg": "image/jpeg",
3926
+ ".jpeg": "image/jpeg",
3927
+ ".gif": "image/gif",
3928
+ ".webp": "image/webp",
3929
+ ".svg": "image/svg+xml",
3930
+ ".bmp": "image/bmp"
3931
+ };
3932
+ function detectImageMediaType(filePath) {
3933
+ const ext = import_node_path6.default.extname(filePath).toLowerCase();
3934
+ return IMAGE_MEDIA_TYPES[ext];
3935
+ }
3871
3936
  var ANSI_YELLOW5 = "\x1B[33m";
3872
3937
  var ANSI_RESET6 = "\x1B[0m";
3873
3938
  async function processMessages(options) {
@@ -3933,6 +3998,47 @@ async function processMessages(options) {
3933
3998
  }
3934
3999
  continue;
3935
4000
  }
4001
+ if (segmentType === "image") {
4002
+ const rawValue = asString3(rawSegment.value);
4003
+ if (!rawValue) {
4004
+ continue;
4005
+ }
4006
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
4007
+ rawValue,
4008
+ searchRoots
4009
+ );
4010
+ if (!resolvedPath) {
4011
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
4012
+ const context2 = messageType === "input" ? "" : " in expected_output";
4013
+ logWarning3(`Image file not found${context2}: ${displayPath}`, attempts);
4014
+ continue;
4015
+ }
4016
+ const mediaType = detectImageMediaType(resolvedPath);
4017
+ if (!mediaType) {
4018
+ logWarning3(
4019
+ `Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
4020
+ );
4021
+ continue;
4022
+ }
4023
+ try {
4024
+ const imageBuffer = await (0, import_promises6.readFile)(resolvedPath);
4025
+ const base64 = imageBuffer.toString("base64");
4026
+ processedContent.push({
4027
+ type: "image",
4028
+ media_type: mediaType,
4029
+ source: `data:${mediaType};base64,${base64}`
4030
+ });
4031
+ if (verbose) {
4032
+ const label = messageType === "input" ? "[Image]" : "[Expected Output Image]";
4033
+ console.log(` ${label} Found: ${displayPath}`);
4034
+ console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
4035
+ }
4036
+ } catch (error) {
4037
+ const context2 = messageType === "input" ? "" : " expected output";
4038
+ logWarning3(`Could not read${context2} image ${resolvedPath}: ${error.message}`);
4039
+ }
4040
+ continue;
4041
+ }
3936
4042
  const clonedSegment = cloneJsonObject(rawSegment);
3937
4043
  processedContent.push(clonedSegment);
3938
4044
  const inlineValue = clonedSegment.value;
@@ -4010,6 +4116,46 @@ async function processExpectedMessages(options) {
4010
4116
  }
4011
4117
  continue;
4012
4118
  }
4119
+ if (segmentType === "image") {
4120
+ const rawValue = asString3(rawSegment.value);
4121
+ if (!rawValue) {
4122
+ continue;
4123
+ }
4124
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference2(
4125
+ rawValue,
4126
+ searchRoots
4127
+ );
4128
+ if (!resolvedPath) {
4129
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
4130
+ logWarning3(`Image file not found in expected_output: ${displayPath}`, attempts);
4131
+ continue;
4132
+ }
4133
+ const mediaType = detectImageMediaType(resolvedPath);
4134
+ if (!mediaType) {
4135
+ logWarning3(
4136
+ `Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
4137
+ );
4138
+ continue;
4139
+ }
4140
+ try {
4141
+ const imageBuffer = await (0, import_promises6.readFile)(resolvedPath);
4142
+ const base64 = imageBuffer.toString("base64");
4143
+ processedContent.push({
4144
+ type: "image",
4145
+ media_type: mediaType,
4146
+ source: `data:${mediaType};base64,${base64}`
4147
+ });
4148
+ if (verbose) {
4149
+ console.log(` [Expected Output Image] Found: ${displayPath}`);
4150
+ console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
4151
+ }
4152
+ } catch (error) {
4153
+ logWarning3(
4154
+ `Could not read expected output image ${resolvedPath}: ${error.message}`
4155
+ );
4156
+ }
4157
+ continue;
4158
+ }
4013
4159
  processedContent.push(cloneJsonObject(rawSegment));
4014
4160
  }
4015
4161
  segment.content = processedContent;
@@ -4256,7 +4402,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
4256
4402
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
4257
4403
  const testCase = {
4258
4404
  id,
4259
- eval_set: evalSetName,
4405
+ dataset: evalSetName,
4260
4406
  conversation_id: conversationId,
4261
4407
  question,
4262
4408
  input: inputMessages,
@@ -4527,7 +4673,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4527
4673
  }
4528
4674
  const suite = interpolated;
4529
4675
  const evalSetNameFromSuite = asString5(suite.name)?.trim();
4530
- const fallbackEvalSet = import_node_path8.default.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
4676
+ const fallbackEvalSet = import_node_path8.default.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
4531
4677
  const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
4532
4678
  const rawTestcases = resolveTests(suite);
4533
4679
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
@@ -4648,7 +4794,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
4648
4794
  const caseTargets = extractTargetsFromTestCase(evalcase);
4649
4795
  const testCase = {
4650
4796
  id,
4651
- eval_set: evalSetName,
4797
+ dataset: evalSetName,
4798
+ category: options?.category,
4652
4799
  conversation_id: conversationId,
4653
4800
  question,
4654
4801
  input: inputMessages,
@@ -5690,6 +5837,48 @@ var import_node_fs4 = require("fs");
5690
5837
  var import_promises10 = require("fs/promises");
5691
5838
  var import_node_path12 = __toESM(require("path"), 1);
5692
5839
 
5840
+ // src/evaluation/providers/claude-content.ts
5841
+ function toContentArray(content) {
5842
+ if (!Array.isArray(content)) return void 0;
5843
+ let hasNonText = false;
5844
+ const blocks = [];
5845
+ for (const part of content) {
5846
+ if (!part || typeof part !== "object") continue;
5847
+ const p = part;
5848
+ if (p.type === "text" && typeof p.text === "string") {
5849
+ blocks.push({ type: "text", text: p.text });
5850
+ } else if (p.type === "image" && typeof p.source === "object" && p.source !== null) {
5851
+ const src = p.source;
5852
+ const mediaType = typeof p.media_type === "string" ? p.media_type : typeof src.media_type === "string" ? src.media_type : "application/octet-stream";
5853
+ const data = typeof src.data === "string" ? `data:${mediaType};base64,${src.data}` : typeof p.url === "string" ? p.url : "";
5854
+ blocks.push({ type: "image", media_type: mediaType, source: data });
5855
+ hasNonText = true;
5856
+ } else if (p.type === "tool_use") {
5857
+ } else if (p.type === "tool_result") {
5858
+ }
5859
+ }
5860
+ return hasNonText && blocks.length > 0 ? blocks : void 0;
5861
+ }
5862
+ function extractTextContent(content) {
5863
+ if (typeof content === "string") {
5864
+ return content;
5865
+ }
5866
+ if (!Array.isArray(content)) {
5867
+ return void 0;
5868
+ }
5869
+ const textParts = [];
5870
+ for (const part of content) {
5871
+ if (!part || typeof part !== "object") {
5872
+ continue;
5873
+ }
5874
+ const p = part;
5875
+ if (p.type === "text" && typeof p.text === "string") {
5876
+ textParts.push(p.text);
5877
+ }
5878
+ }
5879
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
5880
+ }
5881
+
5693
5882
  // src/evaluation/providers/claude-log-tracker.ts
5694
5883
  var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeLogs");
5695
5884
  var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeLogSubscribers");
@@ -5855,11 +6044,12 @@ var ClaudeCliProvider = class {
5855
6044
  if (betaMessage && typeof betaMessage === "object") {
5856
6045
  const msg = betaMessage;
5857
6046
  const content = msg.content;
6047
+ const structuredContent = toContentArray(content);
5858
6048
  const textContent = extractTextContent(content);
5859
6049
  const toolCalls = extractToolCalls(content);
5860
6050
  const outputMsg = {
5861
6051
  role: "assistant",
5862
- content: textContent,
6052
+ content: structuredContent ?? textContent,
5863
6053
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0
5864
6054
  };
5865
6055
  output.push(outputMsg);
@@ -6198,25 +6388,6 @@ function summarizeEvent(event) {
6198
6388
  return void 0;
6199
6389
  }
6200
6390
  }
6201
- function extractTextContent(content) {
6202
- if (typeof content === "string") {
6203
- return content;
6204
- }
6205
- if (!Array.isArray(content)) {
6206
- return void 0;
6207
- }
6208
- const textParts = [];
6209
- for (const part of content) {
6210
- if (!part || typeof part !== "object") {
6211
- continue;
6212
- }
6213
- const p = part;
6214
- if (p.type === "text" && typeof p.text === "string") {
6215
- textParts.push(p.text);
6216
- }
6217
- }
6218
- return textParts.length > 0 ? textParts.join("\n") : void 0;
6219
- }
6220
6391
  function extractToolCalls(content) {
6221
6392
  if (!Array.isArray(content)) {
6222
6393
  return [];
@@ -6389,11 +6560,12 @@ var ClaudeSdkProvider = class {
6389
6560
  if (betaMessage && typeof betaMessage === "object") {
6390
6561
  const msg = betaMessage;
6391
6562
  const content = msg.content;
6392
- const textContent = extractTextContent2(content);
6563
+ const structuredContent = toContentArray(content);
6564
+ const textContent = extractTextContent(content);
6393
6565
  const toolCalls = extractToolCalls2(content);
6394
6566
  const outputMsg = {
6395
6567
  role: "assistant",
6396
- content: textContent,
6568
+ content: structuredContent ?? textContent,
6397
6569
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0
6398
6570
  };
6399
6571
  output.push(outputMsg);
@@ -6511,25 +6683,6 @@ var ClaudeSdkProvider = class {
6511
6683
  }
6512
6684
  }
6513
6685
  };
6514
- function extractTextContent2(content) {
6515
- if (typeof content === "string") {
6516
- return content;
6517
- }
6518
- if (!Array.isArray(content)) {
6519
- return void 0;
6520
- }
6521
- const textParts = [];
6522
- for (const part of content) {
6523
- if (!part || typeof part !== "object") {
6524
- continue;
6525
- }
6526
- const p = part;
6527
- if (p.type === "text" && typeof p.text === "string") {
6528
- textParts.push(p.text);
6529
- }
6530
- }
6531
- return textParts.length > 0 ? textParts.join("\n") : void 0;
6532
- }
6533
6686
  function extractToolCalls2(content) {
6534
6687
  if (!Array.isArray(content)) {
6535
6688
  return [];
@@ -6753,7 +6906,7 @@ function convertMessages(messages) {
6753
6906
  return messages.map((msg) => ({
6754
6907
  role: msg.role,
6755
6908
  name: msg.name,
6756
- content: msg.content,
6909
+ content: isContentArray(msg.content) ? msg.content : typeof msg.content === "string" ? msg.content : void 0,
6757
6910
  toolCalls: msg.tool_calls?.map((tc) => ({
6758
6911
  tool: tc.tool,
6759
6912
  input: tc.input,
@@ -9007,6 +9160,35 @@ function extractPiTextContent(content) {
9007
9160
  }
9008
9161
  return textParts.length > 0 ? textParts.join("\n") : void 0;
9009
9162
  }
9163
+ function toPiContentArray(content) {
9164
+ if (!Array.isArray(content)) return void 0;
9165
+ let hasNonText = false;
9166
+ const blocks = [];
9167
+ for (const part of content) {
9168
+ if (!part || typeof part !== "object") continue;
9169
+ const p = part;
9170
+ if (p.type === "text" && typeof p.text === "string") {
9171
+ blocks.push({ type: "text", text: p.text });
9172
+ } else if (p.type === "image") {
9173
+ const mediaType = typeof p.media_type === "string" ? p.media_type : "application/octet-stream";
9174
+ let source = "";
9175
+ if (typeof p.source === "object" && p.source !== null) {
9176
+ const src = p.source;
9177
+ const srcMediaType = typeof src.media_type === "string" ? src.media_type : mediaType;
9178
+ source = typeof src.data === "string" ? `data:${srcMediaType};base64,${src.data}` : "";
9179
+ }
9180
+ if (!source && typeof p.url === "string") {
9181
+ source = p.url;
9182
+ }
9183
+ if (source) {
9184
+ blocks.push({ type: "image", media_type: mediaType, source });
9185
+ hasNonText = true;
9186
+ }
9187
+ } else if (p.type === "tool_use" || p.type === "tool_result") {
9188
+ }
9189
+ }
9190
+ return hasNonText && blocks.length > 0 ? blocks : void 0;
9191
+ }
9010
9192
  function toFiniteNumber(value) {
9011
9193
  if (typeof value === "number" && Number.isFinite(value)) return value;
9012
9194
  return void 0;
@@ -10178,7 +10360,8 @@ function convertAgentMessage(message, toolTrackers, completedToolResults) {
10178
10360
  }
10179
10361
  const msg = message;
10180
10362
  const role = typeof msg.role === "string" ? msg.role : "unknown";
10181
- const content = extractPiTextContent(msg.content);
10363
+ const structuredContent = toPiContentArray(msg.content);
10364
+ const content = structuredContent ?? extractPiTextContent(msg.content);
10182
10365
  const toolCalls = extractToolCalls4(msg.content, toolTrackers, completedToolResults);
10183
10366
  const startTimeVal = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
10184
10367
  let msgTokenUsage;
@@ -10440,6 +10623,12 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
10440
10623
  "FILES",
10441
10624
  "OUTPUT_FILE"
10442
10625
  ]);
10626
+ var COMMON_TARGET_SETTINGS = [
10627
+ "provider_batching",
10628
+ "providerBatching",
10629
+ "subagent_mode_allowed",
10630
+ "subagentModeAllowed"
10631
+ ];
10443
10632
  var BASE_TARGET_SCHEMA = import_zod3.z.object({
10444
10633
  name: import_zod3.z.string().min(1, "target name is required"),
10445
10634
  provider: import_zod3.z.string().min(1, "provider is required"),
@@ -10448,7 +10637,8 @@ var BASE_TARGET_SCHEMA = import_zod3.z.object({
10448
10637
  // backward compat
10449
10638
  workers: import_zod3.z.number().int().min(1).optional(),
10450
10639
  workspace_template: import_zod3.z.string().optional(),
10451
- workspaceTemplate: import_zod3.z.string().optional()
10640
+ workspaceTemplate: import_zod3.z.string().optional(),
10641
+ subagent_mode_allowed: import_zod3.z.boolean().optional()
10452
10642
  }).passthrough();
10453
10643
  var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
10454
10644
  var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
@@ -10511,42 +10701,40 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
10511
10701
  const providerBatching = resolveOptionalBoolean(
10512
10702
  parsed.provider_batching ?? parsed.providerBatching
10513
10703
  );
10704
+ const subagentModeAllowed = resolveOptionalBoolean(
10705
+ parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
10706
+ );
10707
+ const base = {
10708
+ name: parsed.name,
10709
+ graderTarget: parsed.grader_target ?? parsed.judge_target,
10710
+ workers: parsed.workers,
10711
+ providerBatching,
10712
+ subagentModeAllowed
10713
+ };
10514
10714
  switch (provider) {
10515
10715
  case "openai":
10516
10716
  return {
10517
10717
  kind: "openai",
10518
- name: parsed.name,
10519
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10520
- workers: parsed.workers,
10521
- providerBatching,
10718
+ ...base,
10522
10719
  config: resolveOpenAIConfig(parsed, env)
10523
10720
  };
10524
10721
  case "openrouter":
10525
10722
  return {
10526
10723
  kind: "openrouter",
10527
- name: parsed.name,
10528
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10529
- workers: parsed.workers,
10530
- providerBatching,
10724
+ ...base,
10531
10725
  config: resolveOpenRouterConfig(parsed, env)
10532
10726
  };
10533
10727
  case "azure":
10534
10728
  case "azure-openai":
10535
10729
  return {
10536
10730
  kind: "azure",
10537
- name: parsed.name,
10538
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10539
- workers: parsed.workers,
10540
- providerBatching,
10731
+ ...base,
10541
10732
  config: resolveAzureConfig(parsed, env)
10542
10733
  };
10543
10734
  case "anthropic":
10544
10735
  return {
10545
10736
  kind: "anthropic",
10546
- name: parsed.name,
10547
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10548
- workers: parsed.workers,
10549
- providerBatching,
10737
+ ...base,
10550
10738
  config: resolveAnthropicConfig(parsed, env)
10551
10739
  };
10552
10740
  case "gemini":
@@ -10554,68 +10742,47 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
10554
10742
  case "google-gemini":
10555
10743
  return {
10556
10744
  kind: "gemini",
10557
- name: parsed.name,
10558
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10559
- workers: parsed.workers,
10560
- providerBatching,
10745
+ ...base,
10561
10746
  config: resolveGeminiConfig(parsed, env)
10562
10747
  };
10563
10748
  case "codex":
10564
10749
  case "codex-cli":
10565
10750
  return {
10566
10751
  kind: "codex",
10567
- name: parsed.name,
10568
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10569
- workers: parsed.workers,
10570
- providerBatching,
10752
+ ...base,
10571
10753
  config: resolveCodexConfig(parsed, env, evalFilePath)
10572
10754
  };
10573
10755
  case "copilot-sdk":
10574
10756
  case "copilot_sdk":
10575
10757
  return {
10576
10758
  kind: "copilot-sdk",
10577
- name: parsed.name,
10578
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10579
- workers: parsed.workers,
10580
- providerBatching,
10759
+ ...base,
10581
10760
  config: resolveCopilotSdkConfig(parsed, env, evalFilePath)
10582
10761
  };
10583
10762
  case "copilot":
10584
10763
  case "copilot-cli":
10585
10764
  return {
10586
10765
  kind: "copilot-cli",
10587
- name: parsed.name,
10588
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10589
- workers: parsed.workers,
10590
- providerBatching,
10766
+ ...base,
10591
10767
  config: resolveCopilotCliConfig(parsed, env, evalFilePath)
10592
10768
  };
10593
10769
  case "copilot-log":
10594
10770
  return {
10595
10771
  kind: "copilot-log",
10596
- name: parsed.name,
10597
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10598
- workers: parsed.workers,
10599
- providerBatching,
10772
+ ...base,
10600
10773
  config: resolveCopilotLogConfig(parsed, env)
10601
10774
  };
10602
10775
  case "pi":
10603
10776
  case "pi-coding-agent":
10604
10777
  return {
10605
10778
  kind: "pi-coding-agent",
10606
- name: parsed.name,
10607
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10608
- workers: parsed.workers,
10609
- providerBatching,
10779
+ ...base,
10610
10780
  config: resolvePiCodingAgentConfig(parsed, env, evalFilePath)
10611
10781
  };
10612
10782
  case "pi-cli":
10613
10783
  return {
10614
10784
  kind: "pi-cli",
10615
- name: parsed.name,
10616
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10617
- workers: parsed.workers,
10618
- providerBatching,
10785
+ ...base,
10619
10786
  config: resolvePiCliConfig(parsed, env, evalFilePath)
10620
10787
  };
10621
10788
  case "claude":
@@ -10623,38 +10790,26 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
10623
10790
  case "claude-cli":
10624
10791
  return {
10625
10792
  kind: "claude-cli",
10626
- name: parsed.name,
10627
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10628
- workers: parsed.workers,
10629
- providerBatching,
10793
+ ...base,
10630
10794
  config: resolveClaudeConfig(parsed, env, evalFilePath)
10631
10795
  };
10632
10796
  case "claude-sdk":
10633
10797
  return {
10634
10798
  kind: "claude-sdk",
10635
- name: parsed.name,
10636
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10637
- workers: parsed.workers,
10638
- providerBatching,
10799
+ ...base,
10639
10800
  config: resolveClaudeConfig(parsed, env, evalFilePath)
10640
10801
  };
10641
10802
  case "mock":
10642
10803
  return {
10643
10804
  kind: "mock",
10644
- name: parsed.name,
10645
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10646
- workers: parsed.workers,
10647
- providerBatching,
10805
+ ...base,
10648
10806
  config: resolveMockConfig(parsed)
10649
10807
  };
10650
10808
  case "vscode":
10651
10809
  case "vscode-insiders":
10652
10810
  return {
10653
10811
  kind: provider,
10654
- name: parsed.name,
10655
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10656
- workers: parsed.workers,
10657
- providerBatching,
10812
+ ...base,
10658
10813
  config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders", evalFilePath)
10659
10814
  };
10660
10815
  case "agentv": {
@@ -10667,29 +10822,21 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
10667
10822
  const temperature = typeof parsed.temperature === "number" ? parsed.temperature : 0;
10668
10823
  return {
10669
10824
  kind: "agentv",
10670
- name: parsed.name,
10671
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10825
+ ...base,
10672
10826
  workers: typeof parsed.workers === "number" ? parsed.workers : void 0,
10673
- providerBatching,
10674
10827
  config: { model, temperature }
10675
10828
  };
10676
10829
  }
10677
10830
  case "cli":
10678
10831
  return {
10679
10832
  kind: "cli",
10680
- name: parsed.name,
10681
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10682
- workers: parsed.workers,
10683
- providerBatching,
10833
+ ...base,
10684
10834
  config: resolveCliConfig(parsed, env, evalFilePath)
10685
10835
  };
10686
10836
  default:
10687
10837
  return {
10688
10838
  kind: "cli",
10689
- name: parsed.name,
10690
- graderTarget: parsed.grader_target ?? parsed.judge_target,
10691
- workers: parsed.workers,
10692
- providerBatching,
10839
+ ...base,
10693
10840
  config: resolveDiscoveredProviderConfig(parsed, provider, env, evalFilePath)
10694
10841
  };
10695
10842
  }
@@ -11317,8 +11464,8 @@ function resolveCliConfig(target, env, evalFilePath) {
11317
11464
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
11318
11465
  if (!parseResult.success) {
11319
11466
  const firstError = parseResult.error.errors[0];
11320
- const path50 = firstError?.path.join(".") || "";
11321
- const prefix = path50 ? `${target.name} ${path50}: ` : `${target.name}: `;
11467
+ const path51 = firstError?.path.join(".") || "";
11468
+ const prefix = path51 ? `${target.name} ${path51}: ` : `${target.name}: `;
11322
11469
  throw new Error(`${prefix}${firstError?.message}`);
11323
11470
  }
11324
11471
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -13007,6 +13154,41 @@ total unlocked subagents available: ${result.created.length + result.skippedExis
13007
13154
  }
13008
13155
  }
13009
13156
 
13157
+ // src/evaluation/providers/types.ts
13158
+ var AGENT_PROVIDER_KINDS = [
13159
+ "codex",
13160
+ "copilot-sdk",
13161
+ "copilot-cli",
13162
+ "pi-coding-agent",
13163
+ "pi-cli",
13164
+ "claude",
13165
+ "claude-cli",
13166
+ "claude-sdk",
13167
+ "vscode",
13168
+ "vscode-insiders"
13169
+ ];
13170
+ function extractLastAssistantContent(messages) {
13171
+ if (!messages || messages.length === 0) {
13172
+ return "";
13173
+ }
13174
+ for (let i = messages.length - 1; i >= 0; i--) {
13175
+ const msg = messages[i];
13176
+ if (msg.role === "assistant" && msg.content !== void 0) {
13177
+ if (typeof msg.content === "string") {
13178
+ return msg.content;
13179
+ }
13180
+ if (isContentArray(msg.content)) {
13181
+ return getTextContent(msg.content);
13182
+ }
13183
+ return JSON.stringify(msg.content);
13184
+ }
13185
+ }
13186
+ return "";
13187
+ }
13188
+ function isAgentProvider(provider) {
13189
+ return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
13190
+ }
13191
+
13010
13192
  // src/evaluation/providers/targets-file.ts
13011
13193
  var import_node_fs11 = require("fs");
13012
13194
  var import_promises27 = require("fs/promises");
@@ -13319,13 +13501,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
13319
13501
  async function execShellWithStdin(command, stdinPayload, options = {}) {
13320
13502
  const { mkdir: mkdir17, readFile: readFile16, rm: rm6, writeFile: writeFile9 } = await import("fs/promises");
13321
13503
  const { tmpdir: tmpdir3 } = await import("os");
13322
- const path50 = await import("path");
13504
+ const path51 = await import("path");
13323
13505
  const { randomUUID: randomUUID10 } = await import("crypto");
13324
- const dir = path50.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
13506
+ const dir = path51.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
13325
13507
  await mkdir17(dir, { recursive: true });
13326
- const stdinPath = path50.join(dir, "stdin.txt");
13327
- const stdoutPath = path50.join(dir, "stdout.txt");
13328
- const stderrPath = path50.join(dir, "stderr.txt");
13508
+ const stdinPath = path51.join(dir, "stdin.txt");
13509
+ const stdoutPath = path51.join(dir, "stdout.txt");
13510
+ const stderrPath = path51.join(dir, "stderr.txt");
13329
13511
  await writeFile9(stdinPath, stdinPayload, "utf8");
13330
13512
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
13331
13513
  const { spawn: spawn5 } = await import("child_process");
@@ -13457,7 +13639,7 @@ async function createTargetProxy(options) {
13457
13639
  totalOutputTokens += response.tokenUsage.output;
13458
13640
  }
13459
13641
  const output = response.output ?? [];
13460
- const rawText = extractLastAssistantContent(output);
13642
+ const rawText = extractLastAssistantContent2(output);
13461
13643
  const result = {
13462
13644
  output,
13463
13645
  rawText,
@@ -13515,7 +13697,7 @@ async function createTargetProxy(options) {
13515
13697
  const output = response.output ?? [];
13516
13698
  responses.push({
13517
13699
  output,
13518
- rawText: extractLastAssistantContent(output),
13700
+ rawText: extractLastAssistantContent2(output),
13519
13701
  tokenUsage: response.tokenUsage
13520
13702
  });
13521
13703
  } catch (error) {
@@ -13572,7 +13754,7 @@ function readBody(req) {
13572
13754
  req.on("error", reject);
13573
13755
  });
13574
13756
  }
13575
- function extractLastAssistantContent(messages) {
13757
+ function extractLastAssistantContent2(messages) {
13576
13758
  for (let i = messages.length - 1; i >= 0; i--) {
13577
13759
  const msg = messages[i];
13578
13760
  if (msg.role === "assistant" && msg.content !== void 0) {
@@ -13641,6 +13823,56 @@ function toCamelCaseDeep(obj) {
13641
13823
 
13642
13824
  // src/evaluation/evaluators/code-evaluator.ts
13643
13825
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
13826
+ var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
13827
+ async function materializeContentForGrader(messages, getWorkDir) {
13828
+ if (!messages || messages.length === 0) return messages ?? null;
13829
+ let hasAnyImage = false;
13830
+ for (const msg of messages) {
13831
+ if (isContentArray(msg.content)) {
13832
+ for (const block of msg.content) {
13833
+ if (block.type === "image") {
13834
+ hasAnyImage = true;
13835
+ break;
13836
+ }
13837
+ }
13838
+ }
13839
+ if (hasAnyImage) break;
13840
+ }
13841
+ if (!hasAnyImage) return messages;
13842
+ let counter = 0;
13843
+ const result = [];
13844
+ for (const msg of messages) {
13845
+ if (!isContentArray(msg.content)) {
13846
+ result.push(msg);
13847
+ continue;
13848
+ }
13849
+ if (!msg.content.some((b) => b.type === "image")) {
13850
+ result.push(msg);
13851
+ continue;
13852
+ }
13853
+ const blocks = [];
13854
+ for (const block of msg.content) {
13855
+ if (block.type !== "image") {
13856
+ blocks.push({ ...block });
13857
+ continue;
13858
+ }
13859
+ const img = block;
13860
+ const match = DATA_URI_RE.exec(img.source);
13861
+ if (match) {
13862
+ const [, mediaType, base64Data] = match;
13863
+ const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
13864
+ const dir = await getWorkDir();
13865
+ const filePath = (0, import_node_path38.join)(dir, `img-${counter++}.${ext}`);
13866
+ await (0, import_promises28.writeFile)(filePath, Buffer.from(base64Data, "base64"));
13867
+ blocks.push({ type: "image", media_type: img.media_type, path: filePath });
13868
+ } else {
13869
+ blocks.push({ type: "image", media_type: img.media_type, path: img.source });
13870
+ }
13871
+ }
13872
+ result.push({ ...msg, content: blocks });
13873
+ }
13874
+ return result;
13875
+ }
13644
13876
  var CodeEvaluator = class {
13645
13877
  kind = "code-grader";
13646
13878
  command;
@@ -13656,7 +13888,18 @@ var CodeEvaluator = class {
13656
13888
  this.target = options.target;
13657
13889
  }
13658
13890
  async evaluate(context2) {
13659
- let outputForPayload = context2.output ?? null;
13891
+ let imageTmpDir;
13892
+ const getImageDir = async () => {
13893
+ if (!imageTmpDir) {
13894
+ imageTmpDir = await (0, import_promises28.mkdtemp)((0, import_node_path38.join)((0, import_node_os7.tmpdir)(), "agentv-img-"));
13895
+ }
13896
+ return imageTmpDir;
13897
+ };
13898
+ const materializedOutput = await materializeContentForGrader(
13899
+ context2.output,
13900
+ getImageDir
13901
+ );
13902
+ let outputForPayload = materializedOutput;
13660
13903
  let outputPath;
13661
13904
  if (outputForPayload) {
13662
13905
  const serialized = JSON.stringify(outputForPayload);
@@ -13669,12 +13912,17 @@ var CodeEvaluator = class {
13669
13912
  }
13670
13913
  const payload = {
13671
13914
  criteria: context2.evalCase.criteria,
13672
- expectedOutput: context2.evalCase.expected_output,
13673
- outputText: context2.candidate,
13915
+ expectedOutput: await materializeContentForGrader(
13916
+ context2.evalCase.expected_output,
13917
+ getImageDir
13918
+ ),
13674
13919
  output: outputForPayload,
13675
13920
  outputPath,
13676
13921
  inputFiles: context2.evalCase.file_paths,
13677
- input: context2.evalCase.input,
13922
+ input: await materializeContentForGrader(
13923
+ context2.evalCase.input,
13924
+ getImageDir
13925
+ ),
13678
13926
  trace: context2.trace ?? null,
13679
13927
  tokenUsage: context2.tokenUsage ?? null,
13680
13928
  costUsd: context2.costUsd ?? null,
@@ -13683,9 +13931,7 @@ var CodeEvaluator = class {
13683
13931
  endTime: context2.endTime ?? null,
13684
13932
  fileChanges: context2.fileChanges ?? null,
13685
13933
  workspacePath: context2.workspacePath ?? null,
13686
- config: this.config ?? null,
13687
- inputText: context2.evalCase.question,
13688
- expectedOutputText: context2.evalCase.reference_answer ?? ""
13934
+ config: this.config ?? null
13689
13935
  };
13690
13936
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
13691
13937
  let proxyEnv;
@@ -13775,6 +14021,10 @@ var CodeEvaluator = class {
13775
14021
  await (0, import_promises28.rm)((0, import_node_path38.dirname)(outputPath), { recursive: true, force: true }).catch(() => {
13776
14022
  });
13777
14023
  }
14024
+ if (imageTmpDir) {
14025
+ await (0, import_promises28.rm)(imageTmpDir, { recursive: true, force: true }).catch(() => {
14026
+ });
14027
+ }
13778
14028
  }
13779
14029
  }
13780
14030
  };
@@ -13802,38 +14052,6 @@ ${tail}`;
13802
14052
  // src/evaluation/evaluators/composite.ts
13803
14053
  var import_ai3 = require("ai");
13804
14054
 
13805
- // src/evaluation/providers/types.ts
13806
- var AGENT_PROVIDER_KINDS = [
13807
- "codex",
13808
- "copilot-sdk",
13809
- "copilot-cli",
13810
- "pi-coding-agent",
13811
- "pi-cli",
13812
- "claude",
13813
- "claude-cli",
13814
- "claude-sdk",
13815
- "vscode",
13816
- "vscode-insiders"
13817
- ];
13818
- function extractLastAssistantContent2(messages) {
13819
- if (!messages || messages.length === 0) {
13820
- return "";
13821
- }
13822
- for (let i = messages.length - 1; i >= 0; i--) {
13823
- const msg = messages[i];
13824
- if (msg.role === "assistant" && msg.content !== void 0) {
13825
- if (typeof msg.content === "string") {
13826
- return msg.content;
13827
- }
13828
- return JSON.stringify(msg.content);
13829
- }
13830
- }
13831
- return "";
13832
- }
13833
- function isAgentProvider(provider) {
13834
- return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
13835
- }
13836
-
13837
14055
  // src/evaluation/evaluators/llm-grader.ts
13838
14056
  var import_promises29 = __toESM(require("fs/promises"), 1);
13839
14057
  var import_node_path39 = __toESM(require("path"), 1);
@@ -13884,13 +14102,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
13884
14102
  {{${TEMPLATE_VARIABLES.CRITERIA}}}
13885
14103
 
13886
14104
  [[ ## question ## ]]
13887
- {{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
14105
+ {{${TEMPLATE_VARIABLES.INPUT}}}
13888
14106
 
13889
14107
  [[ ## reference_answer ## ]]
13890
- {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
14108
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT}}}
13891
14109
 
13892
14110
  [[ ## answer ## ]]
13893
- {{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
14111
+ {{${TEMPLATE_VARIABLES.OUTPUT}}}`;
13894
14112
  var freeformEvaluationSchema = import_zod4.z.object({
13895
14113
  score: import_zod4.z.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
13896
14114
  assertions: import_zod4.z.array(
@@ -13962,21 +14180,19 @@ var LlmGraderEvaluator = class {
13962
14180
  async evaluateFreeform(context2, graderProvider) {
13963
14181
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
13964
14182
  const variables = {
13965
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context2.evalCase.input, null, 2),
13966
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(
13967
- context2.evalCase.expected_output,
13968
- null,
13969
- 2
13970
- ),
13971
- [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
14183
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
14184
+ [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
14185
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
13972
14186
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
13973
14187
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
14188
+ // Deprecated aliases — same values as the primary variables above
13974
14189
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
13975
14190
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
13976
14191
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
13977
14192
  };
13978
14193
  const systemPrompt = buildOutputSchema();
13979
14194
  const evaluatorTemplate = context2.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
14195
+ warnDeprecatedTemplateVars(evaluatorTemplate);
13980
14196
  let userPrompt = substituteVariables(evaluatorTemplate, variables);
13981
14197
  if (context2.fileChanges && !context2.evaluatorTemplateOverride && !this.evaluatorTemplate) {
13982
14198
  userPrompt += `
@@ -13988,13 +14204,15 @@ ${context2.fileChanges}`;
13988
14204
  userPrompt,
13989
14205
  systemPrompt
13990
14206
  };
14207
+ const images = context2.output ? extractImageBlocks(context2.output) : [];
13991
14208
  try {
13992
14209
  const { data, tokenUsage } = await this.runWithRetry({
13993
14210
  context: context2,
13994
14211
  graderProvider,
13995
14212
  systemPrompt,
13996
14213
  userPrompt,
13997
- schema: freeformEvaluationSchema
14214
+ schema: freeformEvaluationSchema,
14215
+ images
13998
14216
  });
13999
14217
  const score = clampScore(data.score);
14000
14218
  const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
@@ -14038,13 +14256,15 @@ ${context2.fileChanges}`;
14038
14256
  userPrompt: prompt,
14039
14257
  systemPrompt
14040
14258
  };
14259
+ const images = context2.output ? extractImageBlocks(context2.output) : [];
14041
14260
  try {
14042
14261
  const { data, tokenUsage } = await this.runWithRetry({
14043
14262
  context: context2,
14044
14263
  graderProvider,
14045
14264
  systemPrompt,
14046
14265
  userPrompt: prompt,
14047
- schema: rubricEvaluationSchema
14266
+ schema: rubricEvaluationSchema,
14267
+ images
14048
14268
  });
14049
14269
  const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
14050
14270
  return {
@@ -14081,13 +14301,15 @@ ${context2.fileChanges}`;
14081
14301
  userPrompt: prompt,
14082
14302
  systemPrompt
14083
14303
  };
14304
+ const images = context2.output ? extractImageBlocks(context2.output) : [];
14084
14305
  try {
14085
14306
  const { data, tokenUsage } = await this.runWithRetry({
14086
14307
  context: context2,
14087
14308
  graderProvider,
14088
14309
  systemPrompt,
14089
14310
  userPrompt: prompt,
14090
- schema: scoreRangeEvaluationSchema
14311
+ schema: scoreRangeEvaluationSchema,
14312
+ images
14091
14313
  });
14092
14314
  const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
14093
14315
  return {
@@ -14217,7 +14439,7 @@ ${context2.fileChanges}`;
14217
14439
  evalCaseId: context2.evalCase.id,
14218
14440
  attempt: context2.attempt
14219
14441
  });
14220
- const assistantContent = extractLastAssistantContent2(response.output);
14442
+ const assistantContent = extractLastAssistantContent(response.output);
14221
14443
  if (!assistantContent) {
14222
14444
  return {
14223
14445
  score: 0,
@@ -14294,12 +14516,17 @@ ${context2.fileChanges}`;
14294
14516
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
14295
14517
  const variables = {
14296
14518
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
14519
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
14520
+ [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
14521
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
14522
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
14523
+ // Deprecated aliases
14297
14524
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
14298
14525
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
14299
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
14300
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
14526
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
14301
14527
  };
14302
14528
  if (this.evaluatorTemplate) {
14529
+ warnDeprecatedTemplateVars(this.evaluatorTemplate);
14303
14530
  return substituteVariables(this.evaluatorTemplate, variables);
14304
14531
  }
14305
14532
  const config = context2.evaluator;
@@ -14350,11 +14577,16 @@ ${context2.fileChanges}`;
14350
14577
  if (this.evaluatorTemplate) {
14351
14578
  const variables = {
14352
14579
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
14580
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
14581
+ [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
14582
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
14583
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
14584
+ // Deprecated aliases
14353
14585
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
14354
14586
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
14355
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
14356
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
14587
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
14357
14588
  };
14589
+ warnDeprecatedTemplateVars(this.evaluatorTemplate);
14358
14590
  const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
14359
14591
  const outputSchema = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
14360
14592
  return `${customPrompt}
@@ -14525,18 +14757,35 @@ ${outputSchema}`;
14525
14757
  // LLM mode retry logic
14526
14758
  // ---------------------------------------------------------------------------
14527
14759
  async runWithRetry(options) {
14528
- const { context: context2, graderProvider, systemPrompt, userPrompt, schema } = options;
14760
+ const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
14529
14761
  let lastError;
14530
14762
  for (let attempt = 1; attempt <= 3; attempt++) {
14531
14763
  try {
14532
14764
  const model = graderProvider.asLanguageModel?.();
14533
14765
  if (model) {
14534
- const result = await (0, import_ai2.generateText)({
14766
+ const modelOptions = {
14767
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
14768
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
14769
+ };
14770
+ const hasImages = images && images.length > 0;
14771
+ const result = hasImages ? await (0, import_ai2.generateText)({
14772
+ model,
14773
+ system: systemPrompt,
14774
+ messages: [
14775
+ {
14776
+ role: "user",
14777
+ content: [
14778
+ { type: "text", text: userPrompt },
14779
+ ...toAiSdkImageParts(images)
14780
+ ]
14781
+ }
14782
+ ],
14783
+ ...modelOptions
14784
+ }) : await (0, import_ai2.generateText)({
14535
14785
  model,
14536
14786
  system: systemPrompt,
14537
14787
  prompt: userPrompt,
14538
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
14539
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
14788
+ ...modelOptions
14540
14789
  });
14541
14790
  const data2 = schema.parse(parseJsonFromText(result.text));
14542
14791
  const rawUsage = result.usage;
@@ -14551,7 +14800,7 @@ ${outputSchema}`;
14551
14800
  maxOutputTokens: this.maxOutputTokens,
14552
14801
  temperature: this.temperature
14553
14802
  });
14554
- const data = schema.parse(parseJsonFromText(extractLastAssistantContent2(response.output)));
14803
+ const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
14555
14804
  return { data, providerResponse: response, tokenUsage: response.tokenUsage };
14556
14805
  } catch (e) {
14557
14806
  lastError = e instanceof Error ? e : new Error(String(e));
@@ -14596,6 +14845,26 @@ function substituteVariables(template, variables) {
14596
14845
  return variables[varName] ?? match;
14597
14846
  });
14598
14847
  }
14848
+ var ANSI_YELLOW8 = "\x1B[33m";
14849
+ var ANSI_RESET9 = "\x1B[0m";
14850
+ var warnedTemplateStrings = /* @__PURE__ */ new Set();
14851
+ function warnDeprecatedTemplateVars(template) {
14852
+ if (warnedTemplateStrings.has(template)) return;
14853
+ const used = [];
14854
+ for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
14855
+ if (new RegExp(`\\{\\{\\s*${deprecated}\\s*\\}\\}`).test(template)) {
14856
+ used.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
14857
+ }
14858
+ }
14859
+ if (used.length > 0) {
14860
+ warnedTemplateStrings.add(template);
14861
+ console.warn(
14862
+ `${ANSI_YELLOW8}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
14863
+ ${used.join("\n ")}
14864
+ Update your custom evaluator template to use the new names.${ANSI_RESET9}`
14865
+ );
14866
+ }
14867
+ }
14599
14868
  function calculateRubricScore(result, rubrics) {
14600
14869
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
14601
14870
  const assertions = [];
@@ -14690,6 +14959,26 @@ function calculateScoreRangeResult(result, rubrics) {
14690
14959
  }
14691
14960
  };
14692
14961
  }
14962
+ function extractImageBlocks(messages) {
14963
+ const images = [];
14964
+ for (const msg of messages) {
14965
+ if (msg.role !== "assistant") continue;
14966
+ if (!isContentArray(msg.content)) continue;
14967
+ for (const block of msg.content) {
14968
+ if (block.type === "image") {
14969
+ images.push(block);
14970
+ }
14971
+ }
14972
+ }
14973
+ return images;
14974
+ }
14975
+ function toAiSdkImageParts(images) {
14976
+ return images.map((img) => ({
14977
+ type: "image",
14978
+ image: img.source,
14979
+ mediaType: img.media_type || void 0
14980
+ }));
14981
+ }
14693
14982
  function resolveSandboxed(basePath, relativePath) {
14694
14983
  const resolved = import_node_path39.default.resolve(basePath, relativePath);
14695
14984
  if (!resolved.startsWith(basePath + import_node_path39.default.sep) && resolved !== basePath) {
@@ -15075,7 +15364,7 @@ var CompositeEvaluator = class {
15075
15364
  attempt: context2.attempt
15076
15365
  });
15077
15366
  const data = freeformEvaluationSchema.parse(
15078
- parseJsonFromText(extractLastAssistantContent2(response.output))
15367
+ parseJsonFromText(extractLastAssistantContent(response.output))
15079
15368
  );
15080
15369
  const score = clampScore(data.score);
15081
15370
  const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
@@ -15431,115 +15720,115 @@ var FieldAccuracyEvaluator = class {
15431
15720
  * Evaluate a single field against the expected value.
15432
15721
  */
15433
15722
  evaluateField(fieldConfig, candidateData, expectedData) {
15434
- const { path: path50, match, required = true, weight = 1 } = fieldConfig;
15435
- const candidateValue = resolvePath(candidateData, path50);
15436
- const expectedValue = resolvePath(expectedData, path50);
15723
+ const { path: path51, match, required = true, weight = 1 } = fieldConfig;
15724
+ const candidateValue = resolvePath(candidateData, path51);
15725
+ const expectedValue = resolvePath(expectedData, path51);
15437
15726
  if (expectedValue === void 0) {
15438
15727
  return {
15439
- path: path50,
15728
+ path: path51,
15440
15729
  score: 1,
15441
15730
  // No expected value means no comparison needed
15442
15731
  weight,
15443
15732
  hit: true,
15444
- message: `${path50}: no expected value`
15733
+ message: `${path51}: no expected value`
15445
15734
  };
15446
15735
  }
15447
15736
  if (candidateValue === void 0) {
15448
15737
  if (required) {
15449
15738
  return {
15450
- path: path50,
15739
+ path: path51,
15451
15740
  score: 0,
15452
15741
  weight,
15453
15742
  hit: false,
15454
- message: `${path50} (required, missing)`
15743
+ message: `${path51} (required, missing)`
15455
15744
  };
15456
15745
  }
15457
15746
  return {
15458
- path: path50,
15747
+ path: path51,
15459
15748
  score: 1,
15460
15749
  // Don't penalize missing optional fields
15461
15750
  weight: 0,
15462
15751
  // Zero weight means it won't affect the score
15463
15752
  hit: true,
15464
- message: `${path50}: optional field missing`
15753
+ message: `${path51}: optional field missing`
15465
15754
  };
15466
15755
  }
15467
15756
  switch (match) {
15468
15757
  case "exact":
15469
- return this.compareExact(path50, candidateValue, expectedValue, weight);
15758
+ return this.compareExact(path51, candidateValue, expectedValue, weight);
15470
15759
  case "numeric_tolerance":
15471
15760
  return this.compareNumericTolerance(
15472
- path50,
15761
+ path51,
15473
15762
  candidateValue,
15474
15763
  expectedValue,
15475
15764
  fieldConfig,
15476
15765
  weight
15477
15766
  );
15478
15767
  case "date":
15479
- return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
15768
+ return this.compareDate(path51, candidateValue, expectedValue, fieldConfig, weight);
15480
15769
  default:
15481
15770
  return {
15482
- path: path50,
15771
+ path: path51,
15483
15772
  score: 0,
15484
15773
  weight,
15485
15774
  hit: false,
15486
- message: `${path50}: unknown match type "${match}"`
15775
+ message: `${path51}: unknown match type "${match}"`
15487
15776
  };
15488
15777
  }
15489
15778
  }
15490
15779
  /**
15491
15780
  * Exact equality comparison.
15492
15781
  */
15493
- compareExact(path50, candidateValue, expectedValue, weight) {
15782
+ compareExact(path51, candidateValue, expectedValue, weight) {
15494
15783
  if (deepEqual(candidateValue, expectedValue)) {
15495
15784
  return {
15496
- path: path50,
15785
+ path: path51,
15497
15786
  score: 1,
15498
15787
  weight,
15499
15788
  hit: true,
15500
- message: path50
15789
+ message: path51
15501
15790
  };
15502
15791
  }
15503
15792
  if (typeof candidateValue !== typeof expectedValue) {
15504
15793
  return {
15505
- path: path50,
15794
+ path: path51,
15506
15795
  score: 0,
15507
15796
  weight,
15508
15797
  hit: false,
15509
- message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
15798
+ message: `${path51} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
15510
15799
  };
15511
15800
  }
15512
15801
  return {
15513
- path: path50,
15802
+ path: path51,
15514
15803
  score: 0,
15515
15804
  weight,
15516
15805
  hit: false,
15517
- message: `${path50} (value mismatch)`
15806
+ message: `${path51} (value mismatch)`
15518
15807
  };
15519
15808
  }
15520
15809
  /**
15521
15810
  * Numeric comparison with absolute or relative tolerance.
15522
15811
  */
15523
- compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
15812
+ compareNumericTolerance(path51, candidateValue, expectedValue, fieldConfig, weight) {
15524
15813
  const { tolerance = 0, relative = false } = fieldConfig;
15525
15814
  const candidateNum = toNumber(candidateValue);
15526
15815
  const expectedNum = toNumber(expectedValue);
15527
15816
  if (candidateNum === null || expectedNum === null) {
15528
15817
  return {
15529
- path: path50,
15818
+ path: path51,
15530
15819
  score: 0,
15531
15820
  weight,
15532
15821
  hit: false,
15533
- message: `${path50} (non-numeric value)`
15822
+ message: `${path51} (non-numeric value)`
15534
15823
  };
15535
15824
  }
15536
15825
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
15537
15826
  return {
15538
- path: path50,
15827
+ path: path51,
15539
15828
  score: 0,
15540
15829
  weight,
15541
15830
  hit: false,
15542
- message: `${path50} (invalid numeric value)`
15831
+ message: `${path51} (invalid numeric value)`
15543
15832
  };
15544
15833
  }
15545
15834
  const diff = Math.abs(candidateNum - expectedNum);
@@ -15552,61 +15841,61 @@ var FieldAccuracyEvaluator = class {
15552
15841
  }
15553
15842
  if (withinTolerance) {
15554
15843
  return {
15555
- path: path50,
15844
+ path: path51,
15556
15845
  score: 1,
15557
15846
  weight,
15558
15847
  hit: true,
15559
- message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
15848
+ message: `${path51} (within tolerance: diff=${diff.toFixed(2)})`
15560
15849
  };
15561
15850
  }
15562
15851
  return {
15563
- path: path50,
15852
+ path: path51,
15564
15853
  score: 0,
15565
15854
  weight,
15566
15855
  hit: false,
15567
- message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
15856
+ message: `${path51} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
15568
15857
  };
15569
15858
  }
15570
15859
  /**
15571
15860
  * Date comparison with format normalization.
15572
15861
  */
15573
- compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
15862
+ compareDate(path51, candidateValue, expectedValue, fieldConfig, weight) {
15574
15863
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
15575
15864
  const candidateDate = parseDate(String(candidateValue), formats);
15576
15865
  const expectedDate = parseDate(String(expectedValue), formats);
15577
15866
  if (candidateDate === null) {
15578
15867
  return {
15579
- path: path50,
15868
+ path: path51,
15580
15869
  score: 0,
15581
15870
  weight,
15582
15871
  hit: false,
15583
- message: `${path50} (unparseable candidate date)`
15872
+ message: `${path51} (unparseable candidate date)`
15584
15873
  };
15585
15874
  }
15586
15875
  if (expectedDate === null) {
15587
15876
  return {
15588
- path: path50,
15877
+ path: path51,
15589
15878
  score: 0,
15590
15879
  weight,
15591
15880
  hit: false,
15592
- message: `${path50} (unparseable expected date)`
15881
+ message: `${path51} (unparseable expected date)`
15593
15882
  };
15594
15883
  }
15595
15884
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
15596
15885
  return {
15597
- path: path50,
15886
+ path: path51,
15598
15887
  score: 1,
15599
15888
  weight,
15600
15889
  hit: true,
15601
- message: path50
15890
+ message: path51
15602
15891
  };
15603
15892
  }
15604
15893
  return {
15605
- path: path50,
15894
+ path: path51,
15606
15895
  score: 0,
15607
15896
  weight,
15608
15897
  hit: false,
15609
- message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
15898
+ message: `${path51} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
15610
15899
  };
15611
15900
  }
15612
15901
  /**
@@ -15639,11 +15928,11 @@ var FieldAccuracyEvaluator = class {
15639
15928
  };
15640
15929
  }
15641
15930
  };
15642
- function resolvePath(obj, path50) {
15643
- if (!path50 || !obj) {
15931
+ function resolvePath(obj, path51) {
15932
+ if (!path51 || !obj) {
15644
15933
  return void 0;
15645
15934
  }
15646
- const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
15935
+ const parts = path51.split(/\.|\[|\]/).filter((p) => p.length > 0);
15647
15936
  let current = obj;
15648
15937
  for (const part of parts) {
15649
15938
  if (current === null || current === void 0) {
@@ -15935,11 +16224,12 @@ function assembleLlmGraderPrompt(input) {
15935
16224
  function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
15936
16225
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
15937
16226
  const variables = {
15938
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2),
15939
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
15940
- [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
16227
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
16228
+ [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(),
16229
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
15941
16230
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
15942
16231
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
16232
+ // Deprecated aliases
15943
16233
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
15944
16234
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
15945
16235
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -16126,8 +16416,8 @@ var TokenUsageEvaluator = class {
16126
16416
  };
16127
16417
 
16128
16418
  // src/evaluation/evaluators/tool-trajectory.ts
16129
- function getNestedValue(obj, path50) {
16130
- const parts = path50.split(".");
16419
+ function getNestedValue(obj, path51) {
16420
+ const parts = path51.split(".");
16131
16421
  let current = obj;
16132
16422
  for (const part of parts) {
16133
16423
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -16996,16 +17286,13 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
16996
17286
  const payload = {
16997
17287
  criteria: context2.evalCase.criteria,
16998
17288
  expectedOutput: context2.evalCase.expected_output,
16999
- outputText: context2.candidate,
17000
17289
  output: context2.output ?? null,
17001
17290
  inputFiles: context2.evalCase.file_paths,
17002
17291
  input: context2.evalCase.input,
17003
17292
  trace: context2.trace ?? null,
17004
17293
  fileChanges: context2.fileChanges ?? null,
17005
17294
  workspacePath: context2.workspacePath ?? null,
17006
- config: config ?? context2.config ?? null,
17007
- inputText: context2.evalCase.question,
17008
- expectedOutputText: context2.evalCase.reference_answer ?? ""
17295
+ config: config ?? context2.config ?? null
17009
17296
  };
17010
17297
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
17011
17298
  const scriptPath = script[script.length - 1];
@@ -18685,7 +18972,8 @@ async function runEvaluation(options) {
18685
18972
  const budgetResult = {
18686
18973
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
18687
18974
  testId: evalCase.id,
18688
- eval_set: evalCase.eval_set,
18975
+ dataset: evalCase.dataset,
18976
+ category: evalCase.category,
18689
18977
  score: 0,
18690
18978
  assertions: [],
18691
18979
  output: [],
@@ -18721,7 +19009,8 @@ async function runEvaluation(options) {
18721
19009
  const haltResult = {
18722
19010
  timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
18723
19011
  testId: evalCase.id,
18724
- eval_set: evalCase.eval_set,
19012
+ dataset: evalCase.dataset,
19013
+ category: evalCase.category,
18725
19014
  score: 0,
18726
19015
  assertions: [],
18727
19016
  output: [],
@@ -19004,7 +19293,7 @@ async function runBatchEvaluation(options) {
19004
19293
  const tokenUsage = merged?.tokenUsage;
19005
19294
  const startTime = merged?.startTime;
19006
19295
  const endTime = merged?.endTime;
19007
- const candidate = extractLastAssistantContent2(output);
19296
+ const candidate = extractLastAssistantContent(output);
19008
19297
  const providerError = extractProviderError(providerResponse);
19009
19298
  let result;
19010
19299
  try {
@@ -19412,7 +19701,7 @@ async function runEvalCase(options) {
19412
19701
  const tokenUsage = merged?.tokenUsage;
19413
19702
  const startTime = merged?.startTime;
19414
19703
  const endTime = merged?.endTime;
19415
- const candidate = extractLastAssistantContent2(output);
19704
+ const candidate = extractLastAssistantContent(output);
19416
19705
  let fileChanges;
19417
19706
  if (baselineCommit && workspacePath) {
19418
19707
  try {
@@ -19720,7 +20009,8 @@ async function evaluateCandidate(options) {
19720
20009
  return {
19721
20010
  timestamp: completedAt.toISOString(),
19722
20011
  testId: evalCase.id,
19723
- eval_set: evalCase.eval_set,
20012
+ dataset: evalCase.dataset,
20013
+ category: evalCase.category,
19724
20014
  conversationId: evalCase.conversation_id,
19725
20015
  score: score.score,
19726
20016
  assertions: score.assertions,
@@ -20070,7 +20360,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
20070
20360
  return {
20071
20361
  timestamp: timestamp.toISOString(),
20072
20362
  testId: evalCase.id,
20073
- eval_set: evalCase.eval_set,
20363
+ dataset: evalCase.dataset,
20364
+ category: evalCase.category,
20074
20365
  conversationId: evalCase.conversation_id,
20075
20366
  score: 0,
20076
20367
  assertions: [{ text: `Error: ${message}`, passed: false }],
@@ -20643,6 +20934,18 @@ function trimBaselineResult(result) {
20643
20934
  return trimmed;
20644
20935
  }
20645
20936
 
20937
+ // src/evaluation/category.ts
20938
+ var import_node_path51 = __toESM(require("path"), 1);
20939
+ var DEFAULT_CATEGORY = "Uncategorized";
20940
+ function deriveCategory(relativePath) {
20941
+ const parts = relativePath.split(import_node_path51.default.sep);
20942
+ if (parts.length <= 1) {
20943
+ return DEFAULT_CATEGORY;
20944
+ }
20945
+ const dirs = parts.slice(0, -1).filter((d) => d !== "evals");
20946
+ return dirs.length > 0 ? dirs.join("/") : DEFAULT_CATEGORY;
20947
+ }
20948
+
20646
20949
  // src/observability/otel-exporter.ts
20647
20950
  var OTEL_BACKEND_PRESETS = {
20648
20951
  langfuse: {
@@ -20766,7 +21069,7 @@ var OtelTraceExporter = class {
20766
21069
  rootSpan.setAttribute("gen_ai.system", "agentv");
20767
21070
  rootSpan.setAttribute("agentv.test_id", result.testId);
20768
21071
  rootSpan.setAttribute("agentv.target", result.target);
20769
- if (result.eval_set) rootSpan.setAttribute("agentv.eval_set", result.eval_set);
21072
+ if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
20770
21073
  rootSpan.setAttribute("agentv.score", result.score);
20771
21074
  if (captureContent && result.output.length > 0) {
20772
21075
  const lastMsg = result.output[result.output.length - 1];
@@ -20975,7 +21278,7 @@ var OtelStreamingObserver = class {
20975
21278
  this.rootSpan.setAttribute("gen_ai.system", "agentv");
20976
21279
  this.rootSpan.setAttribute("agentv.test_id", testId);
20977
21280
  this.rootSpan.setAttribute("agentv.target", target);
20978
- if (evalSet) this.rootSpan.setAttribute("agentv.eval_set", evalSet);
21281
+ if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
20979
21282
  this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
20980
21283
  }
20981
21284
  /** Create and immediately export a tool span */
@@ -21151,9 +21454,11 @@ function createAgentKernel() {
21151
21454
  }
21152
21455
  // Annotate the CommonJS export names for ESM import in node:
21153
21456
  0 && (module.exports = {
21457
+ COMMON_TARGET_SETTINGS,
21154
21458
  CodeEvaluator,
21155
21459
  CompositeEvaluator,
21156
21460
  CostEvaluator,
21461
+ DEFAULT_CATEGORY,
21157
21462
  DEFAULT_EVALUATOR_TEMPLATE,
21158
21463
  DEFAULT_EVAL_PATTERNS,
21159
21464
  DEFAULT_EXPLORATION_TOOLS,
@@ -21207,6 +21512,7 @@ function createAgentKernel() {
21207
21512
  createTempWorkspace,
21208
21513
  deepEqual,
21209
21514
  defineConfig,
21515
+ deriveCategory,
21210
21516
  detectFormat,
21211
21517
  discoverAssertions,
21212
21518
  discoverCopilotSessions,
@@ -21220,7 +21526,9 @@ function createAgentKernel() {
21220
21526
  explorationRatio,
21221
21527
  extractCacheConfig,
21222
21528
  extractFailOnError,
21529
+ extractImageBlocks,
21223
21530
  extractJsonBlob,
21531
+ extractLastAssistantContent,
21224
21532
  extractTargetFromSuite,
21225
21533
  extractTargetsFromSuite,
21226
21534
  extractTargetsFromTestCase,
@@ -21234,12 +21542,15 @@ function createAgentKernel() {
21234
21542
  getAgentvHome,
21235
21543
  getOutputFilenames,
21236
21544
  getSubagentsRoot,
21545
+ getTextContent,
21237
21546
  getTraceStateRoot,
21238
21547
  getWorkspacePath,
21239
21548
  getWorkspacePoolRoot,
21240
21549
  getWorkspacesRoot,
21241
21550
  initializeBaseline,
21242
21551
  isAgentSkillsFormat,
21552
+ isContent,
21553
+ isContentArray,
21243
21554
  isEvaluatorKind,
21244
21555
  isJsonObject,
21245
21556
  isJsonValue,