agentv 3.14.6 → 4.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
301
301
  }
302
302
  });
303
303
 
304
- // ../../packages/core/dist/chunk-HP5PFOVK.js
304
+ // ../../packages/core/dist/chunk-PXYYRDHH.js
305
305
  import { constants } from "node:fs";
306
306
  import { access, readFile } from "node:fs/promises";
307
307
  import path from "node:path";
@@ -419,11 +419,32 @@ __export(external_exports2, {
419
419
  void: () => voidType
420
420
  });
421
421
 
422
- // ../../packages/core/dist/chunk-HP5PFOVK.js
422
+ // ../../packages/core/dist/chunk-PXYYRDHH.js
423
423
  import { readFile as readFile2 } from "node:fs/promises";
424
424
  import path3 from "node:path";
425
425
  import fg from "fast-glob";
426
426
  import { parse as parseYaml } from "yaml";
427
+ var CONTENT_TYPES = /* @__PURE__ */ new Set(["text", "image", "file"]);
428
+ function isContent(value) {
429
+ if (!value || typeof value !== "object") return false;
430
+ const v = value;
431
+ return typeof v.type === "string" && CONTENT_TYPES.has(v.type);
432
+ }
433
+ function isContentArray(value) {
434
+ return Array.isArray(value) && value.length > 0 && value.every(isContent);
435
+ }
436
+ function getTextContent(content) {
437
+ if (content == null) return "";
438
+ if (typeof content === "string") return content;
439
+ if (!Array.isArray(content)) return "";
440
+ const parts = [];
441
+ for (const block of content) {
442
+ if (block.type === "text") {
443
+ parts.push(block.text);
444
+ }
445
+ }
446
+ return parts.join("\n");
447
+ }
427
448
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
428
449
  var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
429
450
  var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
@@ -776,6 +797,12 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
776
797
  "FILES",
777
798
  "OUTPUT_FILE"
778
799
  ]);
800
+ var COMMON_TARGET_SETTINGS = [
801
+ "provider_batching",
802
+ "providerBatching",
803
+ "subagent_mode_allowed",
804
+ "subagentModeAllowed"
805
+ ];
779
806
  var BASE_TARGET_SCHEMA = external_exports2.object({
780
807
  name: external_exports2.string().min(1, "target name is required"),
781
808
  provider: external_exports2.string().min(1, "provider is required"),
@@ -784,7 +811,8 @@ var BASE_TARGET_SCHEMA = external_exports2.object({
784
811
  // backward compat
785
812
  workers: external_exports2.number().int().min(1).optional(),
786
813
  workspace_template: external_exports2.string().optional(),
787
- workspaceTemplate: external_exports2.string().optional()
814
+ workspaceTemplate: external_exports2.string().optional(),
815
+ subagent_mode_allowed: external_exports2.boolean().optional()
788
816
  }).passthrough();
789
817
  var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
790
818
  var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
@@ -847,42 +875,40 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
847
875
  const providerBatching = resolveOptionalBoolean(
848
876
  parsed.provider_batching ?? parsed.providerBatching
849
877
  );
878
+ const subagentModeAllowed = resolveOptionalBoolean(
879
+ parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
880
+ );
881
+ const base = {
882
+ name: parsed.name,
883
+ graderTarget: parsed.grader_target ?? parsed.judge_target,
884
+ workers: parsed.workers,
885
+ providerBatching,
886
+ subagentModeAllowed
887
+ };
850
888
  switch (provider) {
851
889
  case "openai":
852
890
  return {
853
891
  kind: "openai",
854
- name: parsed.name,
855
- graderTarget: parsed.grader_target ?? parsed.judge_target,
856
- workers: parsed.workers,
857
- providerBatching,
892
+ ...base,
858
893
  config: resolveOpenAIConfig(parsed, env)
859
894
  };
860
895
  case "openrouter":
861
896
  return {
862
897
  kind: "openrouter",
863
- name: parsed.name,
864
- graderTarget: parsed.grader_target ?? parsed.judge_target,
865
- workers: parsed.workers,
866
- providerBatching,
898
+ ...base,
867
899
  config: resolveOpenRouterConfig(parsed, env)
868
900
  };
869
901
  case "azure":
870
902
  case "azure-openai":
871
903
  return {
872
904
  kind: "azure",
873
- name: parsed.name,
874
- graderTarget: parsed.grader_target ?? parsed.judge_target,
875
- workers: parsed.workers,
876
- providerBatching,
905
+ ...base,
877
906
  config: resolveAzureConfig(parsed, env)
878
907
  };
879
908
  case "anthropic":
880
909
  return {
881
910
  kind: "anthropic",
882
- name: parsed.name,
883
- graderTarget: parsed.grader_target ?? parsed.judge_target,
884
- workers: parsed.workers,
885
- providerBatching,
911
+ ...base,
886
912
  config: resolveAnthropicConfig(parsed, env)
887
913
  };
888
914
  case "gemini":
@@ -890,68 +916,47 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
890
916
  case "google-gemini":
891
917
  return {
892
918
  kind: "gemini",
893
- name: parsed.name,
894
- graderTarget: parsed.grader_target ?? parsed.judge_target,
895
- workers: parsed.workers,
896
- providerBatching,
919
+ ...base,
897
920
  config: resolveGeminiConfig(parsed, env)
898
921
  };
899
922
  case "codex":
900
923
  case "codex-cli":
901
924
  return {
902
925
  kind: "codex",
903
- name: parsed.name,
904
- graderTarget: parsed.grader_target ?? parsed.judge_target,
905
- workers: parsed.workers,
906
- providerBatching,
926
+ ...base,
907
927
  config: resolveCodexConfig(parsed, env, evalFilePath)
908
928
  };
909
929
  case "copilot-sdk":
910
930
  case "copilot_sdk":
911
931
  return {
912
932
  kind: "copilot-sdk",
913
- name: parsed.name,
914
- graderTarget: parsed.grader_target ?? parsed.judge_target,
915
- workers: parsed.workers,
916
- providerBatching,
933
+ ...base,
917
934
  config: resolveCopilotSdkConfig(parsed, env, evalFilePath)
918
935
  };
919
936
  case "copilot":
920
937
  case "copilot-cli":
921
938
  return {
922
939
  kind: "copilot-cli",
923
- name: parsed.name,
924
- graderTarget: parsed.grader_target ?? parsed.judge_target,
925
- workers: parsed.workers,
926
- providerBatching,
940
+ ...base,
927
941
  config: resolveCopilotCliConfig(parsed, env, evalFilePath)
928
942
  };
929
943
  case "copilot-log":
930
944
  return {
931
945
  kind: "copilot-log",
932
- name: parsed.name,
933
- graderTarget: parsed.grader_target ?? parsed.judge_target,
934
- workers: parsed.workers,
935
- providerBatching,
946
+ ...base,
936
947
  config: resolveCopilotLogConfig(parsed, env)
937
948
  };
938
949
  case "pi":
939
950
  case "pi-coding-agent":
940
951
  return {
941
952
  kind: "pi-coding-agent",
942
- name: parsed.name,
943
- graderTarget: parsed.grader_target ?? parsed.judge_target,
944
- workers: parsed.workers,
945
- providerBatching,
953
+ ...base,
946
954
  config: resolvePiCodingAgentConfig(parsed, env, evalFilePath)
947
955
  };
948
956
  case "pi-cli":
949
957
  return {
950
958
  kind: "pi-cli",
951
- name: parsed.name,
952
- graderTarget: parsed.grader_target ?? parsed.judge_target,
953
- workers: parsed.workers,
954
- providerBatching,
959
+ ...base,
955
960
  config: resolvePiCliConfig(parsed, env, evalFilePath)
956
961
  };
957
962
  case "claude":
@@ -959,38 +964,26 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
959
964
  case "claude-cli":
960
965
  return {
961
966
  kind: "claude-cli",
962
- name: parsed.name,
963
- graderTarget: parsed.grader_target ?? parsed.judge_target,
964
- workers: parsed.workers,
965
- providerBatching,
967
+ ...base,
966
968
  config: resolveClaudeConfig(parsed, env, evalFilePath)
967
969
  };
968
970
  case "claude-sdk":
969
971
  return {
970
972
  kind: "claude-sdk",
971
- name: parsed.name,
972
- graderTarget: parsed.grader_target ?? parsed.judge_target,
973
- workers: parsed.workers,
974
- providerBatching,
973
+ ...base,
975
974
  config: resolveClaudeConfig(parsed, env, evalFilePath)
976
975
  };
977
976
  case "mock":
978
977
  return {
979
978
  kind: "mock",
980
- name: parsed.name,
981
- graderTarget: parsed.grader_target ?? parsed.judge_target,
982
- workers: parsed.workers,
983
- providerBatching,
979
+ ...base,
984
980
  config: resolveMockConfig(parsed)
985
981
  };
986
982
  case "vscode":
987
983
  case "vscode-insiders":
988
984
  return {
989
985
  kind: provider,
990
- name: parsed.name,
991
- graderTarget: parsed.grader_target ?? parsed.judge_target,
992
- workers: parsed.workers,
993
- providerBatching,
986
+ ...base,
994
987
  config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders", evalFilePath)
995
988
  };
996
989
  case "agentv": {
@@ -1003,29 +996,21 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
1003
996
  const temperature = typeof parsed.temperature === "number" ? parsed.temperature : 0;
1004
997
  return {
1005
998
  kind: "agentv",
1006
- name: parsed.name,
1007
- graderTarget: parsed.grader_target ?? parsed.judge_target,
999
+ ...base,
1008
1000
  workers: typeof parsed.workers === "number" ? parsed.workers : void 0,
1009
- providerBatching,
1010
1001
  config: { model, temperature }
1011
1002
  };
1012
1003
  }
1013
1004
  case "cli":
1014
1005
  return {
1015
1006
  kind: "cli",
1016
- name: parsed.name,
1017
- graderTarget: parsed.grader_target ?? parsed.judge_target,
1018
- workers: parsed.workers,
1019
- providerBatching,
1007
+ ...base,
1020
1008
  config: resolveCliConfig(parsed, env, evalFilePath)
1021
1009
  };
1022
1010
  default:
1023
1011
  return {
1024
1012
  kind: "cli",
1025
- name: parsed.name,
1026
- graderTarget: parsed.grader_target ?? parsed.judge_target,
1027
- workers: parsed.workers,
1028
- providerBatching,
1013
+ ...base,
1029
1014
  config: resolveDiscoveredProviderConfig(parsed, provider, env, evalFilePath)
1030
1015
  };
1031
1016
  }
@@ -1653,8 +1638,8 @@ function resolveCliConfig(target, env, evalFilePath) {
1653
1638
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
1654
1639
  if (!parseResult.success) {
1655
1640
  const firstError = parseResult.error.errors[0];
1656
- const path47 = firstError?.path.join(".") || "";
1657
- const prefix = path47 ? `${target.name} ${path47}: ` : `${target.name}: `;
1641
+ const path48 = firstError?.path.join(".") || "";
1642
+ const prefix = path48 ? `${target.name} ${path48}: ` : `${target.name}: `;
1658
1643
  throw new Error(`${prefix}${firstError?.message}`);
1659
1644
  }
1660
1645
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -1897,6 +1882,82 @@ function resolveOptionalNumberArray(source, description) {
1897
1882
  }
1898
1883
  return resolved.length > 0 ? resolved : void 0;
1899
1884
  }
1885
+ var AGENT_PROVIDER_KINDS = [
1886
+ "codex",
1887
+ "copilot-sdk",
1888
+ "copilot-cli",
1889
+ "pi-coding-agent",
1890
+ "pi-cli",
1891
+ "claude",
1892
+ "claude-cli",
1893
+ "claude-sdk",
1894
+ "vscode",
1895
+ "vscode-insiders"
1896
+ ];
1897
+ var KNOWN_PROVIDERS = [
1898
+ "openai",
1899
+ "openrouter",
1900
+ "azure",
1901
+ "anthropic",
1902
+ "gemini",
1903
+ "codex",
1904
+ "copilot-sdk",
1905
+ "copilot-cli",
1906
+ "copilot-log",
1907
+ "pi-coding-agent",
1908
+ "pi-cli",
1909
+ "claude",
1910
+ "claude-cli",
1911
+ "claude-sdk",
1912
+ "cli",
1913
+ "mock",
1914
+ "vscode",
1915
+ "vscode-insiders",
1916
+ "agentv"
1917
+ ];
1918
+ var PROVIDER_ALIASES = [
1919
+ "azure-openai",
1920
+ // alias for "azure"
1921
+ "google",
1922
+ // alias for "gemini"
1923
+ "google-gemini",
1924
+ // alias for "gemini"
1925
+ "codex-cli",
1926
+ // alias for "codex"
1927
+ "copilot",
1928
+ // alias for "copilot-cli" (default copilot experience)
1929
+ "copilot_sdk",
1930
+ // alias for "copilot-sdk" (underscore variant)
1931
+ "pi",
1932
+ // alias for "pi-coding-agent"
1933
+ "claude-code",
1934
+ // alias for "claude" (legacy)
1935
+ "bedrock",
1936
+ // legacy/future support
1937
+ "vertex"
1938
+ // legacy/future support
1939
+ ];
1940
+ function extractLastAssistantContent(messages) {
1941
+ if (!messages || messages.length === 0) {
1942
+ return "";
1943
+ }
1944
+ for (let i = messages.length - 1; i >= 0; i--) {
1945
+ const msg = messages[i];
1946
+ if (msg.role === "assistant" && msg.content !== void 0) {
1947
+ if (typeof msg.content === "string") {
1948
+ return msg.content;
1949
+ }
1950
+ if (isContentArray(msg.content)) {
1951
+ return getTextContent(msg.content);
1952
+ }
1953
+ return JSON.stringify(msg.content);
1954
+ }
1955
+ }
1956
+ return "";
1957
+ }
1958
+ function isAgentProvider(provider) {
1959
+ return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
1960
+ }
1900
1961
  var ENV_VAR_PATTERN = /\$\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*\}\}/g;
1901
1962
  function interpolateEnv(value, env) {
1902
1963
  if (typeof value === "string") {
@@ -2026,79 +2087,6 @@ async function expandFileReferences(tests, evalFileDir) {
2026
2087
  }
2027
2088
  return expanded;
2028
2089
  }
2029
- var AGENT_PROVIDER_KINDS = [
2030
- "codex",
2031
- "copilot-sdk",
2032
- "copilot-cli",
2033
- "pi-coding-agent",
2034
- "pi-cli",
2035
- "claude",
2036
- "claude-cli",
2037
- "claude-sdk",
2038
- "vscode",
2039
- "vscode-insiders"
2040
- ];
2041
- var KNOWN_PROVIDERS = [
2042
- "openai",
2043
- "openrouter",
2044
- "azure",
2045
- "anthropic",
2046
- "gemini",
2047
- "codex",
2048
- "copilot-sdk",
2049
- "copilot-cli",
2050
- "copilot-log",
2051
- "pi-coding-agent",
2052
- "pi-cli",
2053
- "claude",
2054
- "claude-cli",
2055
- "claude-sdk",
2056
- "cli",
2057
- "mock",
2058
- "vscode",
2059
- "vscode-insiders",
2060
- "agentv"
2061
- ];
2062
- var PROVIDER_ALIASES = [
2063
- "azure-openai",
2064
- // alias for "azure"
2065
- "google",
2066
- // alias for "gemini"
2067
- "google-gemini",
2068
- // alias for "gemini"
2069
- "codex-cli",
2070
- // alias for "codex"
2071
- "copilot",
2072
- // alias for "copilot-cli" (default copilot experience)
2073
- "copilot_sdk",
2074
- // alias for "copilot-sdk" (underscore variant)
2075
- "pi",
2076
- // alias for "pi-coding-agent"
2077
- "claude-code",
2078
- // alias for "claude" (legacy)
2079
- "bedrock",
2080
- // legacy/future support
2081
- "vertex"
2082
- // legacy/future support
2083
- ];
2084
- function extractLastAssistantContent(messages) {
2085
- if (!messages || messages.length === 0) {
2086
- return "";
2087
- }
2088
- for (let i = messages.length - 1; i >= 0; i--) {
2089
- const msg = messages[i];
2090
- if (msg.role === "assistant" && msg.content !== void 0) {
2091
- if (typeof msg.content === "string") {
2092
- return msg.content;
2093
- }
2094
- return JSON.stringify(msg.content);
2095
- }
2096
- }
2097
- return "";
2098
- }
2099
- function isAgentProvider(provider) {
2100
- return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
2101
- }
2102
2090
 
2103
2091
  // ../../packages/core/dist/index.js
2104
2092
  import { readFile as readFile6 } from "node:fs/promises";
@@ -6734,7 +6722,7 @@ function createOpenRouter(options = {}) {
6734
6722
  );
6735
6723
  const createChatModel = (modelId, settings = {}) => new OpenRouterChatLanguageModel(modelId, settings, {
6736
6724
  provider: "openrouter.chat",
6737
- url: ({ path: path47 }) => `${baseURL}${path47}`,
6725
+ url: ({ path: path48 }) => `${baseURL}${path48}`,
6738
6726
  headers: getHeaders,
6739
6727
  compatibility,
6740
6728
  fetch: options.fetch,
@@ -6742,7 +6730,7 @@ function createOpenRouter(options = {}) {
6742
6730
  });
6743
6731
  const createCompletionModel = (modelId, settings = {}) => new OpenRouterCompletionLanguageModel(modelId, settings, {
6744
6732
  provider: "openrouter.completion",
6745
- url: ({ path: path47 }) => `${baseURL}${path47}`,
6733
+ url: ({ path: path48 }) => `${baseURL}${path48}`,
6746
6734
  headers: getHeaders,
6747
6735
  compatibility,
6748
6736
  fetch: options.fetch,
@@ -6750,14 +6738,14 @@ function createOpenRouter(options = {}) {
6750
6738
  });
6751
6739
  const createEmbeddingModel = (modelId, settings = {}) => new OpenRouterEmbeddingModel(modelId, settings, {
6752
6740
  provider: "openrouter.embedding",
6753
- url: ({ path: path47 }) => `${baseURL}${path47}`,
6741
+ url: ({ path: path48 }) => `${baseURL}${path48}`,
6754
6742
  headers: getHeaders,
6755
6743
  fetch: options.fetch,
6756
6744
  extraBody: options.extraBody
6757
6745
  });
6758
6746
  const createImageModel = (modelId, settings = {}) => new OpenRouterImageModel(modelId, settings, {
6759
6747
  provider: "openrouter.image",
6760
- url: ({ path: path47 }) => `${baseURL}${path47}`,
6748
+ url: ({ path: path48 }) => `${baseURL}${path48}`,
6761
6749
  headers: getHeaders,
6762
6750
  fetch: options.fetch,
6763
6751
  extraBody: options.extraBody
@@ -14350,6 +14338,7 @@ import { existsSync as existsSync4 } from "node:fs";
14350
14338
  import path45 from "node:path";
14351
14339
  import { mkdir as mkdir15, readFile as readFile13, writeFile as writeFile8 } from "node:fs/promises";
14352
14340
  import path46 from "node:path";
14341
+ import path47 from "node:path";
14353
14342
  function computeTraceSummary(messages) {
14354
14343
  const toolCallCounts = {};
14355
14344
  const toolDurations = {};
@@ -14979,15 +14968,23 @@ var TEMPLATE_VARIABLES = {
14979
14968
  INPUT: "input",
14980
14969
  OUTPUT: "output",
14981
14970
  FILE_CHANGES: "file_changes",
14971
+ /** @deprecated Use INPUT instead — resolves to the same text value. */
14982
14972
  INPUT_TEXT: "input_text",
14973
+ /** @deprecated Use OUTPUT instead — resolves to the same text value. */
14983
14974
  OUTPUT_TEXT: "output_text",
14975
+ /** @deprecated Use EXPECTED_OUTPUT instead — resolves to the same text value. */
14984
14976
  EXPECTED_OUTPUT_TEXT: "expected_output_text"
14985
14977
  };
14986
14978
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
14987
14979
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
14988
- TEMPLATE_VARIABLES.OUTPUT_TEXT,
14980
+ TEMPLATE_VARIABLES.OUTPUT,
14989
14981
  TEMPLATE_VARIABLES.EXPECTED_OUTPUT
14990
14982
  ]);
14983
+ var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
14984
+ [TEMPLATE_VARIABLES.INPUT_TEXT, TEMPLATE_VARIABLES.INPUT],
14985
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT],
14986
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
14987
+ ]);
14991
14988
  var ANSI_YELLOW22 = "\x1B[33m";
14992
14989
  var ANSI_RESET3 = "\x1B[0m";
14993
14990
  async function validateCustomPromptContent(promptPath) {
@@ -15007,16 +15004,29 @@ function validateTemplateVariables(content, source) {
15007
15004
  }
15008
15005
  match = variablePattern.exec(content);
15009
15006
  }
15010
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
15011
- const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
15007
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
15008
+ const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT);
15012
15009
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
15013
15010
  if (!hasRequiredFields) {
15014
15011
  throw new Error(
15015
15012
  `Missing required fields. Must include at least one of:
15016
- - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
15013
+ - {{ ${TEMPLATE_VARIABLES.OUTPUT} }}
15017
15014
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
15018
15015
  );
15019
15016
  }
15017
+ const deprecatedUsed = [];
15018
+ for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
15019
+ if (foundVariables.has(deprecated)) {
15020
+ deprecatedUsed.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
15021
+ }
15022
+ }
15023
+ if (deprecatedUsed.length > 0) {
15024
+ console.warn(
15025
+ `${ANSI_YELLOW22}Warning: Template at ${source} uses deprecated variable names:
15026
+ ${deprecatedUsed.join("\n ")}
15027
+ These still work but will be removed in a future version.${ANSI_RESET3}`
15028
+ );
15029
+ }
15020
15030
  if (invalidVariables.length > 0) {
15021
15031
  const warningMessage = `${ANSI_YELLOW22}Warning: Custom evaluator template at ${source}
15022
15032
  Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
@@ -16418,6 +16428,19 @@ function hasVisibleContent(segments) {
16418
16428
  function asString2(value) {
16419
16429
  return typeof value === "string" ? value : void 0;
16420
16430
  }
16431
+ var IMAGE_MEDIA_TYPES = {
16432
+ ".png": "image/png",
16433
+ ".jpg": "image/jpeg",
16434
+ ".jpeg": "image/jpeg",
16435
+ ".gif": "image/gif",
16436
+ ".webp": "image/webp",
16437
+ ".svg": "image/svg+xml",
16438
+ ".bmp": "image/bmp"
16439
+ };
16440
+ function detectImageMediaType(filePath) {
16441
+ const ext = path5.extname(filePath).toLowerCase();
16442
+ return IMAGE_MEDIA_TYPES[ext];
16443
+ }
16421
16444
  var ANSI_YELLOW4 = "\x1B[33m";
16422
16445
  var ANSI_RESET5 = "\x1B[0m";
16423
16446
  async function processMessages(options) {
@@ -16483,6 +16506,47 @@ async function processMessages(options) {
16483
16506
  }
16484
16507
  continue;
16485
16508
  }
16509
+ if (segmentType === "image") {
16510
+ const rawValue = asString3(rawSegment.value);
16511
+ if (!rawValue) {
16512
+ continue;
16513
+ }
16514
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference22(
16515
+ rawValue,
16516
+ searchRoots
16517
+ );
16518
+ if (!resolvedPath) {
16519
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
16520
+ const context2 = messageType === "input" ? "" : " in expected_output";
16521
+ logWarning3(`Image file not found${context2}: ${displayPath}`, attempts);
16522
+ continue;
16523
+ }
16524
+ const mediaType = detectImageMediaType(resolvedPath);
16525
+ if (!mediaType) {
16526
+ logWarning3(
16527
+ `Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
16528
+ );
16529
+ continue;
16530
+ }
16531
+ try {
16532
+ const imageBuffer = await readFile4(resolvedPath);
16533
+ const base64 = imageBuffer.toString("base64");
16534
+ processedContent.push({
16535
+ type: "image",
16536
+ media_type: mediaType,
16537
+ source: `data:${mediaType};base64,${base64}`
16538
+ });
16539
+ if (verbose) {
16540
+ const label = messageType === "input" ? "[Image]" : "[Expected Output Image]";
16541
+ console.log(` ${label} Found: ${displayPath}`);
16542
+ console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
16543
+ }
16544
+ } catch (error) {
16545
+ const context2 = messageType === "input" ? "" : " expected output";
16546
+ logWarning3(`Could not read${context2} image ${resolvedPath}: ${error.message}`);
16547
+ }
16548
+ continue;
16549
+ }
16486
16550
  const clonedSegment = cloneJsonObject(rawSegment);
16487
16551
  processedContent.push(clonedSegment);
16488
16552
  const inlineValue = clonedSegment.value;
@@ -16560,6 +16624,46 @@ async function processExpectedMessages(options) {
16560
16624
  }
16561
16625
  continue;
16562
16626
  }
16627
+ if (segmentType === "image") {
16628
+ const rawValue = asString3(rawSegment.value);
16629
+ if (!rawValue) {
16630
+ continue;
16631
+ }
16632
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference22(
16633
+ rawValue,
16634
+ searchRoots
16635
+ );
16636
+ if (!resolvedPath) {
16637
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
16638
+ logWarning3(`Image file not found in expected_output: ${displayPath}`, attempts);
16639
+ continue;
16640
+ }
16641
+ const mediaType = detectImageMediaType(resolvedPath);
16642
+ if (!mediaType) {
16643
+ logWarning3(
16644
+ `Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
16645
+ );
16646
+ continue;
16647
+ }
16648
+ try {
16649
+ const imageBuffer = await readFile4(resolvedPath);
16650
+ const base64 = imageBuffer.toString("base64");
16651
+ processedContent.push({
16652
+ type: "image",
16653
+ media_type: mediaType,
16654
+ source: `data:${mediaType};base64,${base64}`
16655
+ });
16656
+ if (verbose) {
16657
+ console.log(` [Expected Output Image] Found: ${displayPath}`);
16658
+ console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
16659
+ }
16660
+ } catch (error) {
16661
+ logWarning3(
16662
+ `Could not read expected output image ${resolvedPath}: ${error.message}`
16663
+ );
16664
+ }
16665
+ continue;
16666
+ }
16563
16667
  processedContent.push(cloneJsonObject(rawSegment));
16564
16668
  }
16565
16669
  segment.content = processedContent;
@@ -16802,7 +16906,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
16802
16906
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
16803
16907
  const testCase = {
16804
16908
  id,
16805
- eval_set: evalSetName,
16909
+ dataset: evalSetName,
16806
16910
  conversation_id: conversationId,
16807
16911
  question,
16808
16912
  input: inputMessages,
@@ -17066,7 +17170,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
17066
17170
  }
17067
17171
  const suite = interpolated;
17068
17172
  const evalSetNameFromSuite = asString5(suite.name)?.trim();
17069
- const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
17173
+ const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
17070
17174
  const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
17071
17175
  const rawTestcases = resolveTests(suite);
17072
17176
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
@@ -17187,7 +17291,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
17187
17291
  const caseTargets = extractTargetsFromTestCase(evalcase);
17188
17292
  const testCase = {
17189
17293
  id,
17190
- eval_set: evalSetName,
17294
+ dataset: evalSetName,
17295
+ category: options?.category,
17191
17296
  conversation_id: conversationId,
17192
17297
  question,
17193
17298
  input: inputMessages,
@@ -18090,6 +18195,47 @@ async function withRetry(fn, retryConfig, signal) {
18090
18195
  }
18091
18196
  throw lastError;
18092
18197
  }
18198
+ function toContentArray(content) {
18199
+ if (!Array.isArray(content)) return void 0;
18200
+ let hasNonText = false;
18201
+ const blocks = [];
18202
+ for (const part of content) {
18203
+ if (!part || typeof part !== "object") continue;
18204
+ const p = part;
18205
+ if (p.type === "text" && typeof p.text === "string") {
18206
+ blocks.push({ type: "text", text: p.text });
18207
+ } else if (p.type === "image" && typeof p.source === "object" && p.source !== null) {
18208
+ const src = p.source;
18209
+ const mediaType = typeof p.media_type === "string" ? p.media_type : typeof src.media_type === "string" ? src.media_type : "application/octet-stream";
18210
+ const data = typeof src.data === "string" && src.data !== "" ? `data:${mediaType};base64,${src.data}` : typeof p.url === "string" && p.url !== "" ? p.url : "";
18211
+ if (!data) continue;
18212
+ blocks.push({ type: "image", media_type: mediaType, source: data });
18213
+ hasNonText = true;
18214
+ } else if (p.type === "tool_use") {
18215
+ } else if (p.type === "tool_result") {
18216
+ }
18217
+ }
18218
+ return hasNonText && blocks.length > 0 ? blocks : void 0;
18219
+ }
18220
+ function extractTextContent2(content) {
18221
+ if (typeof content === "string") {
18222
+ return content;
18223
+ }
18224
+ if (!Array.isArray(content)) {
18225
+ return void 0;
18226
+ }
18227
+ const textParts = [];
18228
+ for (const part of content) {
18229
+ if (!part || typeof part !== "object") {
18230
+ continue;
18231
+ }
18232
+ const p = part;
18233
+ if (p.type === "text" && typeof p.text === "string") {
18234
+ textParts.push(p.text);
18235
+ }
18236
+ }
18237
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
18238
+ }
18093
18239
  var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeLogs");
18094
18240
  var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeLogSubscribers");
18095
18241
  function getClaudeLogStore() {
@@ -18249,11 +18395,12 @@ var ClaudeCliProvider = class {
18249
18395
  if (betaMessage && typeof betaMessage === "object") {
18250
18396
  const msg = betaMessage;
18251
18397
  const content = msg.content;
18398
+ const structuredContent = toContentArray(content);
18252
18399
  const textContent = extractTextContent2(content);
18253
18400
  const toolCalls = extractToolCalls(content);
18254
18401
  const outputMsg = {
18255
18402
  role: "assistant",
18256
- content: textContent,
18403
+ content: structuredContent ?? textContent,
18257
18404
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0
18258
18405
  };
18259
18406
  output.push(outputMsg);
@@ -18592,25 +18739,6 @@ function summarizeEvent(event) {
18592
18739
  return void 0;
18593
18740
  }
18594
18741
  }
18595
- function extractTextContent2(content) {
18596
- if (typeof content === "string") {
18597
- return content;
18598
- }
18599
- if (!Array.isArray(content)) {
18600
- return void 0;
18601
- }
18602
- const textParts = [];
18603
- for (const part of content) {
18604
- if (!part || typeof part !== "object") {
18605
- continue;
18606
- }
18607
- const p = part;
18608
- if (p.type === "text" && typeof p.text === "string") {
18609
- textParts.push(p.text);
18610
- }
18611
- }
18612
- return textParts.length > 0 ? textParts.join("\n") : void 0;
18613
- }
18614
18742
  function extractToolCalls(content) {
18615
18743
  if (!Array.isArray(content)) {
18616
18744
  return [];
@@ -18777,11 +18905,12 @@ var ClaudeSdkProvider = class {
18777
18905
  if (betaMessage && typeof betaMessage === "object") {
18778
18906
  const msg = betaMessage;
18779
18907
  const content = msg.content;
18780
- const textContent = extractTextContent22(content);
18908
+ const structuredContent = toContentArray(content);
18909
+ const textContent = extractTextContent2(content);
18781
18910
  const toolCalls = extractToolCalls2(content);
18782
18911
  const outputMsg = {
18783
18912
  role: "assistant",
18784
- content: textContent,
18913
+ content: structuredContent ?? textContent,
18785
18914
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0
18786
18915
  };
18787
18916
  output.push(outputMsg);
@@ -18899,25 +19028,6 @@ var ClaudeSdkProvider = class {
18899
19028
  }
18900
19029
  }
18901
19030
  };
18902
- function extractTextContent22(content) {
18903
- if (typeof content === "string") {
18904
- return content;
18905
- }
18906
- if (!Array.isArray(content)) {
18907
- return void 0;
18908
- }
18909
- const textParts = [];
18910
- for (const part of content) {
18911
- if (!part || typeof part !== "object") {
18912
- continue;
18913
- }
18914
- const p = part;
18915
- if (p.type === "text" && typeof p.text === "string") {
18916
- textParts.push(p.text);
18917
- }
18918
- }
18919
- return textParts.length > 0 ? textParts.join("\n") : void 0;
18920
- }
18921
19031
  function extractToolCalls2(content) {
18922
19032
  if (!Array.isArray(content)) {
18923
19033
  return [];
@@ -19133,7 +19243,7 @@ function convertMessages(messages) {
19133
19243
  return messages.map((msg) => ({
19134
19244
  role: msg.role,
19135
19245
  name: msg.name,
19136
- content: msg.content,
19246
+ content: isContentArray(msg.content) ? msg.content : typeof msg.content === "string" ? msg.content : void 0,
19137
19247
  toolCalls: msg.tool_calls?.map((tc) => ({
19138
19248
  tool: tc.tool,
19139
19249
  input: tc.input,
@@ -21319,6 +21429,35 @@ function extractPiTextContent(content) {
21319
21429
  }
21320
21430
  return textParts.length > 0 ? textParts.join("\n") : void 0;
21321
21431
  }
21432
+ function toPiContentArray(content) {
21433
+ if (!Array.isArray(content)) return void 0;
21434
+ let hasNonText = false;
21435
+ const blocks = [];
21436
+ for (const part of content) {
21437
+ if (!part || typeof part !== "object") continue;
21438
+ const p = part;
21439
+ if (p.type === "text" && typeof p.text === "string") {
21440
+ blocks.push({ type: "text", text: p.text });
21441
+ } else if (p.type === "image") {
21442
+ const mediaType = typeof p.media_type === "string" ? p.media_type : "application/octet-stream";
21443
+ let source = "";
21444
+ if (typeof p.source === "object" && p.source !== null) {
21445
+ const src = p.source;
21446
+ const srcMediaType = typeof src.media_type === "string" ? src.media_type : mediaType;
21447
+ source = typeof src.data === "string" ? `data:${srcMediaType};base64,${src.data}` : "";
21448
+ }
21449
+ if (!source && typeof p.url === "string") {
21450
+ source = p.url;
21451
+ }
21452
+ if (source) {
21453
+ blocks.push({ type: "image", media_type: mediaType, source });
21454
+ hasNonText = true;
21455
+ }
21456
+ } else if (p.type === "tool_use" || p.type === "tool_result") {
21457
+ }
21458
+ }
21459
+ return hasNonText && blocks.length > 0 ? blocks : void 0;
21460
+ }
21322
21461
  function toFiniteNumber(value) {
21323
21462
  if (typeof value === "number" && Number.isFinite(value)) return value;
21324
21463
  return void 0;
@@ -22478,7 +22617,8 @@ function convertAgentMessage(message, toolTrackers, completedToolResults) {
22478
22617
  }
22479
22618
  const msg = message;
22480
22619
  const role = typeof msg.role === "string" ? msg.role : "unknown";
22481
- const content = extractPiTextContent(msg.content);
22620
+ const structuredContent = toPiContentArray(msg.content);
22621
+ const content = structuredContent ?? extractPiTextContent(msg.content);
22482
22622
  const toolCalls = extractToolCalls4(msg.content, toolTrackers, completedToolResults);
22483
22623
  const startTimeVal = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
22484
22624
  let msgTokenUsage;
@@ -24233,13 +24373,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
24233
24373
  async function execShellWithStdin(command, stdinPayload, options = {}) {
24234
24374
  const { mkdir: mkdir16, readFile: readFile14, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
24235
24375
  const { tmpdir: tmpdir3 } = await import("node:os");
24236
- const path47 = await import("node:path");
24376
+ const path48 = await import("node:path");
24237
24377
  const { randomUUID: randomUUID10 } = await import("node:crypto");
24238
- const dir = path47.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
24378
+ const dir = path48.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
24239
24379
  await mkdir16(dir, { recursive: true });
24240
- const stdinPath = path47.join(dir, "stdin.txt");
24241
- const stdoutPath = path47.join(dir, "stdout.txt");
24242
- const stderrPath = path47.join(dir, "stderr.txt");
24380
+ const stdinPath = path48.join(dir, "stdin.txt");
24381
+ const stdoutPath = path48.join(dir, "stdout.txt");
24382
+ const stderrPath = path48.join(dir, "stderr.txt");
24243
24383
  await writeFile9(stdinPath, stdinPayload, "utf8");
24244
24384
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
24245
24385
  const { spawn: spawn5 } = await import("node:child_process");
@@ -24547,6 +24687,56 @@ function toCamelCaseDeep(obj) {
24547
24687
  return obj;
24548
24688
  }
24549
24689
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
24690
+ var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
24691
+ async function materializeContentForGrader(messages, getWorkDir) {
24692
+ if (!messages || messages.length === 0) return messages ?? null;
24693
+ let hasAnyImage = false;
24694
+ for (const msg of messages) {
24695
+ if (isContentArray(msg.content)) {
24696
+ for (const block of msg.content) {
24697
+ if (block.type === "image") {
24698
+ hasAnyImage = true;
24699
+ break;
24700
+ }
24701
+ }
24702
+ }
24703
+ if (hasAnyImage) break;
24704
+ }
24705
+ if (!hasAnyImage) return messages;
24706
+ let counter = 0;
24707
+ const result = [];
24708
+ for (const msg of messages) {
24709
+ if (!isContentArray(msg.content)) {
24710
+ result.push(msg);
24711
+ continue;
24712
+ }
24713
+ if (!msg.content.some((b) => b.type === "image")) {
24714
+ result.push(msg);
24715
+ continue;
24716
+ }
24717
+ const blocks = [];
24718
+ for (const block of msg.content) {
24719
+ if (block.type !== "image") {
24720
+ blocks.push({ ...block });
24721
+ continue;
24722
+ }
24723
+ const img = block;
24724
+ const match = DATA_URI_RE.exec(img.source);
24725
+ if (match) {
24726
+ const [, mediaType, base64Data] = match;
24727
+ const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
24728
+ const dir = await getWorkDir();
24729
+ const filePath = join(dir, `img-${counter++}.${ext}`);
24730
+ await writeFile6(filePath, Buffer.from(base64Data, "base64"));
24731
+ blocks.push({ type: "image", media_type: img.media_type, path: filePath });
24732
+ } else {
24733
+ blocks.push({ type: "image", media_type: img.media_type, path: img.source });
24734
+ }
24735
+ }
24736
+ result.push({ ...msg, content: blocks });
24737
+ }
24738
+ return result;
24739
+ }
24550
24740
  var CodeEvaluator = class {
24551
24741
  kind = "code-grader";
24552
24742
  command;
@@ -24562,7 +24752,18 @@ var CodeEvaluator = class {
24562
24752
  this.target = options.target;
24563
24753
  }
24564
24754
  async evaluate(context2) {
24565
- let outputForPayload = context2.output ?? null;
24755
+ let imageTmpDir;
24756
+ const getImageDir = async () => {
24757
+ if (!imageTmpDir) {
24758
+ imageTmpDir = await mkdtemp2(join(tmpdir2(), "agentv-img-"));
24759
+ }
24760
+ return imageTmpDir;
24761
+ };
24762
+ const materializedOutput = await materializeContentForGrader(
24763
+ context2.output,
24764
+ getImageDir
24765
+ );
24766
+ let outputForPayload = materializedOutput;
24566
24767
  let outputPath;
24567
24768
  if (outputForPayload) {
24568
24769
  const serialized = JSON.stringify(outputForPayload);
@@ -24575,12 +24776,17 @@ var CodeEvaluator = class {
24575
24776
  }
24576
24777
  const payload = {
24577
24778
  criteria: context2.evalCase.criteria,
24578
- expectedOutput: context2.evalCase.expected_output,
24579
- outputText: context2.candidate,
24779
+ expectedOutput: await materializeContentForGrader(
24780
+ context2.evalCase.expected_output,
24781
+ getImageDir
24782
+ ),
24580
24783
  output: outputForPayload,
24581
24784
  outputPath,
24582
24785
  inputFiles: context2.evalCase.file_paths,
24583
- input: context2.evalCase.input,
24786
+ input: await materializeContentForGrader(
24787
+ context2.evalCase.input,
24788
+ getImageDir
24789
+ ),
24584
24790
  trace: context2.trace ?? null,
24585
24791
  tokenUsage: context2.tokenUsage ?? null,
24586
24792
  costUsd: context2.costUsd ?? null,
@@ -24589,9 +24795,7 @@ var CodeEvaluator = class {
24589
24795
  endTime: context2.endTime ?? null,
24590
24796
  fileChanges: context2.fileChanges ?? null,
24591
24797
  workspacePath: context2.workspacePath ?? null,
24592
- config: this.config ?? null,
24593
- inputText: context2.evalCase.question,
24594
- expectedOutputText: context2.evalCase.reference_answer ?? ""
24798
+ config: this.config ?? null
24595
24799
  };
24596
24800
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
24597
24801
  let proxyEnv;
@@ -24681,6 +24885,10 @@ var CodeEvaluator = class {
24681
24885
  await rm3(dirname(outputPath), { recursive: true, force: true }).catch(() => {
24682
24886
  });
24683
24887
  }
24888
+ if (imageTmpDir) {
24889
+ await rm3(imageTmpDir, { recursive: true, force: true }).catch(() => {
24890
+ });
24891
+ }
24684
24892
  }
24685
24893
  }
24686
24894
  };
@@ -24749,13 +24957,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
24749
24957
  {{${TEMPLATE_VARIABLES.CRITERIA}}}
24750
24958
 
24751
24959
  [[ ## question ## ]]
24752
- {{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
24960
+ {{${TEMPLATE_VARIABLES.INPUT}}}
24753
24961
 
24754
24962
  [[ ## reference_answer ## ]]
24755
- {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
24963
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT}}}
24756
24964
 
24757
24965
  [[ ## answer ## ]]
24758
- {{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
24966
+ {{${TEMPLATE_VARIABLES.OUTPUT}}}`;
24759
24967
  var freeformEvaluationSchema = external_exports2.object({
24760
24968
  score: external_exports2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
24761
24969
  assertions: external_exports2.array(
@@ -24827,21 +25035,19 @@ var LlmGraderEvaluator = class {
24827
25035
  async evaluateFreeform(context2, graderProvider) {
24828
25036
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
24829
25037
  const variables = {
24830
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context2.evalCase.input, null, 2),
24831
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(
24832
- context2.evalCase.expected_output,
24833
- null,
24834
- 2
24835
- ),
24836
- [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
25038
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
25039
+ [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
25040
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
24837
25041
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
24838
25042
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
25043
+ // Deprecated aliases — same values as the primary variables above
24839
25044
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
24840
25045
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
24841
25046
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
24842
25047
  };
24843
25048
  const systemPrompt = buildOutputSchema();
24844
25049
  const evaluatorTemplate = context2.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
25050
+ warnDeprecatedTemplateVars(evaluatorTemplate);
24845
25051
  let userPrompt = substituteVariables(evaluatorTemplate, variables);
24846
25052
  if (context2.fileChanges && !context2.evaluatorTemplateOverride && !this.evaluatorTemplate) {
24847
25053
  userPrompt += `
@@ -24853,13 +25059,15 @@ ${context2.fileChanges}`;
24853
25059
  userPrompt,
24854
25060
  systemPrompt
24855
25061
  };
25062
+ const images = context2.output ? extractImageBlocks(context2.output) : [];
24856
25063
  try {
24857
25064
  const { data, tokenUsage } = await this.runWithRetry({
24858
25065
  context: context2,
24859
25066
  graderProvider,
24860
25067
  systemPrompt,
24861
25068
  userPrompt,
24862
- schema: freeformEvaluationSchema
25069
+ schema: freeformEvaluationSchema,
25070
+ images
24863
25071
  });
24864
25072
  const score = clampScore(data.score);
24865
25073
  const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
@@ -24903,13 +25111,15 @@ ${context2.fileChanges}`;
24903
25111
  userPrompt: prompt,
24904
25112
  systemPrompt
24905
25113
  };
25114
+ const images = context2.output ? extractImageBlocks(context2.output) : [];
24906
25115
  try {
24907
25116
  const { data, tokenUsage } = await this.runWithRetry({
24908
25117
  context: context2,
24909
25118
  graderProvider,
24910
25119
  systemPrompt,
24911
25120
  userPrompt: prompt,
24912
- schema: rubricEvaluationSchema
25121
+ schema: rubricEvaluationSchema,
25122
+ images
24913
25123
  });
24914
25124
  const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
24915
25125
  return {
@@ -24946,13 +25156,15 @@ ${context2.fileChanges}`;
24946
25156
  userPrompt: prompt,
24947
25157
  systemPrompt
24948
25158
  };
25159
+ const images = context2.output ? extractImageBlocks(context2.output) : [];
24949
25160
  try {
24950
25161
  const { data, tokenUsage } = await this.runWithRetry({
24951
25162
  context: context2,
24952
25163
  graderProvider,
24953
25164
  systemPrompt,
24954
25165
  userPrompt: prompt,
24955
- schema: scoreRangeEvaluationSchema
25166
+ schema: scoreRangeEvaluationSchema,
25167
+ images
24956
25168
  });
24957
25169
  const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
24958
25170
  return {
@@ -25159,12 +25371,17 @@ ${context2.fileChanges}`;
25159
25371
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
25160
25372
  const variables = {
25161
25373
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
25374
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
25375
+ [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
25376
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
25377
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
25378
+ // Deprecated aliases
25162
25379
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
25163
25380
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
25164
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
25165
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
25381
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
25166
25382
  };
25167
25383
  if (this.evaluatorTemplate) {
25384
+ warnDeprecatedTemplateVars(this.evaluatorTemplate);
25168
25385
  return substituteVariables(this.evaluatorTemplate, variables);
25169
25386
  }
25170
25387
  const config = context2.evaluator;
@@ -25215,11 +25432,16 @@ ${context2.fileChanges}`;
25215
25432
  if (this.evaluatorTemplate) {
25216
25433
  const variables = {
25217
25434
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
25435
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
25436
+ [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
25437
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
25438
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
25439
+ // Deprecated aliases
25218
25440
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
25219
25441
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
25220
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
25221
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
25442
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
25222
25443
  };
25444
+ warnDeprecatedTemplateVars(this.evaluatorTemplate);
25223
25445
  const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
25224
25446
  const outputSchema2 = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
25225
25447
  return `${customPrompt}
@@ -25390,18 +25612,35 @@ ${outputSchema2}`;
25390
25612
  // LLM mode retry logic
25391
25613
  // ---------------------------------------------------------------------------
25392
25614
  async runWithRetry(options) {
25393
- const { context: context2, graderProvider, systemPrompt, userPrompt, schema } = options;
25615
+ const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
25394
25616
  let lastError;
25395
25617
  for (let attempt = 1; attempt <= 3; attempt++) {
25396
25618
  try {
25397
25619
  const model = graderProvider.asLanguageModel?.();
25398
25620
  if (model) {
25399
- const result = await generateText({
25621
+ const modelOptions = {
25622
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
25623
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
25624
+ };
25625
+ const hasImages = images && images.length > 0;
25626
+ const result = hasImages ? await generateText({
25627
+ model,
25628
+ system: systemPrompt,
25629
+ messages: [
25630
+ {
25631
+ role: "user",
25632
+ content: [
25633
+ { type: "text", text: userPrompt },
25634
+ ...toAiSdkImageParts(images)
25635
+ ]
25636
+ }
25637
+ ],
25638
+ ...modelOptions
25639
+ }) : await generateText({
25400
25640
  model,
25401
25641
  system: systemPrompt,
25402
25642
  prompt: userPrompt,
25403
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
25404
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
25643
+ ...modelOptions
25405
25644
  });
25406
25645
  const data2 = schema.parse(parseJsonFromText(result.text));
25407
25646
  const rawUsage = result.usage;
@@ -25461,6 +25700,26 @@ function substituteVariables(template, variables) {
25461
25700
  return variables[varName] ?? match;
25462
25701
  });
25463
25702
  }
25703
+ var ANSI_YELLOW7 = "\x1B[33m";
25704
+ var ANSI_RESET8 = "\x1B[0m";
25705
+ var warnedTemplateStrings = /* @__PURE__ */ new Set();
25706
+ function warnDeprecatedTemplateVars(template) {
25707
+ if (warnedTemplateStrings.has(template)) return;
25708
+ const used = [];
25709
+ for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
25710
+ if (new RegExp(`\\{\\{\\s*${deprecated}\\s*\\}\\}`).test(template)) {
25711
+ used.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
25712
+ }
25713
+ }
25714
+ if (used.length > 0) {
25715
+ warnedTemplateStrings.add(template);
25716
+ console.warn(
25717
+ `${ANSI_YELLOW7}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
25718
+ ${used.join("\n ")}
25719
+ Update your custom evaluator template to use the new names.${ANSI_RESET8}`
25720
+ );
25721
+ }
25722
+ }
25464
25723
  function calculateRubricScore(result, rubrics) {
25465
25724
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
25466
25725
  const assertions = [];
@@ -25555,6 +25814,26 @@ function calculateScoreRangeResult(result, rubrics) {
25555
25814
  }
25556
25815
  };
25557
25816
  }
25817
+ function extractImageBlocks(messages) {
25818
+ const images = [];
25819
+ for (const msg of messages) {
25820
+ if (msg.role !== "assistant") continue;
25821
+ if (!isContentArray(msg.content)) continue;
25822
+ for (const block of msg.content) {
25823
+ if (block.type === "image") {
25824
+ images.push(block);
25825
+ }
25826
+ }
25827
+ }
25828
+ return images;
25829
+ }
25830
+ function toAiSdkImageParts(images) {
25831
+ return images.map((img) => ({
25832
+ type: "image",
25833
+ image: img.source,
25834
+ mediaType: img.media_type || void 0
25835
+ }));
25836
+ }
25558
25837
  function resolveSandboxed(basePath, relativePath) {
25559
25838
  const resolved = path35.resolve(basePath, relativePath);
25560
25839
  if (!resolved.startsWith(basePath + path35.sep) && resolved !== basePath) {
@@ -26288,115 +26567,115 @@ var FieldAccuracyEvaluator = class {
26288
26567
  * Evaluate a single field against the expected value.
26289
26568
  */
26290
26569
  evaluateField(fieldConfig, candidateData, expectedData) {
26291
- const { path: path47, match, required = true, weight = 1 } = fieldConfig;
26292
- const candidateValue = resolvePath(candidateData, path47);
26293
- const expectedValue = resolvePath(expectedData, path47);
26570
+ const { path: path48, match, required = true, weight = 1 } = fieldConfig;
26571
+ const candidateValue = resolvePath(candidateData, path48);
26572
+ const expectedValue = resolvePath(expectedData, path48);
26294
26573
  if (expectedValue === void 0) {
26295
26574
  return {
26296
- path: path47,
26575
+ path: path48,
26297
26576
  score: 1,
26298
26577
  // No expected value means no comparison needed
26299
26578
  weight,
26300
26579
  hit: true,
26301
- message: `${path47}: no expected value`
26580
+ message: `${path48}: no expected value`
26302
26581
  };
26303
26582
  }
26304
26583
  if (candidateValue === void 0) {
26305
26584
  if (required) {
26306
26585
  return {
26307
- path: path47,
26586
+ path: path48,
26308
26587
  score: 0,
26309
26588
  weight,
26310
26589
  hit: false,
26311
- message: `${path47} (required, missing)`
26590
+ message: `${path48} (required, missing)`
26312
26591
  };
26313
26592
  }
26314
26593
  return {
26315
- path: path47,
26594
+ path: path48,
26316
26595
  score: 1,
26317
26596
  // Don't penalize missing optional fields
26318
26597
  weight: 0,
26319
26598
  // Zero weight means it won't affect the score
26320
26599
  hit: true,
26321
- message: `${path47}: optional field missing`
26600
+ message: `${path48}: optional field missing`
26322
26601
  };
26323
26602
  }
26324
26603
  switch (match) {
26325
26604
  case "exact":
26326
- return this.compareExact(path47, candidateValue, expectedValue, weight);
26605
+ return this.compareExact(path48, candidateValue, expectedValue, weight);
26327
26606
  case "numeric_tolerance":
26328
26607
  return this.compareNumericTolerance(
26329
- path47,
26608
+ path48,
26330
26609
  candidateValue,
26331
26610
  expectedValue,
26332
26611
  fieldConfig,
26333
26612
  weight
26334
26613
  );
26335
26614
  case "date":
26336
- return this.compareDate(path47, candidateValue, expectedValue, fieldConfig, weight);
26615
+ return this.compareDate(path48, candidateValue, expectedValue, fieldConfig, weight);
26337
26616
  default:
26338
26617
  return {
26339
- path: path47,
26618
+ path: path48,
26340
26619
  score: 0,
26341
26620
  weight,
26342
26621
  hit: false,
26343
- message: `${path47}: unknown match type "${match}"`
26622
+ message: `${path48}: unknown match type "${match}"`
26344
26623
  };
26345
26624
  }
26346
26625
  }
26347
26626
  /**
26348
26627
  * Exact equality comparison.
26349
26628
  */
26350
- compareExact(path47, candidateValue, expectedValue, weight) {
26629
+ compareExact(path48, candidateValue, expectedValue, weight) {
26351
26630
  if (deepEqual(candidateValue, expectedValue)) {
26352
26631
  return {
26353
- path: path47,
26632
+ path: path48,
26354
26633
  score: 1,
26355
26634
  weight,
26356
26635
  hit: true,
26357
- message: path47
26636
+ message: path48
26358
26637
  };
26359
26638
  }
26360
26639
  if (typeof candidateValue !== typeof expectedValue) {
26361
26640
  return {
26362
- path: path47,
26641
+ path: path48,
26363
26642
  score: 0,
26364
26643
  weight,
26365
26644
  hit: false,
26366
- message: `${path47} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
26645
+ message: `${path48} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
26367
26646
  };
26368
26647
  }
26369
26648
  return {
26370
- path: path47,
26649
+ path: path48,
26371
26650
  score: 0,
26372
26651
  weight,
26373
26652
  hit: false,
26374
- message: `${path47} (value mismatch)`
26653
+ message: `${path48} (value mismatch)`
26375
26654
  };
26376
26655
  }
26377
26656
  /**
26378
26657
  * Numeric comparison with absolute or relative tolerance.
26379
26658
  */
26380
- compareNumericTolerance(path47, candidateValue, expectedValue, fieldConfig, weight) {
26659
+ compareNumericTolerance(path48, candidateValue, expectedValue, fieldConfig, weight) {
26381
26660
  const { tolerance = 0, relative = false } = fieldConfig;
26382
26661
  const candidateNum = toNumber(candidateValue);
26383
26662
  const expectedNum = toNumber(expectedValue);
26384
26663
  if (candidateNum === null || expectedNum === null) {
26385
26664
  return {
26386
- path: path47,
26665
+ path: path48,
26387
26666
  score: 0,
26388
26667
  weight,
26389
26668
  hit: false,
26390
- message: `${path47} (non-numeric value)`
26669
+ message: `${path48} (non-numeric value)`
26391
26670
  };
26392
26671
  }
26393
26672
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
26394
26673
  return {
26395
- path: path47,
26674
+ path: path48,
26396
26675
  score: 0,
26397
26676
  weight,
26398
26677
  hit: false,
26399
- message: `${path47} (invalid numeric value)`
26678
+ message: `${path48} (invalid numeric value)`
26400
26679
  };
26401
26680
  }
26402
26681
  const diff = Math.abs(candidateNum - expectedNum);
@@ -26409,61 +26688,61 @@ var FieldAccuracyEvaluator = class {
26409
26688
  }
26410
26689
  if (withinTolerance) {
26411
26690
  return {
26412
- path: path47,
26691
+ path: path48,
26413
26692
  score: 1,
26414
26693
  weight,
26415
26694
  hit: true,
26416
- message: `${path47} (within tolerance: diff=${diff.toFixed(2)})`
26695
+ message: `${path48} (within tolerance: diff=${diff.toFixed(2)})`
26417
26696
  };
26418
26697
  }
26419
26698
  return {
26420
- path: path47,
26699
+ path: path48,
26421
26700
  score: 0,
26422
26701
  weight,
26423
26702
  hit: false,
26424
- message: `${path47} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
26703
+ message: `${path48} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
26425
26704
  };
26426
26705
  }
26427
26706
  /**
26428
26707
  * Date comparison with format normalization.
26429
26708
  */
26430
- compareDate(path47, candidateValue, expectedValue, fieldConfig, weight) {
26709
+ compareDate(path48, candidateValue, expectedValue, fieldConfig, weight) {
26431
26710
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
26432
26711
  const candidateDate = parseDate(String(candidateValue), formats);
26433
26712
  const expectedDate = parseDate(String(expectedValue), formats);
26434
26713
  if (candidateDate === null) {
26435
26714
  return {
26436
- path: path47,
26715
+ path: path48,
26437
26716
  score: 0,
26438
26717
  weight,
26439
26718
  hit: false,
26440
- message: `${path47} (unparseable candidate date)`
26719
+ message: `${path48} (unparseable candidate date)`
26441
26720
  };
26442
26721
  }
26443
26722
  if (expectedDate === null) {
26444
26723
  return {
26445
- path: path47,
26724
+ path: path48,
26446
26725
  score: 0,
26447
26726
  weight,
26448
26727
  hit: false,
26449
- message: `${path47} (unparseable expected date)`
26728
+ message: `${path48} (unparseable expected date)`
26450
26729
  };
26451
26730
  }
26452
26731
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
26453
26732
  return {
26454
- path: path47,
26733
+ path: path48,
26455
26734
  score: 1,
26456
26735
  weight,
26457
26736
  hit: true,
26458
- message: path47
26737
+ message: path48
26459
26738
  };
26460
26739
  }
26461
26740
  return {
26462
- path: path47,
26741
+ path: path48,
26463
26742
  score: 0,
26464
26743
  weight,
26465
26744
  hit: false,
26466
- message: `${path47} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
26745
+ message: `${path48} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
26467
26746
  };
26468
26747
  }
26469
26748
  /**
@@ -26496,11 +26775,11 @@ var FieldAccuracyEvaluator = class {
26496
26775
  };
26497
26776
  }
26498
26777
  };
26499
- function resolvePath(obj, path47) {
26500
- if (!path47 || !obj) {
26778
+ function resolvePath(obj, path48) {
26779
+ if (!path48 || !obj) {
26501
26780
  return void 0;
26502
26781
  }
26503
- const parts = path47.split(/\.|\[|\]/).filter((p) => p.length > 0);
26782
+ const parts = path48.split(/\.|\[|\]/).filter((p) => p.length > 0);
26504
26783
  let current = obj;
26505
26784
  for (const part of parts) {
26506
26785
  if (current === null || current === void 0) {
@@ -26786,11 +27065,12 @@ function assembleLlmGraderPrompt(input) {
26786
27065
  function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
26787
27066
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
26788
27067
  const variables = {
26789
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2),
26790
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
26791
- [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
27068
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
27069
+ [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(),
27070
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
26792
27071
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
26793
27072
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
27073
+ // Deprecated aliases
26794
27074
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
26795
27075
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
26796
27076
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -26973,8 +27253,8 @@ var TokenUsageEvaluator = class {
26973
27253
  };
26974
27254
  }
26975
27255
  };
26976
- function getNestedValue(obj, path47) {
26977
- const parts = path47.split(".");
27256
+ function getNestedValue(obj, path48) {
27257
+ const parts = path48.split(".");
26978
27258
  let current = obj;
26979
27259
  for (const part of parts) {
26980
27260
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -27824,16 +28104,13 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
27824
28104
  const payload = {
27825
28105
  criteria: context2.evalCase.criteria,
27826
28106
  expectedOutput: context2.evalCase.expected_output,
27827
- outputText: context2.candidate,
27828
28107
  output: context2.output ?? null,
27829
28108
  inputFiles: context2.evalCase.file_paths,
27830
28109
  input: context2.evalCase.input,
27831
28110
  trace: context2.trace ?? null,
27832
28111
  fileChanges: context2.fileChanges ?? null,
27833
28112
  workspacePath: context2.workspacePath ?? null,
27834
- config: config ?? context2.config ?? null,
27835
- inputText: context2.evalCase.question,
27836
- expectedOutputText: context2.evalCase.reference_answer ?? ""
28113
+ config: config ?? context2.config ?? null
27837
28114
  };
27838
28115
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
27839
28116
  const scriptPath = script[script.length - 1];
@@ -29469,7 +29746,8 @@ async function runEvaluation(options) {
29469
29746
  const budgetResult = {
29470
29747
  timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
29471
29748
  testId: evalCase.id,
29472
- eval_set: evalCase.eval_set,
29749
+ dataset: evalCase.dataset,
29750
+ category: evalCase.category,
29473
29751
  score: 0,
29474
29752
  assertions: [],
29475
29753
  output: [],
@@ -29505,7 +29783,8 @@ async function runEvaluation(options) {
29505
29783
  const haltResult = {
29506
29784
  timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
29507
29785
  testId: evalCase.id,
29508
- eval_set: evalCase.eval_set,
29786
+ dataset: evalCase.dataset,
29787
+ category: evalCase.category,
29509
29788
  score: 0,
29510
29789
  assertions: [],
29511
29790
  output: [],
@@ -30504,7 +30783,8 @@ async function evaluateCandidate(options) {
30504
30783
  return {
30505
30784
  timestamp: completedAt.toISOString(),
30506
30785
  testId: evalCase.id,
30507
- eval_set: evalCase.eval_set,
30786
+ dataset: evalCase.dataset,
30787
+ category: evalCase.category,
30508
30788
  conversationId: evalCase.conversation_id,
30509
30789
  score: score.score,
30510
30790
  assertions: score.assertions,
@@ -30854,7 +31134,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
30854
31134
  return {
30855
31135
  timestamp: timestamp.toISOString(),
30856
31136
  testId: evalCase.id,
30857
- eval_set: evalCase.eval_set,
31137
+ dataset: evalCase.dataset,
31138
+ category: evalCase.category,
30858
31139
  conversationId: evalCase.conversation_id,
30859
31140
  score: 0,
30860
31141
  assertions: [{ text: `Error: ${message}`, passed: false }],
@@ -31405,6 +31686,15 @@ function trimBaselineResult(result) {
31405
31686
  }
31406
31687
  return trimmed;
31407
31688
  }
31689
+ var DEFAULT_CATEGORY = "Uncategorized";
31690
+ function deriveCategory(relativePath) {
31691
+ const parts = relativePath.split(path47.sep);
31692
+ if (parts.length <= 1) {
31693
+ return DEFAULT_CATEGORY;
31694
+ }
31695
+ const dirs = parts.slice(0, -1).filter((d) => d !== "evals");
31696
+ return dirs.length > 0 ? dirs.join("/") : DEFAULT_CATEGORY;
31697
+ }
31408
31698
  var OTEL_BACKEND_PRESETS = {
31409
31699
  langfuse: {
31410
31700
  name: "langfuse",
@@ -31527,7 +31817,7 @@ var OtelTraceExporter = class {
31527
31817
  rootSpan.setAttribute("gen_ai.system", "agentv");
31528
31818
  rootSpan.setAttribute("agentv.test_id", result.testId);
31529
31819
  rootSpan.setAttribute("agentv.target", result.target);
31530
- if (result.eval_set) rootSpan.setAttribute("agentv.eval_set", result.eval_set);
31820
+ if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
31531
31821
  rootSpan.setAttribute("agentv.score", result.score);
31532
31822
  if (captureContent && result.output.length > 0) {
31533
31823
  const lastMsg = result.output[result.output.length - 1];
@@ -31736,7 +32026,7 @@ var OtelStreamingObserver = class {
31736
32026
  this.rootSpan.setAttribute("gen_ai.system", "agentv");
31737
32027
  this.rootSpan.setAttribute("agentv.test_id", testId);
31738
32028
  this.rootSpan.setAttribute("agentv.target", target);
31739
- if (evalSet) this.rootSpan.setAttribute("agentv.eval_set", evalSet);
32029
+ if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
31740
32030
  this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
31741
32031
  }
31742
32032
  /** Create and immediately export a tool span */
@@ -31907,6 +32197,9 @@ function createAgentKernel() {
31907
32197
  }
31908
32198
 
31909
32199
  export {
32200
+ isContent,
32201
+ isContentArray,
32202
+ getTextContent,
31910
32203
  TEST_MESSAGE_ROLES,
31911
32204
  isTestMessageRole,
31912
32205
  isJsonObject,
@@ -31922,11 +32215,13 @@ export {
31922
32215
  buildSearchRoots,
31923
32216
  resolveFileReference,
31924
32217
  CLI_PLACEHOLDERS,
32218
+ COMMON_TARGET_SETTINGS,
31925
32219
  resolveTargetDefinition,
31926
- interpolateEnv,
31927
- loadCasesFromFile,
31928
32220
  KNOWN_PROVIDERS,
31929
32221
  PROVIDER_ALIASES,
32222
+ extractLastAssistantContent,
32223
+ interpolateEnv,
32224
+ loadCasesFromFile,
31930
32225
  computeTraceSummary,
31931
32226
  DEFAULT_EXPLORATION_TOOLS,
31932
32227
  explorationRatio,
@@ -32002,6 +32297,7 @@ export {
32002
32297
  substituteVariables,
32003
32298
  calculateRubricScore,
32004
32299
  buildScoreRangeOutputSchema,
32300
+ extractImageBlocks,
32005
32301
  CompositeEvaluator,
32006
32302
  CostEvaluator,
32007
32303
  ExecutionMetricsEvaluator,
@@ -32051,9 +32347,11 @@ export {
32051
32347
  shouldEnableCache,
32052
32348
  shouldSkipCacheForTemperature,
32053
32349
  trimBaselineResult,
32350
+ DEFAULT_CATEGORY,
32351
+ deriveCategory,
32054
32352
  OTEL_BACKEND_PRESETS,
32055
32353
  OtelTraceExporter,
32056
32354
  OtelStreamingObserver,
32057
32355
  createAgentKernel
32058
32356
  };
32059
- //# sourceMappingURL=chunk-ELQEFMGO.js.map
32357
+ //# sourceMappingURL=chunk-XEAW7OQT.js.map