agentv 3.14.5 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
301
301
  }
302
302
  });
303
303
 
304
- // ../../packages/core/dist/chunk-HP5PFOVK.js
304
+ // ../../packages/core/dist/chunk-PXYYRDHH.js
305
305
  import { constants } from "node:fs";
306
306
  import { access, readFile } from "node:fs/promises";
307
307
  import path from "node:path";
@@ -419,11 +419,32 @@ __export(external_exports2, {
419
419
  void: () => voidType
420
420
  });
421
421
 
422
- // ../../packages/core/dist/chunk-HP5PFOVK.js
422
+ // ../../packages/core/dist/chunk-PXYYRDHH.js
423
423
  import { readFile as readFile2 } from "node:fs/promises";
424
424
  import path3 from "node:path";
425
425
  import fg from "fast-glob";
426
426
  import { parse as parseYaml } from "yaml";
427
+ var CONTENT_TYPES = /* @__PURE__ */ new Set(["text", "image", "file"]);
428
+ function isContent(value) {
429
+ if (!value || typeof value !== "object") return false;
430
+ const v = value;
431
+ return typeof v.type === "string" && CONTENT_TYPES.has(v.type);
432
+ }
433
+ function isContentArray(value) {
434
+ return Array.isArray(value) && value.length > 0 && value.every(isContent);
435
+ }
436
+ function getTextContent(content) {
437
+ if (content == null) return "";
438
+ if (typeof content === "string") return content;
439
+ if (!Array.isArray(content)) return "";
440
+ const parts = [];
441
+ for (const block of content) {
442
+ if (block.type === "text") {
443
+ parts.push(block.text);
444
+ }
445
+ }
446
+ return parts.join("\n");
447
+ }
427
448
  var TEST_MESSAGE_ROLE_VALUES = ["system", "user", "assistant", "tool"];
428
449
  var TEST_MESSAGE_ROLES = TEST_MESSAGE_ROLE_VALUES;
429
450
  var TEST_MESSAGE_ROLE_SET = new Set(TEST_MESSAGE_ROLE_VALUES);
@@ -776,6 +797,12 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
776
797
  "FILES",
777
798
  "OUTPUT_FILE"
778
799
  ]);
800
+ var COMMON_TARGET_SETTINGS = [
801
+ "provider_batching",
802
+ "providerBatching",
803
+ "subagent_mode_allowed",
804
+ "subagentModeAllowed"
805
+ ];
779
806
  var BASE_TARGET_SCHEMA = external_exports2.object({
780
807
  name: external_exports2.string().min(1, "target name is required"),
781
808
  provider: external_exports2.string().min(1, "provider is required"),
@@ -784,7 +811,8 @@ var BASE_TARGET_SCHEMA = external_exports2.object({
784
811
  // backward compat
785
812
  workers: external_exports2.number().int().min(1).optional(),
786
813
  workspace_template: external_exports2.string().optional(),
787
- workspaceTemplate: external_exports2.string().optional()
814
+ workspaceTemplate: external_exports2.string().optional(),
815
+ subagent_mode_allowed: external_exports2.boolean().optional()
788
816
  }).passthrough();
789
817
  var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
790
818
  var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
@@ -847,42 +875,40 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
847
875
  const providerBatching = resolveOptionalBoolean(
848
876
  parsed.provider_batching ?? parsed.providerBatching
849
877
  );
878
+ const subagentModeAllowed = resolveOptionalBoolean(
879
+ parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
880
+ );
881
+ const base = {
882
+ name: parsed.name,
883
+ graderTarget: parsed.grader_target ?? parsed.judge_target,
884
+ workers: parsed.workers,
885
+ providerBatching,
886
+ subagentModeAllowed
887
+ };
850
888
  switch (provider) {
851
889
  case "openai":
852
890
  return {
853
891
  kind: "openai",
854
- name: parsed.name,
855
- graderTarget: parsed.grader_target ?? parsed.judge_target,
856
- workers: parsed.workers,
857
- providerBatching,
892
+ ...base,
858
893
  config: resolveOpenAIConfig(parsed, env)
859
894
  };
860
895
  case "openrouter":
861
896
  return {
862
897
  kind: "openrouter",
863
- name: parsed.name,
864
- graderTarget: parsed.grader_target ?? parsed.judge_target,
865
- workers: parsed.workers,
866
- providerBatching,
898
+ ...base,
867
899
  config: resolveOpenRouterConfig(parsed, env)
868
900
  };
869
901
  case "azure":
870
902
  case "azure-openai":
871
903
  return {
872
904
  kind: "azure",
873
- name: parsed.name,
874
- graderTarget: parsed.grader_target ?? parsed.judge_target,
875
- workers: parsed.workers,
876
- providerBatching,
905
+ ...base,
877
906
  config: resolveAzureConfig(parsed, env)
878
907
  };
879
908
  case "anthropic":
880
909
  return {
881
910
  kind: "anthropic",
882
- name: parsed.name,
883
- graderTarget: parsed.grader_target ?? parsed.judge_target,
884
- workers: parsed.workers,
885
- providerBatching,
911
+ ...base,
886
912
  config: resolveAnthropicConfig(parsed, env)
887
913
  };
888
914
  case "gemini":
@@ -890,68 +916,47 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
890
916
  case "google-gemini":
891
917
  return {
892
918
  kind: "gemini",
893
- name: parsed.name,
894
- graderTarget: parsed.grader_target ?? parsed.judge_target,
895
- workers: parsed.workers,
896
- providerBatching,
919
+ ...base,
897
920
  config: resolveGeminiConfig(parsed, env)
898
921
  };
899
922
  case "codex":
900
923
  case "codex-cli":
901
924
  return {
902
925
  kind: "codex",
903
- name: parsed.name,
904
- graderTarget: parsed.grader_target ?? parsed.judge_target,
905
- workers: parsed.workers,
906
- providerBatching,
926
+ ...base,
907
927
  config: resolveCodexConfig(parsed, env, evalFilePath)
908
928
  };
909
929
  case "copilot-sdk":
910
930
  case "copilot_sdk":
911
931
  return {
912
932
  kind: "copilot-sdk",
913
- name: parsed.name,
914
- graderTarget: parsed.grader_target ?? parsed.judge_target,
915
- workers: parsed.workers,
916
- providerBatching,
933
+ ...base,
917
934
  config: resolveCopilotSdkConfig(parsed, env, evalFilePath)
918
935
  };
919
936
  case "copilot":
920
937
  case "copilot-cli":
921
938
  return {
922
939
  kind: "copilot-cli",
923
- name: parsed.name,
924
- graderTarget: parsed.grader_target ?? parsed.judge_target,
925
- workers: parsed.workers,
926
- providerBatching,
940
+ ...base,
927
941
  config: resolveCopilotCliConfig(parsed, env, evalFilePath)
928
942
  };
929
943
  case "copilot-log":
930
944
  return {
931
945
  kind: "copilot-log",
932
- name: parsed.name,
933
- graderTarget: parsed.grader_target ?? parsed.judge_target,
934
- workers: parsed.workers,
935
- providerBatching,
946
+ ...base,
936
947
  config: resolveCopilotLogConfig(parsed, env)
937
948
  };
938
949
  case "pi":
939
950
  case "pi-coding-agent":
940
951
  return {
941
952
  kind: "pi-coding-agent",
942
- name: parsed.name,
943
- graderTarget: parsed.grader_target ?? parsed.judge_target,
944
- workers: parsed.workers,
945
- providerBatching,
953
+ ...base,
946
954
  config: resolvePiCodingAgentConfig(parsed, env, evalFilePath)
947
955
  };
948
956
  case "pi-cli":
949
957
  return {
950
958
  kind: "pi-cli",
951
- name: parsed.name,
952
- graderTarget: parsed.grader_target ?? parsed.judge_target,
953
- workers: parsed.workers,
954
- providerBatching,
959
+ ...base,
955
960
  config: resolvePiCliConfig(parsed, env, evalFilePath)
956
961
  };
957
962
  case "claude":
@@ -959,38 +964,26 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
959
964
  case "claude-cli":
960
965
  return {
961
966
  kind: "claude-cli",
962
- name: parsed.name,
963
- graderTarget: parsed.grader_target ?? parsed.judge_target,
964
- workers: parsed.workers,
965
- providerBatching,
967
+ ...base,
966
968
  config: resolveClaudeConfig(parsed, env, evalFilePath)
967
969
  };
968
970
  case "claude-sdk":
969
971
  return {
970
972
  kind: "claude-sdk",
971
- name: parsed.name,
972
- graderTarget: parsed.grader_target ?? parsed.judge_target,
973
- workers: parsed.workers,
974
- providerBatching,
973
+ ...base,
975
974
  config: resolveClaudeConfig(parsed, env, evalFilePath)
976
975
  };
977
976
  case "mock":
978
977
  return {
979
978
  kind: "mock",
980
- name: parsed.name,
981
- graderTarget: parsed.grader_target ?? parsed.judge_target,
982
- workers: parsed.workers,
983
- providerBatching,
979
+ ...base,
984
980
  config: resolveMockConfig(parsed)
985
981
  };
986
982
  case "vscode":
987
983
  case "vscode-insiders":
988
984
  return {
989
985
  kind: provider,
990
- name: parsed.name,
991
- graderTarget: parsed.grader_target ?? parsed.judge_target,
992
- workers: parsed.workers,
993
- providerBatching,
986
+ ...base,
994
987
  config: resolveVSCodeConfig(parsed, env, provider === "vscode-insiders", evalFilePath)
995
988
  };
996
989
  case "agentv": {
@@ -1003,29 +996,21 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
1003
996
  const temperature = typeof parsed.temperature === "number" ? parsed.temperature : 0;
1004
997
  return {
1005
998
  kind: "agentv",
1006
- name: parsed.name,
1007
- graderTarget: parsed.grader_target ?? parsed.judge_target,
999
+ ...base,
1008
1000
  workers: typeof parsed.workers === "number" ? parsed.workers : void 0,
1009
- providerBatching,
1010
1001
  config: { model, temperature }
1011
1002
  };
1012
1003
  }
1013
1004
  case "cli":
1014
1005
  return {
1015
1006
  kind: "cli",
1016
- name: parsed.name,
1017
- graderTarget: parsed.grader_target ?? parsed.judge_target,
1018
- workers: parsed.workers,
1019
- providerBatching,
1007
+ ...base,
1020
1008
  config: resolveCliConfig(parsed, env, evalFilePath)
1021
1009
  };
1022
1010
  default:
1023
1011
  return {
1024
1012
  kind: "cli",
1025
- name: parsed.name,
1026
- graderTarget: parsed.grader_target ?? parsed.judge_target,
1027
- workers: parsed.workers,
1028
- providerBatching,
1013
+ ...base,
1029
1014
  config: resolveDiscoveredProviderConfig(parsed, provider, env, evalFilePath)
1030
1015
  };
1031
1016
  }
@@ -1653,8 +1638,8 @@ function resolveCliConfig(target, env, evalFilePath) {
1653
1638
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
1654
1639
  if (!parseResult.success) {
1655
1640
  const firstError = parseResult.error.errors[0];
1656
- const path47 = firstError?.path.join(".") || "";
1657
- const prefix = path47 ? `${target.name} ${path47}: ` : `${target.name}: `;
1641
+ const path48 = firstError?.path.join(".") || "";
1642
+ const prefix = path48 ? `${target.name} ${path48}: ` : `${target.name}: `;
1658
1643
  throw new Error(`${prefix}${firstError?.message}`);
1659
1644
  }
1660
1645
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -1897,6 +1882,82 @@ function resolveOptionalNumberArray(source, description) {
1897
1882
  }
1898
1883
  return resolved.length > 0 ? resolved : void 0;
1899
1884
  }
1885
+ var AGENT_PROVIDER_KINDS = [
1886
+ "codex",
1887
+ "copilot-sdk",
1888
+ "copilot-cli",
1889
+ "pi-coding-agent",
1890
+ "pi-cli",
1891
+ "claude",
1892
+ "claude-cli",
1893
+ "claude-sdk",
1894
+ "vscode",
1895
+ "vscode-insiders"
1896
+ ];
1897
+ var KNOWN_PROVIDERS = [
1898
+ "openai",
1899
+ "openrouter",
1900
+ "azure",
1901
+ "anthropic",
1902
+ "gemini",
1903
+ "codex",
1904
+ "copilot-sdk",
1905
+ "copilot-cli",
1906
+ "copilot-log",
1907
+ "pi-coding-agent",
1908
+ "pi-cli",
1909
+ "claude",
1910
+ "claude-cli",
1911
+ "claude-sdk",
1912
+ "cli",
1913
+ "mock",
1914
+ "vscode",
1915
+ "vscode-insiders",
1916
+ "agentv"
1917
+ ];
1918
+ var PROVIDER_ALIASES = [
1919
+ "azure-openai",
1920
+ // alias for "azure"
1921
+ "google",
1922
+ // alias for "gemini"
1923
+ "google-gemini",
1924
+ // alias for "gemini"
1925
+ "codex-cli",
1926
+ // alias for "codex"
1927
+ "copilot",
1928
+ // alias for "copilot-cli" (default copilot experience)
1929
+ "copilot_sdk",
1930
+ // alias for "copilot-sdk" (underscore variant)
1931
+ "pi",
1932
+ // alias for "pi-coding-agent"
1933
+ "claude-code",
1934
+ // alias for "claude" (legacy)
1935
+ "bedrock",
1936
+ // legacy/future support
1937
+ "vertex"
1938
+ // legacy/future support
1939
+ ];
1940
+ function extractLastAssistantContent(messages) {
1941
+ if (!messages || messages.length === 0) {
1942
+ return "";
1943
+ }
1944
+ for (let i = messages.length - 1; i >= 0; i--) {
1945
+ const msg = messages[i];
1946
+ if (msg.role === "assistant" && msg.content !== void 0) {
1947
+ if (typeof msg.content === "string") {
1948
+ return msg.content;
1949
+ }
1950
+ if (isContentArray(msg.content)) {
1951
+ return getTextContent(msg.content);
1952
+ }
1953
+ return JSON.stringify(msg.content);
1954
+ }
1955
+ }
1956
+ return "";
1957
+ }
1958
+ function isAgentProvider(provider) {
1959
+ return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
1960
+ }
1900
1961
  var ENV_VAR_PATTERN = /\$\{\{\s*([A-Za-z_][A-Za-z0-9_]*)\s*\}\}/g;
1901
1962
  function interpolateEnv(value, env) {
1902
1963
  if (typeof value === "string") {
@@ -2026,79 +2087,6 @@ async function expandFileReferences(tests, evalFileDir) {
2026
2087
  }
2027
2088
  return expanded;
2028
2089
  }
2029
- var AGENT_PROVIDER_KINDS = [
2030
- "codex",
2031
- "copilot-sdk",
2032
- "copilot-cli",
2033
- "pi-coding-agent",
2034
- "pi-cli",
2035
- "claude",
2036
- "claude-cli",
2037
- "claude-sdk",
2038
- "vscode",
2039
- "vscode-insiders"
2040
- ];
2041
- var KNOWN_PROVIDERS = [
2042
- "openai",
2043
- "openrouter",
2044
- "azure",
2045
- "anthropic",
2046
- "gemini",
2047
- "codex",
2048
- "copilot-sdk",
2049
- "copilot-cli",
2050
- "copilot-log",
2051
- "pi-coding-agent",
2052
- "pi-cli",
2053
- "claude",
2054
- "claude-cli",
2055
- "claude-sdk",
2056
- "cli",
2057
- "mock",
2058
- "vscode",
2059
- "vscode-insiders",
2060
- "agentv"
2061
- ];
2062
- var PROVIDER_ALIASES = [
2063
- "azure-openai",
2064
- // alias for "azure"
2065
- "google",
2066
- // alias for "gemini"
2067
- "google-gemini",
2068
- // alias for "gemini"
2069
- "codex-cli",
2070
- // alias for "codex"
2071
- "copilot",
2072
- // alias for "copilot-cli" (default copilot experience)
2073
- "copilot_sdk",
2074
- // alias for "copilot-sdk" (underscore variant)
2075
- "pi",
2076
- // alias for "pi-coding-agent"
2077
- "claude-code",
2078
- // alias for "claude" (legacy)
2079
- "bedrock",
2080
- // legacy/future support
2081
- "vertex"
2082
- // legacy/future support
2083
- ];
2084
- function extractLastAssistantContent(messages) {
2085
- if (!messages || messages.length === 0) {
2086
- return "";
2087
- }
2088
- for (let i = messages.length - 1; i >= 0; i--) {
2089
- const msg = messages[i];
2090
- if (msg.role === "assistant" && msg.content !== void 0) {
2091
- if (typeof msg.content === "string") {
2092
- return msg.content;
2093
- }
2094
- return JSON.stringify(msg.content);
2095
- }
2096
- }
2097
- return "";
2098
- }
2099
- function isAgentProvider(provider) {
2100
- return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
2101
- }
2102
2090
 
2103
2091
  // ../../packages/core/dist/index.js
2104
2092
  import { readFile as readFile6 } from "node:fs/promises";
@@ -6734,7 +6722,7 @@ function createOpenRouter(options = {}) {
6734
6722
  );
6735
6723
  const createChatModel = (modelId, settings = {}) => new OpenRouterChatLanguageModel(modelId, settings, {
6736
6724
  provider: "openrouter.chat",
6737
- url: ({ path: path47 }) => `${baseURL}${path47}`,
6725
+ url: ({ path: path48 }) => `${baseURL}${path48}`,
6738
6726
  headers: getHeaders,
6739
6727
  compatibility,
6740
6728
  fetch: options.fetch,
@@ -6742,7 +6730,7 @@ function createOpenRouter(options = {}) {
6742
6730
  });
6743
6731
  const createCompletionModel = (modelId, settings = {}) => new OpenRouterCompletionLanguageModel(modelId, settings, {
6744
6732
  provider: "openrouter.completion",
6745
- url: ({ path: path47 }) => `${baseURL}${path47}`,
6733
+ url: ({ path: path48 }) => `${baseURL}${path48}`,
6746
6734
  headers: getHeaders,
6747
6735
  compatibility,
6748
6736
  fetch: options.fetch,
@@ -6750,14 +6738,14 @@ function createOpenRouter(options = {}) {
6750
6738
  });
6751
6739
  const createEmbeddingModel = (modelId, settings = {}) => new OpenRouterEmbeddingModel(modelId, settings, {
6752
6740
  provider: "openrouter.embedding",
6753
- url: ({ path: path47 }) => `${baseURL}${path47}`,
6741
+ url: ({ path: path48 }) => `${baseURL}${path48}`,
6754
6742
  headers: getHeaders,
6755
6743
  fetch: options.fetch,
6756
6744
  extraBody: options.extraBody
6757
6745
  });
6758
6746
  const createImageModel = (modelId, settings = {}) => new OpenRouterImageModel(modelId, settings, {
6759
6747
  provider: "openrouter.image",
6760
- url: ({ path: path47 }) => `${baseURL}${path47}`,
6748
+ url: ({ path: path48 }) => `${baseURL}${path48}`,
6761
6749
  headers: getHeaders,
6762
6750
  fetch: options.fetch,
6763
6751
  extraBody: options.extraBody
@@ -14350,6 +14338,7 @@ import { existsSync as existsSync4 } from "node:fs";
14350
14338
  import path45 from "node:path";
14351
14339
  import { mkdir as mkdir15, readFile as readFile13, writeFile as writeFile8 } from "node:fs/promises";
14352
14340
  import path46 from "node:path";
14341
+ import path47 from "node:path";
14353
14342
  function computeTraceSummary(messages) {
14354
14343
  const toolCallCounts = {};
14355
14344
  const toolDurations = {};
@@ -14979,15 +14968,23 @@ var TEMPLATE_VARIABLES = {
14979
14968
  INPUT: "input",
14980
14969
  OUTPUT: "output",
14981
14970
  FILE_CHANGES: "file_changes",
14971
+ /** @deprecated Use INPUT instead — resolves to the same text value. */
14982
14972
  INPUT_TEXT: "input_text",
14973
+ /** @deprecated Use OUTPUT instead — resolves to the same text value. */
14983
14974
  OUTPUT_TEXT: "output_text",
14975
+ /** @deprecated Use EXPECTED_OUTPUT instead — resolves to the same text value. */
14984
14976
  EXPECTED_OUTPUT_TEXT: "expected_output_text"
14985
14977
  };
14986
14978
  var VALID_TEMPLATE_VARIABLES = new Set(Object.values(TEMPLATE_VARIABLES));
14987
14979
  var REQUIRED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Set([
14988
- TEMPLATE_VARIABLES.OUTPUT_TEXT,
14980
+ TEMPLATE_VARIABLES.OUTPUT,
14989
14981
  TEMPLATE_VARIABLES.EXPECTED_OUTPUT
14990
14982
  ]);
14983
+ var DEPRECATED_TEMPLATE_VARIABLES = /* @__PURE__ */ new Map([
14984
+ [TEMPLATE_VARIABLES.INPUT_TEXT, TEMPLATE_VARIABLES.INPUT],
14985
+ [TEMPLATE_VARIABLES.OUTPUT_TEXT, TEMPLATE_VARIABLES.OUTPUT],
14986
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT, TEMPLATE_VARIABLES.EXPECTED_OUTPUT]
14987
+ ]);
14991
14988
  var ANSI_YELLOW22 = "\x1B[33m";
14992
14989
  var ANSI_RESET3 = "\x1B[0m";
14993
14990
  async function validateCustomPromptContent(promptPath) {
@@ -15007,16 +15004,29 @@ function validateTemplateVariables(content, source) {
15007
15004
  }
15008
15005
  match = variablePattern.exec(content);
15009
15006
  }
15010
- const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
15007
+ const hasCandidateAnswer = foundVariables.has(TEMPLATE_VARIABLES.OUTPUT) || foundVariables.has(TEMPLATE_VARIABLES.OUTPUT_TEXT);
15011
15008
  const hasExpectedOutput = foundVariables.has(TEMPLATE_VARIABLES.EXPECTED_OUTPUT);
15012
15009
  const hasRequiredFields = hasCandidateAnswer || hasExpectedOutput;
15013
15010
  if (!hasRequiredFields) {
15014
15011
  throw new Error(
15015
15012
  `Missing required fields. Must include at least one of:
15016
- - {{ ${TEMPLATE_VARIABLES.OUTPUT_TEXT} }}
15013
+ - {{ ${TEMPLATE_VARIABLES.OUTPUT} }}
15017
15014
  - {{ ${TEMPLATE_VARIABLES.EXPECTED_OUTPUT} }}`
15018
15015
  );
15019
15016
  }
15017
+ const deprecatedUsed = [];
15018
+ for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
15019
+ if (foundVariables.has(deprecated)) {
15020
+ deprecatedUsed.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
15021
+ }
15022
+ }
15023
+ if (deprecatedUsed.length > 0) {
15024
+ console.warn(
15025
+ `${ANSI_YELLOW22}Warning: Template at ${source} uses deprecated variable names:
15026
+ ${deprecatedUsed.join("\n ")}
15027
+ These still work but will be removed in a future version.${ANSI_RESET3}`
15028
+ );
15029
+ }
15020
15030
  if (invalidVariables.length > 0) {
15021
15031
  const warningMessage = `${ANSI_YELLOW22}Warning: Custom evaluator template at ${source}
15022
15032
  Contains invalid variables: ${invalidVariables.map((v) => `{{ ${v} }}`).join(", ")}
@@ -16418,6 +16428,19 @@ function hasVisibleContent(segments) {
16418
16428
  function asString2(value) {
16419
16429
  return typeof value === "string" ? value : void 0;
16420
16430
  }
16431
+ var IMAGE_MEDIA_TYPES = {
16432
+ ".png": "image/png",
16433
+ ".jpg": "image/jpeg",
16434
+ ".jpeg": "image/jpeg",
16435
+ ".gif": "image/gif",
16436
+ ".webp": "image/webp",
16437
+ ".svg": "image/svg+xml",
16438
+ ".bmp": "image/bmp"
16439
+ };
16440
+ function detectImageMediaType(filePath) {
16441
+ const ext = path5.extname(filePath).toLowerCase();
16442
+ return IMAGE_MEDIA_TYPES[ext];
16443
+ }
16421
16444
  var ANSI_YELLOW4 = "\x1B[33m";
16422
16445
  var ANSI_RESET5 = "\x1B[0m";
16423
16446
  async function processMessages(options) {
@@ -16483,6 +16506,47 @@ async function processMessages(options) {
16483
16506
  }
16484
16507
  continue;
16485
16508
  }
16509
+ if (segmentType === "image") {
16510
+ const rawValue = asString3(rawSegment.value);
16511
+ if (!rawValue) {
16512
+ continue;
16513
+ }
16514
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference22(
16515
+ rawValue,
16516
+ searchRoots
16517
+ );
16518
+ if (!resolvedPath) {
16519
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
16520
+ const context2 = messageType === "input" ? "" : " in expected_output";
16521
+ logWarning3(`Image file not found${context2}: ${displayPath}`, attempts);
16522
+ continue;
16523
+ }
16524
+ const mediaType = detectImageMediaType(resolvedPath);
16525
+ if (!mediaType) {
16526
+ logWarning3(
16527
+ `Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
16528
+ );
16529
+ continue;
16530
+ }
16531
+ try {
16532
+ const imageBuffer = await readFile4(resolvedPath);
16533
+ const base64 = imageBuffer.toString("base64");
16534
+ processedContent.push({
16535
+ type: "image",
16536
+ media_type: mediaType,
16537
+ source: `data:${mediaType};base64,${base64}`
16538
+ });
16539
+ if (verbose) {
16540
+ const label = messageType === "input" ? "[Image]" : "[Expected Output Image]";
16541
+ console.log(` ${label} Found: ${displayPath}`);
16542
+ console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
16543
+ }
16544
+ } catch (error) {
16545
+ const context2 = messageType === "input" ? "" : " expected output";
16546
+ logWarning3(`Could not read${context2} image ${resolvedPath}: ${error.message}`);
16547
+ }
16548
+ continue;
16549
+ }
16486
16550
  const clonedSegment = cloneJsonObject(rawSegment);
16487
16551
  processedContent.push(clonedSegment);
16488
16552
  const inlineValue = clonedSegment.value;
@@ -16560,6 +16624,46 @@ async function processExpectedMessages(options) {
16560
16624
  }
16561
16625
  continue;
16562
16626
  }
16627
+ if (segmentType === "image") {
16628
+ const rawValue = asString3(rawSegment.value);
16629
+ if (!rawValue) {
16630
+ continue;
16631
+ }
16632
+ const { displayPath, resolvedPath, attempted } = await resolveFileReference22(
16633
+ rawValue,
16634
+ searchRoots
16635
+ );
16636
+ if (!resolvedPath) {
16637
+ const attempts = attempted.length ? [" Tried:", ...attempted.map((candidate) => ` ${candidate}`)] : void 0;
16638
+ logWarning3(`Image file not found in expected_output: ${displayPath}`, attempts);
16639
+ continue;
16640
+ }
16641
+ const mediaType = detectImageMediaType(resolvedPath);
16642
+ if (!mediaType) {
16643
+ logWarning3(
16644
+ `Unsupported image extension for ${displayPath}. Supported: ${Object.keys(IMAGE_MEDIA_TYPES).join(", ")}`
16645
+ );
16646
+ continue;
16647
+ }
16648
+ try {
16649
+ const imageBuffer = await readFile4(resolvedPath);
16650
+ const base64 = imageBuffer.toString("base64");
16651
+ processedContent.push({
16652
+ type: "image",
16653
+ media_type: mediaType,
16654
+ source: `data:${mediaType};base64,${base64}`
16655
+ });
16656
+ if (verbose) {
16657
+ console.log(` [Expected Output Image] Found: ${displayPath}`);
16658
+ console.log(` Resolved to: ${resolvedPath} (${mediaType})`);
16659
+ }
16660
+ } catch (error) {
16661
+ logWarning3(
16662
+ `Could not read expected output image ${resolvedPath}: ${error.message}`
16663
+ );
16664
+ }
16665
+ continue;
16666
+ }
16563
16667
  processedContent.push(cloneJsonObject(rawSegment));
16564
16668
  }
16565
16669
  segment.content = processedContent;
@@ -16802,7 +16906,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
16802
16906
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
16803
16907
  const testCase = {
16804
16908
  id,
16805
- eval_set: evalSetName,
16909
+ dataset: evalSetName,
16806
16910
  conversation_id: conversationId,
16807
16911
  question,
16808
16912
  input: inputMessages,
@@ -17066,7 +17170,7 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
17066
17170
  }
17067
17171
  const suite = interpolated;
17068
17172
  const evalSetNameFromSuite = asString5(suite.name)?.trim();
17069
- const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.ya?ml$/i, "") || "eval";
17173
+ const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
17070
17174
  const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
17071
17175
  const rawTestcases = resolveTests(suite);
17072
17176
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
@@ -17187,7 +17291,8 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
17187
17291
  const caseTargets = extractTargetsFromTestCase(evalcase);
17188
17292
  const testCase = {
17189
17293
  id,
17190
- eval_set: evalSetName,
17294
+ dataset: evalSetName,
17295
+ category: options?.category,
17191
17296
  conversation_id: conversationId,
17192
17297
  question,
17193
17298
  input: inputMessages,
@@ -18090,6 +18195,46 @@ async function withRetry(fn, retryConfig, signal) {
18090
18195
  }
18091
18196
  throw lastError;
18092
18197
  }
18198
+ function toContentArray(content) {
18199
+ if (!Array.isArray(content)) return void 0;
18200
+ let hasNonText = false;
18201
+ const blocks = [];
18202
+ for (const part of content) {
18203
+ if (!part || typeof part !== "object") continue;
18204
+ const p = part;
18205
+ if (p.type === "text" && typeof p.text === "string") {
18206
+ blocks.push({ type: "text", text: p.text });
18207
+ } else if (p.type === "image" && typeof p.source === "object" && p.source !== null) {
18208
+ const src = p.source;
18209
+ const mediaType = typeof p.media_type === "string" ? p.media_type : typeof src.media_type === "string" ? src.media_type : "application/octet-stream";
18210
+ const data = typeof src.data === "string" ? `data:${mediaType};base64,${src.data}` : typeof p.url === "string" ? p.url : "";
18211
+ blocks.push({ type: "image", media_type: mediaType, source: data });
18212
+ hasNonText = true;
18213
+ } else if (p.type === "tool_use") {
18214
+ } else if (p.type === "tool_result") {
18215
+ }
18216
+ }
18217
+ return hasNonText && blocks.length > 0 ? blocks : void 0;
18218
+ }
18219
+ function extractTextContent2(content) {
18220
+ if (typeof content === "string") {
18221
+ return content;
18222
+ }
18223
+ if (!Array.isArray(content)) {
18224
+ return void 0;
18225
+ }
18226
+ const textParts = [];
18227
+ for (const part of content) {
18228
+ if (!part || typeof part !== "object") {
18229
+ continue;
18230
+ }
18231
+ const p = part;
18232
+ if (p.type === "text" && typeof p.text === "string") {
18233
+ textParts.push(p.text);
18234
+ }
18235
+ }
18236
+ return textParts.length > 0 ? textParts.join("\n") : void 0;
18237
+ }
18093
18238
  var GLOBAL_LOGS_KEY = Symbol.for("agentv.claudeLogs");
18094
18239
  var GLOBAL_SUBSCRIBERS_KEY = Symbol.for("agentv.claudeLogSubscribers");
18095
18240
  function getClaudeLogStore() {
@@ -18249,11 +18394,12 @@ var ClaudeCliProvider = class {
18249
18394
  if (betaMessage && typeof betaMessage === "object") {
18250
18395
  const msg = betaMessage;
18251
18396
  const content = msg.content;
18397
+ const structuredContent = toContentArray(content);
18252
18398
  const textContent = extractTextContent2(content);
18253
18399
  const toolCalls = extractToolCalls(content);
18254
18400
  const outputMsg = {
18255
18401
  role: "assistant",
18256
- content: textContent,
18402
+ content: structuredContent ?? textContent,
18257
18403
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0
18258
18404
  };
18259
18405
  output.push(outputMsg);
@@ -18592,25 +18738,6 @@ function summarizeEvent(event) {
18592
18738
  return void 0;
18593
18739
  }
18594
18740
  }
18595
- function extractTextContent2(content) {
18596
- if (typeof content === "string") {
18597
- return content;
18598
- }
18599
- if (!Array.isArray(content)) {
18600
- return void 0;
18601
- }
18602
- const textParts = [];
18603
- for (const part of content) {
18604
- if (!part || typeof part !== "object") {
18605
- continue;
18606
- }
18607
- const p = part;
18608
- if (p.type === "text" && typeof p.text === "string") {
18609
- textParts.push(p.text);
18610
- }
18611
- }
18612
- return textParts.length > 0 ? textParts.join("\n") : void 0;
18613
- }
18614
18741
  function extractToolCalls(content) {
18615
18742
  if (!Array.isArray(content)) {
18616
18743
  return [];
@@ -18777,11 +18904,12 @@ var ClaudeSdkProvider = class {
18777
18904
  if (betaMessage && typeof betaMessage === "object") {
18778
18905
  const msg = betaMessage;
18779
18906
  const content = msg.content;
18780
- const textContent = extractTextContent22(content);
18907
+ const structuredContent = toContentArray(content);
18908
+ const textContent = extractTextContent2(content);
18781
18909
  const toolCalls = extractToolCalls2(content);
18782
18910
  const outputMsg = {
18783
18911
  role: "assistant",
18784
- content: textContent,
18912
+ content: structuredContent ?? textContent,
18785
18913
  toolCalls: toolCalls.length > 0 ? toolCalls : void 0
18786
18914
  };
18787
18915
  output.push(outputMsg);
@@ -18899,25 +19027,6 @@ var ClaudeSdkProvider = class {
18899
19027
  }
18900
19028
  }
18901
19029
  };
18902
- function extractTextContent22(content) {
18903
- if (typeof content === "string") {
18904
- return content;
18905
- }
18906
- if (!Array.isArray(content)) {
18907
- return void 0;
18908
- }
18909
- const textParts = [];
18910
- for (const part of content) {
18911
- if (!part || typeof part !== "object") {
18912
- continue;
18913
- }
18914
- const p = part;
18915
- if (p.type === "text" && typeof p.text === "string") {
18916
- textParts.push(p.text);
18917
- }
18918
- }
18919
- return textParts.length > 0 ? textParts.join("\n") : void 0;
18920
- }
18921
19030
  function extractToolCalls2(content) {
18922
19031
  if (!Array.isArray(content)) {
18923
19032
  return [];
@@ -19133,7 +19242,7 @@ function convertMessages(messages) {
19133
19242
  return messages.map((msg) => ({
19134
19243
  role: msg.role,
19135
19244
  name: msg.name,
19136
- content: msg.content,
19245
+ content: isContentArray(msg.content) ? msg.content : typeof msg.content === "string" ? msg.content : void 0,
19137
19246
  toolCalls: msg.tool_calls?.map((tc) => ({
19138
19247
  tool: tc.tool,
19139
19248
  input: tc.input,
@@ -21319,6 +21428,35 @@ function extractPiTextContent(content) {
21319
21428
  }
21320
21429
  return textParts.length > 0 ? textParts.join("\n") : void 0;
21321
21430
  }
21431
+ function toPiContentArray(content) {
21432
+ if (!Array.isArray(content)) return void 0;
21433
+ let hasNonText = false;
21434
+ const blocks = [];
21435
+ for (const part of content) {
21436
+ if (!part || typeof part !== "object") continue;
21437
+ const p = part;
21438
+ if (p.type === "text" && typeof p.text === "string") {
21439
+ blocks.push({ type: "text", text: p.text });
21440
+ } else if (p.type === "image") {
21441
+ const mediaType = typeof p.media_type === "string" ? p.media_type : "application/octet-stream";
21442
+ let source = "";
21443
+ if (typeof p.source === "object" && p.source !== null) {
21444
+ const src = p.source;
21445
+ const srcMediaType = typeof src.media_type === "string" ? src.media_type : mediaType;
21446
+ source = typeof src.data === "string" ? `data:${srcMediaType};base64,${src.data}` : "";
21447
+ }
21448
+ if (!source && typeof p.url === "string") {
21449
+ source = p.url;
21450
+ }
21451
+ if (source) {
21452
+ blocks.push({ type: "image", media_type: mediaType, source });
21453
+ hasNonText = true;
21454
+ }
21455
+ } else if (p.type === "tool_use" || p.type === "tool_result") {
21456
+ }
21457
+ }
21458
+ return hasNonText && blocks.length > 0 ? blocks : void 0;
21459
+ }
21322
21460
  function toFiniteNumber(value) {
21323
21461
  if (typeof value === "number" && Number.isFinite(value)) return value;
21324
21462
  return void 0;
@@ -22478,7 +22616,8 @@ function convertAgentMessage(message, toolTrackers, completedToolResults) {
22478
22616
  }
22479
22617
  const msg = message;
22480
22618
  const role = typeof msg.role === "string" ? msg.role : "unknown";
22481
- const content = extractPiTextContent(msg.content);
22619
+ const structuredContent = toPiContentArray(msg.content);
22620
+ const content = structuredContent ?? extractPiTextContent(msg.content);
22482
22621
  const toolCalls = extractToolCalls4(msg.content, toolTrackers, completedToolResults);
22483
22622
  const startTimeVal = typeof msg.timestamp === "number" ? new Date(msg.timestamp).toISOString() : typeof msg.timestamp === "string" ? msg.timestamp : void 0;
22484
22623
  let msgTokenUsage;
@@ -24233,13 +24372,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
24233
24372
  async function execShellWithStdin(command, stdinPayload, options = {}) {
24234
24373
  const { mkdir: mkdir16, readFile: readFile14, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
24235
24374
  const { tmpdir: tmpdir3 } = await import("node:os");
24236
- const path47 = await import("node:path");
24375
+ const path48 = await import("node:path");
24237
24376
  const { randomUUID: randomUUID10 } = await import("node:crypto");
24238
- const dir = path47.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
24377
+ const dir = path48.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
24239
24378
  await mkdir16(dir, { recursive: true });
24240
- const stdinPath = path47.join(dir, "stdin.txt");
24241
- const stdoutPath = path47.join(dir, "stdout.txt");
24242
- const stderrPath = path47.join(dir, "stderr.txt");
24379
+ const stdinPath = path48.join(dir, "stdin.txt");
24380
+ const stdoutPath = path48.join(dir, "stdout.txt");
24381
+ const stderrPath = path48.join(dir, "stderr.txt");
24243
24382
  await writeFile9(stdinPath, stdinPayload, "utf8");
24244
24383
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
24245
24384
  const { spawn: spawn5 } = await import("node:child_process");
@@ -24547,6 +24686,56 @@ function toCamelCaseDeep(obj) {
24547
24686
  return obj;
24548
24687
  }
24549
24688
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
24689
+ var DATA_URI_RE = /^data:([^;]+);base64,(.+)$/s;
24690
+ async function materializeContentForGrader(messages, getWorkDir) {
24691
+ if (!messages || messages.length === 0) return messages ?? null;
24692
+ let hasAnyImage = false;
24693
+ for (const msg of messages) {
24694
+ if (isContentArray(msg.content)) {
24695
+ for (const block of msg.content) {
24696
+ if (block.type === "image") {
24697
+ hasAnyImage = true;
24698
+ break;
24699
+ }
24700
+ }
24701
+ }
24702
+ if (hasAnyImage) break;
24703
+ }
24704
+ if (!hasAnyImage) return messages;
24705
+ let counter = 0;
24706
+ const result = [];
24707
+ for (const msg of messages) {
24708
+ if (!isContentArray(msg.content)) {
24709
+ result.push(msg);
24710
+ continue;
24711
+ }
24712
+ if (!msg.content.some((b) => b.type === "image")) {
24713
+ result.push(msg);
24714
+ continue;
24715
+ }
24716
+ const blocks = [];
24717
+ for (const block of msg.content) {
24718
+ if (block.type !== "image") {
24719
+ blocks.push({ ...block });
24720
+ continue;
24721
+ }
24722
+ const img = block;
24723
+ const match = DATA_URI_RE.exec(img.source);
24724
+ if (match) {
24725
+ const [, mediaType, base64Data] = match;
24726
+ const ext = mediaType.split("/")[1] === "jpeg" ? "jpg" : mediaType.split("/")[1] ?? "bin";
24727
+ const dir = await getWorkDir();
24728
+ const filePath = join(dir, `img-${counter++}.${ext}`);
24729
+ await writeFile6(filePath, Buffer.from(base64Data, "base64"));
24730
+ blocks.push({ type: "image", media_type: img.media_type, path: filePath });
24731
+ } else {
24732
+ blocks.push({ type: "image", media_type: img.media_type, path: img.source });
24733
+ }
24734
+ }
24735
+ result.push({ ...msg, content: blocks });
24736
+ }
24737
+ return result;
24738
+ }
24550
24739
  var CodeEvaluator = class {
24551
24740
  kind = "code-grader";
24552
24741
  command;
@@ -24562,7 +24751,18 @@ var CodeEvaluator = class {
24562
24751
  this.target = options.target;
24563
24752
  }
24564
24753
  async evaluate(context2) {
24565
- let outputForPayload = context2.output ?? null;
24754
+ let imageTmpDir;
24755
+ const getImageDir = async () => {
24756
+ if (!imageTmpDir) {
24757
+ imageTmpDir = await mkdtemp2(join(tmpdir2(), "agentv-img-"));
24758
+ }
24759
+ return imageTmpDir;
24760
+ };
24761
+ const materializedOutput = await materializeContentForGrader(
24762
+ context2.output,
24763
+ getImageDir
24764
+ );
24765
+ let outputForPayload = materializedOutput;
24566
24766
  let outputPath;
24567
24767
  if (outputForPayload) {
24568
24768
  const serialized = JSON.stringify(outputForPayload);
@@ -24575,12 +24775,17 @@ var CodeEvaluator = class {
24575
24775
  }
24576
24776
  const payload = {
24577
24777
  criteria: context2.evalCase.criteria,
24578
- expectedOutput: context2.evalCase.expected_output,
24579
- outputText: context2.candidate,
24778
+ expectedOutput: await materializeContentForGrader(
24779
+ context2.evalCase.expected_output,
24780
+ getImageDir
24781
+ ),
24580
24782
  output: outputForPayload,
24581
24783
  outputPath,
24582
24784
  inputFiles: context2.evalCase.file_paths,
24583
- input: context2.evalCase.input,
24785
+ input: await materializeContentForGrader(
24786
+ context2.evalCase.input,
24787
+ getImageDir
24788
+ ),
24584
24789
  trace: context2.trace ?? null,
24585
24790
  tokenUsage: context2.tokenUsage ?? null,
24586
24791
  costUsd: context2.costUsd ?? null,
@@ -24589,9 +24794,7 @@ var CodeEvaluator = class {
24589
24794
  endTime: context2.endTime ?? null,
24590
24795
  fileChanges: context2.fileChanges ?? null,
24591
24796
  workspacePath: context2.workspacePath ?? null,
24592
- config: this.config ?? null,
24593
- inputText: context2.evalCase.question,
24594
- expectedOutputText: context2.evalCase.reference_answer ?? ""
24797
+ config: this.config ?? null
24595
24798
  };
24596
24799
  const inputPayload = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
24597
24800
  let proxyEnv;
@@ -24681,6 +24884,10 @@ var CodeEvaluator = class {
24681
24884
  await rm3(dirname(outputPath), { recursive: true, force: true }).catch(() => {
24682
24885
  });
24683
24886
  }
24887
+ if (imageTmpDir) {
24888
+ await rm3(imageTmpDir, { recursive: true, force: true }).catch(() => {
24889
+ });
24890
+ }
24684
24891
  }
24685
24892
  }
24686
24893
  };
@@ -24749,13 +24956,13 @@ Be concise and focused in your evaluation. Provide succinct, specific feedback r
24749
24956
  {{${TEMPLATE_VARIABLES.CRITERIA}}}
24750
24957
 
24751
24958
  [[ ## question ## ]]
24752
- {{${TEMPLATE_VARIABLES.INPUT_TEXT}}}
24959
+ {{${TEMPLATE_VARIABLES.INPUT}}}
24753
24960
 
24754
24961
  [[ ## reference_answer ## ]]
24755
- {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT}}}
24962
+ {{${TEMPLATE_VARIABLES.EXPECTED_OUTPUT}}}
24756
24963
 
24757
24964
  [[ ## answer ## ]]
24758
- {{${TEMPLATE_VARIABLES.OUTPUT_TEXT}}}`;
24965
+ {{${TEMPLATE_VARIABLES.OUTPUT}}}`;
24759
24966
  var freeformEvaluationSchema = external_exports2.object({
24760
24967
  score: external_exports2.number().min(0).max(1).describe("Score between 0.0 and 1.0"),
24761
24968
  assertions: external_exports2.array(
@@ -24827,21 +25034,19 @@ var LlmGraderEvaluator = class {
24827
25034
  async evaluateFreeform(context2, graderProvider) {
24828
25035
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
24829
25036
  const variables = {
24830
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(context2.evalCase.input, null, 2),
24831
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(
24832
- context2.evalCase.expected_output,
24833
- null,
24834
- 2
24835
- ),
24836
- [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify(context2.output ?? [], null, 2),
25037
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
25038
+ [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
25039
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
24837
25040
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
24838
25041
  [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
25042
+ // Deprecated aliases — same values as the primary variables above
24839
25043
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
24840
25044
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
24841
25045
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
24842
25046
  };
24843
25047
  const systemPrompt = buildOutputSchema();
24844
25048
  const evaluatorTemplate = context2.evaluatorTemplateOverride ?? this.evaluatorTemplate ?? DEFAULT_EVALUATOR_TEMPLATE;
25049
+ warnDeprecatedTemplateVars(evaluatorTemplate);
24845
25050
  let userPrompt = substituteVariables(evaluatorTemplate, variables);
24846
25051
  if (context2.fileChanges && !context2.evaluatorTemplateOverride && !this.evaluatorTemplate) {
24847
25052
  userPrompt += `
@@ -24853,13 +25058,15 @@ ${context2.fileChanges}`;
24853
25058
  userPrompt,
24854
25059
  systemPrompt
24855
25060
  };
25061
+ const images = context2.output ? extractImageBlocks(context2.output) : [];
24856
25062
  try {
24857
25063
  const { data, tokenUsage } = await this.runWithRetry({
24858
25064
  context: context2,
24859
25065
  graderProvider,
24860
25066
  systemPrompt,
24861
25067
  userPrompt,
24862
- schema: freeformEvaluationSchema
25068
+ schema: freeformEvaluationSchema,
25069
+ images
24863
25070
  });
24864
25071
  const score = clampScore(data.score);
24865
25072
  const assertions = Array.isArray(data.assertions) ? data.assertions.slice(0, 8) : [];
@@ -24903,13 +25110,15 @@ ${context2.fileChanges}`;
24903
25110
  userPrompt: prompt,
24904
25111
  systemPrompt
24905
25112
  };
25113
+ const images = context2.output ? extractImageBlocks(context2.output) : [];
24906
25114
  try {
24907
25115
  const { data, tokenUsage } = await this.runWithRetry({
24908
25116
  context: context2,
24909
25117
  graderProvider,
24910
25118
  systemPrompt,
24911
25119
  userPrompt: prompt,
24912
- schema: rubricEvaluationSchema
25120
+ schema: rubricEvaluationSchema,
25121
+ images
24913
25122
  });
24914
25123
  const { score, verdict, assertions } = calculateRubricScore(data, rubrics);
24915
25124
  return {
@@ -24946,13 +25155,15 @@ ${context2.fileChanges}`;
24946
25155
  userPrompt: prompt,
24947
25156
  systemPrompt
24948
25157
  };
25158
+ const images = context2.output ? extractImageBlocks(context2.output) : [];
24949
25159
  try {
24950
25160
  const { data, tokenUsage } = await this.runWithRetry({
24951
25161
  context: context2,
24952
25162
  graderProvider,
24953
25163
  systemPrompt,
24954
25164
  userPrompt: prompt,
24955
- schema: scoreRangeEvaluationSchema
25165
+ schema: scoreRangeEvaluationSchema,
25166
+ images
24956
25167
  });
24957
25168
  const { score, verdict, assertions, details } = calculateScoreRangeResult(data, rubrics);
24958
25169
  return {
@@ -25159,12 +25370,17 @@ ${context2.fileChanges}`;
25159
25370
  const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
25160
25371
  const variables = {
25161
25372
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
25373
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
25374
+ [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
25375
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
25376
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
25377
+ // Deprecated aliases
25162
25378
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
25163
25379
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
25164
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
25165
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
25380
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
25166
25381
  };
25167
25382
  if (this.evaluatorTemplate) {
25383
+ warnDeprecatedTemplateVars(this.evaluatorTemplate);
25168
25384
  return substituteVariables(this.evaluatorTemplate, variables);
25169
25385
  }
25170
25386
  const config = context2.evaluator;
@@ -25215,11 +25431,16 @@ ${context2.fileChanges}`;
25215
25431
  if (this.evaluatorTemplate) {
25216
25432
  const variables = {
25217
25433
  [TEMPLATE_VARIABLES.CRITERIA]: context2.evalCase.criteria.trim(),
25434
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
25435
+ [TEMPLATE_VARIABLES.OUTPUT]: context2.candidate.trim(),
25436
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context2.evalCase.reference_answer ?? "").trim(),
25437
+ [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? "",
25438
+ // Deprecated aliases
25218
25439
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
25219
25440
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context2.candidate.trim(),
25220
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim(),
25221
- [TEMPLATE_VARIABLES.FILE_CHANGES]: context2.fileChanges ?? ""
25441
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (context2.evalCase.reference_answer ?? "").trim()
25222
25442
  };
25443
+ warnDeprecatedTemplateVars(this.evaluatorTemplate);
25223
25444
  const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
25224
25445
  const outputSchema2 = rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
25225
25446
  return `${customPrompt}
@@ -25390,18 +25611,35 @@ ${outputSchema2}`;
25390
25611
  // LLM mode retry logic
25391
25612
  // ---------------------------------------------------------------------------
25392
25613
  async runWithRetry(options) {
25393
- const { context: context2, graderProvider, systemPrompt, userPrompt, schema } = options;
25614
+ const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
25394
25615
  let lastError;
25395
25616
  for (let attempt = 1; attempt <= 3; attempt++) {
25396
25617
  try {
25397
25618
  const model = graderProvider.asLanguageModel?.();
25398
25619
  if (model) {
25399
- const result = await generateText({
25620
+ const modelOptions = {
25621
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
25622
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
25623
+ };
25624
+ const hasImages = images && images.length > 0;
25625
+ const result = hasImages ? await generateText({
25626
+ model,
25627
+ system: systemPrompt,
25628
+ messages: [
25629
+ {
25630
+ role: "user",
25631
+ content: [
25632
+ { type: "text", text: userPrompt },
25633
+ ...toAiSdkImageParts(images)
25634
+ ]
25635
+ }
25636
+ ],
25637
+ ...modelOptions
25638
+ }) : await generateText({
25400
25639
  model,
25401
25640
  system: systemPrompt,
25402
25641
  prompt: userPrompt,
25403
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
25404
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
25642
+ ...modelOptions
25405
25643
  });
25406
25644
  const data2 = schema.parse(parseJsonFromText(result.text));
25407
25645
  const rawUsage = result.usage;
@@ -25461,6 +25699,26 @@ function substituteVariables(template, variables) {
25461
25699
  return variables[varName] ?? match;
25462
25700
  });
25463
25701
  }
25702
+ var ANSI_YELLOW7 = "\x1B[33m";
25703
+ var ANSI_RESET8 = "\x1B[0m";
25704
+ var warnedTemplateStrings = /* @__PURE__ */ new Set();
25705
+ function warnDeprecatedTemplateVars(template) {
25706
+ if (warnedTemplateStrings.has(template)) return;
25707
+ const used = [];
25708
+ for (const [deprecated, replacement] of DEPRECATED_TEMPLATE_VARIABLES) {
25709
+ if (new RegExp(`\\{\\{\\s*${deprecated}\\s*\\}\\}`).test(template)) {
25710
+ used.push(`{{ ${deprecated} }} \u2192 {{ ${replacement} }}`);
25711
+ }
25712
+ }
25713
+ if (used.length > 0) {
25714
+ warnedTemplateStrings.add(template);
25715
+ console.warn(
25716
+ `${ANSI_YELLOW7}\u26A0 Deprecated template variables detected (they still work but will be removed in a future version):
25717
+ ${used.join("\n ")}
25718
+ Update your custom evaluator template to use the new names.${ANSI_RESET8}`
25719
+ );
25720
+ }
25721
+ }
25464
25722
  function calculateRubricScore(result, rubrics) {
25465
25723
  const rubricMap = new Map(rubrics.map((rubric) => [rubric.id, rubric]));
25466
25724
  const assertions = [];
@@ -25555,6 +25813,26 @@ function calculateScoreRangeResult(result, rubrics) {
25555
25813
  }
25556
25814
  };
25557
25815
  }
25816
+ function extractImageBlocks(messages) {
25817
+ const images = [];
25818
+ for (const msg of messages) {
25819
+ if (msg.role !== "assistant") continue;
25820
+ if (!isContentArray(msg.content)) continue;
25821
+ for (const block of msg.content) {
25822
+ if (block.type === "image") {
25823
+ images.push(block);
25824
+ }
25825
+ }
25826
+ }
25827
+ return images;
25828
+ }
25829
+ function toAiSdkImageParts(images) {
25830
+ return images.map((img) => ({
25831
+ type: "image",
25832
+ image: img.source,
25833
+ mediaType: img.media_type || void 0
25834
+ }));
25835
+ }
25558
25836
  function resolveSandboxed(basePath, relativePath) {
25559
25837
  const resolved = path35.resolve(basePath, relativePath);
25560
25838
  if (!resolved.startsWith(basePath + path35.sep) && resolved !== basePath) {
@@ -26288,115 +26566,115 @@ var FieldAccuracyEvaluator = class {
26288
26566
  * Evaluate a single field against the expected value.
26289
26567
  */
26290
26568
  evaluateField(fieldConfig, candidateData, expectedData) {
26291
- const { path: path47, match, required = true, weight = 1 } = fieldConfig;
26292
- const candidateValue = resolvePath(candidateData, path47);
26293
- const expectedValue = resolvePath(expectedData, path47);
26569
+ const { path: path48, match, required = true, weight = 1 } = fieldConfig;
26570
+ const candidateValue = resolvePath(candidateData, path48);
26571
+ const expectedValue = resolvePath(expectedData, path48);
26294
26572
  if (expectedValue === void 0) {
26295
26573
  return {
26296
- path: path47,
26574
+ path: path48,
26297
26575
  score: 1,
26298
26576
  // No expected value means no comparison needed
26299
26577
  weight,
26300
26578
  hit: true,
26301
- message: `${path47}: no expected value`
26579
+ message: `${path48}: no expected value`
26302
26580
  };
26303
26581
  }
26304
26582
  if (candidateValue === void 0) {
26305
26583
  if (required) {
26306
26584
  return {
26307
- path: path47,
26585
+ path: path48,
26308
26586
  score: 0,
26309
26587
  weight,
26310
26588
  hit: false,
26311
- message: `${path47} (required, missing)`
26589
+ message: `${path48} (required, missing)`
26312
26590
  };
26313
26591
  }
26314
26592
  return {
26315
- path: path47,
26593
+ path: path48,
26316
26594
  score: 1,
26317
26595
  // Don't penalize missing optional fields
26318
26596
  weight: 0,
26319
26597
  // Zero weight means it won't affect the score
26320
26598
  hit: true,
26321
- message: `${path47}: optional field missing`
26599
+ message: `${path48}: optional field missing`
26322
26600
  };
26323
26601
  }
26324
26602
  switch (match) {
26325
26603
  case "exact":
26326
- return this.compareExact(path47, candidateValue, expectedValue, weight);
26604
+ return this.compareExact(path48, candidateValue, expectedValue, weight);
26327
26605
  case "numeric_tolerance":
26328
26606
  return this.compareNumericTolerance(
26329
- path47,
26607
+ path48,
26330
26608
  candidateValue,
26331
26609
  expectedValue,
26332
26610
  fieldConfig,
26333
26611
  weight
26334
26612
  );
26335
26613
  case "date":
26336
- return this.compareDate(path47, candidateValue, expectedValue, fieldConfig, weight);
26614
+ return this.compareDate(path48, candidateValue, expectedValue, fieldConfig, weight);
26337
26615
  default:
26338
26616
  return {
26339
- path: path47,
26617
+ path: path48,
26340
26618
  score: 0,
26341
26619
  weight,
26342
26620
  hit: false,
26343
- message: `${path47}: unknown match type "${match}"`
26621
+ message: `${path48}: unknown match type "${match}"`
26344
26622
  };
26345
26623
  }
26346
26624
  }
26347
26625
  /**
26348
26626
  * Exact equality comparison.
26349
26627
  */
26350
- compareExact(path47, candidateValue, expectedValue, weight) {
26628
+ compareExact(path48, candidateValue, expectedValue, weight) {
26351
26629
  if (deepEqual(candidateValue, expectedValue)) {
26352
26630
  return {
26353
- path: path47,
26631
+ path: path48,
26354
26632
  score: 1,
26355
26633
  weight,
26356
26634
  hit: true,
26357
- message: path47
26635
+ message: path48
26358
26636
  };
26359
26637
  }
26360
26638
  if (typeof candidateValue !== typeof expectedValue) {
26361
26639
  return {
26362
- path: path47,
26640
+ path: path48,
26363
26641
  score: 0,
26364
26642
  weight,
26365
26643
  hit: false,
26366
- message: `${path47} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
26644
+ message: `${path48} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
26367
26645
  };
26368
26646
  }
26369
26647
  return {
26370
- path: path47,
26648
+ path: path48,
26371
26649
  score: 0,
26372
26650
  weight,
26373
26651
  hit: false,
26374
- message: `${path47} (value mismatch)`
26652
+ message: `${path48} (value mismatch)`
26375
26653
  };
26376
26654
  }
26377
26655
  /**
26378
26656
  * Numeric comparison with absolute or relative tolerance.
26379
26657
  */
26380
- compareNumericTolerance(path47, candidateValue, expectedValue, fieldConfig, weight) {
26658
+ compareNumericTolerance(path48, candidateValue, expectedValue, fieldConfig, weight) {
26381
26659
  const { tolerance = 0, relative = false } = fieldConfig;
26382
26660
  const candidateNum = toNumber(candidateValue);
26383
26661
  const expectedNum = toNumber(expectedValue);
26384
26662
  if (candidateNum === null || expectedNum === null) {
26385
26663
  return {
26386
- path: path47,
26664
+ path: path48,
26387
26665
  score: 0,
26388
26666
  weight,
26389
26667
  hit: false,
26390
- message: `${path47} (non-numeric value)`
26668
+ message: `${path48} (non-numeric value)`
26391
26669
  };
26392
26670
  }
26393
26671
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
26394
26672
  return {
26395
- path: path47,
26673
+ path: path48,
26396
26674
  score: 0,
26397
26675
  weight,
26398
26676
  hit: false,
26399
- message: `${path47} (invalid numeric value)`
26677
+ message: `${path48} (invalid numeric value)`
26400
26678
  };
26401
26679
  }
26402
26680
  const diff = Math.abs(candidateNum - expectedNum);
@@ -26409,61 +26687,61 @@ var FieldAccuracyEvaluator = class {
26409
26687
  }
26410
26688
  if (withinTolerance) {
26411
26689
  return {
26412
- path: path47,
26690
+ path: path48,
26413
26691
  score: 1,
26414
26692
  weight,
26415
26693
  hit: true,
26416
- message: `${path47} (within tolerance: diff=${diff.toFixed(2)})`
26694
+ message: `${path48} (within tolerance: diff=${diff.toFixed(2)})`
26417
26695
  };
26418
26696
  }
26419
26697
  return {
26420
- path: path47,
26698
+ path: path48,
26421
26699
  score: 0,
26422
26700
  weight,
26423
26701
  hit: false,
26424
- message: `${path47} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
26702
+ message: `${path48} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
26425
26703
  };
26426
26704
  }
26427
26705
  /**
26428
26706
  * Date comparison with format normalization.
26429
26707
  */
26430
- compareDate(path47, candidateValue, expectedValue, fieldConfig, weight) {
26708
+ compareDate(path48, candidateValue, expectedValue, fieldConfig, weight) {
26431
26709
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
26432
26710
  const candidateDate = parseDate(String(candidateValue), formats);
26433
26711
  const expectedDate = parseDate(String(expectedValue), formats);
26434
26712
  if (candidateDate === null) {
26435
26713
  return {
26436
- path: path47,
26714
+ path: path48,
26437
26715
  score: 0,
26438
26716
  weight,
26439
26717
  hit: false,
26440
- message: `${path47} (unparseable candidate date)`
26718
+ message: `${path48} (unparseable candidate date)`
26441
26719
  };
26442
26720
  }
26443
26721
  if (expectedDate === null) {
26444
26722
  return {
26445
- path: path47,
26723
+ path: path48,
26446
26724
  score: 0,
26447
26725
  weight,
26448
26726
  hit: false,
26449
- message: `${path47} (unparseable expected date)`
26727
+ message: `${path48} (unparseable expected date)`
26450
26728
  };
26451
26729
  }
26452
26730
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
26453
26731
  return {
26454
- path: path47,
26732
+ path: path48,
26455
26733
  score: 1,
26456
26734
  weight,
26457
26735
  hit: true,
26458
- message: path47
26736
+ message: path48
26459
26737
  };
26460
26738
  }
26461
26739
  return {
26462
- path: path47,
26740
+ path: path48,
26463
26741
  score: 0,
26464
26742
  weight,
26465
26743
  hit: false,
26466
- message: `${path47} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
26744
+ message: `${path48} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
26467
26745
  };
26468
26746
  }
26469
26747
  /**
@@ -26496,11 +26774,11 @@ var FieldAccuracyEvaluator = class {
26496
26774
  };
26497
26775
  }
26498
26776
  };
26499
- function resolvePath(obj, path47) {
26500
- if (!path47 || !obj) {
26777
+ function resolvePath(obj, path48) {
26778
+ if (!path48 || !obj) {
26501
26779
  return void 0;
26502
26780
  }
26503
- const parts = path47.split(/\.|\[|\]/).filter((p) => p.length > 0);
26781
+ const parts = path48.split(/\.|\[|\]/).filter((p) => p.length > 0);
26504
26782
  let current = obj;
26505
26783
  for (const part of parts) {
26506
26784
  if (current === null || current === void 0) {
@@ -26786,11 +27064,12 @@ function assembleLlmGraderPrompt(input) {
26786
27064
  function assembleFreeform(evalCase, candidate, promptInputs, fileChanges, evaluatorTemplateOverride) {
26787
27065
  const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 ? promptInputs.question : evalCase.question;
26788
27066
  const variables = {
26789
- [TEMPLATE_VARIABLES.INPUT]: JSON.stringify(evalCase.input, null, 2),
26790
- [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: JSON.stringify(evalCase.expected_output, null, 2),
26791
- [TEMPLATE_VARIABLES.OUTPUT]: JSON.stringify([], null, 2),
27067
+ [TEMPLATE_VARIABLES.INPUT]: formattedQuestion.trim(),
27068
+ [TEMPLATE_VARIABLES.OUTPUT]: candidate.trim(),
27069
+ [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? "").trim(),
26792
27070
  [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(),
26793
27071
  [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? "",
27072
+ // Deprecated aliases
26794
27073
  [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(),
26795
27074
  [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(),
26796
27075
  [TEMPLATE_VARIABLES.EXPECTED_OUTPUT_TEXT]: (evalCase.reference_answer ?? "").trim()
@@ -26973,8 +27252,8 @@ var TokenUsageEvaluator = class {
26973
27252
  };
26974
27253
  }
26975
27254
  };
26976
- function getNestedValue(obj, path47) {
26977
- const parts = path47.split(".");
27255
+ function getNestedValue(obj, path48) {
27256
+ const parts = path48.split(".");
26978
27257
  let current = obj;
26979
27258
  for (const part of parts) {
26980
27259
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -27824,16 +28103,13 @@ async function executePromptTemplate(script, context2, config, timeoutMs) {
27824
28103
  const payload = {
27825
28104
  criteria: context2.evalCase.criteria,
27826
28105
  expectedOutput: context2.evalCase.expected_output,
27827
- outputText: context2.candidate,
27828
28106
  output: context2.output ?? null,
27829
28107
  inputFiles: context2.evalCase.file_paths,
27830
28108
  input: context2.evalCase.input,
27831
28109
  trace: context2.trace ?? null,
27832
28110
  fileChanges: context2.fileChanges ?? null,
27833
28111
  workspacePath: context2.workspacePath ?? null,
27834
- config: config ?? context2.config ?? null,
27835
- inputText: context2.evalCase.question,
27836
- expectedOutputText: context2.evalCase.reference_answer ?? ""
28112
+ config: config ?? context2.config ?? null
27837
28113
  };
27838
28114
  const inputJson = JSON.stringify(toSnakeCaseDeep(payload), null, 2);
27839
28115
  const scriptPath = script[script.length - 1];
@@ -29469,7 +29745,8 @@ async function runEvaluation(options) {
29469
29745
  const budgetResult = {
29470
29746
  timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
29471
29747
  testId: evalCase.id,
29472
- eval_set: evalCase.eval_set,
29748
+ dataset: evalCase.dataset,
29749
+ category: evalCase.category,
29473
29750
  score: 0,
29474
29751
  assertions: [],
29475
29752
  output: [],
@@ -29505,7 +29782,8 @@ async function runEvaluation(options) {
29505
29782
  const haltResult = {
29506
29783
  timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
29507
29784
  testId: evalCase.id,
29508
- eval_set: evalCase.eval_set,
29785
+ dataset: evalCase.dataset,
29786
+ category: evalCase.category,
29509
29787
  score: 0,
29510
29788
  assertions: [],
29511
29789
  output: [],
@@ -30504,7 +30782,8 @@ async function evaluateCandidate(options) {
30504
30782
  return {
30505
30783
  timestamp: completedAt.toISOString(),
30506
30784
  testId: evalCase.id,
30507
- eval_set: evalCase.eval_set,
30785
+ dataset: evalCase.dataset,
30786
+ category: evalCase.category,
30508
30787
  conversationId: evalCase.conversation_id,
30509
30788
  score: score.score,
30510
30789
  assertions: score.assertions,
@@ -30854,7 +31133,8 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
30854
31133
  return {
30855
31134
  timestamp: timestamp.toISOString(),
30856
31135
  testId: evalCase.id,
30857
- eval_set: evalCase.eval_set,
31136
+ dataset: evalCase.dataset,
31137
+ category: evalCase.category,
30858
31138
  conversationId: evalCase.conversation_id,
30859
31139
  score: 0,
30860
31140
  assertions: [{ text: `Error: ${message}`, passed: false }],
@@ -31405,6 +31685,15 @@ function trimBaselineResult(result) {
31405
31685
  }
31406
31686
  return trimmed;
31407
31687
  }
31688
+ var DEFAULT_CATEGORY = "Uncategorized";
31689
+ function deriveCategory(relativePath) {
31690
+ const parts = relativePath.split(path47.sep);
31691
+ if (parts.length <= 1) {
31692
+ return DEFAULT_CATEGORY;
31693
+ }
31694
+ const dirs = parts.slice(0, -1).filter((d) => d !== "evals");
31695
+ return dirs.length > 0 ? dirs.join("/") : DEFAULT_CATEGORY;
31696
+ }
31408
31697
  var OTEL_BACKEND_PRESETS = {
31409
31698
  langfuse: {
31410
31699
  name: "langfuse",
@@ -31527,7 +31816,7 @@ var OtelTraceExporter = class {
31527
31816
  rootSpan.setAttribute("gen_ai.system", "agentv");
31528
31817
  rootSpan.setAttribute("agentv.test_id", result.testId);
31529
31818
  rootSpan.setAttribute("agentv.target", result.target);
31530
- if (result.eval_set) rootSpan.setAttribute("agentv.eval_set", result.eval_set);
31819
+ if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
31531
31820
  rootSpan.setAttribute("agentv.score", result.score);
31532
31821
  if (captureContent && result.output.length > 0) {
31533
31822
  const lastMsg = result.output[result.output.length - 1];
@@ -31736,7 +32025,7 @@ var OtelStreamingObserver = class {
31736
32025
  this.rootSpan.setAttribute("gen_ai.system", "agentv");
31737
32026
  this.rootSpan.setAttribute("agentv.test_id", testId);
31738
32027
  this.rootSpan.setAttribute("agentv.target", target);
31739
- if (evalSet) this.rootSpan.setAttribute("agentv.eval_set", evalSet);
32028
+ if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
31740
32029
  this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
31741
32030
  }
31742
32031
  /** Create and immediately export a tool span */
@@ -31907,6 +32196,9 @@ function createAgentKernel() {
31907
32196
  }
31908
32197
 
31909
32198
  export {
32199
+ isContent,
32200
+ isContentArray,
32201
+ getTextContent,
31910
32202
  TEST_MESSAGE_ROLES,
31911
32203
  isTestMessageRole,
31912
32204
  isJsonObject,
@@ -31922,11 +32214,13 @@ export {
31922
32214
  buildSearchRoots,
31923
32215
  resolveFileReference,
31924
32216
  CLI_PLACEHOLDERS,
32217
+ COMMON_TARGET_SETTINGS,
31925
32218
  resolveTargetDefinition,
31926
- interpolateEnv,
31927
- loadCasesFromFile,
31928
32219
  KNOWN_PROVIDERS,
31929
32220
  PROVIDER_ALIASES,
32221
+ extractLastAssistantContent,
32222
+ interpolateEnv,
32223
+ loadCasesFromFile,
31930
32224
  computeTraceSummary,
31931
32225
  DEFAULT_EXPLORATION_TOOLS,
31932
32226
  explorationRatio,
@@ -32002,6 +32296,7 @@ export {
32002
32296
  substituteVariables,
32003
32297
  calculateRubricScore,
32004
32298
  buildScoreRangeOutputSchema,
32299
+ extractImageBlocks,
32005
32300
  CompositeEvaluator,
32006
32301
  CostEvaluator,
32007
32302
  ExecutionMetricsEvaluator,
@@ -32051,9 +32346,11 @@ export {
32051
32346
  shouldEnableCache,
32052
32347
  shouldSkipCacheForTemperature,
32053
32348
  trimBaselineResult,
32349
+ DEFAULT_CATEGORY,
32350
+ deriveCategory,
32054
32351
  OTEL_BACKEND_PRESETS,
32055
32352
  OtelTraceExporter,
32056
32353
  OtelStreamingObserver,
32057
32354
  createAgentKernel
32058
32355
  };
32059
- //# sourceMappingURL=chunk-ELQEFMGO.js.map
32356
+ //# sourceMappingURL=chunk-OXBBWZOY.js.map