agentv 0.7.5 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -590,7 +590,7 @@ import fg from "fast-glob";
590
590
  import { stat as stat3 } from "node:fs/promises";
591
591
  import path15 from "node:path";
592
592
 
593
- // ../../packages/core/dist/chunk-7XM7HYRS.js
593
+ // ../../packages/core/dist/chunk-YQBJAT5I.js
594
594
  import { constants } from "node:fs";
595
595
  import { access, readFile } from "node:fs/promises";
596
596
  import path from "node:path";
@@ -4636,7 +4636,7 @@ var coerce = {
4636
4636
  };
4637
4637
  var NEVER = INVALID;
4638
4638
 
4639
- // ../../packages/core/dist/chunk-7XM7HYRS.js
4639
+ // ../../packages/core/dist/chunk-YQBJAT5I.js
4640
4640
  async function fileExists(filePath) {
4641
4641
  try {
4642
4642
  await access(filePath, constants.F_OK);
@@ -4747,10 +4747,9 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set(["PROMPT", "GUIDELINES", "EVAL_ID
4747
4747
  var BASE_TARGET_SCHEMA = external_exports.object({
4748
4748
  name: external_exports.string().min(1, "target name is required"),
4749
4749
  provider: external_exports.string().min(1, "provider is required"),
4750
- settings: external_exports.record(external_exports.unknown()).optional(),
4751
4750
  judge_target: external_exports.string().optional(),
4752
4751
  workers: external_exports.number().int().min(1).optional()
4753
- });
4752
+ }).passthrough();
4754
4753
  var DEFAULT_AZURE_API_VERSION = "2024-10-01-preview";
4755
4754
  function normalizeAzureApiVersion(value) {
4756
4755
  if (!value) {
@@ -4763,11 +4762,43 @@ function normalizeAzureApiVersion(value) {
4763
4762
  const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
4764
4763
  return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
4765
4764
  }
4765
+ function resolveRetryConfig(target) {
4766
+ const maxRetries = resolveOptionalNumber(
4767
+ target.max_retries ?? target.maxRetries,
4768
+ `${target.name} max retries`
4769
+ );
4770
+ const initialDelayMs = resolveOptionalNumber(
4771
+ target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
4772
+ `${target.name} retry initial delay`
4773
+ );
4774
+ const maxDelayMs = resolveOptionalNumber(
4775
+ target.retry_max_delay_ms ?? target.retryMaxDelayMs,
4776
+ `${target.name} retry max delay`
4777
+ );
4778
+ const backoffFactor = resolveOptionalNumber(
4779
+ target.retry_backoff_factor ?? target.retryBackoffFactor,
4780
+ `${target.name} retry backoff factor`
4781
+ );
4782
+ const retryableStatusCodes = resolveOptionalNumberArray(
4783
+ target.retry_status_codes ?? target.retryStatusCodes,
4784
+ `${target.name} retry status codes`
4785
+ );
4786
+ if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
4787
+ return void 0;
4788
+ }
4789
+ return {
4790
+ maxRetries,
4791
+ initialDelayMs,
4792
+ maxDelayMs,
4793
+ backoffFactor,
4794
+ retryableStatusCodes
4795
+ };
4796
+ }
4766
4797
  function resolveTargetDefinition(definition, env = process.env) {
4767
4798
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
4768
4799
  const provider = parsed.provider.toLowerCase();
4769
4800
  const providerBatching = resolveOptionalBoolean(
4770
- parsed.settings?.provider_batching ?? parsed.settings?.providerBatching
4801
+ parsed.provider_batching ?? parsed.providerBatching
4771
4802
  );
4772
4803
  switch (provider) {
4773
4804
  case "azure":
@@ -4843,13 +4874,12 @@ function resolveTargetDefinition(definition, env = process.env) {
4843
4874
  }
4844
4875
  }
4845
4876
  function resolveAzureConfig(target, env) {
4846
- const settings = target.settings ?? {};
4847
- const endpointSource = settings.endpoint ?? settings.resource ?? settings.resourceName;
4848
- const apiKeySource = settings.api_key ?? settings.apiKey;
4849
- const deploymentSource = settings.deployment ?? settings.deploymentName ?? settings.model;
4850
- const versionSource = settings.version ?? settings.api_version;
4851
- const temperatureSource = settings.temperature;
4852
- const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
4877
+ const endpointSource = target.endpoint ?? target.resource ?? target.resourceName;
4878
+ const apiKeySource = target.api_key ?? target.apiKey;
4879
+ const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
4880
+ const versionSource = target.version ?? target.api_version;
4881
+ const temperatureSource = target.temperature;
4882
+ const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
4853
4883
  const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
4854
4884
  const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
4855
4885
  const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
@@ -4861,58 +4891,61 @@ function resolveAzureConfig(target, env) {
4861
4891
  maxTokensSource,
4862
4892
  `${target.name} max output tokens`
4863
4893
  );
4894
+ const retry = resolveRetryConfig(target);
4864
4895
  return {
4865
4896
  resourceName,
4866
4897
  deploymentName,
4867
4898
  apiKey,
4868
4899
  version,
4869
4900
  temperature,
4870
- maxOutputTokens
4901
+ maxOutputTokens,
4902
+ retry
4871
4903
  };
4872
4904
  }
4873
4905
  function resolveAnthropicConfig(target, env) {
4874
- const settings = target.settings ?? {};
4875
- const apiKeySource = settings.api_key ?? settings.apiKey;
4876
- const modelSource = settings.model ?? settings.deployment ?? settings.variant;
4877
- const temperatureSource = settings.temperature;
4878
- const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
4879
- const thinkingBudgetSource = settings.thinking_budget ?? settings.thinkingBudget;
4906
+ const apiKeySource = target.api_key ?? target.apiKey;
4907
+ const modelSource = target.model ?? target.deployment ?? target.variant;
4908
+ const temperatureSource = target.temperature;
4909
+ const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
4910
+ const thinkingBudgetSource = target.thinking_budget ?? target.thinkingBudget;
4880
4911
  const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
4881
4912
  const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
4913
+ const retry = resolveRetryConfig(target);
4882
4914
  return {
4883
4915
  apiKey,
4884
4916
  model,
4885
4917
  temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
4886
4918
  maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
4887
- thinkingBudget: resolveOptionalNumber(thinkingBudgetSource, `${target.name} thinking budget`)
4919
+ thinkingBudget: resolveOptionalNumber(thinkingBudgetSource, `${target.name} thinking budget`),
4920
+ retry
4888
4921
  };
4889
4922
  }
4890
4923
  function resolveGeminiConfig(target, env) {
4891
- const settings = target.settings ?? {};
4892
- const apiKeySource = settings.api_key ?? settings.apiKey;
4893
- const modelSource = settings.model ?? settings.deployment ?? settings.variant;
4894
- const temperatureSource = settings.temperature;
4895
- const maxTokensSource = settings.max_output_tokens ?? settings.maxTokens;
4924
+ const apiKeySource = target.api_key ?? target.apiKey;
4925
+ const modelSource = target.model ?? target.deployment ?? target.variant;
4926
+ const temperatureSource = target.temperature;
4927
+ const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
4896
4928
  const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
4897
4929
  const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
4898
4930
  allowLiteral: true,
4899
4931
  optionalEnv: true
4900
4932
  }) ?? "gemini-2.5-flash";
4933
+ const retry = resolveRetryConfig(target);
4901
4934
  return {
4902
4935
  apiKey,
4903
4936
  model,
4904
4937
  temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
4905
- maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`)
4938
+ maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
4939
+ retry
4906
4940
  };
4907
4941
  }
4908
4942
  function resolveCodexConfig(target, env) {
4909
- const settings = target.settings ?? {};
4910
- const executableSource = settings.executable ?? settings.command ?? settings.binary;
4911
- const argsSource = settings.args ?? settings.arguments;
4912
- const cwdSource = settings.cwd;
4913
- const timeoutSource = settings.timeout_seconds ?? settings.timeoutSeconds;
4914
- const logDirSource = settings.log_dir ?? settings.logDir ?? settings.log_directory ?? settings.logDirectory;
4915
- const logFormatSource = settings.log_format ?? settings.logFormat ?? settings.log_output_format ?? settings.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
4943
+ const executableSource = target.executable ?? target.command ?? target.binary;
4944
+ const argsSource = target.args ?? target.arguments;
4945
+ const cwdSource = target.cwd;
4946
+ const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
4947
+ const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
4948
+ const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
4916
4949
  const executable = resolveOptionalString(executableSource, env, `${target.name} codex executable`, {
4917
4950
  allowLiteral: true,
4918
4951
  optionalEnv: true
@@ -4951,21 +4984,19 @@ function normalizeCodexLogFormat(value) {
4951
4984
  throw new Error("codex log format must be 'summary' or 'json'");
4952
4985
  }
4953
4986
  function resolveMockConfig(target) {
4954
- const settings = target.settings ?? {};
4955
- const response = typeof settings.response === "string" ? settings.response : void 0;
4987
+ const response = typeof target.response === "string" ? target.response : void 0;
4956
4988
  return { response };
4957
4989
  }
4958
4990
  function resolveVSCodeConfig(target, env, insiders) {
4959
- const settings = target.settings ?? {};
4960
- const workspaceTemplateEnvVar = resolveOptionalLiteralString(settings.workspace_template ?? settings.workspaceTemplate);
4991
+ const workspaceTemplateEnvVar = resolveOptionalLiteralString(target.workspace_template ?? target.workspaceTemplate);
4961
4992
  const workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(workspaceTemplateEnvVar, env, `${target.name} workspace template path`, {
4962
4993
  allowLiteral: false,
4963
4994
  optionalEnv: true
4964
4995
  }) : void 0;
4965
- const commandSource = settings.vscode_cmd ?? settings.command;
4966
- const waitSource = settings.wait;
4967
- const dryRunSource = settings.dry_run ?? settings.dryRun;
4968
- const subagentRootSource = settings.subagent_root ?? settings.subagentRoot;
4996
+ const commandSource = target.vscode_cmd ?? target.command;
4997
+ const waitSource = target.wait;
4998
+ const dryRunSource = target.dry_run ?? target.dryRun;
4999
+ const subagentRootSource = target.subagent_root ?? target.subagentRoot;
4969
5000
  const defaultCommand = insiders ? "code-insiders" : "code";
4970
5001
  const command = resolveOptionalLiteralString(commandSource) ?? defaultCommand;
4971
5002
  return {
@@ -4980,18 +5011,16 @@ function resolveVSCodeConfig(target, env, insiders) {
4980
5011
  };
4981
5012
  }
4982
5013
  function resolveCliConfig(target, env) {
4983
- const settings = target.settings ?? {};
4984
- const commandTemplateSource = settings.command_template ?? settings.commandTemplate;
5014
+ const commandTemplateSource = target.command_template ?? target.commandTemplate;
4985
5015
  const filesFormat = resolveOptionalLiteralString(
4986
- settings.files_format ?? settings.filesFormat ?? settings.attachments_format ?? settings.attachmentsFormat
5016
+ target.files_format ?? target.filesFormat ?? target.attachments_format ?? target.attachmentsFormat
4987
5017
  );
4988
- const cwd = resolveOptionalString(settings.cwd, env, `${target.name} working directory`, {
5018
+ const cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
4989
5019
  allowLiteral: true,
4990
5020
  optionalEnv: true
4991
5021
  });
4992
- const envOverrides = resolveEnvOverrides(settings.env, env, target.name);
4993
- const timeoutMs = resolveTimeoutMs(settings.timeout_seconds ?? settings.timeoutSeconds, `${target.name} timeout`);
4994
- const healthcheck = resolveCliHealthcheck(settings.healthcheck, env, target.name);
5022
+ const timeoutMs = resolveTimeoutMs(target.timeout_seconds ?? target.timeoutSeconds, `${target.name} timeout`);
5023
+ const healthcheck = resolveCliHealthcheck(target.healthcheck, env, target.name);
4995
5024
  const commandTemplate = resolveString(
4996
5025
  commandTemplateSource,
4997
5026
  env,
@@ -5003,29 +5032,10 @@ function resolveCliConfig(target, env) {
5003
5032
  commandTemplate,
5004
5033
  filesFormat,
5005
5034
  cwd,
5006
- env: envOverrides,
5007
5035
  timeoutMs,
5008
5036
  healthcheck
5009
5037
  };
5010
5038
  }
5011
- function resolveEnvOverrides(source2, env, targetName) {
5012
- if (source2 === void 0 || source2 === null) {
5013
- return void 0;
5014
- }
5015
- if (typeof source2 !== "object" || Array.isArray(source2)) {
5016
- throw new Error(`${targetName} env overrides must be an object map of strings`);
5017
- }
5018
- const entries = Object.entries(source2);
5019
- const resolved = {};
5020
- for (const [key2, value] of entries) {
5021
- if (typeof value !== "string") {
5022
- throw new Error(`${targetName} env override '${key2}' must be a string`);
5023
- }
5024
- const resolvedValue = resolveString(value, env, `${targetName} env override '${key2}'`);
5025
- resolved[key2] = resolvedValue;
5026
- }
5027
- return Object.keys(resolved).length > 0 ? resolved : void 0;
5028
- }
5029
5039
  function resolveTimeoutMs(source2, description) {
5030
5040
  const seconds = resolveOptionalNumber(source2, `${description} (seconds)`);
5031
5041
  if (seconds === void 0) {
@@ -5221,6 +5231,26 @@ function resolveOptionalStringArray(source2, env, description) {
5221
5231
  }
5222
5232
  return resolved.length > 0 ? resolved : void 0;
5223
5233
  }
5234
+ function resolveOptionalNumberArray(source2, description) {
5235
+ if (source2 === void 0 || source2 === null) {
5236
+ return void 0;
5237
+ }
5238
+ if (!Array.isArray(source2)) {
5239
+ throw new Error(`${description} must be an array of numbers`);
5240
+ }
5241
+ if (source2.length === 0) {
5242
+ return void 0;
5243
+ }
5244
+ const resolved = [];
5245
+ for (let i6 = 0; i6 < source2.length; i6++) {
5246
+ const item = source2[i6];
5247
+ if (typeof item !== "number" || !Number.isFinite(item)) {
5248
+ throw new Error(`${description}[${i6}] must be a number`);
5249
+ }
5250
+ resolved.push(item);
5251
+ }
5252
+ return resolved.length > 0 ? resolved : void 0;
5253
+ }
5224
5254
  var AGENT_PROVIDER_KINDS = [
5225
5255
  "codex",
5226
5256
  "vscode",
@@ -5252,7 +5282,7 @@ var PROVIDER_ALIASES = [
5252
5282
  "vertex"
5253
5283
  // legacy/future support
5254
5284
  ];
5255
- var TARGETS_SCHEMA_V2 = "agentv-targets-v2.1";
5285
+ var TARGETS_SCHEMA_V2 = "agentv-targets-v2.2";
5256
5286
  function isAgentProvider(provider) {
5257
5287
  return provider ? AGENT_PROVIDER_KINDS.includes(provider.kind) : false;
5258
5288
  }
@@ -11917,14 +11947,11 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
11917
11947
  logWarning(`Skipping incomplete eval case: ${id ?? "unknown"}`);
11918
11948
  continue;
11919
11949
  }
11920
- if (!Array.isArray(expectedMessagesValue)) {
11921
- logWarning(`Eval case '${id}' missing expected_messages array`);
11922
- continue;
11923
- }
11950
+ const hasExpectedMessages = Array.isArray(expectedMessagesValue) && expectedMessagesValue.length > 0;
11924
11951
  const inputMessages = inputMessagesValue.filter((msg) => isTestMessage(msg));
11925
- const expectedMessages = expectedMessagesValue.filter((msg) => isTestMessage(msg));
11926
- if (expectedMessages.length === 0) {
11927
- logWarning(`No expected message found for eval case: ${id}`);
11952
+ const expectedMessages = hasExpectedMessages ? expectedMessagesValue.filter((msg) => isTestMessage(msg)) : [];
11953
+ if (hasExpectedMessages && expectedMessages.length === 0) {
11954
+ logWarning(`No valid expected message found for eval case: ${id}`);
11928
11955
  continue;
11929
11956
  }
11930
11957
  if (expectedMessages.length > 1) {
@@ -11942,17 +11969,17 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
11942
11969
  messageType: "input",
11943
11970
  verbose
11944
11971
  });
11945
- const outputSegments = await processMessages({
11972
+ const outputSegments = hasExpectedMessages ? await processMessages({
11946
11973
  messages: expectedMessages,
11947
11974
  searchRoots,
11948
11975
  repoRootPath,
11949
11976
  guidelinePatterns,
11950
11977
  messageType: "output",
11951
11978
  verbose
11952
- });
11979
+ }) : [];
11953
11980
  const codeSnippets = extractCodeBlocks(inputSegments);
11954
11981
  const expectedContent = expectedMessages[0]?.content;
11955
- const referenceAnswer = await resolveAssistantContent(expectedContent, searchRoots, verbose);
11982
+ const referenceAnswer = expectedContent ? await resolveAssistantContent(expectedContent, searchRoots, verbose) : "";
11956
11983
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
11957
11984
  const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
11958
11985
  const evaluators = await parseEvaluators(evalcase, searchRoots, id ?? "unknown");
@@ -11971,6 +11998,7 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
11971
11998
  dataset: datasetName,
11972
11999
  conversation_id: conversationId,
11973
12000
  question,
12001
+ input_messages: inputMessages,
11974
12002
  input_segments: inputSegments,
11975
12003
  output_segments: outputSegments,
11976
12004
  reference_answer: referenceAnswer,
@@ -11998,6 +12026,54 @@ Please add '$schema: ${SCHEMA_EVAL_V2}' at the top of the file.`;
11998
12026
  }
11999
12027
  return results;
12000
12028
  }
12029
+ function needsRoleMarkers(messages, processedSegmentsByMessage) {
12030
+ if (messages.some((msg) => msg.role === "assistant" || msg.role === "tool")) {
12031
+ return true;
12032
+ }
12033
+ let messagesWithContent = 0;
12034
+ for (const segments of processedSegmentsByMessage) {
12035
+ if (hasVisibleContent(segments)) {
12036
+ messagesWithContent++;
12037
+ }
12038
+ }
12039
+ return messagesWithContent > 1;
12040
+ }
12041
+ function hasVisibleContent(segments) {
12042
+ return segments.some((segment) => {
12043
+ const type = asString(segment.type);
12044
+ if (type === "text") {
12045
+ const value = asString(segment.value);
12046
+ return value !== void 0 && value.trim().length > 0;
12047
+ }
12048
+ if (type === "guideline_ref") {
12049
+ return false;
12050
+ }
12051
+ if (type === "file") {
12052
+ const text = asString(segment.text);
12053
+ return text !== void 0 && text.trim().length > 0;
12054
+ }
12055
+ return false;
12056
+ });
12057
+ }
12058
+ function formatSegment(segment) {
12059
+ const type = asString(segment.type);
12060
+ if (type === "text") {
12061
+ return asString(segment.value);
12062
+ }
12063
+ if (type === "guideline_ref") {
12064
+ const refPath = asString(segment.path);
12065
+ return refPath ? `<Attached: ${refPath}>` : void 0;
12066
+ }
12067
+ if (type === "file") {
12068
+ const text = asString(segment.text);
12069
+ const filePath = asString(segment.path);
12070
+ if (text && filePath) {
12071
+ return `=== ${filePath} ===
12072
+ ${text}`;
12073
+ }
12074
+ }
12075
+ return void 0;
12076
+ }
12001
12077
  async function buildPromptInputs(testCase) {
12002
12078
  const guidelineContents = [];
12003
12079
  for (const rawPath of testCase.guideline_paths) {
@@ -12014,36 +12090,168 @@ ${content}`);
12014
12090
  logWarning(`Could not read guideline file ${absolutePath}: ${error.message}`);
12015
12091
  }
12016
12092
  }
12017
- const questionParts = [];
12093
+ const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
12094
+ const segmentsByMessage = [];
12095
+ const fileContentsByPath = /* @__PURE__ */ new Map();
12018
12096
  for (const segment of testCase.input_segments) {
12019
- const typeValue = segment.type;
12020
- if (typeof typeValue === "string" && typeValue === "file") {
12021
- const pathValue = segment.path;
12022
- const textValue = segment.text;
12023
- const label = typeof pathValue === "string" ? pathValue : "file";
12024
- const body = typeof textValue === "string" ? textValue : "";
12025
- questionParts.push(`=== ${label} ===
12026
- ${body}`);
12027
- continue;
12097
+ if (segment.type === "file" && typeof segment.path === "string" && typeof segment.text === "string") {
12098
+ fileContentsByPath.set(segment.path, segment.text);
12028
12099
  }
12029
- if (typeof typeValue === "string" && typeValue === "text") {
12030
- const value = segment.value;
12031
- if (typeof value === "string") {
12032
- questionParts.push(value);
12100
+ }
12101
+ for (const message of testCase.input_messages) {
12102
+ const messageSegments = [];
12103
+ if (typeof message.content === "string") {
12104
+ if (message.content.trim().length > 0) {
12105
+ messageSegments.push({ type: "text", value: message.content });
12106
+ }
12107
+ } else if (Array.isArray(message.content)) {
12108
+ for (const segment of message.content) {
12109
+ if (typeof segment === "string") {
12110
+ if (segment.trim().length > 0) {
12111
+ messageSegments.push({ type: "text", value: segment });
12112
+ }
12113
+ } else if (isJsonObject(segment)) {
12114
+ const type = asString(segment.type);
12115
+ if (type === "file") {
12116
+ const value = asString(segment.value);
12117
+ if (!value) continue;
12118
+ if (testCase.guideline_patterns && isGuidelineFile(value, testCase.guideline_patterns)) {
12119
+ messageSegments.push({ type: "guideline_ref", path: value });
12120
+ continue;
12121
+ }
12122
+ const fileText = fileContentsByPath.get(value);
12123
+ if (fileText !== void 0) {
12124
+ messageSegments.push({ type: "file", text: fileText, path: value });
12125
+ }
12126
+ } else if (type === "text") {
12127
+ const textValue = asString(segment.value);
12128
+ if (textValue && textValue.trim().length > 0) {
12129
+ messageSegments.push({ type: "text", value: textValue });
12130
+ }
12131
+ }
12132
+ }
12133
+ }
12134
+ }
12135
+ segmentsByMessage.push(messageSegments);
12136
+ }
12137
+ const useRoleMarkers = needsRoleMarkers(testCase.input_messages, segmentsByMessage);
12138
+ let question;
12139
+ if (useRoleMarkers) {
12140
+ const messageParts = [];
12141
+ for (let i6 = 0; i6 < testCase.input_messages.length; i6++) {
12142
+ const message = testCase.input_messages[i6];
12143
+ const segments = segmentsByMessage[i6];
12144
+ if (!hasVisibleContent(segments)) {
12145
+ continue;
12146
+ }
12147
+ const roleLabel = message.role.charAt(0).toUpperCase() + message.role.slice(1);
12148
+ const contentParts = [];
12149
+ for (const segment of segments) {
12150
+ const formattedContent = formatSegment(segment);
12151
+ if (formattedContent) {
12152
+ contentParts.push(formattedContent);
12153
+ }
12154
+ }
12155
+ if (contentParts.length > 0) {
12156
+ const messageContent = contentParts.join("\n");
12157
+ messageParts.push(`@[${roleLabel}]:
12158
+ ${messageContent}`);
12159
+ }
12160
+ }
12161
+ question = messageParts.join("\n\n");
12162
+ } else {
12163
+ const questionParts = [];
12164
+ for (const segment of testCase.input_segments) {
12165
+ const formattedContent = formatSegment(segment);
12166
+ if (formattedContent) {
12167
+ questionParts.push(formattedContent);
12168
+ }
12169
+ }
12170
+ if (testCase.code_snippets.length > 0) {
12171
+ questionParts.push(testCase.code_snippets.join("\n"));
12172
+ }
12173
+ question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
12174
+ }
12175
+ const chatPrompt = useRoleMarkers ? buildChatPromptFromSegments({
12176
+ messages: testCase.input_messages,
12177
+ segmentsByMessage,
12178
+ guidelinePatterns: testCase.guideline_patterns,
12179
+ guidelineContent: guidelines
12180
+ }) : void 0;
12181
+ return { question, guidelines, chatPrompt };
12182
+ }
12183
+ function buildChatPromptFromSegments(options) {
12184
+ const { messages, segmentsByMessage, guidelinePatterns, guidelineContent, systemPrompt } = options;
12185
+ if (messages.length === 0) {
12186
+ return void 0;
12187
+ }
12188
+ const systemSegments = [];
12189
+ if (systemPrompt && systemPrompt.trim().length > 0) {
12190
+ systemSegments.push(systemPrompt.trim());
12191
+ }
12192
+ if (guidelineContent && guidelineContent.trim().length > 0) {
12193
+ systemSegments.push(`[[ ## Guidelines ## ]]
12194
+
12195
+ ${guidelineContent.trim()}`);
12196
+ }
12197
+ let startIndex = 0;
12198
+ while (startIndex < messages.length && messages[startIndex].role === "system") {
12199
+ const segments = segmentsByMessage[startIndex];
12200
+ const contentParts = [];
12201
+ for (const segment of segments) {
12202
+ const formatted = formatSegment(segment);
12203
+ if (formatted) {
12204
+ contentParts.push(formatted);
12033
12205
  }
12034
- continue;
12035
12206
  }
12036
- const genericValue = segment.value;
12037
- if (typeof genericValue === "string") {
12038
- questionParts.push(genericValue);
12207
+ if (contentParts.length > 0) {
12208
+ systemSegments.push(contentParts.join("\n"));
12039
12209
  }
12210
+ startIndex += 1;
12040
12211
  }
12041
- if (testCase.code_snippets.length > 0) {
12042
- questionParts.push(testCase.code_snippets.join("\n"));
12212
+ const chatPrompt = [];
12213
+ if (systemSegments.length > 0) {
12214
+ chatPrompt.push({
12215
+ role: "system",
12216
+ content: systemSegments.join("\n\n")
12217
+ });
12043
12218
  }
12044
- const question = questionParts.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
12045
- const guidelines = guidelineContents.map((part) => part.trim()).filter((part) => part.length > 0).join("\n\n");
12046
- return { question, guidelines };
12219
+ for (let i6 = startIndex; i6 < messages.length; i6++) {
12220
+ const message = messages[i6];
12221
+ const segments = segmentsByMessage[i6];
12222
+ const contentParts = [];
12223
+ let role = message.role;
12224
+ let name;
12225
+ if (role === "system") {
12226
+ role = "assistant";
12227
+ contentParts.push("@[System]:");
12228
+ } else if (role === "tool") {
12229
+ role = "function";
12230
+ name = "tool";
12231
+ }
12232
+ for (const segment of segments) {
12233
+ if (segment.type === "guideline_ref") {
12234
+ continue;
12235
+ }
12236
+ const formatted = formatSegment(segment);
12237
+ if (formatted) {
12238
+ const isGuidelineRef = segment.type === "file" && typeof segment.path === "string" && guidelinePatterns && isGuidelineFile(segment.path, guidelinePatterns);
12239
+ if (isGuidelineRef) {
12240
+ continue;
12241
+ }
12242
+ contentParts.push(formatted);
12243
+ }
12244
+ }
12245
+ if (contentParts.length === 0) {
12246
+ continue;
12247
+ }
12248
+ chatPrompt.push({
12249
+ role,
12250
+ content: contentParts.join("\n"),
12251
+ ...name ? { name } : {}
12252
+ });
12253
+ }
12254
+ return chatPrompt.length > 0 ? chatPrompt : void 0;
12047
12255
  }
12048
12256
  async function fileExists2(absolutePath) {
12049
12257
  try {
@@ -12237,21 +12445,14 @@ ${detailBlock}${ANSI_RESET}`);
12237
12445
  var DEFAULT_SYSTEM_PROMPT = "You are a careful assistant. Follow all provided instructions and do not fabricate results.";
12238
12446
  function buildChatPrompt(request) {
12239
12447
  if (request.chatPrompt) {
12240
- return request.chatPrompt;
12241
- }
12242
- const systemSegments = [];
12243
- const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
12244
- if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
12245
- systemSegments.push(metadataSystemPrompt.trim());
12246
- } else {
12247
- systemSegments.push(DEFAULT_SYSTEM_PROMPT);
12248
- }
12249
- if (request.guidelines && request.guidelines.trim().length > 0) {
12250
- systemSegments.push(`[[ ## Guidelines ## ]]
12251
-
12252
- ${request.guidelines.trim()}`);
12448
+ const hasSystemMessage = request.chatPrompt.some((message) => message.role === "system");
12449
+ if (hasSystemMessage) {
12450
+ return request.chatPrompt;
12451
+ }
12452
+ const systemContent2 = resolveSystemContent(request);
12453
+ return [{ role: "system", content: systemContent2 }, ...request.chatPrompt];
12253
12454
  }
12254
- const systemContent = systemSegments.join("\n\n");
12455
+ const systemContent = resolveSystemContent(request);
12255
12456
  const userContent = request.question.trim();
12256
12457
  const prompt = [
12257
12458
  {
@@ -12265,6 +12466,21 @@ ${request.guidelines.trim()}`);
12265
12466
  ];
12266
12467
  return prompt;
12267
12468
  }
12469
+ function resolveSystemContent(request) {
12470
+ const systemSegments = [];
12471
+ const metadataSystemPrompt = typeof request.metadata?.systemPrompt === "string" ? request.metadata.systemPrompt : void 0;
12472
+ if (metadataSystemPrompt && metadataSystemPrompt.trim().length > 0) {
12473
+ systemSegments.push(metadataSystemPrompt.trim());
12474
+ } else {
12475
+ systemSegments.push(DEFAULT_SYSTEM_PROMPT);
12476
+ }
12477
+ if (request.guidelines && request.guidelines.trim().length > 0) {
12478
+ systemSegments.push(`[[ ## Guidelines ## ]]
12479
+
12480
+ ${request.guidelines.trim()}`);
12481
+ }
12482
+ return systemSegments.join("\n\n");
12483
+ }
12268
12484
  function extractModelConfig(request, defaults) {
12269
12485
  const temperature = request.temperature ?? defaults.temperature;
12270
12486
  const maxTokens = request.maxOutputTokens ?? defaults.maxOutputTokens;
@@ -12308,6 +12524,67 @@ function ensureChatResponse(result) {
12308
12524
  }
12309
12525
  return result;
12310
12526
  }
12527
+ function isRetryableError(error, retryableStatusCodes) {
12528
+ if (!error || typeof error !== "object") {
12529
+ return false;
12530
+ }
12531
+ if ("status" in error && typeof error.status === "number") {
12532
+ return retryableStatusCodes.includes(error.status);
12533
+ }
12534
+ if ("message" in error && typeof error.message === "string") {
12535
+ const match = error.message.match(/HTTP (\d{3})/);
12536
+ if (match) {
12537
+ const status = Number.parseInt(match[1], 10);
12538
+ return retryableStatusCodes.includes(status);
12539
+ }
12540
+ }
12541
+ if ("name" in error && error.name === "AxAIServiceNetworkError") {
12542
+ return true;
12543
+ }
12544
+ return false;
12545
+ }
12546
+ function calculateRetryDelay(attempt, config) {
12547
+ const delay = Math.min(
12548
+ config.maxDelayMs,
12549
+ config.initialDelayMs * config.backoffFactor ** attempt
12550
+ );
12551
+ return delay * (0.75 + Math.random() * 0.5);
12552
+ }
12553
+ async function sleep2(ms) {
12554
+ return new Promise((resolve) => setTimeout(resolve, ms));
12555
+ }
12556
+ async function withRetry(fn, retryConfig, signal) {
12557
+ const config = {
12558
+ maxRetries: retryConfig?.maxRetries ?? 3,
12559
+ initialDelayMs: retryConfig?.initialDelayMs ?? 1e3,
12560
+ maxDelayMs: retryConfig?.maxDelayMs ?? 6e4,
12561
+ backoffFactor: retryConfig?.backoffFactor ?? 2,
12562
+ retryableStatusCodes: retryConfig?.retryableStatusCodes ?? [500, 408, 429, 502, 503, 504]
12563
+ };
12564
+ let lastError;
12565
+ for (let attempt = 0; attempt <= config.maxRetries; attempt++) {
12566
+ if (signal?.aborted) {
12567
+ throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
12568
+ }
12569
+ try {
12570
+ return await fn();
12571
+ } catch (error) {
12572
+ lastError = error;
12573
+ if (attempt >= config.maxRetries) {
12574
+ break;
12575
+ }
12576
+ if (!isRetryableError(error, config.retryableStatusCodes)) {
12577
+ throw error;
12578
+ }
12579
+ const delay = calculateRetryDelay(attempt, config);
12580
+ await sleep2(delay);
12581
+ if (signal?.aborted) {
12582
+ throw new Error(`Request aborted: ${signal.reason ?? "Unknown reason"}`);
12583
+ }
12584
+ }
12585
+ }
12586
+ throw lastError;
12587
+ }
12311
12588
  var AzureProvider = class {
12312
12589
  constructor(targetName, config) {
12313
12590
  this.config = config;
@@ -12317,6 +12594,7 @@ var AzureProvider = class {
12317
12594
  temperature: config.temperature,
12318
12595
  maxOutputTokens: config.maxOutputTokens
12319
12596
  };
12597
+ this.retryConfig = config.retry;
12320
12598
  this.ai = Wn.create({
12321
12599
  name: "azure-openai",
12322
12600
  apiKey: config.apiKey,
@@ -12333,16 +12611,21 @@ var AzureProvider = class {
12333
12611
  targetName;
12334
12612
  ai;
12335
12613
  defaults;
12614
+ retryConfig;
12336
12615
  async invoke(request) {
12337
12616
  const chatPrompt = buildChatPrompt(request);
12338
12617
  const modelConfig = extractModelConfig(request, this.defaults);
12339
- const response = await this.ai.chat(
12340
- {
12341
- chatPrompt,
12342
- model: this.config.deploymentName,
12343
- ...modelConfig ? { modelConfig } : {}
12344
- },
12345
- request.signal ? { abortSignal: request.signal } : void 0
12618
+ const response = await withRetry(
12619
+ async () => await this.ai.chat(
12620
+ {
12621
+ chatPrompt,
12622
+ model: this.config.deploymentName,
12623
+ ...modelConfig ? { modelConfig } : {}
12624
+ },
12625
+ request.signal ? { abortSignal: request.signal } : void 0
12626
+ ),
12627
+ this.retryConfig,
12628
+ request.signal
12346
12629
  );
12347
12630
  return mapResponse(ensureChatResponse(response));
12348
12631
  }
@@ -12360,6 +12643,7 @@ var AnthropicProvider = class {
12360
12643
  maxOutputTokens: config.maxOutputTokens,
12361
12644
  thinkingBudget: config.thinkingBudget
12362
12645
  };
12646
+ this.retryConfig = config.retry;
12363
12647
  this.ai = Wn.create({
12364
12648
  name: "anthropic",
12365
12649
  apiKey: config.apiKey
@@ -12370,16 +12654,21 @@ var AnthropicProvider = class {
12370
12654
  targetName;
12371
12655
  ai;
12372
12656
  defaults;
12657
+ retryConfig;
12373
12658
  async invoke(request) {
12374
12659
  const chatPrompt = buildChatPrompt(request);
12375
12660
  const modelConfig = extractModelConfig(request, this.defaults);
12376
- const response = await this.ai.chat(
12377
- {
12378
- chatPrompt,
12379
- model: this.config.model,
12380
- ...modelConfig ? { modelConfig } : {}
12381
- },
12382
- request.signal ? { abortSignal: request.signal } : void 0
12661
+ const response = await withRetry(
12662
+ async () => await this.ai.chat(
12663
+ {
12664
+ chatPrompt,
12665
+ model: this.config.model,
12666
+ ...modelConfig ? { modelConfig } : {}
12667
+ },
12668
+ request.signal ? { abortSignal: request.signal } : void 0
12669
+ ),
12670
+ this.retryConfig,
12671
+ request.signal
12383
12672
  );
12384
12673
  return mapResponse(ensureChatResponse(response));
12385
12674
  }
@@ -12396,6 +12685,7 @@ var GeminiProvider = class {
12396
12685
  temperature: config.temperature,
12397
12686
  maxOutputTokens: config.maxOutputTokens
12398
12687
  };
12688
+ this.retryConfig = config.retry;
12399
12689
  this.ai = Wn.create({
12400
12690
  name: "google-gemini",
12401
12691
  apiKey: config.apiKey
@@ -12406,16 +12696,21 @@ var GeminiProvider = class {
12406
12696
  targetName;
12407
12697
  ai;
12408
12698
  defaults;
12699
+ retryConfig;
12409
12700
  async invoke(request) {
12410
12701
  const chatPrompt = buildChatPrompt(request);
12411
12702
  const modelConfig = extractModelConfig(request, this.defaults);
12412
- const response = await this.ai.chat(
12413
- {
12414
- chatPrompt,
12415
- model: this.config.model,
12416
- ...modelConfig ? { modelConfig } : {}
12417
- },
12418
- request.signal ? { abortSignal: request.signal } : void 0
12703
+ const response = await withRetry(
12704
+ async () => await this.ai.chat(
12705
+ {
12706
+ chatPrompt,
12707
+ model: this.config.model,
12708
+ ...modelConfig ? { modelConfig } : {}
12709
+ },
12710
+ request.signal ? { abortSignal: request.signal } : void 0
12711
+ ),
12712
+ this.retryConfig,
12713
+ request.signal
12419
12714
  );
12420
12715
  return mapResponse(ensureChatResponse(response));
12421
12716
  }
@@ -12478,10 +12773,9 @@ var CliProvider = class {
12478
12773
  const outputFilePath = generateOutputFilePath(request.evalCaseId);
12479
12774
  const templateValues = buildTemplateValues(request, this.config, outputFilePath);
12480
12775
  const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
12481
- const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
12482
12776
  const result = await this.runCommand(renderedCommand, {
12483
12777
  cwd: this.config.cwd,
12484
- env,
12778
+ env: process.env,
12485
12779
  timeoutMs: this.config.timeoutMs,
12486
12780
  signal: request.signal
12487
12781
  });
@@ -12570,10 +12864,9 @@ var CliProvider = class {
12570
12864
  generateOutputFilePath("healthcheck")
12571
12865
  )
12572
12866
  );
12573
- const env = this.config.env ? { ...process.env, ...this.config.env } : process.env;
12574
12867
  const result = await this.runCommand(renderedCommand, {
12575
12868
  cwd: healthcheck.cwd ?? this.config.cwd,
12576
- env,
12869
+ env: process.env,
12577
12870
  timeoutMs,
12578
12871
  signal
12579
12872
  });
@@ -13771,20 +14064,13 @@ function assertTargetDefinition(value, index, filePath) {
13771
14064
  }
13772
14065
  const name = value.name;
13773
14066
  const provider = value.provider;
13774
- const settings = value.settings;
13775
- const judgeTarget = value.judge_target;
13776
14067
  if (typeof name !== "string" || name.trim().length === 0) {
13777
14068
  throw new Error(`targets.yaml entry at index ${index} in ${filePath} is missing a valid 'name'`);
13778
14069
  }
13779
14070
  if (typeof provider !== "string" || provider.trim().length === 0) {
13780
14071
  throw new Error(`targets.yaml entry '${name}' in ${filePath} is missing a valid 'provider'`);
13781
14072
  }
13782
- return {
13783
- name,
13784
- provider,
13785
- settings: isRecord(settings) ? settings : void 0,
13786
- judge_target: typeof judgeTarget === "string" ? judgeTarget : void 0
13787
- };
14073
+ return value;
13788
14074
  }
13789
14075
  async function fileExists3(filePath) {
13790
14076
  try {
@@ -13855,19 +14141,21 @@ var LlmJudgeEvaluator = class {
13855
14141
  return this.evaluateWithPrompt(context2, judgeProvider);
13856
14142
  }
13857
14143
  async evaluateWithPrompt(context2, judgeProvider) {
13858
- let prompt = buildQualityPrompt(context2.evalCase, context2.candidate);
13859
- let systemPrompt = context2.systemPrompt ?? this.customPrompt ?? QUALITY_SYSTEM_PROMPT;
14144
+ const hasReferenceAnswer = hasNonEmptyReferenceAnswer(context2.evalCase);
14145
+ const formattedQuestion = context2.promptInputs.question && context2.promptInputs.question.trim().length > 0 ? context2.promptInputs.question : context2.evalCase.question;
14146
+ let prompt = buildQualityPrompt(context2.evalCase, context2.candidate, formattedQuestion);
14147
+ let systemPrompt = context2.systemPrompt ?? this.customPrompt ?? buildSystemPrompt(hasReferenceAnswer);
13860
14148
  if (systemPrompt && hasTemplateVariables(systemPrompt)) {
13861
14149
  const variables = {
13862
14150
  input_messages: JSON.stringify(context2.evalCase.input_segments, null, 2),
13863
14151
  output_messages: JSON.stringify(context2.evalCase.output_segments, null, 2),
13864
14152
  candidate_answer: context2.candidate,
13865
- reference_answer: context2.evalCase.reference_answer,
14153
+ reference_answer: context2.evalCase.reference_answer ?? "",
13866
14154
  expected_outcome: context2.evalCase.expected_outcome,
13867
- question: context2.evalCase.question
14155
+ question: formattedQuestion
13868
14156
  };
13869
14157
  prompt = substituteVariables(systemPrompt, variables);
13870
- systemPrompt = QUALITY_SYSTEM_PROMPT;
14158
+ systemPrompt = buildSystemPrompt(hasReferenceAnswer);
13871
14159
  }
13872
14160
  const metadata = {
13873
14161
  ...systemPrompt !== void 0 ? { systemPrompt } : {},
@@ -13905,38 +14193,51 @@ var LlmJudgeEvaluator = class {
13905
14193
  };
13906
14194
  }
13907
14195
  };
13908
- var QUALITY_SYSTEM_PROMPT = [
13909
- "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
13910
- "",
13911
- "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but it should capture the key points and follow the same spirit.",
13912
- "",
13913
- "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
13914
- "",
13915
- "You must respond with a single JSON object matching this schema:",
13916
- "",
13917
- "{",
13918
- ' "score": <number between 0.0 and 1.0>,',
13919
- ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
13920
- ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
13921
- ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
13922
- "}"
13923
- ].join("\n");
13924
- function buildQualityPrompt(evalCase, candidate) {
14196
+ function buildSystemPrompt(hasReferenceAnswer) {
14197
+ const basePrompt = [
14198
+ "You are an expert evaluator. Your goal is to grade the candidate_answer based on how well it achieves the expected_outcome for the original task.",
14199
+ ""
14200
+ ];
14201
+ if (hasReferenceAnswer) {
14202
+ basePrompt.push(
14203
+ "Use the reference_answer as a gold standard for a high-quality response. The candidate_answer does not need to match it verbatim, but should capture the key points and follow the same spirit.",
14204
+ ""
14205
+ );
14206
+ }
14207
+ basePrompt.push(
14208
+ "Be concise and focused in your evaluation. Provide succinct, specific feedback rather than verbose explanations.",
14209
+ "",
14210
+ "You must respond with a single JSON object matching this schema:",
14211
+ "",
14212
+ "{",
14213
+ ' "score": <number between 0.0 and 1.0>,',
14214
+ ' "hits": [<array of strings, max 4 items, brief specific achievements>],',
14215
+ ' "misses": [<array of strings, max 4 items, brief specific failures or omissions, empty if none>],',
14216
+ ' "reasoning": "<string, concise explanation for the score, 1-2 sentences max>"',
14217
+ "}"
14218
+ );
14219
+ return basePrompt.join("\n");
14220
+ }
14221
+ function buildQualityPrompt(evalCase, candidate, question) {
13925
14222
  const parts = [
13926
14223
  "[[ ## expected_outcome ## ]]",
13927
14224
  evalCase.expected_outcome.trim(),
13928
14225
  "",
13929
14226
  "[[ ## question ## ]]",
13930
- evalCase.question.trim(),
13931
- "",
13932
- "[[ ## reference_answer ## ]]",
13933
- evalCase.reference_answer.trim(),
13934
- "",
13935
- "[[ ## candidate_answer ## ]]",
13936
- candidate.trim(),
13937
- "",
13938
- "Respond with a single JSON object matching the schema described in the system prompt."
14227
+ question.trim(),
14228
+ ""
13939
14229
  ];
14230
+ if (hasNonEmptyReferenceAnswer(evalCase)) {
14231
+ parts.push(
14232
+ "[[ ## reference_answer ## ]]",
14233
+ evalCase.reference_answer.trim(),
14234
+ ""
14235
+ );
14236
+ }
14237
+ parts.push(
14238
+ "[[ ## candidate_answer ## ]]",
14239
+ candidate.trim()
14240
+ );
13940
14241
  return parts.join("\n");
13941
14242
  }
13942
14243
  function clampScore(value) {
@@ -14019,6 +14320,9 @@ function extractJsonBlob(text) {
14019
14320
  function isNonEmptyString(value) {
14020
14321
  return typeof value === "string" && value.trim().length > 0;
14021
14322
  }
14323
+ function hasNonEmptyReferenceAnswer(evalCase) {
14324
+ return evalCase.reference_answer !== void 0 && evalCase.reference_answer.trim().length > 0;
14325
+ }
14022
14326
  var CodeEvaluator = class {
14023
14327
  kind = "code";
14024
14328
  script;
@@ -14405,10 +14709,11 @@ async function runEvaluation(options) {
14405
14709
  await onProgress({
14406
14710
  workerId,
14407
14711
  evalId: evalCase.id,
14408
- status: "completed",
14712
+ status: result.error ? "failed" : "completed",
14409
14713
  startedAt: 0,
14410
14714
  // Not used for completed status
14411
- completedAt: Date.now()
14715
+ completedAt: Date.now(),
14716
+ error: result.error
14412
14717
  });
14413
14718
  }
14414
14719
  if (onResult) {
@@ -14665,11 +14970,27 @@ async function evaluateCandidate(options) {
14665
14970
  agentTimeoutMs
14666
14971
  });
14667
14972
  const completedAt = nowFn();
14668
- const rawRequest = {
14669
- question: promptInputs.question,
14670
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
14671
- guideline_paths: evalCase.guideline_paths
14672
- };
14973
+ let agentProviderRequest;
14974
+ let lmProviderRequest;
14975
+ if (isAgentProvider(provider)) {
14976
+ agentProviderRequest = {
14977
+ question: promptInputs.question,
14978
+ guideline_paths: evalCase.guideline_paths
14979
+ };
14980
+ } else {
14981
+ if (promptInputs.chatPrompt) {
14982
+ lmProviderRequest = {
14983
+ chat_prompt: promptInputs.chatPrompt,
14984
+ guideline_paths: evalCase.guideline_paths
14985
+ };
14986
+ } else {
14987
+ lmProviderRequest = {
14988
+ question: promptInputs.question,
14989
+ guidelines: promptInputs.guidelines,
14990
+ guideline_paths: evalCase.guideline_paths
14991
+ };
14992
+ }
14993
+ }
14673
14994
  return {
14674
14995
  eval_id: evalCase.id,
14675
14996
  dataset: evalCase.dataset,
@@ -14683,7 +15004,8 @@ async function evaluateCandidate(options) {
14683
15004
  timestamp: completedAt.toISOString(),
14684
15005
  reasoning: score.reasoning,
14685
15006
  raw_aspects: score.rawAspects,
14686
- raw_request: rawRequest,
15007
+ agent_provider_request: agentProviderRequest,
15008
+ lm_provider_request: lmProviderRequest,
14687
15009
  evaluator_raw_request: evaluatorResults ? void 0 : score.evaluatorRawRequest,
14688
15010
  evaluator_results: evaluatorResults
14689
15011
  };
@@ -14912,6 +15234,7 @@ async function invokeProvider(provider, options) {
14912
15234
  question: promptInputs.question,
14913
15235
  guidelines: promptInputs.guidelines,
14914
15236
  guideline_patterns: evalCase.guideline_patterns,
15237
+ chatPrompt: promptInputs.chatPrompt,
14915
15238
  inputFiles: evalCase.file_paths,
14916
15239
  evalCaseId: evalCase.id,
14917
15240
  attempt,
@@ -14928,12 +15251,30 @@ async function invokeProvider(provider, options) {
14928
15251
  }
14929
15252
  function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs, provider) {
14930
15253
  const message = error instanceof Error ? error.message : String(error);
14931
- const rawRequest = {
14932
- question: promptInputs.question,
14933
- ...isAgentProvider(provider) ? {} : { guidelines: promptInputs.guidelines },
14934
- guideline_paths: evalCase.guideline_paths,
14935
- error: message
14936
- };
15254
+ let agentProviderRequest;
15255
+ let lmProviderRequest;
15256
+ if (isAgentProvider(provider)) {
15257
+ agentProviderRequest = {
15258
+ question: promptInputs.question,
15259
+ guideline_paths: evalCase.guideline_paths,
15260
+ error: message
15261
+ };
15262
+ } else {
15263
+ if (promptInputs.chatPrompt) {
15264
+ lmProviderRequest = {
15265
+ chat_prompt: promptInputs.chatPrompt,
15266
+ guideline_paths: evalCase.guideline_paths,
15267
+ error: message
15268
+ };
15269
+ } else {
15270
+ lmProviderRequest = {
15271
+ question: promptInputs.question,
15272
+ guidelines: promptInputs.guidelines,
15273
+ guideline_paths: evalCase.guideline_paths,
15274
+ error: message
15275
+ };
15276
+ }
15277
+ }
14937
15278
  return {
14938
15279
  eval_id: evalCase.id,
14939
15280
  dataset: evalCase.dataset,
@@ -14946,7 +15287,9 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
14946
15287
  target: targetName,
14947
15288
  timestamp: timestamp.toISOString(),
14948
15289
  raw_aspects: [],
14949
- raw_request: rawRequest
15290
+ agent_provider_request: agentProviderRequest,
15291
+ lm_provider_request: lmProviderRequest,
15292
+ error: message
14950
15293
  };
14951
15294
  }
14952
15295
  function createCacheKey(provider, target, evalCase, promptInputs) {
@@ -14957,6 +15300,9 @@ function createCacheKey(provider, target, evalCase, promptInputs) {
14957
15300
  hash.update(promptInputs.question);
14958
15301
  hash.update(promptInputs.guidelines);
14959
15302
  hash.update(promptInputs.systemMessage ?? "");
15303
+ if (promptInputs.chatPrompt) {
15304
+ hash.update(JSON.stringify(promptInputs.chatPrompt));
15305
+ }
14960
15306
  return hash.digest("hex");
14961
15307
  }
14962
15308
  function isTimeoutLike(error) {
@@ -15384,8 +15730,6 @@ import { stripVTControlCharacters } from "node:util";
15384
15730
  var ESC = "\x1B[";
15385
15731
  var CLEAR_LINE = `${ESC}K`;
15386
15732
  var MOVE_CURSOR_UP = `${ESC}1A`;
15387
- var SYNC_START = `${ESC}?2026h`;
15388
- var SYNC_END = `${ESC}?2026l`;
15389
15733
  var ProgressDisplay = class {
15390
15734
  workers = /* @__PURE__ */ new Map();
15391
15735
  maxWorkers;
@@ -15624,6 +15968,8 @@ function buildHistogram(values) {
15624
15968
  function calculateEvaluationSummary(results) {
15625
15969
  const scores = results.map((result) => result.score);
15626
15970
  const total = results.length;
15971
+ const errors = results.filter((result) => result.error !== void 0).map((result) => ({ evalId: result.eval_id, error: result.error }));
15972
+ const errorCount = errors.length;
15627
15973
  if (total === 0) {
15628
15974
  return {
15629
15975
  total: 0,
@@ -15634,7 +15980,9 @@ function calculateEvaluationSummary(results) {
15634
15980
  standardDeviation: void 0,
15635
15981
  histogram: buildHistogram([]),
15636
15982
  topResults: [],
15637
- bottomResults: []
15983
+ bottomResults: [],
15984
+ errorCount: 0,
15985
+ errors: []
15638
15986
  };
15639
15987
  }
15640
15988
  const mean = computeMean(scores);
@@ -15655,7 +16003,9 @@ function calculateEvaluationSummary(results) {
15655
16003
  standardDeviation,
15656
16004
  histogram,
15657
16005
  topResults,
15658
- bottomResults
16006
+ bottomResults,
16007
+ errorCount,
16008
+ errors
15659
16009
  };
15660
16010
  }
15661
16011
  function formatScore(value) {
@@ -15666,10 +16016,25 @@ function formatEvaluationSummary(summary) {
15666
16016
  return "\nNo results to summarize";
15667
16017
  }
15668
16018
  const lines = [];
16019
+ if (summary.errorCount > 0) {
16020
+ lines.push("\n==================================================");
16021
+ lines.push("ERRORS");
16022
+ lines.push("==================================================");
16023
+ summary.errors.forEach((error) => {
16024
+ lines.push(`
16025
+ \u274C ${error.evalId}`);
16026
+ lines.push(` ${error.error}`);
16027
+ });
16028
+ lines.push("");
16029
+ }
15669
16030
  lines.push("\n==================================================");
15670
16031
  lines.push("EVALUATION SUMMARY");
15671
16032
  lines.push("==================================================");
15672
16033
  lines.push(`Total eval cases: ${summary.total}`);
16034
+ if (summary.errorCount > 0) {
16035
+ lines.push(`Failed: ${summary.errorCount}`);
16036
+ lines.push(`Passed: ${summary.total - summary.errorCount}`);
16037
+ }
15673
16038
  lines.push(`Mean score: ${formatScore(summary.mean)}`);
15674
16039
  lines.push(`Median score: ${formatScore(summary.median)}`);
15675
16040
  lines.push(`Min score: ${formatScore(summary.min)}`);
@@ -15708,7 +16073,7 @@ import { readFile as readFile5 } from "node:fs/promises";
15708
16073
  import path33 from "node:path";
15709
16074
  import { parse as parse5 } from "yaml";
15710
16075
  var SCHEMA_EVAL_V22 = "agentv-eval-v2";
15711
- var SCHEMA_TARGETS_V2 = "agentv-targets-v2.1";
16076
+ var SCHEMA_TARGETS_V2 = "agentv-targets-v2.2";
15712
16077
  var SCHEMA_CONFIG_V22 = "agentv-config-v2";
15713
16078
  async function detectFileType(filePath) {
15714
16079
  try {
@@ -15840,14 +16205,14 @@ async function validateEvalFile(filePath) {
15840
16205
  validateMessages(inputMessages, `${location}.input_messages`, absolutePath, errors);
15841
16206
  }
15842
16207
  const expectedMessages = evalCase["expected_messages"];
15843
- if (!Array.isArray(expectedMessages)) {
16208
+ if (expectedMessages !== void 0 && !Array.isArray(expectedMessages)) {
15844
16209
  errors.push({
15845
16210
  severity: "error",
15846
16211
  filePath: absolutePath,
15847
16212
  location: `${location}.expected_messages`,
15848
- message: "Missing or invalid 'expected_messages' field (must be an array)"
16213
+ message: "Invalid 'expected_messages' field (must be an array if provided)"
15849
16214
  });
15850
- } else {
16215
+ } else if (Array.isArray(expectedMessages)) {
15851
16216
  validateMessages(expectedMessages, `${location}.expected_messages`, absolutePath, errors);
15852
16217
  }
15853
16218
  }
@@ -15883,11 +16248,13 @@ function validateMessages(messages, location, filePath, errors) {
15883
16248
  }
15884
16249
  const content = message["content"];
15885
16250
  if (typeof content === "string") {
16251
+ validateContentForRoleMarkers(content, `${msgLocation}.content`, filePath, errors);
15886
16252
  } else if (Array.isArray(content)) {
15887
16253
  for (let j2 = 0; j2 < content.length; j2++) {
15888
16254
  const contentItem = content[j2];
15889
16255
  const contentLocation = `${msgLocation}.content[${j2}]`;
15890
16256
  if (typeof contentItem === "string") {
16257
+ validateContentForRoleMarkers(contentItem, contentLocation, filePath, errors);
15891
16258
  } else if (isObject(contentItem)) {
15892
16259
  const type = contentItem["type"];
15893
16260
  if (typeof type !== "string") {
@@ -15907,6 +16274,8 @@ function validateMessages(messages, location, filePath, errors) {
15907
16274
  location: `${contentLocation}.value`,
15908
16275
  message: "Content with type 'text' must have a 'value' field"
15909
16276
  });
16277
+ } else {
16278
+ validateContentForRoleMarkers(value, `${contentLocation}.value`, filePath, errors);
15910
16279
  }
15911
16280
  }
15912
16281
  } else {
@@ -15928,6 +16297,19 @@ function validateMessages(messages, location, filePath, errors) {
15928
16297
  }
15929
16298
  }
15930
16299
  }
16300
+ function validateContentForRoleMarkers(content, location, filePath, errors) {
16301
+ const markers = ["@[System]:", "@[User]:", "@[Assistant]:", "@[Tool]:"];
16302
+ for (const marker of markers) {
16303
+ if (content.toLowerCase().includes(marker.toLowerCase())) {
16304
+ errors.push({
16305
+ severity: "warning",
16306
+ filePath,
16307
+ location,
16308
+ message: `Content contains potential role marker '${marker}'. This may confuse agentic providers or cause prompt injection.`
16309
+ });
16310
+ }
16311
+ }
16312
+ }
15931
16313
  function isObject2(value) {
15932
16314
  return typeof value === "object" && value !== null && !Array.isArray(value);
15933
16315
  }
@@ -15935,8 +16317,21 @@ var COMMON_SETTINGS = /* @__PURE__ */ new Set([
15935
16317
  "provider_batching",
15936
16318
  "providerBatching"
15937
16319
  ]);
16320
+ var RETRY_SETTINGS = /* @__PURE__ */ new Set([
16321
+ "max_retries",
16322
+ "maxRetries",
16323
+ "retry_initial_delay_ms",
16324
+ "retryInitialDelayMs",
16325
+ "retry_max_delay_ms",
16326
+ "retryMaxDelayMs",
16327
+ "retry_backoff_factor",
16328
+ "retryBackoffFactor",
16329
+ "retry_status_codes",
16330
+ "retryStatusCodes"
16331
+ ]);
15938
16332
  var AZURE_SETTINGS = /* @__PURE__ */ new Set([
15939
16333
  ...COMMON_SETTINGS,
16334
+ ...RETRY_SETTINGS,
15940
16335
  "endpoint",
15941
16336
  "resource",
15942
16337
  "resourceName",
@@ -15953,6 +16348,7 @@ var AZURE_SETTINGS = /* @__PURE__ */ new Set([
15953
16348
  ]);
15954
16349
  var ANTHROPIC_SETTINGS = /* @__PURE__ */ new Set([
15955
16350
  ...COMMON_SETTINGS,
16351
+ ...RETRY_SETTINGS,
15956
16352
  "api_key",
15957
16353
  "apiKey",
15958
16354
  "model",
@@ -15966,6 +16362,7 @@ var ANTHROPIC_SETTINGS = /* @__PURE__ */ new Set([
15966
16362
  ]);
15967
16363
  var GEMINI_SETTINGS = /* @__PURE__ */ new Set([
15968
16364
  ...COMMON_SETTINGS,
16365
+ ...RETRY_SETTINGS,
15969
16366
  "api_key",
15970
16367
  "apiKey",
15971
16368
  "model",
@@ -16053,13 +16450,14 @@ function getKnownSettings(provider) {
16053
16450
  return null;
16054
16451
  }
16055
16452
  }
16056
- function validateUnknownSettings(settings, provider, absolutePath, location, errors) {
16453
+ function validateUnknownSettings(target, provider, absolutePath, location, errors) {
16057
16454
  const knownSettings = getKnownSettings(provider);
16058
16455
  if (!knownSettings) {
16059
16456
  return;
16060
16457
  }
16061
- for (const key2 of Object.keys(settings)) {
16062
- if (!knownSettings.has(key2)) {
16458
+ const baseFields = /* @__PURE__ */ new Set(["name", "provider", "judge_target", "workers", "$schema", "targets"]);
16459
+ for (const key2 of Object.keys(target)) {
16460
+ if (!baseFields.has(key2) && !knownSettings.has(key2)) {
16063
16461
  errors.push({
16064
16462
  severity: "warning",
16065
16463
  filePath: absolutePath,
@@ -16089,17 +16487,8 @@ async function validateTargetsFile(filePath) {
16089
16487
  errors
16090
16488
  };
16091
16489
  }
16092
- function validateCliSettings(settings, absolutePath2, location, errors2) {
16093
- if (!isObject2(settings)) {
16094
- errors2.push({
16095
- severity: "error",
16096
- filePath: absolutePath2,
16097
- location,
16098
- message: "CLI provider requires a 'settings' object"
16099
- });
16100
- return;
16101
- }
16102
- const commandTemplate = settings["command_template"] ?? settings["commandTemplate"];
16490
+ function validateCliSettings(target, absolutePath2, location, errors2) {
16491
+ const commandTemplate = target["command_template"] ?? target["commandTemplate"];
16103
16492
  if (typeof commandTemplate !== "string" || commandTemplate.trim().length === 0) {
16104
16493
  errors2.push({
16105
16494
  severity: "error",
@@ -16110,7 +16499,7 @@ async function validateTargetsFile(filePath) {
16110
16499
  } else {
16111
16500
  recordUnknownPlaceholders(commandTemplate, absolutePath2, `${location}.commandTemplate`, errors2);
16112
16501
  }
16113
- const attachmentsFormat = settings["attachments_format"] ?? settings["attachmentsFormat"];
16502
+ const attachmentsFormat = target["attachments_format"] ?? target["attachmentsFormat"];
16114
16503
  if (attachmentsFormat !== void 0 && typeof attachmentsFormat !== "string") {
16115
16504
  errors2.push({
16116
16505
  severity: "error",
@@ -16119,7 +16508,7 @@ async function validateTargetsFile(filePath) {
16119
16508
  message: "'attachmentsFormat' must be a string when provided"
16120
16509
  });
16121
16510
  }
16122
- const filesFormat = settings["files_format"] ?? settings["filesFormat"];
16511
+ const filesFormat = target["files_format"] ?? target["filesFormat"];
16123
16512
  if (filesFormat !== void 0 && typeof filesFormat !== "string") {
16124
16513
  errors2.push({
16125
16514
  severity: "error",
@@ -16128,7 +16517,7 @@ async function validateTargetsFile(filePath) {
16128
16517
  message: "'filesFormat' must be a string when provided"
16129
16518
  });
16130
16519
  }
16131
- const cwd = settings["cwd"];
16520
+ const cwd = target["cwd"];
16132
16521
  if (cwd !== void 0 && typeof cwd !== "string") {
16133
16522
  errors2.push({
16134
16523
  severity: "error",
@@ -16137,7 +16526,7 @@ async function validateTargetsFile(filePath) {
16137
16526
  message: "'cwd' must be a string when provided"
16138
16527
  });
16139
16528
  }
16140
- const timeoutSeconds = settings["timeout_seconds"] ?? settings["timeoutSeconds"];
16529
+ const timeoutSeconds = target["timeout_seconds"] ?? target["timeoutSeconds"];
16141
16530
  if (timeoutSeconds !== void 0) {
16142
16531
  const numericTimeout = Number(timeoutSeconds);
16143
16532
  if (!Number.isFinite(numericTimeout) || numericTimeout <= 0) {
@@ -16149,29 +16538,7 @@ async function validateTargetsFile(filePath) {
16149
16538
  });
16150
16539
  }
16151
16540
  }
16152
- const envOverrides = settings["env"];
16153
- if (envOverrides !== void 0) {
16154
- if (!isObject2(envOverrides)) {
16155
- errors2.push({
16156
- severity: "error",
16157
- filePath: absolutePath2,
16158
- location: `${location}.env`,
16159
- message: "'env' must be an object with string values"
16160
- });
16161
- } else {
16162
- for (const [key2, value] of Object.entries(envOverrides)) {
16163
- if (typeof value !== "string" || value.trim().length === 0) {
16164
- errors2.push({
16165
- severity: "error",
16166
- filePath: absolutePath2,
16167
- location: `${location}.env.${key2}`,
16168
- message: `Environment override '${key2}' must be a non-empty string`
16169
- });
16170
- }
16171
- }
16172
- }
16173
- }
16174
- const healthcheck = settings["healthcheck"];
16541
+ const healthcheck = target["healthcheck"];
16175
16542
  if (healthcheck !== void 0) {
16176
16543
  validateCliHealthcheck(healthcheck, absolutePath2, `${location}.healthcheck`, errors2);
16177
16544
  }
@@ -16342,20 +16709,11 @@ async function validateTargetsFile(filePath) {
16342
16709
  message: `Unknown provider '${provider}'. Known providers: ${knownProviders.join(", ")}`
16343
16710
  });
16344
16711
  }
16345
- const settings = target["settings"];
16346
- if (providerValue !== "cli" && settings !== void 0 && !isObject2(settings)) {
16347
- errors.push({
16348
- severity: "error",
16349
- filePath: absolutePath,
16350
- location: `${location}.settings`,
16351
- message: "Invalid 'settings' field (must be an object)"
16352
- });
16353
- }
16354
16712
  if (providerValue === "cli") {
16355
- validateCliSettings(settings, absolutePath, `${location}.settings`, errors);
16713
+ validateCliSettings(target, absolutePath, location, errors);
16356
16714
  }
16357
- if (settings !== void 0 && isObject2(settings) && typeof provider === "string") {
16358
- validateUnknownSettings(settings, provider, absolutePath, `${location}.settings`, errors);
16715
+ if (typeof provider === "string") {
16716
+ validateUnknownSettings(target, provider, absolutePath, location, errors);
16359
16717
  }
16360
16718
  const judgeTarget = target["judge_target"];
16361
16719
  if (judgeTarget !== void 0 && typeof judgeTarget !== "string") {
@@ -17566,4 +17924,4 @@ export {
17566
17924
  createProgram,
17567
17925
  runCli
17568
17926
  };
17569
- //# sourceMappingURL=chunk-J3LVKRRT.js.map
17927
+ //# sourceMappingURL=chunk-J5HK75TC.js.map