agentv 4.6.1 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
301
301
  }
302
302
  });
303
303
 
304
- // ../../packages/core/dist/chunk-ZK4GG7PR.js
304
+ // ../../packages/core/dist/chunk-75RFVESM.js
305
305
  import { constants } from "node:fs";
306
306
  import { access, readFile } from "node:fs/promises";
307
307
  import path from "node:path";
@@ -419,7 +419,7 @@ __export(external_exports2, {
419
419
  void: () => voidType
420
420
  });
421
421
 
422
- // ../../packages/core/dist/chunk-ZK4GG7PR.js
422
+ // ../../packages/core/dist/chunk-75RFVESM.js
423
423
  import { readFile as readFile2 } from "node:fs/promises";
424
424
  import path3 from "node:path";
425
425
  import fg from "fast-glob";
@@ -633,15 +633,13 @@ async function resolveFileReference(rawValue, searchRoots) {
633
633
  }
634
634
  var CliHealthcheckHttpInputSchema = external_exports2.object({
635
635
  url: external_exports2.string().min(1, "healthcheck URL is required"),
636
- timeout_seconds: external_exports2.number().positive().optional(),
637
- timeoutSeconds: external_exports2.number().positive().optional()
638
- });
636
+ timeout_seconds: external_exports2.number().positive().optional()
637
+ }).passthrough();
639
638
  var CliHealthcheckCommandInputSchema = external_exports2.object({
640
639
  command: external_exports2.string().min(1, "healthcheck command is required"),
641
640
  cwd: external_exports2.string().optional(),
642
- timeout_seconds: external_exports2.number().positive().optional(),
643
- timeoutSeconds: external_exports2.number().positive().optional()
644
- });
641
+ timeout_seconds: external_exports2.number().positive().optional()
642
+ }).passthrough();
645
643
  var CliHealthcheckInputSchema = external_exports2.union([
646
644
  CliHealthcheckHttpInputSchema,
647
645
  CliHealthcheckCommandInputSchema
@@ -653,36 +651,28 @@ var CliTargetInputSchema = external_exports2.object({
653
651
  command: external_exports2.string(),
654
652
  // Files format - optional
655
653
  files_format: external_exports2.string().optional(),
656
- filesFormat: external_exports2.string().optional(),
657
654
  attachments_format: external_exports2.string().optional(),
658
- attachmentsFormat: external_exports2.string().optional(),
659
655
  // Working directory - optional
660
656
  cwd: external_exports2.string().optional(),
661
657
  // Workspace template directory - optional (mutually exclusive with cwd)
662
658
  workspace_template: external_exports2.string().optional(),
663
- workspaceTemplate: external_exports2.string().optional(),
664
659
  // Timeout in seconds - optional
665
660
  timeout_seconds: external_exports2.number().positive().optional(),
666
- timeoutSeconds: external_exports2.number().positive().optional(),
667
661
  // Healthcheck configuration - optional
668
662
  healthcheck: CliHealthcheckInputSchema.optional(),
669
663
  // Verbose mode - optional
670
664
  verbose: external_exports2.boolean().optional(),
671
665
  cli_verbose: external_exports2.boolean().optional(),
672
- cliVerbose: external_exports2.boolean().optional(),
673
666
  // Keep temp files - optional
674
667
  keep_temp_files: external_exports2.boolean().optional(),
675
- keepTempFiles: external_exports2.boolean().optional(),
676
668
  keep_output_files: external_exports2.boolean().optional(),
677
- keepOutputFiles: external_exports2.boolean().optional(),
678
669
  // Common target fields
679
670
  grader_target: external_exports2.string().optional(),
680
671
  judge_target: external_exports2.string().optional(),
681
672
  // backward compat
682
673
  workers: external_exports2.number().int().min(1).optional(),
683
- provider_batching: external_exports2.boolean().optional(),
684
- providerBatching: external_exports2.boolean().optional()
685
- });
674
+ provider_batching: external_exports2.boolean().optional()
675
+ }).passthrough();
686
676
  var CliHealthcheckHttpSchema = external_exports2.object({
687
677
  url: external_exports2.string().min(1),
688
678
  timeoutMs: external_exports2.number().positive().optional()
@@ -707,7 +697,7 @@ var CliTargetConfigSchema = external_exports2.object({
707
697
  keepTempFiles: external_exports2.boolean().optional()
708
698
  }).strict();
709
699
  function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
710
- const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
700
+ const timeoutSeconds = input.timeout_seconds;
711
701
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
712
702
  if ("url" in input && input.url) {
713
703
  const url = resolveString(input.url, env, `${targetName} healthcheck URL`);
@@ -741,9 +731,9 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
741
731
  function normalizeCliTargetInput(input, env, evalFilePath) {
742
732
  const targetName = input.name;
743
733
  const command = resolveString(input.command, env, `${targetName} CLI command`, true);
744
- const filesFormatSource = input.files_format ?? input.filesFormat ?? input.attachments_format ?? input.attachmentsFormat;
734
+ const filesFormatSource = input.files_format ?? input.attachments_format;
745
735
  const filesFormat = resolveOptionalLiteralString(filesFormatSource);
746
- const workspaceTemplateSource = input.workspace_template ?? input.workspaceTemplate;
736
+ const workspaceTemplateSource = input.workspace_template;
747
737
  let workspaceTemplate = resolveOptionalString(
748
738
  workspaceTemplateSource,
749
739
  env,
@@ -771,12 +761,10 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
771
761
  if (!cwd && !workspaceTemplate && evalFilePath) {
772
762
  cwd = path2.dirname(path2.resolve(evalFilePath));
773
763
  }
774
- const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
764
+ const timeoutSeconds = input.timeout_seconds;
775
765
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
776
- const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose ?? input.cliVerbose);
777
- const keepTempFiles = resolveOptionalBoolean(
778
- input.keep_temp_files ?? input.keepTempFiles ?? input.keep_output_files ?? input.keepOutputFiles
779
- );
766
+ const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose);
767
+ const keepTempFiles = resolveOptionalBoolean(input.keep_temp_files ?? input.keep_output_files);
780
768
  const healthcheck = input.healthcheck ? normalizeCliHealthcheck(input.healthcheck, env, targetName, evalFilePath) : void 0;
781
769
  return {
782
770
  command,
@@ -797,14 +785,104 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
797
785
  "FILES",
798
786
  "OUTPUT_FILE"
799
787
  ]);
788
+ var DEPRECATED_TARGET_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
789
+ ["providerBatching", "provider_batching"],
790
+ ["subagentModeAllowed", "subagent_mode_allowed"],
791
+ ["fallbackTargets", "fallback_targets"],
792
+ ["resourceName", "endpoint"],
793
+ ["baseUrl", "base_url"],
794
+ ["apiKey", "api_key"],
795
+ ["deploymentName", "model"],
796
+ ["thinkingBudget", "thinking_budget"],
797
+ ["maxTokens", "max_output_tokens"],
798
+ ["apiFormat", "api_format"],
799
+ ["timeoutSeconds", "timeout_seconds"],
800
+ ["logDir", "log_dir"],
801
+ ["logDirectory", "log_directory"],
802
+ ["logFormat", "log_format"],
803
+ ["logOutputFormat", "log_output_format"],
804
+ ["systemPrompt", "system_prompt"],
805
+ ["maxTurns", "max_turns"],
806
+ ["maxBudgetUsd", "max_budget_usd"],
807
+ ["dryRun", "dry_run"],
808
+ ["subagentRoot", "subagent_root"],
809
+ ["filesFormat", "files_format"],
810
+ ["attachmentsFormat", "attachments_format"],
811
+ ["cliUrl", "cli_url"],
812
+ ["cliPath", "cli_path"],
813
+ ["githubToken", "github_token"],
814
+ ["sessionDir", "session_dir"],
815
+ ["sessionId", "session_id"],
816
+ ["sessionStateDir", "session_state_dir"],
817
+ ["maxRetries", "max_retries"],
818
+ ["retryInitialDelayMs", "retry_initial_delay_ms"],
819
+ ["retryMaxDelayMs", "retry_max_delay_ms"],
820
+ ["retryBackoffFactor", "retry_backoff_factor"],
821
+ ["retryStatusCodes", "retry_status_codes"]
822
+ ]);
823
+ var DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
824
+ ["timeoutSeconds", "timeout_seconds"]
825
+ ]);
826
+ function collectDeprecatedCamelCaseWarnings(value, location, aliases) {
827
+ if (typeof value !== "object" || value === null || Array.isArray(value)) {
828
+ return [];
829
+ }
830
+ const warnings = [];
831
+ for (const [camelCaseField, snakeCaseField] of aliases) {
832
+ if (Object.prototype.hasOwnProperty.call(value, camelCaseField)) {
833
+ warnings.push({
834
+ location: `${location}.${camelCaseField}`,
835
+ message: `camelCase field '${camelCaseField}' is no longer supported in targets.yaml. Use '${snakeCaseField}' instead.`
836
+ });
837
+ }
838
+ }
839
+ return warnings;
840
+ }
841
+ function assertNoDeprecatedCamelCaseTargetFields(definition) {
842
+ if (Object.prototype.hasOwnProperty.call(definition, "workspaceTemplate")) {
843
+ throw new Error(
844
+ `${definition.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
845
+ );
846
+ }
847
+ const warning = findDeprecatedCamelCaseTargetWarnings(
848
+ definition,
849
+ `target "${definition.name}"`
850
+ )[0];
851
+ if (!warning) {
852
+ return;
853
+ }
854
+ const fieldMatch = warning.message.match(/field '([^']+)'/);
855
+ const replacementMatch = warning.message.match(/Use '([^']+)' instead/);
856
+ const field = fieldMatch?.[1] ?? "unknown";
857
+ const replacement = replacementMatch?.[1] ?? "snake_case";
858
+ throw new Error(
859
+ `${warning.location}: camelCase field '${field}' is no longer supported in targets.yaml. Use '${replacement}' instead.`
860
+ );
861
+ }
862
+ function findDeprecatedCamelCaseTargetWarnings(target, location) {
863
+ const warnings = collectDeprecatedCamelCaseWarnings(
864
+ target,
865
+ location,
866
+ DEPRECATED_TARGET_CAMEL_CASE_FIELDS
867
+ );
868
+ if (typeof target !== "object" || target === null || Array.isArray(target)) {
869
+ return warnings;
870
+ }
871
+ const healthcheck = target.healthcheck;
872
+ warnings.push(
873
+ ...collectDeprecatedCamelCaseWarnings(
874
+ healthcheck,
875
+ `${location}.healthcheck`,
876
+ DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS
877
+ )
878
+ );
879
+ return warnings;
880
+ }
800
881
  var COMMON_TARGET_SETTINGS = [
801
882
  "use_target",
802
883
  "provider_batching",
803
- "providerBatching",
804
884
  "subagent_mode_allowed",
805
- "subagentModeAllowed",
806
- "fallback_targets",
807
- "fallbackTargets"
885
+ "fallback_targets"
808
886
  ];
809
887
  var USE_TARGET_ENV_PATTERN = /^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i;
810
888
  var BASE_TARGET_SCHEMA = external_exports2.object({
@@ -816,43 +894,40 @@ var BASE_TARGET_SCHEMA = external_exports2.object({
816
894
  // backward compat
817
895
  workers: external_exports2.number().int().min(1).optional(),
818
896
  workspace_template: external_exports2.string().optional(),
819
- workspaceTemplate: external_exports2.string().optional(),
820
897
  subagent_mode_allowed: external_exports2.boolean().optional(),
821
- fallback_targets: external_exports2.array(external_exports2.string().min(1)).optional(),
822
- fallbackTargets: external_exports2.array(external_exports2.string().min(1)).optional()
898
+ fallback_targets: external_exports2.array(external_exports2.string().min(1)).optional()
823
899
  }).passthrough();
824
900
  var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
901
+ var DEFAULT_AZURE_RESPONSES_API_VERSION = "v1";
825
902
  var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
826
- function normalizeAzureApiVersion(value) {
903
+ function normalizeAzureApiVersion(value, apiFormat) {
904
+ const defaultVersion = apiFormat === "responses" ? DEFAULT_AZURE_RESPONSES_API_VERSION : DEFAULT_AZURE_API_VERSION;
827
905
  if (!value) {
828
- return DEFAULT_AZURE_API_VERSION;
906
+ return defaultVersion;
829
907
  }
830
908
  const trimmed = value.trim();
831
909
  if (trimmed.length === 0) {
832
- return DEFAULT_AZURE_API_VERSION;
910
+ return defaultVersion;
833
911
  }
834
912
  const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
835
- return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
913
+ return withoutPrefix.length > 0 ? withoutPrefix : defaultVersion;
836
914
  }
837
915
  function resolveRetryConfig(target) {
838
- const maxRetries = resolveOptionalNumber(
839
- target.max_retries ?? target.maxRetries,
840
- `${target.name} max retries`
841
- );
916
+ const maxRetries = resolveOptionalNumber(target.max_retries, `${target.name} max retries`);
842
917
  const initialDelayMs = resolveOptionalNumber(
843
- target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
918
+ target.retry_initial_delay_ms,
844
919
  `${target.name} retry initial delay`
845
920
  );
846
921
  const maxDelayMs = resolveOptionalNumber(
847
- target.retry_max_delay_ms ?? target.retryMaxDelayMs,
922
+ target.retry_max_delay_ms,
848
923
  `${target.name} retry max delay`
849
924
  );
850
925
  const backoffFactor = resolveOptionalNumber(
851
- target.retry_backoff_factor ?? target.retryBackoffFactor,
926
+ target.retry_backoff_factor,
852
927
  `${target.name} retry backoff factor`
853
928
  );
854
929
  const retryableStatusCodes = resolveOptionalNumberArray(
855
- target.retry_status_codes ?? target.retryStatusCodes,
930
+ target.retry_status_codes,
856
931
  `${target.name} retry status codes`
857
932
  );
858
933
  if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
@@ -912,9 +987,10 @@ function resolveDelegatedTargetDefinition(name21, definitions, env = process.env
912
987
  `Target "${name21}" exceeded the maximum use_target resolution depth (10). Check for a delegation loop or overly deep alias chain.`
913
988
  );
914
989
  }
915
- function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
990
+ function resolveTargetDefinition(definition, env = process.env, evalFilePath, options) {
991
+ assertNoDeprecatedCamelCaseTargetFields(definition);
916
992
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
917
- if (parsed.workspace_template !== void 0 || parsed.workspaceTemplate !== void 0) {
993
+ if (parsed.workspace_template !== void 0) {
918
994
  throw new Error(
919
995
  `${parsed.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
920
996
  );
@@ -930,13 +1006,9 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
930
1006
  `${parsed.name} provider`,
931
1007
  true
932
1008
  ).toLowerCase();
933
- const providerBatching = resolveOptionalBoolean(
934
- parsed.provider_batching ?? parsed.providerBatching
935
- );
936
- const subagentModeAllowed = resolveOptionalBoolean(
937
- parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
938
- );
939
- const fallbackTargets = parsed.fallback_targets ?? parsed.fallbackTargets;
1009
+ const providerBatching = resolveOptionalBoolean(parsed.provider_batching);
1010
+ const subagentModeAllowed = resolveOptionalBoolean(parsed.subagent_mode_allowed);
1011
+ const fallbackTargets = parsed.fallback_targets;
940
1012
  const base = {
941
1013
  name: parsed.name,
942
1014
  graderTarget: parsed.grader_target ?? parsed.judge_target,
@@ -1086,20 +1158,22 @@ function normalizeOpenAIBaseUrl(value) {
1086
1158
  return trimmed.endsWith("/v1") ? trimmed : `${trimmed}/v1`;
1087
1159
  }
1088
1160
  function resolveAzureConfig(target, env) {
1089
- const endpointSource = target.endpoint ?? target.resource ?? target.resourceName;
1090
- const apiKeySource = target.api_key ?? target.apiKey;
1091
- const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
1161
+ const endpointSource = target.endpoint ?? target.resource;
1162
+ const apiKeySource = target.api_key;
1163
+ const deploymentSource = target.deployment ?? target.model;
1092
1164
  const versionSource = target.version ?? target.api_version;
1093
1165
  const temperatureSource = target.temperature;
1094
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
1166
+ const maxTokensSource = target.max_output_tokens;
1095
1167
  const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
1096
1168
  const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
1097
1169
  const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
1170
+ const apiFormat = resolveApiFormat(target, env, target.name);
1098
1171
  const version = normalizeAzureApiVersion(
1099
1172
  resolveOptionalString(versionSource, env, `${target.name} api version`, {
1100
1173
  allowLiteral: true,
1101
1174
  optionalEnv: true
1102
- })
1175
+ }),
1176
+ apiFormat
1103
1177
  );
1104
1178
  const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
1105
1179
  const maxOutputTokens = resolveOptionalNumber(
@@ -1112,13 +1186,17 @@ function resolveAzureConfig(target, env) {
1112
1186
  deploymentName,
1113
1187
  apiKey,
1114
1188
  version,
1189
+ apiFormat,
1115
1190
  temperature,
1116
1191
  maxOutputTokens,
1117
1192
  retry
1118
1193
  };
1119
1194
  }
1120
- function resolveApiFormat(target, targetName) {
1121
- const raw = target.api_format ?? target.apiFormat;
1195
+ function resolveApiFormat(target, env, targetName) {
1196
+ const raw = resolveOptionalString(target.api_format, env, `${targetName} api format`, {
1197
+ allowLiteral: true,
1198
+ optionalEnv: true
1199
+ });
1122
1200
  if (raw === void 0) return void 0;
1123
1201
  if (raw === "chat" || raw === "responses") return raw;
1124
1202
  throw new Error(
@@ -1126,11 +1204,11 @@ function resolveApiFormat(target, targetName) {
1126
1204
  );
1127
1205
  }
1128
1206
  function resolveOpenAIConfig(target, env) {
1129
- const endpointSource = target.endpoint ?? target.base_url ?? target.baseUrl;
1130
- const apiKeySource = target.api_key ?? target.apiKey;
1207
+ const endpointSource = target.endpoint ?? target.base_url;
1208
+ const apiKeySource = target.api_key;
1131
1209
  const modelSource = target.model ?? target.deployment ?? target.variant;
1132
1210
  const temperatureSource = target.temperature;
1133
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
1211
+ const maxTokensSource = target.max_output_tokens;
1134
1212
  const baseURL = normalizeOpenAIBaseUrl(
1135
1213
  resolveOptionalString(endpointSource, env, `${target.name} endpoint`, {
1136
1214
  allowLiteral: true,
@@ -1144,17 +1222,17 @@ function resolveOpenAIConfig(target, env) {
1144
1222
  baseURL,
1145
1223
  apiKey,
1146
1224
  model,
1147
- apiFormat: resolveApiFormat(target, target.name),
1225
+ apiFormat: resolveApiFormat(target, env, target.name),
1148
1226
  temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
1149
1227
  maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
1150
1228
  retry
1151
1229
  };
1152
1230
  }
1153
1231
  function resolveOpenRouterConfig(target, env) {
1154
- const apiKeySource = target.api_key ?? target.apiKey;
1232
+ const apiKeySource = target.api_key;
1155
1233
  const modelSource = target.model ?? target.deployment ?? target.variant;
1156
1234
  const temperatureSource = target.temperature;
1157
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
1235
+ const maxTokensSource = target.max_output_tokens;
1158
1236
  const retry = resolveRetryConfig(target);
1159
1237
  return {
1160
1238
  apiKey: resolveString(apiKeySource, env, `${target.name} OpenRouter api key`),
@@ -1165,11 +1243,11 @@ function resolveOpenRouterConfig(target, env) {
1165
1243
  };
1166
1244
  }
1167
1245
  function resolveAnthropicConfig(target, env) {
1168
- const apiKeySource = target.api_key ?? target.apiKey;
1246
+ const apiKeySource = target.api_key;
1169
1247
  const modelSource = target.model ?? target.deployment ?? target.variant;
1170
1248
  const temperatureSource = target.temperature;
1171
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
1172
- const thinkingBudgetSource = target.thinking_budget ?? target.thinkingBudget;
1249
+ const maxTokensSource = target.max_output_tokens;
1250
+ const thinkingBudgetSource = target.thinking_budget;
1173
1251
  const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
1174
1252
  const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
1175
1253
  const retry = resolveRetryConfig(target);
@@ -1183,10 +1261,10 @@ function resolveAnthropicConfig(target, env) {
1183
1261
  };
1184
1262
  }
1185
1263
  function resolveGeminiConfig(target, env) {
1186
- const apiKeySource = target.api_key ?? target.apiKey;
1264
+ const apiKeySource = target.api_key;
1187
1265
  const modelSource = target.model ?? target.deployment ?? target.variant;
1188
1266
  const temperatureSource = target.temperature;
1189
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
1267
+ const maxTokensSource = target.max_output_tokens;
1190
1268
  const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
1191
1269
  const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
1192
1270
  allowLiteral: true,
@@ -1206,11 +1284,11 @@ function resolveCodexConfig(target, env, evalFilePath) {
1206
1284
  const executableSource = target.executable ?? target.command ?? target.binary;
1207
1285
  const argsSource = target.args ?? target.arguments;
1208
1286
  const cwdSource = target.cwd;
1209
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
1210
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1211
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
1212
- const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
1213
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
1287
+ const workspaceTemplateSource = target.workspace_template;
1288
+ const timeoutSource = target.timeout_seconds;
1289
+ const logDirSource = target.log_dir ?? target.log_directory;
1290
+ const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
1291
+ const systemPromptSource = target.system_prompt;
1214
1292
  const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
1215
1293
  allowLiteral: true,
1216
1294
  optionalEnv: true
@@ -1274,16 +1352,16 @@ function normalizeCodexLogFormat(value) {
1274
1352
  throw new Error("codex log format must be 'summary' or 'json'");
1275
1353
  }
1276
1354
  function resolveCopilotSdkConfig(target, env, evalFilePath) {
1277
- const cliUrlSource = target.cli_url ?? target.cliUrl;
1278
- const cliPathSource = target.cli_path ?? target.cliPath;
1279
- const githubTokenSource = target.github_token ?? target.githubToken;
1355
+ const cliUrlSource = target.cli_url;
1356
+ const cliPathSource = target.cli_path;
1357
+ const githubTokenSource = target.github_token;
1280
1358
  const modelSource = target.model;
1281
1359
  const cwdSource = target.cwd;
1282
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
1283
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1284
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
1285
- const logFormatSource = target.log_format ?? target.logFormat;
1286
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
1360
+ const workspaceTemplateSource = target.workspace_template;
1361
+ const timeoutSource = target.timeout_seconds;
1362
+ const logDirSource = target.log_dir ?? target.log_directory;
1363
+ const logFormatSource = target.log_format;
1364
+ const systemPromptSource = target.system_prompt;
1287
1365
  const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, {
1288
1366
  allowLiteral: true,
1289
1367
  optionalEnv: true
@@ -1356,11 +1434,11 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
1356
1434
  const modelSource = target.model;
1357
1435
  const argsSource = target.args ?? target.arguments;
1358
1436
  const cwdSource = target.cwd;
1359
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
1360
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1361
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
1362
- const logFormatSource = target.log_format ?? target.logFormat;
1363
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
1437
+ const workspaceTemplateSource = target.workspace_template;
1438
+ const timeoutSource = target.timeout_seconds;
1439
+ const logDirSource = target.log_dir ?? target.log_directory;
1440
+ const logFormatSource = target.log_format;
1441
+ const systemPromptSource = target.system_prompt;
1364
1442
  const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, {
1365
1443
  allowLiteral: true,
1366
1444
  optionalEnv: true
@@ -1424,16 +1502,16 @@ function normalizeCopilotLogFormat(value) {
1424
1502
  }
1425
1503
  function resolvePiCodingAgentConfig(target, env, evalFilePath) {
1426
1504
  const subproviderSource = target.subprovider;
1427
- const modelSource = target.model ?? target.pi_model ?? target.piModel;
1428
- const apiKeySource = target.api_key ?? target.apiKey;
1429
- const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
1430
- const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
1505
+ const modelSource = target.model ?? target.pi_model;
1506
+ const apiKeySource = target.api_key;
1507
+ const toolsSource = target.tools ?? target.pi_tools;
1508
+ const thinkingSource = target.thinking ?? target.pi_thinking;
1431
1509
  const cwdSource = target.cwd;
1432
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
1433
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1434
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
1435
- const logFormatSource = target.log_format ?? target.logFormat;
1436
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
1510
+ const workspaceTemplateSource = target.workspace_template;
1511
+ const timeoutSource = target.timeout_seconds;
1512
+ const logDirSource = target.log_dir ?? target.log_directory;
1513
+ const logFormatSource = target.log_format;
1514
+ const systemPromptSource = target.system_prompt;
1437
1515
  const subprovider = resolveOptionalString(
1438
1516
  subproviderSource,
1439
1517
  env,
@@ -1451,7 +1529,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
1451
1529
  allowLiteral: false,
1452
1530
  optionalEnv: true
1453
1531
  });
1454
- const baseUrlSource = target.base_url ?? target.baseUrl ?? target.endpoint;
1532
+ const baseUrlSource = target.base_url ?? target.endpoint;
1455
1533
  const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi base url`, {
1456
1534
  allowLiteral: true,
1457
1535
  optionalEnv: true
@@ -1510,16 +1588,16 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
1510
1588
  function resolvePiCliConfig(target, env, evalFilePath) {
1511
1589
  const executableSource = target.executable ?? target.command ?? target.binary;
1512
1590
  const subproviderSource = target.subprovider;
1513
- const modelSource = target.model ?? target.pi_model ?? target.piModel;
1514
- const apiKeySource = target.api_key ?? target.apiKey;
1515
- const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
1516
- const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
1591
+ const modelSource = target.model ?? target.pi_model;
1592
+ const apiKeySource = target.api_key;
1593
+ const toolsSource = target.tools ?? target.pi_tools;
1594
+ const thinkingSource = target.thinking ?? target.pi_thinking;
1517
1595
  const cwdSource = target.cwd;
1518
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
1519
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1520
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
1521
- const logFormatSource = target.log_format ?? target.logFormat;
1522
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
1596
+ const workspaceTemplateSource = target.workspace_template;
1597
+ const timeoutSource = target.timeout_seconds;
1598
+ const logDirSource = target.log_dir ?? target.log_directory;
1599
+ const logFormatSource = target.log_format;
1600
+ const systemPromptSource = target.system_prompt;
1523
1601
  const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, {
1524
1602
  allowLiteral: true,
1525
1603
  optionalEnv: true
@@ -1538,7 +1616,7 @@ function resolvePiCliConfig(target, env, evalFilePath) {
1538
1616
  allowLiteral: false,
1539
1617
  optionalEnv: true
1540
1618
  });
1541
- const baseUrlSource = target.base_url ?? target.baseUrl ?? target.endpoint;
1619
+ const baseUrlSource = target.base_url ?? target.endpoint;
1542
1620
  const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi-cli base url`, {
1543
1621
  allowLiteral: true,
1544
1622
  optionalEnv: true
@@ -1596,11 +1674,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
1596
1674
  function resolveClaudeConfig(target, env, evalFilePath) {
1597
1675
  const modelSource = target.model;
1598
1676
  const cwdSource = target.cwd;
1599
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
1600
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1601
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
1602
- const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_LOG_FORMAT;
1603
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
1677
+ const workspaceTemplateSource = target.workspace_template;
1678
+ const timeoutSource = target.timeout_seconds;
1679
+ const logDirSource = target.log_dir ?? target.log_directory;
1680
+ const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT;
1681
+ const systemPromptSource = target.system_prompt;
1604
1682
  const model = resolveOptionalString(modelSource, env, `${target.name} claude model`, {
1605
1683
  allowLiteral: true,
1606
1684
  optionalEnv: true
@@ -1633,8 +1711,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
1633
1711
  });
1634
1712
  const logFormat = normalizeClaudeLogFormat(logFormatSource);
1635
1713
  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
1636
- const maxTurns = typeof target.max_turns === "number" ? target.max_turns : typeof target.maxTurns === "number" ? target.maxTurns : void 0;
1637
- const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd : typeof target.maxBudgetUsd === "number" ? target.maxBudgetUsd : void 0;
1714
+ const maxTurns = typeof target.max_turns === "number" ? target.max_turns : void 0;
1715
+ const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd : void 0;
1638
1716
  return {
1639
1717
  model,
1640
1718
  systemPrompt,
@@ -1665,9 +1743,7 @@ function resolveMockConfig(target) {
1665
1743
  return { response };
1666
1744
  }
1667
1745
  function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
1668
- const workspaceTemplateEnvVar = resolveOptionalLiteralString(
1669
- target.workspace_template ?? target.workspaceTemplate
1670
- );
1746
+ const workspaceTemplateEnvVar = resolveOptionalLiteralString(target.workspace_template);
1671
1747
  let workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(
1672
1748
  workspaceTemplateEnvVar,
1673
1749
  env,
@@ -1682,9 +1758,9 @@ function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
1682
1758
  }
1683
1759
  const executableSource = target.executable;
1684
1760
  const waitSource = target.wait;
1685
- const dryRunSource = target.dry_run ?? target.dryRun;
1686
- const subagentRootSource = target.subagent_root ?? target.subagentRoot;
1687
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1761
+ const dryRunSource = target.dry_run;
1762
+ const subagentRootSource = target.subagent_root;
1763
+ const timeoutSource = target.timeout_seconds;
1688
1764
  const defaultCommand = insiders ? "code-insiders" : "code";
1689
1765
  const executable = resolveOptionalString(executableSource, env, `${target.name} vscode executable`, {
1690
1766
  allowLiteral: true,
@@ -1719,8 +1795,8 @@ function resolveCliConfig(target, env, evalFilePath) {
1719
1795
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
1720
1796
  if (!parseResult.success) {
1721
1797
  const firstError = parseResult.error.errors[0];
1722
- const path49 = firstError?.path.join(".") || "";
1723
- const prefix = path49 ? `${target.name} ${path49}: ` : `${target.name}: `;
1798
+ const path410 = firstError?.path.join(".") || "";
1799
+ const prefix = path410 ? `${target.name} ${path410}: ` : `${target.name}: `;
1724
1800
  throw new Error(`${prefix}${firstError?.message}`);
1725
1801
  }
1726
1802
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -1735,7 +1811,7 @@ function resolveCliConfig(target, env, evalFilePath) {
1735
1811
  }
1736
1812
  function resolveDiscoveredProviderConfig(target, providerKind, env, evalFilePath) {
1737
1813
  const command = target.command ? resolveString(target.command, env, `${target.name} command`, true) : `bun run .agentv/providers/${providerKind}.ts {PROMPT}`;
1738
- const timeoutSeconds = target.timeout_seconds ?? target.timeoutSeconds;
1814
+ const timeoutSeconds = target.timeout_seconds;
1739
1815
  const timeoutMs = resolveTimeoutMs(timeoutSeconds, `${target.name} timeout`);
1740
1816
  let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
1741
1817
  allowLiteral: true,
@@ -1799,10 +1875,10 @@ function resolveDiscover(value, targetName) {
1799
1875
  throw new Error(`Target "${targetName}": discover must be "latest" (got "${String(value)}")`);
1800
1876
  }
1801
1877
  function resolveCopilotLogConfig(target, env) {
1802
- const sessionDirSource = target.session_dir ?? target.sessionDir;
1803
- const sessionIdSource = target.session_id ?? target.sessionId;
1878
+ const sessionDirSource = target.session_dir;
1879
+ const sessionIdSource = target.session_id;
1804
1880
  const discoverSource = target.discover;
1805
- const sessionStateDirSource = target.session_state_dir ?? target.sessionStateDir;
1881
+ const sessionStateDirSource = target.session_state_dir;
1806
1882
  const cwdSource = target.cwd;
1807
1883
  return {
1808
1884
  sessionDir: resolveOptionalString(
@@ -1975,6 +2051,15 @@ var AGENT_PROVIDER_KINDS = [
1975
2051
  "vscode",
1976
2052
  "vscode-insiders"
1977
2053
  ];
2054
+ var LLM_GRADER_CAPABLE_KINDS = [
2055
+ "openai",
2056
+ "openrouter",
2057
+ "azure",
2058
+ "anthropic",
2059
+ "gemini",
2060
+ "agentv",
2061
+ "mock"
2062
+ ];
1978
2063
  var KNOWN_PROVIDERS = [
1979
2064
  "openai",
1980
2065
  "openrouter",
@@ -1994,7 +2079,8 @@ var KNOWN_PROVIDERS = [
1994
2079
  "mock",
1995
2080
  "vscode",
1996
2081
  "vscode-insiders",
1997
- "agentv"
2082
+ "agentv",
2083
+ "transcript"
1998
2084
  ];
1999
2085
  var PROVIDER_ALIASES = [
2000
2086
  "azure-openai",
@@ -6803,7 +6889,7 @@ function createOpenRouter(options = {}) {
6803
6889
  );
6804
6890
  const createChatModel = (modelId, settings = {}) => new OpenRouterChatLanguageModel(modelId, settings, {
6805
6891
  provider: "openrouter.chat",
6806
- url: ({ path: path49 }) => `${baseURL}${path49}`,
6892
+ url: ({ path: path50 }) => `${baseURL}${path50}`,
6807
6893
  headers: getHeaders,
6808
6894
  compatibility,
6809
6895
  fetch: options.fetch,
@@ -6811,7 +6897,7 @@ function createOpenRouter(options = {}) {
6811
6897
  });
6812
6898
  const createCompletionModel = (modelId, settings = {}) => new OpenRouterCompletionLanguageModel(modelId, settings, {
6813
6899
  provider: "openrouter.completion",
6814
- url: ({ path: path49 }) => `${baseURL}${path49}`,
6900
+ url: ({ path: path50 }) => `${baseURL}${path50}`,
6815
6901
  headers: getHeaders,
6816
6902
  compatibility,
6817
6903
  fetch: options.fetch,
@@ -6819,14 +6905,14 @@ function createOpenRouter(options = {}) {
6819
6905
  });
6820
6906
  const createEmbeddingModel = (modelId, settings = {}) => new OpenRouterEmbeddingModel(modelId, settings, {
6821
6907
  provider: "openrouter.embedding",
6822
- url: ({ path: path49 }) => `${baseURL}${path49}`,
6908
+ url: ({ path: path50 }) => `${baseURL}${path50}`,
6823
6909
  headers: getHeaders,
6824
6910
  fetch: options.fetch,
6825
6911
  extraBody: options.extraBody
6826
6912
  });
6827
6913
  const createImageModel = (modelId, settings = {}) => new OpenRouterImageModel(modelId, settings, {
6828
6914
  provider: "openrouter.image",
6829
- url: ({ path: path49 }) => `${baseURL}${path49}`,
6915
+ url: ({ path: path50 }) => `${baseURL}${path50}`,
6830
6916
  headers: getHeaders,
6831
6917
  fetch: options.fetch,
6832
6918
  extraBody: options.extraBody
@@ -14345,11 +14431,13 @@ import { tmpdir } from "node:os";
14345
14431
  import path19 from "node:path";
14346
14432
  import { execSync as execSync2 } from "node:child_process";
14347
14433
  import { randomUUID as randomUUID8 } from "node:crypto";
14348
- import { accessSync as accessSync2, createWriteStream as createWriteStream6 } from "node:fs";
14434
+ import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
14349
14435
  import { mkdir as mkdir7 } from "node:fs/promises";
14350
- import path20 from "node:path";
14436
+ import path21 from "node:path";
14351
14437
  import { createInterface } from "node:readline";
14352
- import { fileURLToPath as fileURLToPath3 } from "node:url";
14438
+ import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
14439
+ import os2 from "node:os";
14440
+ import path20 from "node:path";
14353
14441
  import { exec as exec2 } from "node:child_process";
14354
14442
  import { constants as constants3, access as access3, stat as stat5 } from "node:fs/promises";
14355
14443
  import path322 from "node:path";
@@ -14358,18 +14446,16 @@ import { stat as stat4, writeFile as writeFile4 } from "node:fs/promises";
14358
14446
  import path30 from "node:path";
14359
14447
  import { constants as constants22 } from "node:fs";
14360
14448
  import { access as access22, mkdir as mkdir8, readdir as readdir2, rm as rm2, stat as stat2 } from "node:fs/promises";
14361
- import path21 from "node:path";
14362
14449
  import path222 from "node:path";
14363
14450
  import path23 from "node:path";
14364
- import { readFile as readFile9 } from "node:fs/promises";
14365
14451
  import path24 from "node:path";
14452
+ import { readFile as readFile9 } from "node:fs/promises";
14453
+ import path25 from "node:path";
14366
14454
  import { exec, spawn as spawn4 } from "node:child_process";
14367
14455
  import { mkdir as mkdir9, writeFile as writeFile2 } from "node:fs/promises";
14368
14456
  import path27 from "node:path";
14369
14457
  import { promisify as promisify2 } from "node:util";
14370
14458
  import path26 from "node:path";
14371
- import os2 from "node:os";
14372
- import path25 from "node:path";
14373
14459
  import { copyFile, mkdir as mkdir10, readFile as readFile10, readdir as readdir3, stat as stat3, writeFile as writeFile3 } from "node:fs/promises";
14374
14460
  import path29 from "node:path";
14375
14461
  import path28 from "node:path";
@@ -14420,12 +14506,15 @@ import { existsSync as existsSync5 } from "node:fs";
14420
14506
  import path45 from "node:path";
14421
14507
  import { mkdir as mkdir15, readFile as readFile13, writeFile as writeFile8 } from "node:fs/promises";
14422
14508
  import path46 from "node:path";
14423
- import { existsSync as existsSync6, mkdirSync, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
14509
+ import { existsSync as existsSync6, mkdirSync as mkdirSync2, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
14424
14510
  import path47 from "node:path";
14425
14511
  import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
14426
14512
  import { readdir as readdir8, stat as stat9 } from "node:fs/promises";
14427
14513
  import { homedir as homedir3 } from "node:os";
14428
14514
  import path48 from "node:path";
14515
+ import { readdir as readdir9, stat as stat10 } from "node:fs/promises";
14516
+ import { homedir as homedir4 } from "node:os";
14517
+ import path49 from "node:path";
14429
14518
  import { readFile as readFile14 } from "node:fs/promises";
14430
14519
  function computeTraceSummary(messages) {
14431
14520
  const toolCallCounts = {};
@@ -15213,8 +15302,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15213
15302
  const negate = rawEvaluator.negate === true ? true : void 0;
15214
15303
  if (isCustomType) {
15215
15304
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15216
- const required2 = parseRequired(rawEvaluator.required);
15217
- const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "negate"]);
15305
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15306
+ rawEvaluator.required,
15307
+ rawEvaluator.min_score,
15308
+ name21,
15309
+ evalId
15310
+ );
15311
+ const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
15218
15312
  const config2 = {};
15219
15313
  for (const [key, value] of Object.entries(rawEvaluator)) {
15220
15314
  if (!knownProps2.has(key) && value !== void 0) {
@@ -15226,6 +15320,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15226
15320
  type: customTypeName,
15227
15321
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15228
15322
  ...required2 !== void 0 ? { required: required2 } : {},
15323
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15229
15324
  ...negate !== void 0 ? { negate } : {},
15230
15325
  ...Object.keys(config2).length > 0 ? { config: config2 } : {}
15231
15326
  });
@@ -15295,7 +15390,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15295
15390
  );
15296
15391
  }
15297
15392
  }
15298
- const required2 = parseRequired(rawEvaluator.required);
15393
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15394
+ rawEvaluator.required,
15395
+ rawEvaluator.min_score,
15396
+ name21,
15397
+ evalId
15398
+ );
15299
15399
  const knownProps2 = /* @__PURE__ */ new Set([
15300
15400
  "name",
15301
15401
  "type",
@@ -15321,6 +15421,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15321
15421
  resolvedCwd,
15322
15422
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15323
15423
  ...required2 !== void 0 ? { required: required2 } : {},
15424
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15324
15425
  ...negate !== void 0 ? { negate } : {},
15325
15426
  ...Object.keys(config2).length > 0 ? { config: config2 } : {},
15326
15427
  ...targetConfig !== void 0 ? { target: targetConfig } : {}
@@ -15449,7 +15550,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15449
15550
  };
15450
15551
  }
15451
15552
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15452
- const required2 = parseRequired(rawEvaluator.required);
15553
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15554
+ rawEvaluator.required,
15555
+ rawEvaluator.min_score,
15556
+ name21,
15557
+ evalId
15558
+ );
15453
15559
  evaluators.push({
15454
15560
  name: name21,
15455
15561
  type: "composite",
@@ -15457,6 +15563,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15457
15563
  aggregator,
15458
15564
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15459
15565
  ...required2 !== void 0 ? { required: required2 } : {},
15566
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15460
15567
  ...negate !== void 0 ? { negate } : {}
15461
15568
  });
15462
15569
  continue;
@@ -15567,7 +15674,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15567
15674
  continue;
15568
15675
  }
15569
15676
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15570
- const required2 = parseRequired(rawEvaluator.required);
15677
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15678
+ rawEvaluator.required,
15679
+ rawEvaluator.min_score,
15680
+ name21,
15681
+ evalId
15682
+ );
15571
15683
  const config2 = {
15572
15684
  name: name21,
15573
15685
  type: "tool-trajectory",
@@ -15576,6 +15688,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15576
15688
  ...expected ? { expected } : {},
15577
15689
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15578
15690
  ...required2 !== void 0 ? { required: required2 } : {},
15691
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15579
15692
  ...negate !== void 0 ? { negate } : {},
15580
15693
  ...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
15581
15694
  };
@@ -15638,7 +15751,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15638
15751
  const aggregation = asString(rawEvaluator.aggregation);
15639
15752
  const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
15640
15753
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15641
- const required2 = parseRequired(rawEvaluator.required);
15754
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15755
+ rawEvaluator.required,
15756
+ rawEvaluator.min_score,
15757
+ name21,
15758
+ evalId
15759
+ );
15642
15760
  evaluators.push({
15643
15761
  name: name21,
15644
15762
  type: "field-accuracy",
@@ -15646,6 +15764,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15646
15764
  ...validAggregation ? { aggregation: validAggregation } : {},
15647
15765
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15648
15766
  ...required2 !== void 0 ? { required: required2 } : {},
15767
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15649
15768
  ...negate !== void 0 ? { negate } : {}
15650
15769
  });
15651
15770
  continue;
@@ -15659,13 +15778,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15659
15778
  continue;
15660
15779
  }
15661
15780
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15662
- const required2 = parseRequired(rawEvaluator.required);
15781
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15782
+ rawEvaluator.required,
15783
+ rawEvaluator.min_score,
15784
+ name21,
15785
+ evalId
15786
+ );
15663
15787
  evaluators.push({
15664
15788
  name: name21,
15665
15789
  type: "latency",
15666
15790
  threshold,
15667
15791
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15668
15792
  ...required2 !== void 0 ? { required: required2 } : {},
15793
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15669
15794
  ...negate !== void 0 ? { negate } : {}
15670
15795
  });
15671
15796
  continue;
@@ -15679,13 +15804,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15679
15804
  continue;
15680
15805
  }
15681
15806
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15682
- const required2 = parseRequired(rawEvaluator.required);
15807
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15808
+ rawEvaluator.required,
15809
+ rawEvaluator.min_score,
15810
+ name21,
15811
+ evalId
15812
+ );
15683
15813
  evaluators.push({
15684
15814
  name: name21,
15685
15815
  type: "cost",
15686
15816
  budget,
15687
15817
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15688
15818
  ...required2 !== void 0 ? { required: required2 } : {},
15819
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15689
15820
  ...negate !== void 0 ? { negate } : {}
15690
15821
  });
15691
15822
  continue;
@@ -15717,13 +15848,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15717
15848
  continue;
15718
15849
  }
15719
15850
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15720
- const required2 = parseRequired(rawEvaluator.required);
15851
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15852
+ rawEvaluator.required,
15853
+ rawEvaluator.min_score,
15854
+ name21,
15855
+ evalId
15856
+ );
15721
15857
  evaluators.push({
15722
15858
  name: name21,
15723
15859
  type: "token-usage",
15724
15860
  ...validLimits,
15725
15861
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15726
15862
  ...required2 !== void 0 ? { required: required2 } : {},
15863
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15727
15864
  ...negate !== void 0 ? { negate } : {}
15728
15865
  });
15729
15866
  continue;
@@ -15769,13 +15906,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15769
15906
  continue;
15770
15907
  }
15771
15908
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15772
- const required2 = parseRequired(rawEvaluator.required);
15909
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15910
+ rawEvaluator.required,
15911
+ rawEvaluator.min_score,
15912
+ name21,
15913
+ evalId
15914
+ );
15773
15915
  evaluators.push({
15774
15916
  name: name21,
15775
15917
  type: "execution-metrics",
15776
15918
  ...validThresholds,
15777
15919
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15778
15920
  ...required2 !== void 0 ? { required: required2 } : {},
15921
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15779
15922
  ...negate !== void 0 ? { negate } : {}
15780
15923
  });
15781
15924
  continue;
@@ -15789,7 +15932,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15789
15932
  const rawShouldTrigger = rawEvaluator.should_trigger;
15790
15933
  const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
15791
15934
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15792
- const required2 = parseRequired(rawEvaluator.required);
15935
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15936
+ rawEvaluator.required,
15937
+ rawEvaluator.min_score,
15938
+ name21,
15939
+ evalId
15940
+ );
15793
15941
  evaluators.push({
15794
15942
  name: name21,
15795
15943
  type: "skill-trigger",
@@ -15797,6 +15945,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15797
15945
  ...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
15798
15946
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15799
15947
  ...required2 !== void 0 ? { required: required2 } : {},
15948
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15800
15949
  ...negate !== void 0 ? { negate } : {}
15801
15950
  });
15802
15951
  continue;
@@ -15808,13 +15957,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15808
15957
  continue;
15809
15958
  }
15810
15959
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15811
- const required2 = parseRequired(rawEvaluator.required);
15960
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15961
+ rawEvaluator.required,
15962
+ rawEvaluator.min_score,
15963
+ name21,
15964
+ evalId
15965
+ );
15812
15966
  evaluators.push({
15813
15967
  name: name21,
15814
15968
  type: "contains",
15815
15969
  value,
15816
15970
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15817
15971
  ...required2 !== void 0 ? { required: required2 } : {},
15972
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15818
15973
  ...negate !== void 0 ? { negate } : {}
15819
15974
  });
15820
15975
  continue;
@@ -15828,13 +15983,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15828
15983
  continue;
15829
15984
  }
15830
15985
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15831
- const required2 = parseRequired(rawEvaluator.required);
15986
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15987
+ rawEvaluator.required,
15988
+ rawEvaluator.min_score,
15989
+ name21,
15990
+ evalId
15991
+ );
15832
15992
  evaluators.push({
15833
15993
  name: name21,
15834
15994
  type: typeValue,
15835
15995
  value,
15836
15996
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15837
15997
  ...required2 !== void 0 ? { required: required2 } : {},
15998
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15838
15999
  ...negate !== void 0 ? { negate } : {}
15839
16000
  });
15840
16001
  continue;
@@ -15846,13 +16007,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15846
16007
  continue;
15847
16008
  }
15848
16009
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15849
- const required2 = parseRequired(rawEvaluator.required);
16010
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16011
+ rawEvaluator.required,
16012
+ rawEvaluator.min_score,
16013
+ name21,
16014
+ evalId
16015
+ );
15850
16016
  evaluators.push({
15851
16017
  name: name21,
15852
16018
  type: "icontains",
15853
16019
  value,
15854
16020
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15855
16021
  ...required2 !== void 0 ? { required: required2 } : {},
16022
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15856
16023
  ...negate !== void 0 ? { negate } : {}
15857
16024
  });
15858
16025
  continue;
@@ -15866,13 +16033,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15866
16033
  continue;
15867
16034
  }
15868
16035
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15869
- const required2 = parseRequired(rawEvaluator.required);
16036
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16037
+ rawEvaluator.required,
16038
+ rawEvaluator.min_score,
16039
+ name21,
16040
+ evalId
16041
+ );
15870
16042
  evaluators.push({
15871
16043
  name: name21,
15872
16044
  type: typeValue,
15873
16045
  value,
15874
16046
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15875
16047
  ...required2 !== void 0 ? { required: required2 } : {},
16048
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15876
16049
  ...negate !== void 0 ? { negate } : {}
15877
16050
  });
15878
16051
  continue;
@@ -15884,13 +16057,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15884
16057
  continue;
15885
16058
  }
15886
16059
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15887
- const required2 = parseRequired(rawEvaluator.required);
16060
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16061
+ rawEvaluator.required,
16062
+ rawEvaluator.min_score,
16063
+ name21,
16064
+ evalId
16065
+ );
15888
16066
  evaluators.push({
15889
16067
  name: name21,
15890
16068
  type: typeValue,
15891
16069
  value,
15892
16070
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15893
16071
  ...required2 !== void 0 ? { required: required2 } : {},
16072
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15894
16073
  ...negate !== void 0 ? { negate } : {}
15895
16074
  });
15896
16075
  continue;
@@ -15903,7 +16082,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15903
16082
  }
15904
16083
  const flags = asString(rawEvaluator.flags);
15905
16084
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15906
- const required2 = parseRequired(rawEvaluator.required);
16085
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16086
+ rawEvaluator.required,
16087
+ rawEvaluator.min_score,
16088
+ name21,
16089
+ evalId
16090
+ );
15907
16091
  evaluators.push({
15908
16092
  name: name21,
15909
16093
  type: "regex",
@@ -15911,18 +16095,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15911
16095
  ...flags !== void 0 ? { flags } : {},
15912
16096
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15913
16097
  ...required2 !== void 0 ? { required: required2 } : {},
16098
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15914
16099
  ...negate !== void 0 ? { negate } : {}
15915
16100
  });
15916
16101
  continue;
15917
16102
  }
15918
16103
  if (typeValue === "is-json") {
15919
16104
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15920
- const required2 = parseRequired(rawEvaluator.required);
16105
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16106
+ rawEvaluator.required,
16107
+ rawEvaluator.min_score,
16108
+ name21,
16109
+ evalId
16110
+ );
15921
16111
  evaluators.push({
15922
16112
  name: name21,
15923
16113
  type: "is-json",
15924
16114
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15925
16115
  ...required2 !== void 0 ? { required: required2 } : {},
16116
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15926
16117
  ...negate !== void 0 ? { negate } : {}
15927
16118
  });
15928
16119
  continue;
@@ -15934,13 +16125,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15934
16125
  continue;
15935
16126
  }
15936
16127
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15937
- const required2 = parseRequired(rawEvaluator.required);
16128
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16129
+ rawEvaluator.required,
16130
+ rawEvaluator.min_score,
16131
+ name21,
16132
+ evalId
16133
+ );
15938
16134
  evaluators.push({
15939
16135
  name: name21,
15940
16136
  type: "equals",
15941
16137
  value,
15942
16138
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15943
16139
  ...required2 !== void 0 ? { required: required2 } : {},
16140
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15944
16141
  ...negate !== void 0 ? { negate } : {}
15945
16142
  });
15946
16143
  continue;
@@ -15976,7 +16173,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15976
16173
  continue;
15977
16174
  }
15978
16175
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15979
- const required2 = parseRequired(rawEvaluator.required);
16176
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16177
+ rawEvaluator.required,
16178
+ rawEvaluator.min_score,
16179
+ name21,
16180
+ evalId
16181
+ );
15980
16182
  evaluators.push({
15981
16183
  name: name21,
15982
16184
  type: "llm-grader",
@@ -15984,6 +16186,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15984
16186
  ...graderTargetName ? { target: graderTargetName } : {},
15985
16187
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15986
16188
  ...required2 !== void 0 ? { required: required2 } : {},
16189
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15987
16190
  ...negate !== void 0 ? { negate } : {}
15988
16191
  });
15989
16192
  continue;
@@ -16053,7 +16256,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
16053
16256
  continue;
16054
16257
  }
16055
16258
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
16056
- const required2 = parseRequired(rawEvaluator.required);
16259
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16260
+ rawEvaluator.required,
16261
+ rawEvaluator.min_score,
16262
+ name21,
16263
+ evalId
16264
+ );
16057
16265
  evaluators.push({
16058
16266
  name: name21,
16059
16267
  type: "llm-grader",
@@ -16061,12 +16269,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
16061
16269
  ...graderTargetName ? { target: graderTargetName } : {},
16062
16270
  ...weight2 !== void 0 ? { weight: weight2 } : {},
16063
16271
  ...required2 !== void 0 ? { required: required2 } : {},
16272
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
16064
16273
  ...negate !== void 0 ? { negate } : {}
16065
16274
  });
16066
16275
  continue;
16067
16276
  }
16068
16277
  const weight = validateWeight(rawEvaluator.weight, name21, evalId);
16069
- const required = parseRequired(rawEvaluator.required);
16278
+ const { required, min_score } = parseRequiredAndMinScore(
16279
+ rawEvaluator.required,
16280
+ rawEvaluator.min_score,
16281
+ name21,
16282
+ evalId
16283
+ );
16070
16284
  const knownProps = /* @__PURE__ */ new Set([
16071
16285
  "name",
16072
16286
  "type",
@@ -16077,6 +16291,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
16077
16291
  "weight",
16078
16292
  "config",
16079
16293
  "required",
16294
+ "min_score",
16080
16295
  "negate",
16081
16296
  "max_steps",
16082
16297
  "maxSteps",
@@ -16106,6 +16321,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
16106
16321
  ...graderTargetName ? { target: graderTargetName } : {},
16107
16322
  ...weight !== void 0 ? { weight } : {},
16108
16323
  ...required !== void 0 ? { required } : {},
16324
+ ...min_score !== void 0 ? { min_score } : {},
16109
16325
  ...negate !== void 0 ? { negate } : {},
16110
16326
  ...finalConfig ? { config: finalConfig } : {},
16111
16327
  ...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
@@ -16237,10 +16453,23 @@ ${detailBlock}${ANSI_RESET4}`);
16237
16453
  console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET4}`);
16238
16454
  }
16239
16455
  }
16240
- function parseRequired(value) {
16241
- if (value === true) return true;
16242
- if (typeof value === "number" && value > 0 && value <= 1) return value;
16243
- return void 0;
16456
+ function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
16457
+ const result = {};
16458
+ if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
16459
+ result.min_score = rawMinScore;
16460
+ }
16461
+ if (rawRequired === true) {
16462
+ result.required = true;
16463
+ } else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
16464
+ if (result.min_score === void 0) {
16465
+ result.min_score = rawRequired;
16466
+ }
16467
+ result.required = rawRequired;
16468
+ logWarning2(
16469
+ `Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
16470
+ );
16471
+ }
16472
+ return result;
16244
16473
  }
16245
16474
  function validateWeight(rawWeight, evaluatorName, evalId) {
16246
16475
  if (rawWeight === void 0) {
@@ -16283,16 +16512,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
16283
16512
  const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
16284
16513
  const expectedOutcome = asString(rawRubric.outcome) ?? "";
16285
16514
  const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
16515
+ let minScore;
16286
16516
  let requiredMinScore;
16287
16517
  let required;
16288
- if (typeof rawRubric.required_min_score === "number") {
16289
- const minScore = rawRubric.required_min_score;
16290
- if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
16518
+ if (typeof rawRubric.min_score === "number") {
16519
+ const ms = rawRubric.min_score;
16520
+ if (ms <= 0 || ms > 1) {
16291
16521
  throw new Error(
16292
- `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
16522
+ `Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
16293
16523
  );
16294
16524
  }
16295
- requiredMinScore = minScore;
16525
+ minScore = ms;
16526
+ requiredMinScore = Math.round(ms * 10);
16527
+ } else if (typeof rawRubric.required_min_score === "number") {
16528
+ const rms = rawRubric.required_min_score;
16529
+ if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
16530
+ throw new Error(
16531
+ `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
16532
+ );
16533
+ }
16534
+ requiredMinScore = rms;
16535
+ minScore = rms / 10;
16536
+ logWarning2(
16537
+ `Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
16538
+ );
16296
16539
  }
16297
16540
  if (typeof rawRubric.required === "boolean") {
16298
16541
  required = rawRubric.required;
@@ -16312,6 +16555,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
16312
16555
  weight,
16313
16556
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
16314
16557
  ...required !== void 0 ? { required } : {},
16558
+ ...minScore !== void 0 ? { min_score: minScore } : {},
16315
16559
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
16316
16560
  score_ranges: scoreRanges
16317
16561
  });
@@ -16328,6 +16572,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
16328
16572
  weight,
16329
16573
  // Default to required: true if not specified (backward compatibility)
16330
16574
  required: required ?? true,
16575
+ ...minScore !== void 0 ? { min_score: minScore } : {},
16331
16576
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
16332
16577
  });
16333
16578
  }
@@ -16456,12 +16701,22 @@ function parseInlineRubrics(rawRubrics) {
16456
16701
  id: asString(rubric.id) ?? `rubric-${index + 1}`,
16457
16702
  weight: typeof rubric.weight === "number" ? rubric.weight : 1
16458
16703
  };
16704
+ let inlineMinScore;
16705
+ let inlineRequiredMinScore;
16706
+ if (typeof rubric.min_score === "number") {
16707
+ inlineMinScore = rubric.min_score;
16708
+ inlineRequiredMinScore = Math.round(inlineMinScore * 10);
16709
+ } else if (typeof rubric.required_min_score === "number") {
16710
+ inlineRequiredMinScore = rubric.required_min_score;
16711
+ inlineMinScore = inlineRequiredMinScore / 10;
16712
+ }
16459
16713
  if (scoreRanges && scoreRanges.length > 0) {
16460
16714
  return {
16461
16715
  ...baseRubric,
16462
16716
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
16463
16717
  ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
16464
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
16718
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
16719
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
16465
16720
  score_ranges: scoreRanges
16466
16721
  };
16467
16722
  }
@@ -16469,7 +16724,8 @@ function parseInlineRubrics(rawRubrics) {
16469
16724
  ...baseRubric,
16470
16725
  outcome: expectedOutcome,
16471
16726
  required: typeof rubric.required === "boolean" ? rubric.required : true,
16472
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
16727
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
16728
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
16473
16729
  };
16474
16730
  }).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
16475
16731
  if (rubricItems.length === 0) {
@@ -16851,6 +17107,9 @@ function resolveExpectedMessages(raw) {
16851
17107
  var ANSI_YELLOW5 = "\x1B[33m";
16852
17108
  var ANSI_RED2 = "\x1B[31m";
16853
17109
  var ANSI_RESET6 = "\x1B[0m";
17110
+ function matchesFilter(id, filter2) {
17111
+ return typeof filter2 === "string" ? micromatch.isMatch(id, filter2) : filter2.some((pattern) => micromatch.isMatch(id, pattern));
17112
+ }
16854
17113
  function detectFormat(filePath) {
16855
17114
  const ext = path6.extname(filePath).toLowerCase();
16856
17115
  if (ext === ".jsonl") return "jsonl";
@@ -16918,40 +17177,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
16918
17177
  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
16919
17178
  const rawFile = await readFile5(absoluteTestPath, "utf8");
16920
17179
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
16921
- const fallbackEvalSet = path6.basename(absoluteTestPath, ".jsonl") || "eval";
16922
- const evalSetName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackEvalSet;
17180
+ const fallbackSuiteName = path6.basename(absoluteTestPath, ".jsonl") || "eval";
17181
+ const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
16923
17182
  const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
16924
17183
  const globalExecution = sidecar.execution;
16925
17184
  if (verbose) {
16926
17185
  console.log(`
16927
- [JSONL Dataset: ${evalFilePath}]`);
17186
+ [JSONL Suite: ${evalFilePath}]`);
16928
17187
  console.log(` Cases: ${rawCases.length}`);
16929
- console.log(` Eval set: ${evalSetName}`);
17188
+ console.log(` Suite: ${suiteName}`);
16930
17189
  if (sidecar.description) {
16931
17190
  console.log(` Description: ${sidecar.description}`);
16932
17191
  }
16933
17192
  }
16934
17193
  const results = [];
16935
17194
  for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
16936
- const evalcase = rawCases[lineIndex];
17195
+ const testCaseConfig = rawCases[lineIndex];
16937
17196
  const lineNumber = lineIndex + 1;
16938
- const id = asString4(evalcase.id);
16939
- if (filterPattern && (!id || !micromatch.isMatch(id, filterPattern))) {
17197
+ const id = asString4(testCaseConfig.id);
17198
+ if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
16940
17199
  continue;
16941
17200
  }
16942
- const conversationId = asString4(evalcase.conversation_id);
16943
- let outcome = asString4(evalcase.criteria);
16944
- if (!outcome && evalcase.expected_outcome !== void 0) {
16945
- outcome = asString4(evalcase.expected_outcome);
17201
+ const conversationId = asString4(testCaseConfig.conversation_id);
17202
+ let outcome = asString4(testCaseConfig.criteria);
17203
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
17204
+ outcome = asString4(testCaseConfig.expected_outcome);
16946
17205
  if (outcome) {
16947
17206
  logWarning4(
16948
- `Test '${asString4(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
17207
+ `Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
16949
17208
  );
16950
17209
  }
16951
17210
  }
16952
- const rawInputMessages = resolveInputMessages(evalcase);
16953
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
16954
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
17211
+ const rawInputMessages = resolveInputMessages(testCaseConfig);
17212
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
17213
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
16955
17214
  if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
16956
17215
  logError2(
16957
17216
  `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
@@ -16988,18 +17247,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
16988
17247
  }
16989
17248
  }
16990
17249
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
16991
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
17250
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
16992
17251
  const mergedExecution = caseExecution ?? globalExecution;
16993
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
17252
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
16994
17253
  let evaluators;
16995
17254
  try {
16996
- evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
17255
+ evaluators = await parseEvaluators(
17256
+ testCaseConfig,
17257
+ mergedExecution,
17258
+ searchRoots,
17259
+ id ?? "unknown"
17260
+ );
16997
17261
  } catch (error) {
16998
17262
  const message = error instanceof Error ? error.message : String(error);
16999
17263
  logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
17000
17264
  continue;
17001
17265
  }
17002
- const inlineRubrics = evalcase.rubrics;
17266
+ const inlineRubrics = testCaseConfig.rubrics;
17003
17267
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
17004
17268
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
17005
17269
  if (rubricEvaluator) {
@@ -17010,7 +17274,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
17010
17274
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
17011
17275
  const testCase = {
17012
17276
  id,
17013
- dataset: evalSetName,
17277
+ suite: suiteName,
17014
17278
  conversation_id: conversationId,
17015
17279
  question,
17016
17280
  input: inputMessages,
@@ -17018,7 +17282,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
17018
17282
  reference_answer: referenceAnswer,
17019
17283
  file_paths: userFilePaths,
17020
17284
  criteria: outcome ?? "",
17021
- evaluator: evalCaseEvaluatorKind,
17285
+ evaluator: testCaseEvaluatorKind,
17022
17286
  assertions: evaluators
17023
17287
  };
17024
17288
  results.push(testCase);
@@ -17194,6 +17458,9 @@ function buildChatPromptFromSegments(options) {
17194
17458
  var ANSI_YELLOW6 = "\x1B[33m";
17195
17459
  var ANSI_RED3 = "\x1B[31m";
17196
17460
  var ANSI_RESET7 = "\x1B[0m";
17461
+ function matchesFilter2(id, filter2) {
17462
+ return typeof filter2 === "string" ? micromatch2.isMatch(id, filter2) : filter2.some((pattern) => micromatch2.isMatch(id, pattern));
17463
+ }
17197
17464
  function resolveTests(suite) {
17198
17465
  if (suite.tests !== void 0) return suite.tests;
17199
17466
  if (suite.eval_cases !== void 0) {
@@ -17273,18 +17540,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
17273
17540
  throw new Error(`Invalid test file format: ${evalFilePath}`);
17274
17541
  }
17275
17542
  const suite = interpolated;
17276
- const evalSetNameFromSuite = asString5(suite.name)?.trim();
17277
- const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
17278
- const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
17279
- const rawTestcases = resolveTests(suite);
17543
+ const suiteNameFromFile = asString5(suite.name)?.trim();
17544
+ const fallbackSuiteName = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
17545
+ const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
17546
+ const rawTestCases = resolveTests(suite);
17280
17547
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
17281
17548
  const evalFileDir = path7.dirname(absoluteTestPath);
17282
- let expandedTestcases;
17283
- if (typeof rawTestcases === "string") {
17284
- const externalPath = path7.resolve(evalFileDir, rawTestcases);
17285
- expandedTestcases = await loadCasesFromFile(externalPath);
17286
- } else if (Array.isArray(rawTestcases)) {
17287
- expandedTestcases = await expandFileReferences(rawTestcases, evalFileDir);
17549
+ let expandedTestCases;
17550
+ if (typeof rawTestCases === "string") {
17551
+ const externalPath = path7.resolve(evalFileDir, rawTestCases);
17552
+ expandedTestCases = await loadCasesFromFile(externalPath);
17553
+ } else if (Array.isArray(rawTestCases)) {
17554
+ expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
17288
17555
  } else {
17289
17556
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
17290
17557
  }
@@ -17299,32 +17566,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
17299
17566
  }
17300
17567
  const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
17301
17568
  const results = [];
17302
- for (const rawEvalcase of expandedTestcases) {
17303
- if (!isJsonObject(rawEvalcase)) {
17569
+ for (const rawTestCase of expandedTestCases) {
17570
+ if (!isJsonObject(rawTestCase)) {
17304
17571
  logWarning5("Skipping invalid test entry (expected object)");
17305
17572
  continue;
17306
17573
  }
17307
- const evalcase = rawEvalcase;
17308
- const id = asString5(evalcase.id);
17309
- if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
17574
+ const testCaseConfig = rawTestCase;
17575
+ const id = asString5(testCaseConfig.id);
17576
+ if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
17310
17577
  continue;
17311
17578
  }
17312
- const conversationId = asString5(evalcase.conversation_id);
17313
- let outcome = asString5(evalcase.criteria);
17314
- if (!outcome && evalcase.expected_outcome !== void 0) {
17315
- outcome = asString5(evalcase.expected_outcome);
17579
+ const conversationId = asString5(testCaseConfig.conversation_id);
17580
+ let outcome = asString5(testCaseConfig.criteria);
17581
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
17582
+ outcome = asString5(testCaseConfig.expected_outcome);
17316
17583
  if (outcome) {
17317
17584
  logWarning5(
17318
- `Test '${asString5(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
17585
+ `Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
17319
17586
  );
17320
17587
  }
17321
17588
  }
17322
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
17589
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
17323
17590
  const skipDefaults = caseExecution?.skip_defaults === true;
17591
+ const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
17324
17592
  const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
17325
- const testInputMessages = resolveInputMessages(evalcase, effectiveSuiteInputFiles);
17326
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
17327
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assertions !== void 0 || evalcase.assert !== void 0;
17593
+ const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
17594
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
17595
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
17328
17596
  if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
17329
17597
  logError3(
17330
17598
  `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
@@ -17371,16 +17639,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
17371
17639
  }
17372
17640
  }
17373
17641
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
17374
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
17642
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
17375
17643
  let evaluators;
17376
17644
  try {
17377
- evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
17645
+ evaluators = await parseEvaluators(
17646
+ testCaseConfig,
17647
+ globalExecution,
17648
+ searchRoots,
17649
+ id ?? "unknown"
17650
+ );
17378
17651
  } catch (error) {
17379
17652
  const message = error instanceof Error ? error.message : String(error);
17380
17653
  logError3(`Skipping test '${id}': ${message}`);
17381
17654
  continue;
17382
17655
  }
17383
- const inlineRubrics = evalcase.rubrics;
17656
+ const inlineRubrics = testCaseConfig.rubrics;
17384
17657
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
17385
17658
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
17386
17659
  if (rubricEvaluator) {
@@ -17389,13 +17662,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
17389
17662
  }
17390
17663
  warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
17391
17664
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
17392
- const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
17665
+ const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
17393
17666
  const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
17394
- const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
17395
- const caseTargets = extractTargetsFromTestCase(evalcase);
17667
+ const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
17668
+ const caseTargets = extractTargetsFromTestCase(testCaseConfig);
17396
17669
  const testCase = {
17397
17670
  id,
17398
- dataset: evalSetName,
17671
+ suite: suiteName,
17399
17672
  category: options?.category,
17400
17673
  conversation_id: conversationId,
17401
17674
  question,
@@ -17404,11 +17677,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
17404
17677
  reference_answer: referenceAnswer,
17405
17678
  file_paths: userFilePaths,
17406
17679
  criteria: outcome ?? "",
17407
- evaluator: evalCaseEvaluatorKind,
17680
+ evaluator: testCaseEvaluatorKind,
17408
17681
  assertions: evaluators,
17409
17682
  workspace: mergedWorkspace,
17410
17683
  metadata,
17411
- targets: caseTargets
17684
+ targets: caseTargets,
17685
+ ...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
17412
17686
  };
17413
17687
  results.push(testCase);
17414
17688
  }
@@ -17939,7 +18213,7 @@ var AzureProvider = class {
17939
18213
  };
17940
18214
  this.retryConfig = config.retry;
17941
18215
  const azure = createAzure(buildAzureOptions(config));
17942
- this.model = azure.chat(config.deploymentName);
18216
+ this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
17943
18217
  }
17944
18218
  id;
17945
18219
  kind = "azure";
@@ -18065,7 +18339,9 @@ function buildAzureOptions(config) {
18065
18339
  const options = {
18066
18340
  apiKey: config.apiKey,
18067
18341
  apiVersion: config.version,
18068
- useDeploymentBasedUrls: true
18342
+ // Chat completions still use deployment-scoped Azure URLs for compatibility
18343
+ // with existing deployments. Responses API should use the SDK's v1 path.
18344
+ useDeploymentBasedUrls: config.apiFormat !== "responses"
18069
18345
  };
18070
18346
  const baseURL = normalizeAzureBaseUrl(config.resourceName);
18071
18347
  if (baseURL) {
@@ -21575,6 +21851,22 @@ function extractAzureResourceName(baseUrl) {
21575
21851
  if (urlMatch) return urlMatch[1];
21576
21852
  return baseUrl;
21577
21853
  }
21854
+ function normalizeAzureSdkBaseUrl(baseUrl) {
21855
+ const trimmed = baseUrl.trim().replace(/\/+$/, "");
21856
+ if (!trimmed) {
21857
+ return trimmed;
21858
+ }
21859
+ if (!/^https?:\/\//i.test(trimmed)) {
21860
+ return `https://${trimmed}.openai.azure.com/openai/v1`;
21861
+ }
21862
+ if (/\/openai\/v1$/i.test(trimmed)) {
21863
+ return trimmed;
21864
+ }
21865
+ if (/\/openai$/i.test(trimmed)) {
21866
+ return `${trimmed}/v1`;
21867
+ }
21868
+ return `${trimmed}/openai/v1`;
21869
+ }
21578
21870
  function extractPiTextContent(content) {
21579
21871
  if (typeof content === "string") {
21580
21872
  return content;
@@ -22397,6 +22689,30 @@ async function defaultPiRunner(options) {
22397
22689
  });
22398
22690
  });
22399
22691
  }
22692
+ var logged = false;
22693
+ function getAgentvHome() {
22694
+ const envHome = process.env.AGENTV_HOME;
22695
+ if (envHome && envHome !== "undefined") {
22696
+ if (!logged) {
22697
+ logged = true;
22698
+ console.warn(`Using AGENTV_HOME: ${envHome}`);
22699
+ }
22700
+ return envHome;
22701
+ }
22702
+ return path20.join(os2.homedir(), ".agentv");
22703
+ }
22704
+ function getWorkspacesRoot() {
22705
+ return path20.join(getAgentvHome(), "workspaces");
22706
+ }
22707
+ function getSubagentsRoot() {
22708
+ return path20.join(getAgentvHome(), "subagents");
22709
+ }
22710
+ function getTraceStateRoot() {
22711
+ return path20.join(getAgentvHome(), "trace-state");
22712
+ }
22713
+ function getWorkspacePoolRoot() {
22714
+ return path20.join(getAgentvHome(), "workspace-pool");
22715
+ }
22400
22716
  var piCodingAgentModule = null;
22401
22717
  var piAiModule = null;
22402
22718
  var loadingPromise = null;
@@ -22414,46 +22730,126 @@ async function promptInstall() {
22414
22730
  rl.close();
22415
22731
  }
22416
22732
  }
22417
- function findAgentvRoot() {
22418
- const thisFile = fileURLToPath3(import.meta.url);
22419
- let dir = path20.dirname(thisFile);
22420
- for (let i = 0; i < 10; i++) {
22733
+ function findManagedSdkInstallRoot() {
22734
+ return path21.join(getAgentvHome(), "deps", "pi-sdk");
22735
+ }
22736
+ function resolveGlobalNpmRoot() {
22737
+ try {
22738
+ const root = execSync2("npm root -g", {
22739
+ encoding: "utf-8",
22740
+ stdio: ["ignore", "pipe", "ignore"]
22741
+ }).trim();
22742
+ return root.length > 0 ? root : void 0;
22743
+ } catch {
22744
+ return void 0;
22745
+ }
22746
+ }
22747
+ function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
22748
+ return path21.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
22749
+ }
22750
+ function findAccessiblePath(paths) {
22751
+ for (const candidate of paths) {
22421
22752
  try {
22422
- const pkg = path20.join(dir, "package.json");
22423
- accessSync2(pkg);
22424
- return dir;
22753
+ accessSync2(candidate);
22754
+ return candidate;
22425
22755
  } catch {
22426
- const parent = path20.dirname(dir);
22427
- if (parent === dir) break;
22428
- dir = parent;
22429
22756
  }
22430
22757
  }
22431
- return path20.dirname(thisFile);
22758
+ return void 0;
22432
22759
  }
22433
- async function doLoadSdkModules() {
22760
+ async function tryImportLocalSdkModules() {
22434
22761
  try {
22435
22762
  [piCodingAgentModule, piAiModule] = await Promise.all([
22436
22763
  import("@mariozechner/pi-coding-agent"),
22437
22764
  import("@mariozechner/pi-ai")
22438
22765
  ]);
22766
+ return true;
22439
22767
  } catch {
22440
- if (await promptInstall()) {
22441
- const installDir = findAgentvRoot();
22442
- console.error(`Installing @mariozechner/pi-coding-agent into ${installDir}...`);
22443
- execSync2("bun add @mariozechner/pi-coding-agent", {
22444
- cwd: installDir,
22445
- stdio: "inherit"
22446
- });
22447
- [piCodingAgentModule, piAiModule] = await Promise.all([
22448
- import("@mariozechner/pi-coding-agent"),
22449
- import("@mariozechner/pi-ai")
22450
- ]);
22451
- } else {
22452
- throw new Error(
22453
- "pi-coding-agent SDK is not installed. Install it with:\n bun add @mariozechner/pi-coding-agent"
22454
- );
22768
+ return false;
22769
+ }
22770
+ }
22771
+ async function tryImportManagedSdkModules() {
22772
+ const managedRoot = findManagedSdkInstallRoot();
22773
+ const piCodingAgentEntry = findAccessiblePath([
22774
+ path21.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
22775
+ ]);
22776
+ const piAiEntry = findAccessiblePath([
22777
+ path21.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
22778
+ path21.join(
22779
+ managedRoot,
22780
+ "node_modules",
22781
+ "@mariozechner",
22782
+ "pi-coding-agent",
22783
+ "node_modules",
22784
+ "@mariozechner",
22785
+ "pi-ai",
22786
+ "dist",
22787
+ "index.js"
22788
+ )
22789
+ ]);
22790
+ if (!piCodingAgentEntry || !piAiEntry) return false;
22791
+ try {
22792
+ [piCodingAgentModule, piAiModule] = await Promise.all([
22793
+ import(pathToFileURL(piCodingAgentEntry).href),
22794
+ import(pathToFileURL(piAiEntry).href)
22795
+ ]);
22796
+ return true;
22797
+ } catch {
22798
+ return false;
22799
+ }
22800
+ }
22801
+ async function tryImportGlobalSdkModules() {
22802
+ const globalNpmRoot = resolveGlobalNpmRoot();
22803
+ if (!globalNpmRoot) return false;
22804
+ const piCodingAgentEntry = findAccessiblePath([
22805
+ buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
22806
+ ]);
22807
+ const piAiEntry = findAccessiblePath([
22808
+ buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
22809
+ path21.join(
22810
+ globalNpmRoot,
22811
+ "@mariozechner",
22812
+ "pi-coding-agent",
22813
+ "node_modules",
22814
+ "@mariozechner",
22815
+ "pi-ai",
22816
+ "dist",
22817
+ "index.js"
22818
+ )
22819
+ ]);
22820
+ if (!piCodingAgentEntry || !piAiEntry) return false;
22821
+ try {
22822
+ [piCodingAgentModule, piAiModule] = await Promise.all([
22823
+ import(pathToFileURL(piCodingAgentEntry).href),
22824
+ import(pathToFileURL(piAiEntry).href)
22825
+ ]);
22826
+ return true;
22827
+ } catch {
22828
+ return false;
22829
+ }
22830
+ }
22831
+ function installSdkModules(installDir) {
22832
+ console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
22833
+ mkdirSync(installDir, { recursive: true });
22834
+ execSync2("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
22835
+ cwd: installDir,
22836
+ stdio: "inherit"
22837
+ });
22838
+ }
22839
+ async function doLoadSdkModules() {
22840
+ if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
22841
+ return;
22842
+ }
22843
+ if (await promptInstall()) {
22844
+ const installDir = findManagedSdkInstallRoot();
22845
+ installSdkModules(installDir);
22846
+ if (await tryImportManagedSdkModules()) {
22847
+ return;
22455
22848
  }
22456
22849
  }
22850
+ throw new Error(
22851
+ "pi-coding-agent SDK is not installed. Install it with:\n npm install @mariozechner/pi-coding-agent"
22852
+ );
22457
22853
  }
22458
22854
  async function loadSdkModules() {
22459
22855
  if (!piCodingAgentModule || !piAiModule) {
@@ -22510,12 +22906,16 @@ var PiCodingAgentProvider = class {
22510
22906
  try {
22511
22907
  const cwd = this.resolveCwd(request.cwd);
22512
22908
  const rawProvider = this.config.subprovider ?? "google";
22513
- const hasBaseUrl = !!this.config.baseUrl;
22909
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
22910
+ const hasBaseUrl = !!normalizedBaseUrl;
22514
22911
  const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
22515
22912
  const modelId = this.config.model ?? "gemini-2.5-flash";
22516
22913
  this.setApiKeyEnv(rawProvider, hasBaseUrl);
22517
- this.setBaseUrlEnv(rawProvider, hasBaseUrl);
22914
+ this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
22518
22915
  let model = sdk.getModel(providerName, modelId);
22916
+ if (model && normalizedBaseUrl) {
22917
+ model = { ...model, baseUrl: normalizedBaseUrl };
22918
+ }
22519
22919
  if (!model) {
22520
22920
  const envProvider = providerName.replace(/-responses$/, "");
22521
22921
  model = {
@@ -22523,7 +22923,7 @@ var PiCodingAgentProvider = class {
22523
22923
  name: modelId,
22524
22924
  api: providerName,
22525
22925
  provider: envProvider,
22526
- baseUrl: this.config.baseUrl ?? "",
22926
+ baseUrl: normalizedBaseUrl ?? "",
22527
22927
  reasoning: false,
22528
22928
  input: ["text"],
22529
22929
  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
@@ -22690,19 +23090,27 @@ ${fileList}`;
22690
23090
  }
22691
23091
  }
22692
23092
  /** Maps config baseUrl to the provider-specific env var the SDK reads. */
22693
- setBaseUrlEnv(providerName, hasBaseUrl = false) {
22694
- if (!this.config.baseUrl) return;
23093
+ setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
23094
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
23095
+ if (!normalizedBaseUrl) return;
22695
23096
  const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
22696
23097
  if (envKey) {
22697
- process.env[envKey] = this.config.baseUrl;
23098
+ process.env[envKey] = normalizedBaseUrl;
22698
23099
  }
22699
23100
  }
23101
+ normalizeSdkBaseUrl(providerName, baseUrl) {
23102
+ if (!baseUrl) return void 0;
23103
+ if (providerName.toLowerCase() === "azure") {
23104
+ return normalizeAzureSdkBaseUrl(baseUrl);
23105
+ }
23106
+ return baseUrl;
23107
+ }
22700
23108
  resolveCwd(cwdOverride) {
22701
23109
  if (cwdOverride) {
22702
- return path20.resolve(cwdOverride);
23110
+ return path21.resolve(cwdOverride);
22703
23111
  }
22704
23112
  if (this.config.cwd) {
22705
- return path20.resolve(this.config.cwd);
23113
+ return path21.resolve(this.config.cwd);
22706
23114
  }
22707
23115
  return process.cwd();
22708
23116
  }
@@ -22721,9 +23129,9 @@ ${fileList}`;
22721
23129
  }
22722
23130
  resolveLogDirectory() {
22723
23131
  if (this.config.logDir) {
22724
- return path20.resolve(this.config.logDir);
23132
+ return path21.resolve(this.config.logDir);
22725
23133
  }
22726
- return path20.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
23134
+ return path21.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
22727
23135
  }
22728
23136
  async createStreamLogger(request) {
22729
23137
  const logDir = this.resolveLogDirectory();
@@ -22737,7 +23145,7 @@ ${fileList}`;
22737
23145
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
22738
23146
  return void 0;
22739
23147
  }
22740
- const filePath = path20.join(logDir, buildLogFilename6(request, this.targetName));
23148
+ const filePath = path21.join(logDir, buildLogFilename6(request, this.targetName));
22741
23149
  try {
22742
23150
  const logger = await PiStreamLogger2.create({
22743
23151
  filePath,
@@ -22961,7 +23369,7 @@ async function readDirEntries(target) {
22961
23369
  const entries = await readdir2(target, { withFileTypes: true });
22962
23370
  return entries.map((entry) => ({
22963
23371
  name: entry.name,
22964
- absolutePath: path21.join(target, entry.name),
23372
+ absolutePath: path222.join(target, entry.name),
22965
23373
  isDirectory: entry.isDirectory()
22966
23374
  }));
22967
23375
  }
@@ -22975,7 +23383,7 @@ async function removeIfExists(target) {
22975
23383
  }
22976
23384
  }
22977
23385
  function pathToFileUri2(filePath) {
22978
- const absolutePath = path222.isAbsolute(filePath) ? filePath : path222.resolve(filePath);
23386
+ const absolutePath = path23.isAbsolute(filePath) ? filePath : path23.resolve(filePath);
22979
23387
  const normalizedPath = absolutePath.replace(/\\/g, "/");
22980
23388
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
22981
23389
  return `file:///${normalizedPath}`;
@@ -23067,8 +23475,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
23067
23475
  });
23068
23476
  }
23069
23477
  function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
23070
- const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path23.basename(file)}`).join("\n");
23071
- const responseList = responseFiles.map((file) => `"${path23.basename(file)}"`).join(", ");
23478
+ const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path24.basename(file)}`).join("\n");
23479
+ const responseList = responseFiles.map((file) => `"${path24.basename(file)}"`).join(", ");
23072
23480
  return renderTemplate2(templateContent, {
23073
23481
  requestFiles: requestLines,
23074
23482
  responseList
@@ -23128,7 +23536,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
23128
23536
  }
23129
23537
  async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
23130
23538
  if (!silent) {
23131
- const fileList = responseFilesFinal.map((file) => path24.basename(file)).join(", ");
23539
+ const fileList = responseFilesFinal.map((file) => path25.basename(file)).join(", ");
23132
23540
  console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
23133
23541
  }
23134
23542
  const deadline = Date.now() + timeoutMs;
@@ -23137,7 +23545,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
23137
23545
  while (pending.size > 0) {
23138
23546
  if (Date.now() >= deadline) {
23139
23547
  if (!silent) {
23140
- const remaining = [...pending].map((f) => path24.basename(f)).join(", ");
23548
+ const remaining = [...pending].map((f) => path25.basename(f)).join(", ");
23141
23549
  console.error(
23142
23550
  `error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
23143
23551
  );
@@ -23184,30 +23592,6 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
23184
23592
  }
23185
23593
  return true;
23186
23594
  }
23187
- var logged = false;
23188
- function getAgentvHome() {
23189
- const envHome = process.env.AGENTV_HOME;
23190
- if (envHome && envHome !== "undefined") {
23191
- if (!logged) {
23192
- logged = true;
23193
- console.warn(`Using AGENTV_HOME: ${envHome}`);
23194
- }
23195
- return envHome;
23196
- }
23197
- return path25.join(os2.homedir(), ".agentv");
23198
- }
23199
- function getWorkspacesRoot() {
23200
- return path25.join(getAgentvHome(), "workspaces");
23201
- }
23202
- function getSubagentsRoot() {
23203
- return path25.join(getAgentvHome(), "subagents");
23204
- }
23205
- function getTraceStateRoot() {
23206
- return path25.join(getAgentvHome(), "trace-state");
23207
- }
23208
- function getWorkspacePoolRoot() {
23209
- return path25.join(getAgentvHome(), "workspace-pool");
23210
- }
23211
23595
  var DEFAULT_LOCK_NAME = "subagent.lock";
23212
23596
  var DEFAULT_ALIVE_FILENAME = ".alive";
23213
23597
  function getDefaultSubagentRoot(vscodeCmd = "code") {
@@ -24428,9 +24812,10 @@ function resolveAndCreateProvider(definition, env = process.env) {
24428
24812
  const resolved = resolveTargetDefinition(definition, env);
24429
24813
  return createProvider(resolved);
24430
24814
  }
24431
- var PASS_THRESHOLD = 0.8;
24432
- function scoreToVerdict(score) {
24433
- return score >= PASS_THRESHOLD ? "pass" : "fail";
24815
+ var DEFAULT_THRESHOLD = 0.8;
24816
+ var PASS_THRESHOLD = DEFAULT_THRESHOLD;
24817
+ function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
24818
+ return score >= threshold ? "pass" : "fail";
24434
24819
  }
24435
24820
  function clampScore(value) {
24436
24821
  if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -24612,13 +24997,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
24612
24997
  async function execShellWithStdin(command, stdinPayload, options = {}) {
24613
24998
  const { mkdir: mkdir16, readFile: readFile15, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
24614
24999
  const { tmpdir: tmpdir3 } = await import("node:os");
24615
- const path49 = await import("node:path");
25000
+ const path50 = await import("node:path");
24616
25001
  const { randomUUID: randomUUID10 } = await import("node:crypto");
24617
- const dir = path49.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
25002
+ const dir = path50.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
24618
25003
  await mkdir16(dir, { recursive: true });
24619
- const stdinPath = path49.join(dir, "stdin.txt");
24620
- const stdoutPath = path49.join(dir, "stdout.txt");
24621
- const stderrPath = path49.join(dir, "stderr.txt");
25004
+ const stdinPath = path50.join(dir, "stdin.txt");
25005
+ const stdoutPath = path50.join(dir, "stdout.txt");
25006
+ const stderrPath = path50.join(dir, "stderr.txt");
24622
25007
  await writeFile9(stdinPath, stdinPayload, "utf8");
24623
25008
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
24624
25009
  const { spawn: spawn5 } = await import("node:child_process");
@@ -25799,7 +26184,7 @@ ${outputSchema2}`;
25799
26184
  parts.push("[[ ## scoring_criteria ## ]]");
25800
26185
  for (const rubric of rubrics) {
25801
26186
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
25802
- const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
26187
+ const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
25803
26188
  parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
25804
26189
  if (rubric.outcome) {
25805
26190
  parts.push(`Description: ${rubric.outcome}`);
@@ -25853,54 +26238,106 @@ ${outputSchema2}`;
25853
26238
  async runWithRetry(options) {
25854
26239
  const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
25855
26240
  let lastError;
26241
+ let lastInvalidResponse;
26242
+ let shouldAttemptStructureFix = false;
25856
26243
  for (let attempt = 1; attempt <= 3; attempt++) {
25857
26244
  try {
25858
- const model = graderProvider.asLanguageModel?.();
25859
- if (model) {
25860
- const modelOptions = {
25861
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
25862
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
25863
- };
25864
- const hasImages = images && images.length > 0;
25865
- const result = hasImages ? await generateText({
25866
- model,
25867
- system: systemPrompt,
25868
- messages: [
25869
- {
25870
- role: "user",
25871
- content: [
25872
- { type: "text", text: userPrompt },
25873
- ...toAiSdkImageParts(images)
25874
- ]
25875
- }
25876
- ],
25877
- ...modelOptions
25878
- }) : await generateText({
25879
- model,
25880
- system: systemPrompt,
25881
- prompt: userPrompt,
25882
- ...modelOptions
25883
- });
25884
- const data2 = schema.parse(parseJsonFromText(result.text));
25885
- const rawUsage = result.usage;
25886
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
25887
- return { data: data2, tokenUsage };
26245
+ const result = await this.generateStructuredResponse({
26246
+ context: context2,
26247
+ graderProvider,
26248
+ systemPrompt,
26249
+ userPrompt,
26250
+ images
26251
+ });
26252
+ const canRepairResponse = result.text.trim().length > 0;
26253
+ lastInvalidResponse = canRepairResponse ? result : void 0;
26254
+ let data;
26255
+ try {
26256
+ data = schema.parse(parseJsonFromText(result.text));
26257
+ } catch (e) {
26258
+ lastError = e instanceof Error ? e : new Error(String(e));
26259
+ shouldAttemptStructureFix = canRepairResponse;
26260
+ continue;
25888
26261
  }
25889
- const response = await graderProvider.invoke({
25890
- question: userPrompt,
26262
+ return {
26263
+ data,
26264
+ providerResponse: result.providerResponse,
26265
+ tokenUsage: result.tokenUsage
26266
+ };
26267
+ } catch (e) {
26268
+ lastError = e instanceof Error ? e : new Error(String(e));
26269
+ }
26270
+ }
26271
+ if (shouldAttemptStructureFix && lastInvalidResponse) {
26272
+ try {
26273
+ const repaired = await this.generateStructuredResponse({
26274
+ context: context2,
26275
+ graderProvider,
25891
26276
  systemPrompt,
25892
- evalCaseId: context2.evalCase.id,
25893
- attempt: context2.attempt,
25894
- maxOutputTokens: this.maxOutputTokens,
25895
- temperature: this.temperature
26277
+ userPrompt: buildStructureRepairPrompt({
26278
+ validationError: lastError?.message ?? "Schema validation failed",
26279
+ invalidResponse: lastInvalidResponse.text
26280
+ })
25896
26281
  });
25897
- const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
25898
- return { data, providerResponse: response, tokenUsage: response.tokenUsage };
26282
+ const data = schema.parse(parseJsonFromText(repaired.text));
26283
+ return {
26284
+ data,
26285
+ providerResponse: repaired.providerResponse,
26286
+ tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
26287
+ };
25899
26288
  } catch (e) {
25900
26289
  lastError = e instanceof Error ? e : new Error(String(e));
25901
26290
  }
25902
26291
  }
25903
- throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
26292
+ throw new Error(
26293
+ `Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
26294
+ );
26295
+ }
26296
+ async generateStructuredResponse(options) {
26297
+ const { context: context2, graderProvider, systemPrompt, userPrompt, images } = options;
26298
+ const model = graderProvider.asLanguageModel?.();
26299
+ if (model) {
26300
+ const modelOptions = {
26301
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
26302
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
26303
+ };
26304
+ const hasImages = images && images.length > 0;
26305
+ const result = hasImages ? await generateText({
26306
+ model,
26307
+ system: systemPrompt,
26308
+ messages: [
26309
+ {
26310
+ role: "user",
26311
+ content: [
26312
+ { type: "text", text: userPrompt },
26313
+ ...toAiSdkImageParts(images)
26314
+ ]
26315
+ }
26316
+ ],
26317
+ ...modelOptions
26318
+ }) : await generateText({
26319
+ model,
26320
+ system: systemPrompt,
26321
+ prompt: userPrompt,
26322
+ ...modelOptions
26323
+ });
26324
+ const rawUsage = result.usage;
26325
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
26326
+ return { text: result.text, tokenUsage };
26327
+ }
26328
+ const response = await graderProvider.invoke({
26329
+ question: userPrompt,
26330
+ systemPrompt,
26331
+ evalCaseId: context2.evalCase.id,
26332
+ attempt: context2.attempt,
26333
+ maxOutputTokens: this.maxOutputTokens,
26334
+ temperature: this.temperature
26335
+ });
26336
+ return {
26337
+ text: extractLastAssistantContent(response.output),
26338
+ providerResponse: response,
26339
+ tokenUsage: response.tokenUsage
26340
+ };
25904
26341
  }
25905
26342
  };
25906
26343
  function buildOutputSchema() {
@@ -25920,6 +26357,29 @@ function buildOutputSchema() {
25920
26357
  "}"
25921
26358
  ].join("\n");
25922
26359
  }
26360
+ function buildStructureRepairPrompt(options) {
26361
+ const { validationError, invalidResponse } = options;
26362
+ return [
26363
+ "The following evaluation response has useful grading content but invalid JSON structure.",
26364
+ "Repair it to satisfy the schema in the system prompt.",
26365
+ "Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
26366
+ "",
26367
+ "Validation error:",
26368
+ validationError,
26369
+ "",
26370
+ "Invalid response:",
26371
+ invalidResponse
26372
+ ].join("\n");
26373
+ }
26374
+ function sumTokenUsage(first, second) {
26375
+ if (!first && !second) {
26376
+ return void 0;
26377
+ }
26378
+ return {
26379
+ input: (first?.input ?? 0) + (second?.input ?? 0),
26380
+ output: (first?.output ?? 0) + (second?.output ?? 0)
26381
+ };
26382
+ }
25923
26383
  function buildRubricOutputSchema() {
25924
26384
  return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
25925
26385
  You must return a valid JSON object matching this schema:
@@ -26019,19 +26479,21 @@ function calculateScoreRangeResult(result, rubrics) {
26019
26479
  rawScores[rubric.id] = rawScore;
26020
26480
  totalWeight += rubric.weight;
26021
26481
  weightedScoreSum += normalizedScore * rubric.weight;
26022
- let requiredMinScore;
26023
- if (rubric.required_min_score !== void 0) {
26024
- requiredMinScore = rubric.required_min_score;
26482
+ let minScoreThreshold;
26483
+ if (rubric.min_score !== void 0) {
26484
+ minScoreThreshold = rubric.min_score;
26485
+ } else if (rubric.required_min_score !== void 0) {
26486
+ minScoreThreshold = rubric.required_min_score / 10;
26025
26487
  } else if (rubric.required === true) {
26026
- requiredMinScore = 10;
26488
+ minScoreThreshold = 1;
26027
26489
  }
26028
26490
  const matchingRange = rubric.score_ranges?.find(
26029
26491
  (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
26030
26492
  );
26031
26493
  const rangeDescription = matchingRange?.outcome ?? "";
26032
26494
  const criterionLabel = rubric.outcome ?? rubric.id;
26033
- const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
26034
- if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
26495
+ const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
26496
+ if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
26035
26497
  failedRequired = true;
26036
26498
  }
26037
26499
  assertions.push({
@@ -26108,11 +26570,11 @@ function createFilesystemTools(workspacePath) {
26108
26570
  execute: async (input) => {
26109
26571
  try {
26110
26572
  const resolved = resolveSandboxed(workspacePath, input.path);
26111
- const stat10 = await fs2.stat(resolved);
26112
- if (stat10.isDirectory()) {
26573
+ const stat11 = await fs2.stat(resolved);
26574
+ if (stat11.isDirectory()) {
26113
26575
  return { error: `'${input.path}' is a directory, not a file` };
26114
26576
  }
26115
- const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
26577
+ const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
26116
26578
  const fd = await fs2.open(resolved, "r");
26117
26579
  try {
26118
26580
  await fd.read(buffer, 0, buffer.length, 0);
@@ -26120,8 +26582,8 @@ function createFilesystemTools(workspacePath) {
26120
26582
  await fd.close();
26121
26583
  }
26122
26584
  const content = buffer.toString("utf-8");
26123
- const truncated = stat10.size > MAX_FILE_SIZE;
26124
- return { content, truncated, size: stat10.size };
26585
+ const truncated = stat11.size > MAX_FILE_SIZE;
26586
+ return { content, truncated, size: stat11.size };
26125
26587
  } catch (error) {
26126
26588
  return { error: error instanceof Error ? error.message : String(error) };
26127
26589
  }
@@ -26172,8 +26634,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
26172
26634
  const ext = path35.extname(entry.name).toLowerCase();
26173
26635
  if (BINARY_EXTENSIONS.has(ext)) continue;
26174
26636
  try {
26175
- const stat10 = await fs2.stat(fullPath);
26176
- if (stat10.size > MAX_FILE_SIZE) continue;
26637
+ const stat11 = await fs2.stat(fullPath);
26638
+ if (stat11.size > MAX_FILE_SIZE) continue;
26177
26639
  const content = await fs2.readFile(fullPath, "utf-8");
26178
26640
  const lines = content.split("\n");
26179
26641
  for (let i = 0; i < lines.length; i++) {
@@ -26806,115 +27268,115 @@ var FieldAccuracyEvaluator = class {
26806
27268
  * Evaluate a single field against the expected value.
26807
27269
  */
26808
27270
  evaluateField(fieldConfig, candidateData, expectedData) {
26809
- const { path: path49, match, required = true, weight = 1 } = fieldConfig;
26810
- const candidateValue = resolvePath(candidateData, path49);
26811
- const expectedValue = resolvePath(expectedData, path49);
27271
+ const { path: path50, match, required = true, weight = 1 } = fieldConfig;
27272
+ const candidateValue = resolvePath(candidateData, path50);
27273
+ const expectedValue = resolvePath(expectedData, path50);
26812
27274
  if (expectedValue === void 0) {
26813
27275
  return {
26814
- path: path49,
27276
+ path: path50,
26815
27277
  score: 1,
26816
27278
  // No expected value means no comparison needed
26817
27279
  weight,
26818
27280
  hit: true,
26819
- message: `${path49}: no expected value`
27281
+ message: `${path50}: no expected value`
26820
27282
  };
26821
27283
  }
26822
27284
  if (candidateValue === void 0) {
26823
27285
  if (required) {
26824
27286
  return {
26825
- path: path49,
27287
+ path: path50,
26826
27288
  score: 0,
26827
27289
  weight,
26828
27290
  hit: false,
26829
- message: `${path49} (required, missing)`
27291
+ message: `${path50} (required, missing)`
26830
27292
  };
26831
27293
  }
26832
27294
  return {
26833
- path: path49,
27295
+ path: path50,
26834
27296
  score: 1,
26835
27297
  // Don't penalize missing optional fields
26836
27298
  weight: 0,
26837
27299
  // Zero weight means it won't affect the score
26838
27300
  hit: true,
26839
- message: `${path49}: optional field missing`
27301
+ message: `${path50}: optional field missing`
26840
27302
  };
26841
27303
  }
26842
27304
  switch (match) {
26843
27305
  case "exact":
26844
- return this.compareExact(path49, candidateValue, expectedValue, weight);
27306
+ return this.compareExact(path50, candidateValue, expectedValue, weight);
26845
27307
  case "numeric_tolerance":
26846
27308
  return this.compareNumericTolerance(
26847
- path49,
27309
+ path50,
26848
27310
  candidateValue,
26849
27311
  expectedValue,
26850
27312
  fieldConfig,
26851
27313
  weight
26852
27314
  );
26853
27315
  case "date":
26854
- return this.compareDate(path49, candidateValue, expectedValue, fieldConfig, weight);
27316
+ return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
26855
27317
  default:
26856
27318
  return {
26857
- path: path49,
27319
+ path: path50,
26858
27320
  score: 0,
26859
27321
  weight,
26860
27322
  hit: false,
26861
- message: `${path49}: unknown match type "${match}"`
27323
+ message: `${path50}: unknown match type "${match}"`
26862
27324
  };
26863
27325
  }
26864
27326
  }
26865
27327
  /**
26866
27328
  * Exact equality comparison.
26867
27329
  */
26868
- compareExact(path49, candidateValue, expectedValue, weight) {
27330
+ compareExact(path50, candidateValue, expectedValue, weight) {
26869
27331
  if (deepEqual(candidateValue, expectedValue)) {
26870
27332
  return {
26871
- path: path49,
27333
+ path: path50,
26872
27334
  score: 1,
26873
27335
  weight,
26874
27336
  hit: true,
26875
- message: path49
27337
+ message: path50
26876
27338
  };
26877
27339
  }
26878
27340
  if (typeof candidateValue !== typeof expectedValue) {
26879
27341
  return {
26880
- path: path49,
27342
+ path: path50,
26881
27343
  score: 0,
26882
27344
  weight,
26883
27345
  hit: false,
26884
- message: `${path49} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
27346
+ message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
26885
27347
  };
26886
27348
  }
26887
27349
  return {
26888
- path: path49,
27350
+ path: path50,
26889
27351
  score: 0,
26890
27352
  weight,
26891
27353
  hit: false,
26892
- message: `${path49} (value mismatch)`
27354
+ message: `${path50} (value mismatch)`
26893
27355
  };
26894
27356
  }
26895
27357
  /**
26896
27358
  * Numeric comparison with absolute or relative tolerance.
26897
27359
  */
26898
- compareNumericTolerance(path49, candidateValue, expectedValue, fieldConfig, weight) {
27360
+ compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
26899
27361
  const { tolerance = 0, relative = false } = fieldConfig;
26900
27362
  const candidateNum = toNumber(candidateValue);
26901
27363
  const expectedNum = toNumber(expectedValue);
26902
27364
  if (candidateNum === null || expectedNum === null) {
26903
27365
  return {
26904
- path: path49,
27366
+ path: path50,
26905
27367
  score: 0,
26906
27368
  weight,
26907
27369
  hit: false,
26908
- message: `${path49} (non-numeric value)`
27370
+ message: `${path50} (non-numeric value)`
26909
27371
  };
26910
27372
  }
26911
27373
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
26912
27374
  return {
26913
- path: path49,
27375
+ path: path50,
26914
27376
  score: 0,
26915
27377
  weight,
26916
27378
  hit: false,
26917
- message: `${path49} (invalid numeric value)`
27379
+ message: `${path50} (invalid numeric value)`
26918
27380
  };
26919
27381
  }
26920
27382
  const diff = Math.abs(candidateNum - expectedNum);
@@ -26927,61 +27389,61 @@ var FieldAccuracyEvaluator = class {
26927
27389
  }
26928
27390
  if (withinTolerance) {
26929
27391
  return {
26930
- path: path49,
27392
+ path: path50,
26931
27393
  score: 1,
26932
27394
  weight,
26933
27395
  hit: true,
26934
- message: `${path49} (within tolerance: diff=${diff.toFixed(2)})`
27396
+ message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
26935
27397
  };
26936
27398
  }
26937
27399
  return {
26938
- path: path49,
27400
+ path: path50,
26939
27401
  score: 0,
26940
27402
  weight,
26941
27403
  hit: false,
26942
- message: `${path49} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
27404
+ message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
26943
27405
  };
26944
27406
  }
26945
27407
  /**
26946
27408
  * Date comparison with format normalization.
26947
27409
  */
26948
- compareDate(path49, candidateValue, expectedValue, fieldConfig, weight) {
27410
+ compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
26949
27411
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
26950
27412
  const candidateDate = parseDate(String(candidateValue), formats);
26951
27413
  const expectedDate = parseDate(String(expectedValue), formats);
26952
27414
  if (candidateDate === null) {
26953
27415
  return {
26954
- path: path49,
27416
+ path: path50,
26955
27417
  score: 0,
26956
27418
  weight,
26957
27419
  hit: false,
26958
- message: `${path49} (unparseable candidate date)`
27420
+ message: `${path50} (unparseable candidate date)`
26959
27421
  };
26960
27422
  }
26961
27423
  if (expectedDate === null) {
26962
27424
  return {
26963
- path: path49,
27425
+ path: path50,
26964
27426
  score: 0,
26965
27427
  weight,
26966
27428
  hit: false,
26967
- message: `${path49} (unparseable expected date)`
27429
+ message: `${path50} (unparseable expected date)`
26968
27430
  };
26969
27431
  }
26970
27432
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
26971
27433
  return {
26972
- path: path49,
27434
+ path: path50,
26973
27435
  score: 1,
26974
27436
  weight,
26975
27437
  hit: true,
26976
- message: path49
27438
+ message: path50
26977
27439
  };
26978
27440
  }
26979
27441
  return {
26980
- path: path49,
27442
+ path: path50,
26981
27443
  score: 0,
26982
27444
  weight,
26983
27445
  hit: false,
26984
- message: `${path49} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
27446
+ message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
26985
27447
  };
26986
27448
  }
26987
27449
  /**
@@ -27014,11 +27476,11 @@ var FieldAccuracyEvaluator = class {
27014
27476
  };
27015
27477
  }
27016
27478
  };
27017
- function resolvePath(obj, path49) {
27018
- if (!path49 || !obj) {
27479
+ function resolvePath(obj, path50) {
27480
+ if (!path50 || !obj) {
27019
27481
  return void 0;
27020
27482
  }
27021
- const parts = path49.split(/\.|\[|\]/).filter((p) => p.length > 0);
27483
+ const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
27022
27484
  let current = obj;
27023
27485
  for (const part of parts) {
27024
27486
  if (current === null || current === void 0) {
@@ -27500,8 +27962,8 @@ var TokenUsageEvaluator = class {
27500
27962
  };
27501
27963
  }
27502
27964
  };
27503
- function getNestedValue(obj, path49) {
27504
- const parts = path49.split(".");
27965
+ function getNestedValue(obj, path50) {
27966
+ const parts = path50.split(".");
27505
27967
  let current = obj;
27506
27968
  for (const part of parts) {
27507
27969
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -29224,7 +29686,7 @@ var WorkspacePoolManager = class {
29224
29686
  }
29225
29687
  /**
29226
29688
  * Reset an existing slot for reuse:
29227
- * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
29689
+ * 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
29228
29690
  * 2. Re-copy template files (skip repo directories)
29229
29691
  */
29230
29692
  async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
@@ -29237,7 +29699,17 @@ var WorkspacePoolManager = class {
29237
29699
  continue;
29238
29700
  }
29239
29701
  const ref = repo.checkout?.ref ?? "HEAD";
29240
- await git(["reset", "--hard", ref], { cwd: repoDir });
29702
+ const resolve2 = repo.checkout?.resolve ?? "remote";
29703
+ if (resolve2 === "remote") {
29704
+ const fetchArgs = ["fetch", "origin", ref];
29705
+ if (repo.clone?.depth) {
29706
+ fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
29707
+ }
29708
+ await git(fetchArgs, { cwd: repoDir });
29709
+ await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
29710
+ } else {
29711
+ await git(["reset", "--hard", ref], { cwd: repoDir });
29712
+ }
29241
29713
  const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
29242
29714
  await git(["clean", cleanFlag], { cwd: repoDir });
29243
29715
  }
@@ -29520,7 +29992,7 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
29520
29992
  }
29521
29993
  return result.stdout;
29522
29994
  }
29523
- function classifyQualityStatus(score, threshold = PASS_THRESHOLD) {
29995
+ function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
29524
29996
  return score >= threshold ? "ok" : "quality_failure";
29525
29997
  }
29526
29998
  function buildSkippedEvaluatorError(scores) {
@@ -29612,7 +30084,7 @@ async function runEvaluation(options) {
29612
30084
  const filteredEvalCases = filterEvalCases(evalCases, filter2);
29613
30085
  if (filteredEvalCases.length === 0) {
29614
30086
  if (filter2) {
29615
- throw new Error(`No tests matched filter '${filter2}' in ${evalFilePath}`);
30087
+ throw new Error(`No tests matched filter '${formatFilter(filter2)}' in ${evalFilePath}`);
29616
30088
  }
29617
30089
  return [];
29618
30090
  }
@@ -29664,6 +30136,9 @@ async function runEvaluation(options) {
29664
30136
  const graderName = targetContext.graderTarget ?? targetContext.name;
29665
30137
  const resolvedGrader = resolveTargetByName(graderName);
29666
30138
  if (!resolvedGrader) {
30139
+ if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
30140
+ return void 0;
30141
+ }
29667
30142
  return getOrCreateProvider(targetContext);
29668
30143
  }
29669
30144
  return getOrCreateProvider(resolvedGrader);
@@ -29994,7 +30469,7 @@ async function runEvaluation(options) {
29994
30469
  const budgetResult = {
29995
30470
  timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
29996
30471
  testId: evalCase.id,
29997
- dataset: evalCase.dataset,
30472
+ suite: evalCase.suite,
29998
30473
  category: evalCase.category,
29999
30474
  score: 0,
30000
30475
  assertions: [],
@@ -30031,7 +30506,7 @@ async function runEvaluation(options) {
30031
30506
  const haltResult = {
30032
30507
  timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
30033
30508
  testId: evalCase.id,
30034
- dataset: evalCase.dataset,
30509
+ suite: evalCase.suite,
30035
30510
  category: evalCase.category,
30036
30511
  score: 0,
30037
30512
  assertions: [],
@@ -30343,7 +30818,7 @@ async function runBatchEvaluation(options) {
30343
30818
  targetResolver,
30344
30819
  availableTargets,
30345
30820
  verbose,
30346
- threshold: batchThreshold
30821
+ threshold: evalCase.threshold ?? batchThreshold
30347
30822
  });
30348
30823
  if (providerError) {
30349
30824
  result = {
@@ -30805,8 +31280,9 @@ async function runEvalCase(options) {
30805
31280
  fileChanges,
30806
31281
  workspacePath,
30807
31282
  verbose,
30808
- threshold: caseThreshold
31283
+ threshold: evalCase.threshold ?? caseThreshold
30809
31284
  });
31285
+ const effectiveThreshold = evalCase.threshold ?? caseThreshold;
30810
31286
  const totalDurationMs = Date.now() - caseStartMs;
30811
31287
  const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
30812
31288
  const evalRunTokenUsage = tokenUsage || graderTokens ? {
@@ -30820,7 +31296,7 @@ async function runEvalCase(options) {
30820
31296
  ...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
30821
31297
  };
30822
31298
  const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
30823
- const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, caseThreshold);
31299
+ const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
30824
31300
  const targetUsedField = targetUsed ? { targetUsed } : {};
30825
31301
  const finalResult = providerError ? {
30826
31302
  ...result,
@@ -31021,7 +31497,8 @@ async function evaluateCandidate(options) {
31021
31497
  targetResolver,
31022
31498
  availableTargets,
31023
31499
  fileChanges,
31024
- workspacePath
31500
+ workspacePath,
31501
+ threshold: evalThreshold
31025
31502
  });
31026
31503
  const completedAt = nowFn();
31027
31504
  let agentRequest;
@@ -31052,7 +31529,7 @@ async function evaluateCandidate(options) {
31052
31529
  return {
31053
31530
  timestamp: completedAt.toISOString(),
31054
31531
  testId: evalCase.id,
31055
- dataset: evalCase.dataset,
31532
+ suite: evalCase.suite,
31056
31533
  category: evalCase.category,
31057
31534
  conversationId: evalCase.conversation_id,
31058
31535
  score: score.score,
@@ -31095,7 +31572,8 @@ async function runEvaluatorsForCase(options) {
31095
31572
  targetResolver,
31096
31573
  availableTargets,
31097
31574
  fileChanges,
31098
- workspacePath
31575
+ workspacePath,
31576
+ threshold
31099
31577
  } = options;
31100
31578
  if (evalCase.assertions && evalCase.assertions.length > 0) {
31101
31579
  return runEvaluatorList({
@@ -31121,7 +31599,8 @@ async function runEvaluatorsForCase(options) {
31121
31599
  targetResolver,
31122
31600
  availableTargets,
31123
31601
  fileChanges,
31124
- workspacePath
31602
+ workspacePath,
31603
+ threshold
31125
31604
  });
31126
31605
  }
31127
31606
  const evaluatorKind = evalCase.evaluator ?? "llm-grader";
@@ -31223,7 +31702,8 @@ async function runEvaluatorList(options) {
31223
31702
  name: evaluatorConfig.name,
31224
31703
  type: evaluatorConfig.type,
31225
31704
  weight,
31226
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
31705
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
31706
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
31227
31707
  });
31228
31708
  scores.push({
31229
31709
  name: evaluatorConfig.name,
@@ -31258,7 +31738,8 @@ async function runEvaluatorList(options) {
31258
31738
  name: evaluatorConfig.name ?? "unknown",
31259
31739
  type: evaluatorConfig.type ?? "llm-grader",
31260
31740
  weight,
31261
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
31741
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
31742
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
31262
31743
  });
31263
31744
  scores.push({
31264
31745
  name: evaluatorConfig.name ?? "unknown",
@@ -31292,9 +31773,10 @@ async function runEvaluatorList(options) {
31292
31773
  }
31293
31774
  }
31294
31775
  }
31776
+ const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
31295
31777
  const hasRequiredFailure = scored.some((entry) => {
31296
31778
  if (!entry.required) return false;
31297
- const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
31779
+ const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
31298
31780
  return entry.score.score < minScore;
31299
31781
  });
31300
31782
  const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
@@ -31305,17 +31787,23 @@ async function runEvaluatorList(options) {
31305
31787
  const expectedAspectCount = assertions.length || 1;
31306
31788
  const score = {
31307
31789
  score: aggregateScore,
31308
- verdict: scoreToVerdict(aggregateScore),
31790
+ verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
31309
31791
  assertions,
31310
31792
  expectedAspectCount
31311
31793
  };
31312
31794
  return { score, scores };
31313
31795
  }
31796
+ function formatFilter(filter2) {
31797
+ return typeof filter2 === "string" ? filter2 : filter2.join(", ");
31798
+ }
31799
+ function matchesFilter3(id, filter2) {
31800
+ return typeof filter2 === "string" ? micromatch3.isMatch(id, filter2) : filter2.some((pattern) => micromatch3.isMatch(id, pattern));
31801
+ }
31314
31802
  function filterEvalCases(evalCases, filter2) {
31315
31803
  if (!filter2) {
31316
31804
  return evalCases;
31317
31805
  }
31318
- return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter2));
31806
+ return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter2));
31319
31807
  }
31320
31808
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
31321
31809
  const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
@@ -31402,7 +31890,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
31402
31890
  return {
31403
31891
  timestamp: timestamp.toISOString(),
31404
31892
  testId: evalCase.id,
31405
- dataset: evalCase.dataset,
31893
+ suite: evalCase.suite,
31406
31894
  category: evalCase.category,
31407
31895
  conversationId: evalCase.conversation_id,
31408
31896
  score: 0,
@@ -31666,6 +32154,7 @@ async function evaluate(config) {
31666
32154
  verbose: config.verbose,
31667
32155
  maxConcurrency: config.workers ?? 3,
31668
32156
  filter: config.filter,
32157
+ threshold: config.threshold,
31669
32158
  evalCases,
31670
32159
  onResult: async (result) => {
31671
32160
  collectedResults.push(result);
@@ -31676,19 +32165,19 @@ async function evaluate(config) {
31676
32165
  const durationMs = Date.now() - startTime;
31677
32166
  return {
31678
32167
  results: allResults,
31679
- summary: computeSummary(allResults, durationMs)
32168
+ summary: computeSummary(allResults, durationMs, config.threshold)
31680
32169
  };
31681
32170
  }
31682
32171
  function mapAssertionType(type) {
31683
32172
  return type.replace(/_/g, "-");
31684
32173
  }
31685
- function computeSummary(results, durationMs) {
32174
+ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
31686
32175
  const total = results.length;
31687
32176
  let passed = 0;
31688
32177
  let scoreSum = 0;
31689
32178
  for (const r of results) {
31690
32179
  scoreSum += r.score;
31691
- if (r.score >= PASS_THRESHOLD) {
32180
+ if (r.score >= threshold) {
31692
32181
  passed++;
31693
32182
  }
31694
32183
  }
@@ -31798,7 +32287,7 @@ var CONFIG_FILE_NAMES = [
31798
32287
  ];
31799
32288
  async function loadTsConfig(projectRoot) {
31800
32289
  const { existsSync: existsSync7 } = await import("node:fs");
31801
- const { pathToFileURL } = await import("node:url");
32290
+ const { pathToFileURL: pathToFileURL2 } = await import("node:url");
31802
32291
  const { join: join2 } = await import("node:path");
31803
32292
  for (const fileName of CONFIG_FILE_NAMES) {
31804
32293
  const filePath = join2(projectRoot, fileName);
@@ -31806,7 +32295,7 @@ async function loadTsConfig(projectRoot) {
31806
32295
  continue;
31807
32296
  }
31808
32297
  try {
31809
- const fileUrl = pathToFileURL(filePath).href;
32298
+ const fileUrl = pathToFileURL2(filePath).href;
31810
32299
  const mod = await import(fileUrl);
31811
32300
  const config = mod.default ?? mod;
31812
32301
  return AgentVConfigSchema.parse(config);
@@ -31953,7 +32442,7 @@ function saveProjectRegistry(registry) {
31953
32442
  const registryPath = getProjectsRegistryPath();
31954
32443
  const dir = path47.dirname(registryPath);
31955
32444
  if (!existsSync6(dir)) {
31956
- mkdirSync(dir, { recursive: true });
32445
+ mkdirSync2(dir, { recursive: true });
31957
32446
  }
31958
32447
  writeFileSync(registryPath, stringifyYaml(registry), "utf-8");
31959
32448
  }
@@ -32213,7 +32702,7 @@ var OtelTraceExporter = class {
32213
32702
  rootSpan.setAttribute("gen_ai.system", "agentv");
32214
32703
  rootSpan.setAttribute("agentv.test_id", result.testId);
32215
32704
  rootSpan.setAttribute("agentv.target", result.target);
32216
- if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
32705
+ if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
32217
32706
  rootSpan.setAttribute("agentv.score", result.score);
32218
32707
  if (captureContent && result.output.length > 0) {
32219
32708
  const lastMsg = result.output[result.output.length - 1];
@@ -32422,7 +32911,7 @@ var OtelStreamingObserver = class {
32422
32911
  this.rootSpan.setAttribute("gen_ai.system", "agentv");
32423
32912
  this.rootSpan.setAttribute("agentv.test_id", testId);
32424
32913
  this.rootSpan.setAttribute("agentv.target", target);
32425
- if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
32914
+ if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
32426
32915
  this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
32427
32916
  }
32428
32917
  /** Create and immediately export a tool span */
@@ -32768,7 +33257,230 @@ function extractToolResultContent(content) {
32768
33257
  }
32769
33258
  return parts.length > 0 ? parts.join("") : void 0;
32770
33259
  }
32771
- var DEFAULT_PROJECTS_DIR = () => path48.join(homedir3(), ".claude", "projects");
33260
+ function parseCodexSession(jsonl) {
33261
+ const messages = [];
33262
+ let sessionId = "";
33263
+ let cwd;
33264
+ let model;
33265
+ let version;
33266
+ let startTimestamp;
33267
+ let endTimestamp;
33268
+ const pendingCalls = /* @__PURE__ */ new Map();
33269
+ const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
33270
+ for (const line of lines) {
33271
+ let entry;
33272
+ try {
33273
+ entry = JSON.parse(line);
33274
+ } catch {
33275
+ continue;
33276
+ }
33277
+ if (!entry.type) continue;
33278
+ if (entry.timestamp) {
33279
+ if (!startTimestamp) startTimestamp = entry.timestamp;
33280
+ endTimestamp = entry.timestamp;
33281
+ }
33282
+ const payload = entry.payload ?? {};
33283
+ switch (entry.type) {
33284
+ case "session_meta": {
33285
+ sessionId = String(payload.id ?? "");
33286
+ cwd = payload.cwd ? String(payload.cwd) : void 0;
33287
+ version = payload.cli_version ? String(payload.cli_version) : void 0;
33288
+ if (payload.model && !model) {
33289
+ model = String(payload.model);
33290
+ }
33291
+ break;
33292
+ }
33293
+ case "turn_context": {
33294
+ if (payload.model && !model) {
33295
+ model = String(payload.model);
33296
+ }
33297
+ if (payload.cwd && !cwd) {
33298
+ cwd = String(payload.cwd);
33299
+ }
33300
+ break;
33301
+ }
33302
+ case "response_item": {
33303
+ const itemType = String(payload.type ?? "");
33304
+ const role = String(payload.role ?? "");
33305
+ switch (itemType) {
33306
+ case "message": {
33307
+ if (role === "developer") break;
33308
+ const content = extractResponseItemContent(payload.content);
33309
+ if (role === "user" && content) {
33310
+ messages.push({ role: "user", content });
33311
+ } else if (role === "assistant" && content) {
33312
+ messages.push({ role: "assistant", content });
33313
+ }
33314
+ break;
33315
+ }
33316
+ case "function_call": {
33317
+ const toolName = String(payload.name ?? "");
33318
+ const callId = String(payload.call_id ?? "");
33319
+ let input;
33320
+ if (typeof payload.arguments === "string") {
33321
+ try {
33322
+ input = JSON.parse(payload.arguments);
33323
+ } catch {
33324
+ input = payload.arguments;
33325
+ }
33326
+ } else {
33327
+ input = payload.arguments;
33328
+ }
33329
+ const toolCall = { tool: toolName, input, id: callId };
33330
+ const msgIdx = messages.length;
33331
+ messages.push({
33332
+ role: "assistant",
33333
+ toolCalls: [toolCall]
33334
+ });
33335
+ if (callId) {
33336
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
33337
+ }
33338
+ break;
33339
+ }
33340
+ case "custom_tool_call": {
33341
+ const toolName = String(payload.name ?? "");
33342
+ const callId = String(payload.call_id ?? "");
33343
+ let input;
33344
+ if (typeof payload.arguments === "string") {
33345
+ try {
33346
+ input = JSON.parse(payload.arguments);
33347
+ } catch {
33348
+ input = payload.arguments;
33349
+ }
33350
+ } else {
33351
+ input = payload.arguments;
33352
+ }
33353
+ const toolCall = { tool: toolName, input, id: callId };
33354
+ const msgIdx = messages.length;
33355
+ messages.push({
33356
+ role: "assistant",
33357
+ toolCalls: [toolCall]
33358
+ });
33359
+ if (callId) {
33360
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
33361
+ }
33362
+ break;
33363
+ }
33364
+ case "function_call_output":
33365
+ case "custom_tool_call_output": {
33366
+ const callId = String(payload.call_id ?? "");
33367
+ const pending = pendingCalls.get(callId);
33368
+ if (pending) {
33369
+ const existingMsg = messages[pending.msgIdx];
33370
+ const existingCalls = [...existingMsg.toolCalls ?? []];
33371
+ existingCalls[pending.toolIdx] = {
33372
+ ...existingCalls[pending.toolIdx],
33373
+ output: payload.output
33374
+ };
33375
+ messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
33376
+ pendingCalls.delete(callId);
33377
+ }
33378
+ break;
33379
+ }
33380
+ // Skip reasoning blocks (thinking tokens)
33381
+ case "reasoning":
33382
+ break;
33383
+ }
33384
+ break;
33385
+ }
33386
+ }
33387
+ }
33388
+ let durationMs;
33389
+ if (startTimestamp && endTimestamp) {
33390
+ durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
33391
+ }
33392
+ const source = {
33393
+ provider: "codex",
33394
+ sessionId,
33395
+ cwd,
33396
+ startedAt: startTimestamp,
33397
+ model,
33398
+ version
33399
+ };
33400
+ return {
33401
+ messages,
33402
+ source,
33403
+ // Codex rollout files don't include token counts (only rate limit info)
33404
+ tokenUsage: void 0,
33405
+ durationMs,
33406
+ costUsd: null
33407
+ };
33408
+ }
33409
+ function extractResponseItemContent(content) {
33410
+ if (typeof content === "string") return content;
33411
+ if (!Array.isArray(content)) return void 0;
33412
+ const parts = [];
33413
+ for (const block of content) {
33414
+ if (typeof block === "object" && block !== null) {
33415
+ const b = block;
33416
+ if (typeof b.text === "string") {
33417
+ parts.push(b.text);
33418
+ }
33419
+ }
33420
+ }
33421
+ return parts.length > 0 ? parts.join("") : void 0;
33422
+ }
33423
+ var DEFAULT_SESSIONS_DIR = () => path48.join(homedir3(), ".codex", "sessions");
33424
+ async function discoverCodexSessions(opts) {
33425
+ const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
33426
+ const limit = opts?.latest ? 1 : opts?.limit ?? 10;
33427
+ const sessions = [];
33428
+ let yearDirs;
33429
+ try {
33430
+ yearDirs = await readdir8(sessionsDir);
33431
+ } catch {
33432
+ return [];
33433
+ }
33434
+ for (const year of yearDirs) {
33435
+ const yearPath = path48.join(sessionsDir, year);
33436
+ let monthDirs;
33437
+ try {
33438
+ monthDirs = await readdir8(yearPath);
33439
+ } catch {
33440
+ continue;
33441
+ }
33442
+ for (const month of monthDirs) {
33443
+ const monthPath = path48.join(yearPath, month);
33444
+ let dayDirs;
33445
+ try {
33446
+ dayDirs = await readdir8(monthPath);
33447
+ } catch {
33448
+ continue;
33449
+ }
33450
+ for (const day of dayDirs) {
33451
+ if (opts?.date) {
33452
+ const dirDate = `${year}-${month}-${day}`;
33453
+ if (dirDate !== opts.date) continue;
33454
+ }
33455
+ const dayPath = path48.join(monthPath, day);
33456
+ let files;
33457
+ try {
33458
+ files = await readdir8(dayPath);
33459
+ } catch {
33460
+ continue;
33461
+ }
33462
+ for (const file of files) {
33463
+ if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
33464
+ const filePath = path48.join(dayPath, file);
33465
+ const nameWithoutExt = file.replace(/\.jsonl$/, "");
33466
+ const parts = nameWithoutExt.split("-");
33467
+ const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
33468
+ let updatedAt;
33469
+ try {
33470
+ const fileStat = await stat9(filePath);
33471
+ updatedAt = fileStat.mtime;
33472
+ } catch {
33473
+ updatedAt = /* @__PURE__ */ new Date(0);
33474
+ }
33475
+ sessions.push({ sessionId, filePath, filename: file, updatedAt });
33476
+ }
33477
+ }
33478
+ }
33479
+ }
33480
+ sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
33481
+ return sessions.slice(0, limit);
33482
+ }
33483
+ var DEFAULT_PROJECTS_DIR = () => path49.join(homedir4(), ".claude", "projects");
32772
33484
  function encodeProjectPath(projectPath) {
32773
33485
  return projectPath.replace(/\//g, "-");
32774
33486
  }
@@ -32777,7 +33489,7 @@ async function discoverClaudeSessions(opts) {
32777
33489
  const limit = opts?.latest ? 1 : opts?.limit ?? 10;
32778
33490
  let projectDirs;
32779
33491
  try {
32780
- projectDirs = await readdir8(projectsDir);
33492
+ projectDirs = await readdir9(projectsDir);
32781
33493
  } catch {
32782
33494
  return [];
32783
33495
  }
@@ -32787,10 +33499,10 @@ async function discoverClaudeSessions(opts) {
32787
33499
  }
32788
33500
  const sessions = [];
32789
33501
  for (const projectDir of projectDirs) {
32790
- const dirPath = path48.join(projectsDir, projectDir);
33502
+ const dirPath = path49.join(projectsDir, projectDir);
32791
33503
  let entries;
32792
33504
  try {
32793
- entries = await readdir8(dirPath);
33505
+ entries = await readdir9(dirPath);
32794
33506
  } catch {
32795
33507
  continue;
32796
33508
  }
@@ -32798,10 +33510,10 @@ async function discoverClaudeSessions(opts) {
32798
33510
  if (!entry.endsWith(".jsonl")) continue;
32799
33511
  const sessionId = entry.replace(/\.jsonl$/, "");
32800
33512
  if (opts?.sessionId && sessionId !== opts.sessionId) continue;
32801
- const filePath = path48.join(dirPath, entry);
33513
+ const filePath = path49.join(dirPath, entry);
32802
33514
  let updatedAt;
32803
33515
  try {
32804
- const fileStat = await stat9(filePath);
33516
+ const fileStat = await stat10(filePath);
32805
33517
  updatedAt = fileStat.mtime;
32806
33518
  } catch {
32807
33519
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -32817,9 +33529,82 @@ async function discoverClaudeSessions(opts) {
32817
33529
  sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
32818
33530
  return sessions.slice(0, limit);
32819
33531
  }
33532
+ function toTranscriptJsonLine(entry) {
33533
+ const firstUserMessage = entry.messages.find((m) => m.role === "user");
33534
+ const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
33535
+ return {
33536
+ input,
33537
+ output: entry.messages,
33538
+ token_usage: entry.tokenUsage ? {
33539
+ input: entry.tokenUsage.input,
33540
+ output: entry.tokenUsage.output,
33541
+ cached: entry.tokenUsage.cached
33542
+ } : void 0,
33543
+ duration_ms: entry.durationMs,
33544
+ cost_usd: entry.costUsd,
33545
+ source: {
33546
+ provider: entry.source.provider,
33547
+ session_id: entry.source.sessionId,
33548
+ model: entry.source.model,
33549
+ timestamp: entry.source.startedAt,
33550
+ git_branch: entry.source.gitBranch,
33551
+ cwd: entry.source.cwd ?? entry.source.projectPath,
33552
+ version: entry.source.version
33553
+ }
33554
+ };
33555
+ }
33556
+ async function readTranscriptJsonl(filePath) {
33557
+ const text2 = await readFile14(filePath, "utf8");
33558
+ return text2.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
33559
+ }
32820
33560
  async function readTranscriptFile(filePath) {
32821
33561
  return readFile14(filePath, "utf8");
32822
33562
  }
33563
+ var TranscriptProvider = class _TranscriptProvider {
33564
+ id;
33565
+ kind = "transcript";
33566
+ targetName;
33567
+ lines;
33568
+ cursor = 0;
33569
+ constructor(targetName, lines) {
33570
+ this.targetName = targetName;
33571
+ this.id = `transcript:${targetName}`;
33572
+ this.lines = lines;
33573
+ }
33574
+ /**
33575
+ * Create a TranscriptProvider from a JSONL file path.
33576
+ */
33577
+ static async fromFile(filePath) {
33578
+ const lines = await readTranscriptJsonl(filePath);
33579
+ if (lines.length === 0) {
33580
+ throw new Error(`Transcript file is empty: ${filePath}`);
33581
+ }
33582
+ const providerName = lines[0].source.provider ?? "transcript";
33583
+ return new _TranscriptProvider(providerName, lines);
33584
+ }
33585
+ get lineCount() {
33586
+ return this.lines.length;
33587
+ }
33588
+ async invoke(_request) {
33589
+ if (this.cursor >= this.lines.length) {
33590
+ throw new Error(
33591
+ `Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
33592
+ );
33593
+ }
33594
+ const line = this.lines[this.cursor++];
33595
+ return {
33596
+ output: line.output,
33597
+ tokenUsage: line.token_usage ? {
33598
+ input: line.token_usage.input,
33599
+ output: line.token_usage.output,
33600
+ cached: line.token_usage.cached
33601
+ } : void 0,
33602
+ durationMs: line.duration_ms,
33603
+ costUsd: line.cost_usd ?? void 0,
33604
+ startTime: line.source.timestamp
33605
+ };
33606
+ }
33607
+ };
32823
33608
  function createAgentKernel() {
32824
33609
  return { status: "stub" };
32825
33610
  }
@@ -32843,6 +33628,7 @@ export {
32843
33628
  buildSearchRoots,
32844
33629
  resolveFileReference,
32845
33630
  CLI_PLACEHOLDERS,
33631
+ findDeprecatedCamelCaseTargetWarnings,
32846
33632
  COMMON_TARGET_SETTINGS,
32847
33633
  resolveDelegatedTargetDefinition,
32848
33634
  resolveTargetDefinition,
@@ -32887,17 +33673,18 @@ export {
32887
33673
  subscribeToCodexLogEntries,
32888
33674
  consumeCopilotCliLogEntries,
32889
33675
  subscribeToCopilotCliLogEntries,
33676
+ parseCopilotEvents,
32890
33677
  discoverCopilotSessions,
32891
33678
  consumeCopilotSdkLogEntries,
32892
33679
  subscribeToCopilotSdkLogEntries,
32893
33680
  consumePiLogEntries,
32894
33681
  subscribeToPiLogEntries,
32895
- ProviderRegistry,
32896
33682
  getAgentvHome,
32897
33683
  getWorkspacesRoot,
32898
33684
  getSubagentsRoot,
32899
33685
  getTraceStateRoot,
32900
33686
  getWorkspacePoolRoot,
33687
+ ProviderRegistry,
32901
33688
  ensureVSCodeSubagents,
32902
33689
  readTargetDefinitions,
32903
33690
  listTargetNames,
@@ -32905,6 +33692,7 @@ export {
32905
33692
  createBuiltinProviderRegistry,
32906
33693
  createProvider,
32907
33694
  resolveAndCreateProvider,
33695
+ DEFAULT_THRESHOLD,
32908
33696
  PASS_THRESHOLD,
32909
33697
  scoreToVerdict,
32910
33698
  clampScore,
@@ -32992,8 +33780,13 @@ export {
32992
33780
  OtelTraceExporter,
32993
33781
  OtelStreamingObserver,
32994
33782
  parseClaudeSession,
33783
+ parseCodexSession,
33784
+ discoverCodexSessions,
32995
33785
  discoverClaudeSessions,
33786
+ toTranscriptJsonLine,
33787
+ readTranscriptJsonl,
32996
33788
  readTranscriptFile,
33789
+ TranscriptProvider,
32997
33790
  createAgentKernel
32998
33791
  };
32999
- //# sourceMappingURL=chunk-YXXD27OK.js.map
33792
+ //# sourceMappingURL=chunk-I6UE4LHZ.js.map