agentv 4.6.0 → 4.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -301,7 +301,7 @@ var require_dist = __commonJS({
301
301
  }
302
302
  });
303
303
 
304
- // ../../packages/core/dist/chunk-AIQ5FO4G.js
304
+ // ../../packages/core/dist/chunk-75RFVESM.js
305
305
  import { constants } from "node:fs";
306
306
  import { access, readFile } from "node:fs/promises";
307
307
  import path from "node:path";
@@ -419,7 +419,7 @@ __export(external_exports2, {
419
419
  void: () => voidType
420
420
  });
421
421
 
422
- // ../../packages/core/dist/chunk-AIQ5FO4G.js
422
+ // ../../packages/core/dist/chunk-75RFVESM.js
423
423
  import { readFile as readFile2 } from "node:fs/promises";
424
424
  import path3 from "node:path";
425
425
  import fg from "fast-glob";
@@ -633,15 +633,13 @@ async function resolveFileReference(rawValue, searchRoots) {
633
633
  }
634
634
  var CliHealthcheckHttpInputSchema = external_exports2.object({
635
635
  url: external_exports2.string().min(1, "healthcheck URL is required"),
636
- timeout_seconds: external_exports2.number().positive().optional(),
637
- timeoutSeconds: external_exports2.number().positive().optional()
638
- });
636
+ timeout_seconds: external_exports2.number().positive().optional()
637
+ }).passthrough();
639
638
  var CliHealthcheckCommandInputSchema = external_exports2.object({
640
639
  command: external_exports2.string().min(1, "healthcheck command is required"),
641
640
  cwd: external_exports2.string().optional(),
642
- timeout_seconds: external_exports2.number().positive().optional(),
643
- timeoutSeconds: external_exports2.number().positive().optional()
644
- });
641
+ timeout_seconds: external_exports2.number().positive().optional()
642
+ }).passthrough();
645
643
  var CliHealthcheckInputSchema = external_exports2.union([
646
644
  CliHealthcheckHttpInputSchema,
647
645
  CliHealthcheckCommandInputSchema
@@ -653,36 +651,28 @@ var CliTargetInputSchema = external_exports2.object({
653
651
  command: external_exports2.string(),
654
652
  // Files format - optional
655
653
  files_format: external_exports2.string().optional(),
656
- filesFormat: external_exports2.string().optional(),
657
654
  attachments_format: external_exports2.string().optional(),
658
- attachmentsFormat: external_exports2.string().optional(),
659
655
  // Working directory - optional
660
656
  cwd: external_exports2.string().optional(),
661
657
  // Workspace template directory - optional (mutually exclusive with cwd)
662
658
  workspace_template: external_exports2.string().optional(),
663
- workspaceTemplate: external_exports2.string().optional(),
664
659
  // Timeout in seconds - optional
665
660
  timeout_seconds: external_exports2.number().positive().optional(),
666
- timeoutSeconds: external_exports2.number().positive().optional(),
667
661
  // Healthcheck configuration - optional
668
662
  healthcheck: CliHealthcheckInputSchema.optional(),
669
663
  // Verbose mode - optional
670
664
  verbose: external_exports2.boolean().optional(),
671
665
  cli_verbose: external_exports2.boolean().optional(),
672
- cliVerbose: external_exports2.boolean().optional(),
673
666
  // Keep temp files - optional
674
667
  keep_temp_files: external_exports2.boolean().optional(),
675
- keepTempFiles: external_exports2.boolean().optional(),
676
668
  keep_output_files: external_exports2.boolean().optional(),
677
- keepOutputFiles: external_exports2.boolean().optional(),
678
669
  // Common target fields
679
670
  grader_target: external_exports2.string().optional(),
680
671
  judge_target: external_exports2.string().optional(),
681
672
  // backward compat
682
673
  workers: external_exports2.number().int().min(1).optional(),
683
- provider_batching: external_exports2.boolean().optional(),
684
- providerBatching: external_exports2.boolean().optional()
685
- });
674
+ provider_batching: external_exports2.boolean().optional()
675
+ }).passthrough();
686
676
  var CliHealthcheckHttpSchema = external_exports2.object({
687
677
  url: external_exports2.string().min(1),
688
678
  timeoutMs: external_exports2.number().positive().optional()
@@ -707,7 +697,7 @@ var CliTargetConfigSchema = external_exports2.object({
707
697
  keepTempFiles: external_exports2.boolean().optional()
708
698
  }).strict();
709
699
  function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
710
- const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
700
+ const timeoutSeconds = input.timeout_seconds;
711
701
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
712
702
  if ("url" in input && input.url) {
713
703
  const url = resolveString(input.url, env, `${targetName} healthcheck URL`);
@@ -741,9 +731,9 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
741
731
  function normalizeCliTargetInput(input, env, evalFilePath) {
742
732
  const targetName = input.name;
743
733
  const command = resolveString(input.command, env, `${targetName} CLI command`, true);
744
- const filesFormatSource = input.files_format ?? input.filesFormat ?? input.attachments_format ?? input.attachmentsFormat;
734
+ const filesFormatSource = input.files_format ?? input.attachments_format;
745
735
  const filesFormat = resolveOptionalLiteralString(filesFormatSource);
746
- const workspaceTemplateSource = input.workspace_template ?? input.workspaceTemplate;
736
+ const workspaceTemplateSource = input.workspace_template;
747
737
  let workspaceTemplate = resolveOptionalString(
748
738
  workspaceTemplateSource,
749
739
  env,
@@ -771,12 +761,10 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
771
761
  if (!cwd && !workspaceTemplate && evalFilePath) {
772
762
  cwd = path2.dirname(path2.resolve(evalFilePath));
773
763
  }
774
- const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
764
+ const timeoutSeconds = input.timeout_seconds;
775
765
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
776
- const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose ?? input.cliVerbose);
777
- const keepTempFiles = resolveOptionalBoolean(
778
- input.keep_temp_files ?? input.keepTempFiles ?? input.keep_output_files ?? input.keepOutputFiles
779
- );
766
+ const verbose = resolveOptionalBoolean(input.verbose ?? input.cli_verbose);
767
+ const keepTempFiles = resolveOptionalBoolean(input.keep_temp_files ?? input.keep_output_files);
780
768
  const healthcheck = input.healthcheck ? normalizeCliHealthcheck(input.healthcheck, env, targetName, evalFilePath) : void 0;
781
769
  return {
782
770
  command,
@@ -797,15 +785,106 @@ var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
797
785
  "FILES",
798
786
  "OUTPUT_FILE"
799
787
  ]);
788
+ var DEPRECATED_TARGET_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
789
+ ["providerBatching", "provider_batching"],
790
+ ["subagentModeAllowed", "subagent_mode_allowed"],
791
+ ["fallbackTargets", "fallback_targets"],
792
+ ["resourceName", "endpoint"],
793
+ ["baseUrl", "base_url"],
794
+ ["apiKey", "api_key"],
795
+ ["deploymentName", "model"],
796
+ ["thinkingBudget", "thinking_budget"],
797
+ ["maxTokens", "max_output_tokens"],
798
+ ["apiFormat", "api_format"],
799
+ ["timeoutSeconds", "timeout_seconds"],
800
+ ["logDir", "log_dir"],
801
+ ["logDirectory", "log_directory"],
802
+ ["logFormat", "log_format"],
803
+ ["logOutputFormat", "log_output_format"],
804
+ ["systemPrompt", "system_prompt"],
805
+ ["maxTurns", "max_turns"],
806
+ ["maxBudgetUsd", "max_budget_usd"],
807
+ ["dryRun", "dry_run"],
808
+ ["subagentRoot", "subagent_root"],
809
+ ["filesFormat", "files_format"],
810
+ ["attachmentsFormat", "attachments_format"],
811
+ ["cliUrl", "cli_url"],
812
+ ["cliPath", "cli_path"],
813
+ ["githubToken", "github_token"],
814
+ ["sessionDir", "session_dir"],
815
+ ["sessionId", "session_id"],
816
+ ["sessionStateDir", "session_state_dir"],
817
+ ["maxRetries", "max_retries"],
818
+ ["retryInitialDelayMs", "retry_initial_delay_ms"],
819
+ ["retryMaxDelayMs", "retry_max_delay_ms"],
820
+ ["retryBackoffFactor", "retry_backoff_factor"],
821
+ ["retryStatusCodes", "retry_status_codes"]
822
+ ]);
823
+ var DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS = /* @__PURE__ */ new Map([
824
+ ["timeoutSeconds", "timeout_seconds"]
825
+ ]);
826
+ function collectDeprecatedCamelCaseWarnings(value, location, aliases) {
827
+ if (typeof value !== "object" || value === null || Array.isArray(value)) {
828
+ return [];
829
+ }
830
+ const warnings = [];
831
+ for (const [camelCaseField, snakeCaseField] of aliases) {
832
+ if (Object.prototype.hasOwnProperty.call(value, camelCaseField)) {
833
+ warnings.push({
834
+ location: `${location}.${camelCaseField}`,
835
+ message: `camelCase field '${camelCaseField}' is no longer supported in targets.yaml. Use '${snakeCaseField}' instead.`
836
+ });
837
+ }
838
+ }
839
+ return warnings;
840
+ }
841
+ function assertNoDeprecatedCamelCaseTargetFields(definition) {
842
+ if (Object.prototype.hasOwnProperty.call(definition, "workspaceTemplate")) {
843
+ throw new Error(
844
+ `${definition.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
845
+ );
846
+ }
847
+ const warning = findDeprecatedCamelCaseTargetWarnings(
848
+ definition,
849
+ `target "${definition.name}"`
850
+ )[0];
851
+ if (!warning) {
852
+ return;
853
+ }
854
+ const fieldMatch = warning.message.match(/field '([^']+)'/);
855
+ const replacementMatch = warning.message.match(/Use '([^']+)' instead/);
856
+ const field = fieldMatch?.[1] ?? "unknown";
857
+ const replacement = replacementMatch?.[1] ?? "snake_case";
858
+ throw new Error(
859
+ `${warning.location}: camelCase field '${field}' is no longer supported in targets.yaml. Use '${replacement}' instead.`
860
+ );
861
+ }
862
+ function findDeprecatedCamelCaseTargetWarnings(target, location) {
863
+ const warnings = collectDeprecatedCamelCaseWarnings(
864
+ target,
865
+ location,
866
+ DEPRECATED_TARGET_CAMEL_CASE_FIELDS
867
+ );
868
+ if (typeof target !== "object" || target === null || Array.isArray(target)) {
869
+ return warnings;
870
+ }
871
+ const healthcheck = target.healthcheck;
872
+ warnings.push(
873
+ ...collectDeprecatedCamelCaseWarnings(
874
+ healthcheck,
875
+ `${location}.healthcheck`,
876
+ DEPRECATED_HEALTHCHECK_CAMEL_CASE_FIELDS
877
+ )
878
+ );
879
+ return warnings;
880
+ }
800
881
  var COMMON_TARGET_SETTINGS = [
801
882
  "use_target",
802
883
  "provider_batching",
803
- "providerBatching",
804
884
  "subagent_mode_allowed",
805
- "subagentModeAllowed",
806
- "fallback_targets",
807
- "fallbackTargets"
885
+ "fallback_targets"
808
886
  ];
887
+ var USE_TARGET_ENV_PATTERN = /^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i;
809
888
  var BASE_TARGET_SCHEMA = external_exports2.object({
810
889
  name: external_exports2.string().min(1, "target name is required"),
811
890
  provider: external_exports2.string().optional(),
@@ -815,43 +894,40 @@ var BASE_TARGET_SCHEMA = external_exports2.object({
815
894
  // backward compat
816
895
  workers: external_exports2.number().int().min(1).optional(),
817
896
  workspace_template: external_exports2.string().optional(),
818
- workspaceTemplate: external_exports2.string().optional(),
819
897
  subagent_mode_allowed: external_exports2.boolean().optional(),
820
- fallback_targets: external_exports2.array(external_exports2.string().min(1)).optional(),
821
- fallbackTargets: external_exports2.array(external_exports2.string().min(1)).optional()
898
+ fallback_targets: external_exports2.array(external_exports2.string().min(1)).optional()
822
899
  }).passthrough();
823
900
  var DEFAULT_AZURE_API_VERSION = "2024-12-01-preview";
901
+ var DEFAULT_AZURE_RESPONSES_API_VERSION = "v1";
824
902
  var DEFAULT_OPENAI_BASE_URL = "https://api.openai.com/v1";
825
- function normalizeAzureApiVersion(value) {
903
+ function normalizeAzureApiVersion(value, apiFormat) {
904
+ const defaultVersion = apiFormat === "responses" ? DEFAULT_AZURE_RESPONSES_API_VERSION : DEFAULT_AZURE_API_VERSION;
826
905
  if (!value) {
827
- return DEFAULT_AZURE_API_VERSION;
906
+ return defaultVersion;
828
907
  }
829
908
  const trimmed = value.trim();
830
909
  if (trimmed.length === 0) {
831
- return DEFAULT_AZURE_API_VERSION;
910
+ return defaultVersion;
832
911
  }
833
912
  const withoutPrefix = trimmed.replace(/^api[-_]?version\s*=\s*/i, "").trim();
834
- return withoutPrefix.length > 0 ? withoutPrefix : DEFAULT_AZURE_API_VERSION;
913
+ return withoutPrefix.length > 0 ? withoutPrefix : defaultVersion;
835
914
  }
836
915
  function resolveRetryConfig(target) {
837
- const maxRetries = resolveOptionalNumber(
838
- target.max_retries ?? target.maxRetries,
839
- `${target.name} max retries`
840
- );
916
+ const maxRetries = resolveOptionalNumber(target.max_retries, `${target.name} max retries`);
841
917
  const initialDelayMs = resolveOptionalNumber(
842
- target.retry_initial_delay_ms ?? target.retryInitialDelayMs,
918
+ target.retry_initial_delay_ms,
843
919
  `${target.name} retry initial delay`
844
920
  );
845
921
  const maxDelayMs = resolveOptionalNumber(
846
- target.retry_max_delay_ms ?? target.retryMaxDelayMs,
922
+ target.retry_max_delay_ms,
847
923
  `${target.name} retry max delay`
848
924
  );
849
925
  const backoffFactor = resolveOptionalNumber(
850
- target.retry_backoff_factor ?? target.retryBackoffFactor,
926
+ target.retry_backoff_factor,
851
927
  `${target.name} retry backoff factor`
852
928
  );
853
929
  const retryableStatusCodes = resolveOptionalNumberArray(
854
- target.retry_status_codes ?? target.retryStatusCodes,
930
+ target.retry_status_codes,
855
931
  `${target.name} retry status codes`
856
932
  );
857
933
  if (maxRetries === void 0 && initialDelayMs === void 0 && maxDelayMs === void 0 && backoffFactor === void 0 && retryableStatusCodes === void 0) {
@@ -865,9 +941,56 @@ function resolveRetryConfig(target) {
865
941
  retryableStatusCodes
866
942
  };
867
943
  }
868
- function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
944
+ function resolveDelegatedTargetDefinition(name21, definitions, env = process.env) {
945
+ let definition = definitions.get(name21);
946
+ if (!definition) {
947
+ return void 0;
948
+ }
949
+ const visited = [definition.name];
950
+ for (let depth = 0; depth < 10; depth++) {
951
+ const rawUseTarget = typeof definition.use_target === "string" ? definition.use_target.trim() : void 0;
952
+ if (!rawUseTarget) {
953
+ return definition;
954
+ }
955
+ const envMatch = rawUseTarget.match(USE_TARGET_ENV_PATTERN);
956
+ const envVarName = envMatch?.[1];
957
+ const resolvedName = envVarName ? env[envVarName]?.trim() ?? "" : rawUseTarget;
958
+ if (resolvedName.length === 0) {
959
+ if (envVarName) {
960
+ throw new Error(
961
+ `Target "${definition.name}" uses use_target: \${{ ${envVarName} }}, but ${envVarName} is not set. Set ${envVarName} to the name of a concrete target (for example, "azure") before running the eval.`
962
+ );
963
+ }
964
+ throw new Error(
965
+ `Target "${definition.name}" has an empty use_target value. Point it at a concrete target name before running the eval.`
966
+ );
967
+ }
968
+ const next = definitions.get(resolvedName);
969
+ if (!next) {
970
+ if (envVarName) {
971
+ throw new Error(
972
+ `Target "${definition.name}" uses use_target: \${{ ${envVarName} }}, which resolved to "${resolvedName}", but no target named "${resolvedName}" exists.`
973
+ );
974
+ }
975
+ throw new Error(
976
+ `Target "${definition.name}" uses use_target: "${resolvedName}", but no target named "${resolvedName}" exists.`
977
+ );
978
+ }
979
+ if (visited.includes(next.name)) {
980
+ const chain = [...visited, next.name].join(" -> ");
981
+ throw new Error(`Circular use_target reference detected: ${chain}`);
982
+ }
983
+ definition = next;
984
+ visited.push(definition.name);
985
+ }
986
+ throw new Error(
987
+ `Target "${name21}" exceeded the maximum use_target resolution depth (10). Check for a delegation loop or overly deep alias chain.`
988
+ );
989
+ }
990
+ function resolveTargetDefinition(definition, env = process.env, evalFilePath, options) {
991
+ assertNoDeprecatedCamelCaseTargetFields(definition);
869
992
  const parsed = BASE_TARGET_SCHEMA.parse(definition);
870
- if (parsed.workspace_template !== void 0 || parsed.workspaceTemplate !== void 0) {
993
+ if (parsed.workspace_template !== void 0) {
871
994
  throw new Error(
872
995
  `${parsed.name}: target-level workspace_template has been removed. Use eval-level workspace.template.`
873
996
  );
@@ -883,13 +1006,9 @@ function resolveTargetDefinition(definition, env = process.env, evalFilePath) {
883
1006
  `${parsed.name} provider`,
884
1007
  true
885
1008
  ).toLowerCase();
886
- const providerBatching = resolveOptionalBoolean(
887
- parsed.provider_batching ?? parsed.providerBatching
888
- );
889
- const subagentModeAllowed = resolveOptionalBoolean(
890
- parsed.subagent_mode_allowed ?? parsed.subagentModeAllowed
891
- );
892
- const fallbackTargets = parsed.fallback_targets ?? parsed.fallbackTargets;
1009
+ const providerBatching = resolveOptionalBoolean(parsed.provider_batching);
1010
+ const subagentModeAllowed = resolveOptionalBoolean(parsed.subagent_mode_allowed);
1011
+ const fallbackTargets = parsed.fallback_targets;
893
1012
  const base = {
894
1013
  name: parsed.name,
895
1014
  graderTarget: parsed.grader_target ?? parsed.judge_target,
@@ -1039,20 +1158,22 @@ function normalizeOpenAIBaseUrl(value) {
1039
1158
  return trimmed.endsWith("/v1") ? trimmed : `${trimmed}/v1`;
1040
1159
  }
1041
1160
  function resolveAzureConfig(target, env) {
1042
- const endpointSource = target.endpoint ?? target.resource ?? target.resourceName;
1043
- const apiKeySource = target.api_key ?? target.apiKey;
1044
- const deploymentSource = target.deployment ?? target.deploymentName ?? target.model;
1161
+ const endpointSource = target.endpoint ?? target.resource;
1162
+ const apiKeySource = target.api_key;
1163
+ const deploymentSource = target.deployment ?? target.model;
1045
1164
  const versionSource = target.version ?? target.api_version;
1046
1165
  const temperatureSource = target.temperature;
1047
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
1166
+ const maxTokensSource = target.max_output_tokens;
1048
1167
  const resourceName = resolveString(endpointSource, env, `${target.name} endpoint`);
1049
1168
  const apiKey = resolveString(apiKeySource, env, `${target.name} api key`);
1050
1169
  const deploymentName = resolveString(deploymentSource, env, `${target.name} deployment`);
1170
+ const apiFormat = resolveApiFormat(target, env, target.name);
1051
1171
  const version = normalizeAzureApiVersion(
1052
1172
  resolveOptionalString(versionSource, env, `${target.name} api version`, {
1053
1173
  allowLiteral: true,
1054
1174
  optionalEnv: true
1055
- })
1175
+ }),
1176
+ apiFormat
1056
1177
  );
1057
1178
  const temperature = resolveOptionalNumber(temperatureSource, `${target.name} temperature`);
1058
1179
  const maxOutputTokens = resolveOptionalNumber(
@@ -1065,13 +1186,17 @@ function resolveAzureConfig(target, env) {
1065
1186
  deploymentName,
1066
1187
  apiKey,
1067
1188
  version,
1189
+ apiFormat,
1068
1190
  temperature,
1069
1191
  maxOutputTokens,
1070
1192
  retry
1071
1193
  };
1072
1194
  }
1073
- function resolveApiFormat(target, targetName) {
1074
- const raw = target.api_format ?? target.apiFormat;
1195
+ function resolveApiFormat(target, env, targetName) {
1196
+ const raw = resolveOptionalString(target.api_format, env, `${targetName} api format`, {
1197
+ allowLiteral: true,
1198
+ optionalEnv: true
1199
+ });
1075
1200
  if (raw === void 0) return void 0;
1076
1201
  if (raw === "chat" || raw === "responses") return raw;
1077
1202
  throw new Error(
@@ -1079,11 +1204,11 @@ function resolveApiFormat(target, targetName) {
1079
1204
  );
1080
1205
  }
1081
1206
  function resolveOpenAIConfig(target, env) {
1082
- const endpointSource = target.endpoint ?? target.base_url ?? target.baseUrl;
1083
- const apiKeySource = target.api_key ?? target.apiKey;
1207
+ const endpointSource = target.endpoint ?? target.base_url;
1208
+ const apiKeySource = target.api_key;
1084
1209
  const modelSource = target.model ?? target.deployment ?? target.variant;
1085
1210
  const temperatureSource = target.temperature;
1086
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
1211
+ const maxTokensSource = target.max_output_tokens;
1087
1212
  const baseURL = normalizeOpenAIBaseUrl(
1088
1213
  resolveOptionalString(endpointSource, env, `${target.name} endpoint`, {
1089
1214
  allowLiteral: true,
@@ -1097,17 +1222,17 @@ function resolveOpenAIConfig(target, env) {
1097
1222
  baseURL,
1098
1223
  apiKey,
1099
1224
  model,
1100
- apiFormat: resolveApiFormat(target, target.name),
1225
+ apiFormat: resolveApiFormat(target, env, target.name),
1101
1226
  temperature: resolveOptionalNumber(temperatureSource, `${target.name} temperature`),
1102
1227
  maxOutputTokens: resolveOptionalNumber(maxTokensSource, `${target.name} max output tokens`),
1103
1228
  retry
1104
1229
  };
1105
1230
  }
1106
1231
  function resolveOpenRouterConfig(target, env) {
1107
- const apiKeySource = target.api_key ?? target.apiKey;
1232
+ const apiKeySource = target.api_key;
1108
1233
  const modelSource = target.model ?? target.deployment ?? target.variant;
1109
1234
  const temperatureSource = target.temperature;
1110
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
1235
+ const maxTokensSource = target.max_output_tokens;
1111
1236
  const retry = resolveRetryConfig(target);
1112
1237
  return {
1113
1238
  apiKey: resolveString(apiKeySource, env, `${target.name} OpenRouter api key`),
@@ -1118,11 +1243,11 @@ function resolveOpenRouterConfig(target, env) {
1118
1243
  };
1119
1244
  }
1120
1245
  function resolveAnthropicConfig(target, env) {
1121
- const apiKeySource = target.api_key ?? target.apiKey;
1246
+ const apiKeySource = target.api_key;
1122
1247
  const modelSource = target.model ?? target.deployment ?? target.variant;
1123
1248
  const temperatureSource = target.temperature;
1124
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
1125
- const thinkingBudgetSource = target.thinking_budget ?? target.thinkingBudget;
1249
+ const maxTokensSource = target.max_output_tokens;
1250
+ const thinkingBudgetSource = target.thinking_budget;
1126
1251
  const apiKey = resolveString(apiKeySource, env, `${target.name} Anthropic api key`);
1127
1252
  const model = resolveString(modelSource, env, `${target.name} Anthropic model`);
1128
1253
  const retry = resolveRetryConfig(target);
@@ -1136,10 +1261,10 @@ function resolveAnthropicConfig(target, env) {
1136
1261
  };
1137
1262
  }
1138
1263
  function resolveGeminiConfig(target, env) {
1139
- const apiKeySource = target.api_key ?? target.apiKey;
1264
+ const apiKeySource = target.api_key;
1140
1265
  const modelSource = target.model ?? target.deployment ?? target.variant;
1141
1266
  const temperatureSource = target.temperature;
1142
- const maxTokensSource = target.max_output_tokens ?? target.maxTokens;
1267
+ const maxTokensSource = target.max_output_tokens;
1143
1268
  const apiKey = resolveString(apiKeySource, env, `${target.name} Google API key`);
1144
1269
  const model = resolveOptionalString(modelSource, env, `${target.name} Gemini model`, {
1145
1270
  allowLiteral: true,
@@ -1159,11 +1284,11 @@ function resolveCodexConfig(target, env, evalFilePath) {
1159
1284
  const executableSource = target.executable ?? target.command ?? target.binary;
1160
1285
  const argsSource = target.args ?? target.arguments;
1161
1286
  const cwdSource = target.cwd;
1162
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
1163
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1164
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
1165
- const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CODEX_LOG_FORMAT;
1166
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
1287
+ const workspaceTemplateSource = target.workspace_template;
1288
+ const timeoutSource = target.timeout_seconds;
1289
+ const logDirSource = target.log_dir ?? target.log_directory;
1290
+ const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CODEX_LOG_FORMAT;
1291
+ const systemPromptSource = target.system_prompt;
1167
1292
  const model = resolveOptionalString(modelSource, env, `${target.name} codex model`, {
1168
1293
  allowLiteral: true,
1169
1294
  optionalEnv: true
@@ -1227,16 +1352,16 @@ function normalizeCodexLogFormat(value) {
1227
1352
  throw new Error("codex log format must be 'summary' or 'json'");
1228
1353
  }
1229
1354
  function resolveCopilotSdkConfig(target, env, evalFilePath) {
1230
- const cliUrlSource = target.cli_url ?? target.cliUrl;
1231
- const cliPathSource = target.cli_path ?? target.cliPath;
1232
- const githubTokenSource = target.github_token ?? target.githubToken;
1355
+ const cliUrlSource = target.cli_url;
1356
+ const cliPathSource = target.cli_path;
1357
+ const githubTokenSource = target.github_token;
1233
1358
  const modelSource = target.model;
1234
1359
  const cwdSource = target.cwd;
1235
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
1236
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1237
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
1238
- const logFormatSource = target.log_format ?? target.logFormat;
1239
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
1360
+ const workspaceTemplateSource = target.workspace_template;
1361
+ const timeoutSource = target.timeout_seconds;
1362
+ const logDirSource = target.log_dir ?? target.log_directory;
1363
+ const logFormatSource = target.log_format;
1364
+ const systemPromptSource = target.system_prompt;
1240
1365
  const cliUrl = resolveOptionalString(cliUrlSource, env, `${target.name} copilot-sdk cli URL`, {
1241
1366
  allowLiteral: true,
1242
1367
  optionalEnv: true
@@ -1309,11 +1434,11 @@ function resolveCopilotCliConfig(target, env, evalFilePath) {
1309
1434
  const modelSource = target.model;
1310
1435
  const argsSource = target.args ?? target.arguments;
1311
1436
  const cwdSource = target.cwd;
1312
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
1313
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1314
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
1315
- const logFormatSource = target.log_format ?? target.logFormat;
1316
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
1437
+ const workspaceTemplateSource = target.workspace_template;
1438
+ const timeoutSource = target.timeout_seconds;
1439
+ const logDirSource = target.log_dir ?? target.log_directory;
1440
+ const logFormatSource = target.log_format;
1441
+ const systemPromptSource = target.system_prompt;
1317
1442
  const executable = resolveOptionalString(executableSource, env, `${target.name} copilot-cli executable`, {
1318
1443
  allowLiteral: true,
1319
1444
  optionalEnv: true
@@ -1377,16 +1502,16 @@ function normalizeCopilotLogFormat(value) {
1377
1502
  }
1378
1503
  function resolvePiCodingAgentConfig(target, env, evalFilePath) {
1379
1504
  const subproviderSource = target.subprovider;
1380
- const modelSource = target.model ?? target.pi_model ?? target.piModel;
1381
- const apiKeySource = target.api_key ?? target.apiKey;
1382
- const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
1383
- const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
1505
+ const modelSource = target.model ?? target.pi_model;
1506
+ const apiKeySource = target.api_key;
1507
+ const toolsSource = target.tools ?? target.pi_tools;
1508
+ const thinkingSource = target.thinking ?? target.pi_thinking;
1384
1509
  const cwdSource = target.cwd;
1385
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
1386
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1387
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
1388
- const logFormatSource = target.log_format ?? target.logFormat;
1389
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
1510
+ const workspaceTemplateSource = target.workspace_template;
1511
+ const timeoutSource = target.timeout_seconds;
1512
+ const logDirSource = target.log_dir ?? target.log_directory;
1513
+ const logFormatSource = target.log_format;
1514
+ const systemPromptSource = target.system_prompt;
1390
1515
  const subprovider = resolveOptionalString(
1391
1516
  subproviderSource,
1392
1517
  env,
@@ -1404,6 +1529,11 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
1404
1529
  allowLiteral: false,
1405
1530
  optionalEnv: true
1406
1531
  });
1532
+ const baseUrlSource = target.base_url ?? target.endpoint;
1533
+ const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi base url`, {
1534
+ allowLiteral: true,
1535
+ optionalEnv: true
1536
+ });
1407
1537
  const tools = resolveOptionalString(toolsSource, env, `${target.name} pi tools`, {
1408
1538
  allowLiteral: true,
1409
1539
  optionalEnv: true
@@ -1444,6 +1574,7 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
1444
1574
  subprovider,
1445
1575
  model,
1446
1576
  apiKey,
1577
+ baseUrl,
1447
1578
  tools,
1448
1579
  thinking,
1449
1580
  cwd,
@@ -1457,16 +1588,16 @@ function resolvePiCodingAgentConfig(target, env, evalFilePath) {
1457
1588
  function resolvePiCliConfig(target, env, evalFilePath) {
1458
1589
  const executableSource = target.executable ?? target.command ?? target.binary;
1459
1590
  const subproviderSource = target.subprovider;
1460
- const modelSource = target.model ?? target.pi_model ?? target.piModel;
1461
- const apiKeySource = target.api_key ?? target.apiKey;
1462
- const toolsSource = target.tools ?? target.pi_tools ?? target.piTools;
1463
- const thinkingSource = target.thinking ?? target.pi_thinking ?? target.piThinking;
1591
+ const modelSource = target.model ?? target.pi_model;
1592
+ const apiKeySource = target.api_key;
1593
+ const toolsSource = target.tools ?? target.pi_tools;
1594
+ const thinkingSource = target.thinking ?? target.pi_thinking;
1464
1595
  const cwdSource = target.cwd;
1465
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
1466
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1467
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
1468
- const logFormatSource = target.log_format ?? target.logFormat;
1469
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
1596
+ const workspaceTemplateSource = target.workspace_template;
1597
+ const timeoutSource = target.timeout_seconds;
1598
+ const logDirSource = target.log_dir ?? target.log_directory;
1599
+ const logFormatSource = target.log_format;
1600
+ const systemPromptSource = target.system_prompt;
1470
1601
  const executable = resolveOptionalString(executableSource, env, `${target.name} pi-cli executable`, {
1471
1602
  allowLiteral: true,
1472
1603
  optionalEnv: true
@@ -1485,6 +1616,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
1485
1616
  allowLiteral: false,
1486
1617
  optionalEnv: true
1487
1618
  });
1619
+ const baseUrlSource = target.base_url ?? target.endpoint;
1620
+ const baseUrl = resolveOptionalString(baseUrlSource, env, `${target.name} pi-cli base url`, {
1621
+ allowLiteral: true,
1622
+ optionalEnv: true
1623
+ });
1488
1624
  const tools = resolveOptionalString(toolsSource, env, `${target.name} pi-cli tools`, {
1489
1625
  allowLiteral: true,
1490
1626
  optionalEnv: true
@@ -1523,6 +1659,7 @@ function resolvePiCliConfig(target, env, evalFilePath) {
1523
1659
  subprovider,
1524
1660
  model,
1525
1661
  apiKey,
1662
+ baseUrl,
1526
1663
  tools,
1527
1664
  thinking,
1528
1665
  args,
@@ -1537,11 +1674,11 @@ function resolvePiCliConfig(target, env, evalFilePath) {
1537
1674
  function resolveClaudeConfig(target, env, evalFilePath) {
1538
1675
  const modelSource = target.model;
1539
1676
  const cwdSource = target.cwd;
1540
- const workspaceTemplateSource = target.workspace_template ?? target.workspaceTemplate;
1541
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1542
- const logDirSource = target.log_dir ?? target.logDir ?? target.log_directory ?? target.logDirectory;
1543
- const logFormatSource = target.log_format ?? target.logFormat ?? target.log_output_format ?? target.logOutputFormat ?? env.AGENTV_CLAUDE_LOG_FORMAT;
1544
- const systemPromptSource = target.system_prompt ?? target.systemPrompt;
1677
+ const workspaceTemplateSource = target.workspace_template;
1678
+ const timeoutSource = target.timeout_seconds;
1679
+ const logDirSource = target.log_dir ?? target.log_directory;
1680
+ const logFormatSource = target.log_format ?? target.log_output_format ?? env.AGENTV_CLAUDE_LOG_FORMAT;
1681
+ const systemPromptSource = target.system_prompt;
1545
1682
  const model = resolveOptionalString(modelSource, env, `${target.name} claude model`, {
1546
1683
  allowLiteral: true,
1547
1684
  optionalEnv: true
@@ -1574,8 +1711,8 @@ function resolveClaudeConfig(target, env, evalFilePath) {
1574
1711
  });
1575
1712
  const logFormat = normalizeClaudeLogFormat(logFormatSource);
1576
1713
  const systemPrompt = typeof systemPromptSource === "string" && systemPromptSource.trim().length > 0 ? systemPromptSource.trim() : void 0;
1577
- const maxTurns = typeof target.max_turns === "number" ? target.max_turns : typeof target.maxTurns === "number" ? target.maxTurns : void 0;
1578
- const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd : typeof target.maxBudgetUsd === "number" ? target.maxBudgetUsd : void 0;
1714
+ const maxTurns = typeof target.max_turns === "number" ? target.max_turns : void 0;
1715
+ const maxBudgetUsd = typeof target.max_budget_usd === "number" ? target.max_budget_usd : void 0;
1579
1716
  return {
1580
1717
  model,
1581
1718
  systemPrompt,
@@ -1606,9 +1743,7 @@ function resolveMockConfig(target) {
1606
1743
  return { response };
1607
1744
  }
1608
1745
  function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
1609
- const workspaceTemplateEnvVar = resolveOptionalLiteralString(
1610
- target.workspace_template ?? target.workspaceTemplate
1611
- );
1746
+ const workspaceTemplateEnvVar = resolveOptionalLiteralString(target.workspace_template);
1612
1747
  let workspaceTemplate = workspaceTemplateEnvVar ? resolveOptionalString(
1613
1748
  workspaceTemplateEnvVar,
1614
1749
  env,
@@ -1623,9 +1758,9 @@ function resolveVSCodeConfig(target, env, insiders, evalFilePath) {
1623
1758
  }
1624
1759
  const executableSource = target.executable;
1625
1760
  const waitSource = target.wait;
1626
- const dryRunSource = target.dry_run ?? target.dryRun;
1627
- const subagentRootSource = target.subagent_root ?? target.subagentRoot;
1628
- const timeoutSource = target.timeout_seconds ?? target.timeoutSeconds;
1761
+ const dryRunSource = target.dry_run;
1762
+ const subagentRootSource = target.subagent_root;
1763
+ const timeoutSource = target.timeout_seconds;
1629
1764
  const defaultCommand = insiders ? "code-insiders" : "code";
1630
1765
  const executable = resolveOptionalString(executableSource, env, `${target.name} vscode executable`, {
1631
1766
  allowLiteral: true,
@@ -1660,8 +1795,8 @@ function resolveCliConfig(target, env, evalFilePath) {
1660
1795
  const parseResult = CliTargetInputSchema.safeParse(target, { errorMap: cliErrorMap });
1661
1796
  if (!parseResult.success) {
1662
1797
  const firstError = parseResult.error.errors[0];
1663
- const path49 = firstError?.path.join(".") || "";
1664
- const prefix = path49 ? `${target.name} ${path49}: ` : `${target.name}: `;
1798
+ const path410 = firstError?.path.join(".") || "";
1799
+ const prefix = path410 ? `${target.name} ${path410}: ` : `${target.name}: `;
1665
1800
  throw new Error(`${prefix}${firstError?.message}`);
1666
1801
  }
1667
1802
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
@@ -1676,7 +1811,7 @@ function resolveCliConfig(target, env, evalFilePath) {
1676
1811
  }
1677
1812
  function resolveDiscoveredProviderConfig(target, providerKind, env, evalFilePath) {
1678
1813
  const command = target.command ? resolveString(target.command, env, `${target.name} command`, true) : `bun run .agentv/providers/${providerKind}.ts {PROMPT}`;
1679
- const timeoutSeconds = target.timeout_seconds ?? target.timeoutSeconds;
1814
+ const timeoutSeconds = target.timeout_seconds;
1680
1815
  const timeoutMs = resolveTimeoutMs(timeoutSeconds, `${target.name} timeout`);
1681
1816
  let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
1682
1817
  allowLiteral: true,
@@ -1740,10 +1875,10 @@ function resolveDiscover(value, targetName) {
1740
1875
  throw new Error(`Target "${targetName}": discover must be "latest" (got "${String(value)}")`);
1741
1876
  }
1742
1877
  function resolveCopilotLogConfig(target, env) {
1743
- const sessionDirSource = target.session_dir ?? target.sessionDir;
1744
- const sessionIdSource = target.session_id ?? target.sessionId;
1878
+ const sessionDirSource = target.session_dir;
1879
+ const sessionIdSource = target.session_id;
1745
1880
  const discoverSource = target.discover;
1746
- const sessionStateDirSource = target.session_state_dir ?? target.sessionStateDir;
1881
+ const sessionStateDirSource = target.session_state_dir;
1747
1882
  const cwdSource = target.cwd;
1748
1883
  return {
1749
1884
  sessionDir: resolveOptionalString(
@@ -1916,6 +2051,15 @@ var AGENT_PROVIDER_KINDS = [
1916
2051
  "vscode",
1917
2052
  "vscode-insiders"
1918
2053
  ];
2054
+ var LLM_GRADER_CAPABLE_KINDS = [
2055
+ "openai",
2056
+ "openrouter",
2057
+ "azure",
2058
+ "anthropic",
2059
+ "gemini",
2060
+ "agentv",
2061
+ "mock"
2062
+ ];
1919
2063
  var KNOWN_PROVIDERS = [
1920
2064
  "openai",
1921
2065
  "openrouter",
@@ -1935,7 +2079,8 @@ var KNOWN_PROVIDERS = [
1935
2079
  "mock",
1936
2080
  "vscode",
1937
2081
  "vscode-insiders",
1938
- "agentv"
2082
+ "agentv",
2083
+ "transcript"
1939
2084
  ];
1940
2085
  var PROVIDER_ALIASES = [
1941
2086
  "azure-openai",
@@ -6744,7 +6889,7 @@ function createOpenRouter(options = {}) {
6744
6889
  );
6745
6890
  const createChatModel = (modelId, settings = {}) => new OpenRouterChatLanguageModel(modelId, settings, {
6746
6891
  provider: "openrouter.chat",
6747
- url: ({ path: path49 }) => `${baseURL}${path49}`,
6892
+ url: ({ path: path50 }) => `${baseURL}${path50}`,
6748
6893
  headers: getHeaders,
6749
6894
  compatibility,
6750
6895
  fetch: options.fetch,
@@ -6752,7 +6897,7 @@ function createOpenRouter(options = {}) {
6752
6897
  });
6753
6898
  const createCompletionModel = (modelId, settings = {}) => new OpenRouterCompletionLanguageModel(modelId, settings, {
6754
6899
  provider: "openrouter.completion",
6755
- url: ({ path: path49 }) => `${baseURL}${path49}`,
6900
+ url: ({ path: path50 }) => `${baseURL}${path50}`,
6756
6901
  headers: getHeaders,
6757
6902
  compatibility,
6758
6903
  fetch: options.fetch,
@@ -6760,14 +6905,14 @@ function createOpenRouter(options = {}) {
6760
6905
  });
6761
6906
  const createEmbeddingModel = (modelId, settings = {}) => new OpenRouterEmbeddingModel(modelId, settings, {
6762
6907
  provider: "openrouter.embedding",
6763
- url: ({ path: path49 }) => `${baseURL}${path49}`,
6908
+ url: ({ path: path50 }) => `${baseURL}${path50}`,
6764
6909
  headers: getHeaders,
6765
6910
  fetch: options.fetch,
6766
6911
  extraBody: options.extraBody
6767
6912
  });
6768
6913
  const createImageModel = (modelId, settings = {}) => new OpenRouterImageModel(modelId, settings, {
6769
6914
  provider: "openrouter.image",
6770
- url: ({ path: path49 }) => `${baseURL}${path49}`,
6915
+ url: ({ path: path50 }) => `${baseURL}${path50}`,
6771
6916
  headers: getHeaders,
6772
6917
  fetch: options.fetch,
6773
6918
  extraBody: options.extraBody
@@ -14278,19 +14423,21 @@ import { randomUUID as randomUUID6 } from "node:crypto";
14278
14423
  import { existsSync as existsSync2 } from "node:fs";
14279
14424
  import { mkdir as mkdir5 } from "node:fs/promises";
14280
14425
  import path18 from "node:path";
14281
- import { spawn as spawn3 } from "node:child_process";
14426
+ import { execSync, spawn as spawn3 } from "node:child_process";
14282
14427
  import { randomUUID as randomUUID7 } from "node:crypto";
14283
- import { createWriteStream as createWriteStream5 } from "node:fs";
14428
+ import { accessSync, createWriteStream as createWriteStream5, readFileSync as readFileSync2 } from "node:fs";
14284
14429
  import { mkdir as mkdir6, mkdtemp, rm, writeFile } from "node:fs/promises";
14285
14430
  import { tmpdir } from "node:os";
14286
14431
  import path19 from "node:path";
14287
- import { execSync } from "node:child_process";
14432
+ import { execSync as execSync2 } from "node:child_process";
14288
14433
  import { randomUUID as randomUUID8 } from "node:crypto";
14289
- import { accessSync, createWriteStream as createWriteStream6 } from "node:fs";
14434
+ import { accessSync as accessSync2, createWriteStream as createWriteStream6, mkdirSync } from "node:fs";
14290
14435
  import { mkdir as mkdir7 } from "node:fs/promises";
14291
- import path20 from "node:path";
14436
+ import path21 from "node:path";
14292
14437
  import { createInterface } from "node:readline";
14293
- import { fileURLToPath as fileURLToPath3 } from "node:url";
14438
+ import { fileURLToPath as fileURLToPath3, pathToFileURL } from "node:url";
14439
+ import os2 from "node:os";
14440
+ import path20 from "node:path";
14294
14441
  import { exec as exec2 } from "node:child_process";
14295
14442
  import { constants as constants3, access as access3, stat as stat5 } from "node:fs/promises";
14296
14443
  import path322 from "node:path";
@@ -14299,18 +14446,16 @@ import { stat as stat4, writeFile as writeFile4 } from "node:fs/promises";
14299
14446
  import path30 from "node:path";
14300
14447
  import { constants as constants22 } from "node:fs";
14301
14448
  import { access as access22, mkdir as mkdir8, readdir as readdir2, rm as rm2, stat as stat2 } from "node:fs/promises";
14302
- import path21 from "node:path";
14303
14449
  import path222 from "node:path";
14304
14450
  import path23 from "node:path";
14305
- import { readFile as readFile9 } from "node:fs/promises";
14306
14451
  import path24 from "node:path";
14452
+ import { readFile as readFile9 } from "node:fs/promises";
14453
+ import path25 from "node:path";
14307
14454
  import { exec, spawn as spawn4 } from "node:child_process";
14308
14455
  import { mkdir as mkdir9, writeFile as writeFile2 } from "node:fs/promises";
14309
14456
  import path27 from "node:path";
14310
14457
  import { promisify as promisify2 } from "node:util";
14311
14458
  import path26 from "node:path";
14312
- import os2 from "node:os";
14313
- import path25 from "node:path";
14314
14459
  import { copyFile, mkdir as mkdir10, readFile as readFile10, readdir as readdir3, stat as stat3, writeFile as writeFile3 } from "node:fs/promises";
14315
14460
  import path29 from "node:path";
14316
14461
  import path28 from "node:path";
@@ -14361,12 +14506,15 @@ import { existsSync as existsSync5 } from "node:fs";
14361
14506
  import path45 from "node:path";
14362
14507
  import { mkdir as mkdir15, readFile as readFile13, writeFile as writeFile8 } from "node:fs/promises";
14363
14508
  import path46 from "node:path";
14364
- import { existsSync as existsSync6, mkdirSync, readFileSync as readFileSync2, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
14509
+ import { existsSync as existsSync6, mkdirSync as mkdirSync2, readFileSync as readFileSync3, readdirSync as readdirSync3, statSync as statSync2, writeFileSync } from "node:fs";
14365
14510
  import path47 from "node:path";
14366
14511
  import { parse as parseYaml3, stringify as stringifyYaml } from "yaml";
14367
14512
  import { readdir as readdir8, stat as stat9 } from "node:fs/promises";
14368
14513
  import { homedir as homedir3 } from "node:os";
14369
14514
  import path48 from "node:path";
14515
+ import { readdir as readdir9, stat as stat10 } from "node:fs/promises";
14516
+ import { homedir as homedir4 } from "node:os";
14517
+ import path49 from "node:path";
14370
14518
  import { readFile as readFile14 } from "node:fs/promises";
14371
14519
  function computeTraceSummary(messages) {
14372
14520
  const toolCallCounts = {};
@@ -15154,8 +15302,13 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15154
15302
  const negate = rawEvaluator.negate === true ? true : void 0;
15155
15303
  if (isCustomType) {
15156
15304
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15157
- const required2 = parseRequired(rawEvaluator.required);
15158
- const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "negate"]);
15305
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15306
+ rawEvaluator.required,
15307
+ rawEvaluator.min_score,
15308
+ name21,
15309
+ evalId
15310
+ );
15311
+ const knownProps2 = /* @__PURE__ */ new Set(["name", "type", "weight", "required", "min_score", "negate"]);
15159
15312
  const config2 = {};
15160
15313
  for (const [key, value] of Object.entries(rawEvaluator)) {
15161
15314
  if (!knownProps2.has(key) && value !== void 0) {
@@ -15167,6 +15320,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15167
15320
  type: customTypeName,
15168
15321
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15169
15322
  ...required2 !== void 0 ? { required: required2 } : {},
15323
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15170
15324
  ...negate !== void 0 ? { negate } : {},
15171
15325
  ...Object.keys(config2).length > 0 ? { config: config2 } : {}
15172
15326
  });
@@ -15236,7 +15390,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15236
15390
  );
15237
15391
  }
15238
15392
  }
15239
- const required2 = parseRequired(rawEvaluator.required);
15393
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15394
+ rawEvaluator.required,
15395
+ rawEvaluator.min_score,
15396
+ name21,
15397
+ evalId
15398
+ );
15240
15399
  const knownProps2 = /* @__PURE__ */ new Set([
15241
15400
  "name",
15242
15401
  "type",
@@ -15262,6 +15421,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15262
15421
  resolvedCwd,
15263
15422
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15264
15423
  ...required2 !== void 0 ? { required: required2 } : {},
15424
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15265
15425
  ...negate !== void 0 ? { negate } : {},
15266
15426
  ...Object.keys(config2).length > 0 ? { config: config2 } : {},
15267
15427
  ...targetConfig !== void 0 ? { target: targetConfig } : {}
@@ -15390,7 +15550,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15390
15550
  };
15391
15551
  }
15392
15552
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15393
- const required2 = parseRequired(rawEvaluator.required);
15553
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15554
+ rawEvaluator.required,
15555
+ rawEvaluator.min_score,
15556
+ name21,
15557
+ evalId
15558
+ );
15394
15559
  evaluators.push({
15395
15560
  name: name21,
15396
15561
  type: "composite",
@@ -15398,6 +15563,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15398
15563
  aggregator,
15399
15564
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15400
15565
  ...required2 !== void 0 ? { required: required2 } : {},
15566
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15401
15567
  ...negate !== void 0 ? { negate } : {}
15402
15568
  });
15403
15569
  continue;
@@ -15508,7 +15674,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15508
15674
  continue;
15509
15675
  }
15510
15676
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15511
- const required2 = parseRequired(rawEvaluator.required);
15677
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15678
+ rawEvaluator.required,
15679
+ rawEvaluator.min_score,
15680
+ name21,
15681
+ evalId
15682
+ );
15512
15683
  const config2 = {
15513
15684
  name: name21,
15514
15685
  type: "tool-trajectory",
@@ -15517,6 +15688,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15517
15688
  ...expected ? { expected } : {},
15518
15689
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15519
15690
  ...required2 !== void 0 ? { required: required2 } : {},
15691
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15520
15692
  ...negate !== void 0 ? { negate } : {},
15521
15693
  ...argsMatch2 !== void 0 ? { argsMatch: argsMatch2 } : {}
15522
15694
  };
@@ -15579,7 +15751,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15579
15751
  const aggregation = asString(rawEvaluator.aggregation);
15580
15752
  const validAggregation = isValidFieldAggregationType(aggregation) ? aggregation : void 0;
15581
15753
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15582
- const required2 = parseRequired(rawEvaluator.required);
15754
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15755
+ rawEvaluator.required,
15756
+ rawEvaluator.min_score,
15757
+ name21,
15758
+ evalId
15759
+ );
15583
15760
  evaluators.push({
15584
15761
  name: name21,
15585
15762
  type: "field-accuracy",
@@ -15587,6 +15764,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15587
15764
  ...validAggregation ? { aggregation: validAggregation } : {},
15588
15765
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15589
15766
  ...required2 !== void 0 ? { required: required2 } : {},
15767
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15590
15768
  ...negate !== void 0 ? { negate } : {}
15591
15769
  });
15592
15770
  continue;
@@ -15600,13 +15778,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15600
15778
  continue;
15601
15779
  }
15602
15780
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15603
- const required2 = parseRequired(rawEvaluator.required);
15781
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15782
+ rawEvaluator.required,
15783
+ rawEvaluator.min_score,
15784
+ name21,
15785
+ evalId
15786
+ );
15604
15787
  evaluators.push({
15605
15788
  name: name21,
15606
15789
  type: "latency",
15607
15790
  threshold,
15608
15791
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15609
15792
  ...required2 !== void 0 ? { required: required2 } : {},
15793
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15610
15794
  ...negate !== void 0 ? { negate } : {}
15611
15795
  });
15612
15796
  continue;
@@ -15620,13 +15804,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15620
15804
  continue;
15621
15805
  }
15622
15806
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15623
- const required2 = parseRequired(rawEvaluator.required);
15807
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15808
+ rawEvaluator.required,
15809
+ rawEvaluator.min_score,
15810
+ name21,
15811
+ evalId
15812
+ );
15624
15813
  evaluators.push({
15625
15814
  name: name21,
15626
15815
  type: "cost",
15627
15816
  budget,
15628
15817
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15629
15818
  ...required2 !== void 0 ? { required: required2 } : {},
15819
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15630
15820
  ...negate !== void 0 ? { negate } : {}
15631
15821
  });
15632
15822
  continue;
@@ -15658,13 +15848,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15658
15848
  continue;
15659
15849
  }
15660
15850
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15661
- const required2 = parseRequired(rawEvaluator.required);
15851
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15852
+ rawEvaluator.required,
15853
+ rawEvaluator.min_score,
15854
+ name21,
15855
+ evalId
15856
+ );
15662
15857
  evaluators.push({
15663
15858
  name: name21,
15664
15859
  type: "token-usage",
15665
15860
  ...validLimits,
15666
15861
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15667
15862
  ...required2 !== void 0 ? { required: required2 } : {},
15863
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15668
15864
  ...negate !== void 0 ? { negate } : {}
15669
15865
  });
15670
15866
  continue;
@@ -15710,13 +15906,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15710
15906
  continue;
15711
15907
  }
15712
15908
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15713
- const required2 = parseRequired(rawEvaluator.required);
15909
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15910
+ rawEvaluator.required,
15911
+ rawEvaluator.min_score,
15912
+ name21,
15913
+ evalId
15914
+ );
15714
15915
  evaluators.push({
15715
15916
  name: name21,
15716
15917
  type: "execution-metrics",
15717
15918
  ...validThresholds,
15718
15919
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15719
15920
  ...required2 !== void 0 ? { required: required2 } : {},
15921
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15720
15922
  ...negate !== void 0 ? { negate } : {}
15721
15923
  });
15722
15924
  continue;
@@ -15730,7 +15932,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15730
15932
  const rawShouldTrigger = rawEvaluator.should_trigger;
15731
15933
  const shouldTrigger = typeof rawShouldTrigger === "boolean" ? rawShouldTrigger : void 0;
15732
15934
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15733
- const required2 = parseRequired(rawEvaluator.required);
15935
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15936
+ rawEvaluator.required,
15937
+ rawEvaluator.min_score,
15938
+ name21,
15939
+ evalId
15940
+ );
15734
15941
  evaluators.push({
15735
15942
  name: name21,
15736
15943
  type: "skill-trigger",
@@ -15738,6 +15945,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15738
15945
  ...shouldTrigger !== void 0 ? { should_trigger: shouldTrigger } : {},
15739
15946
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15740
15947
  ...required2 !== void 0 ? { required: required2 } : {},
15948
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15741
15949
  ...negate !== void 0 ? { negate } : {}
15742
15950
  });
15743
15951
  continue;
@@ -15749,13 +15957,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15749
15957
  continue;
15750
15958
  }
15751
15959
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15752
- const required2 = parseRequired(rawEvaluator.required);
15960
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15961
+ rawEvaluator.required,
15962
+ rawEvaluator.min_score,
15963
+ name21,
15964
+ evalId
15965
+ );
15753
15966
  evaluators.push({
15754
15967
  name: name21,
15755
15968
  type: "contains",
15756
15969
  value,
15757
15970
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15758
15971
  ...required2 !== void 0 ? { required: required2 } : {},
15972
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15759
15973
  ...negate !== void 0 ? { negate } : {}
15760
15974
  });
15761
15975
  continue;
@@ -15769,13 +15983,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15769
15983
  continue;
15770
15984
  }
15771
15985
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15772
- const required2 = parseRequired(rawEvaluator.required);
15986
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
15987
+ rawEvaluator.required,
15988
+ rawEvaluator.min_score,
15989
+ name21,
15990
+ evalId
15991
+ );
15773
15992
  evaluators.push({
15774
15993
  name: name21,
15775
15994
  type: typeValue,
15776
15995
  value,
15777
15996
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15778
15997
  ...required2 !== void 0 ? { required: required2 } : {},
15998
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15779
15999
  ...negate !== void 0 ? { negate } : {}
15780
16000
  });
15781
16001
  continue;
@@ -15787,13 +16007,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15787
16007
  continue;
15788
16008
  }
15789
16009
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15790
- const required2 = parseRequired(rawEvaluator.required);
16010
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16011
+ rawEvaluator.required,
16012
+ rawEvaluator.min_score,
16013
+ name21,
16014
+ evalId
16015
+ );
15791
16016
  evaluators.push({
15792
16017
  name: name21,
15793
16018
  type: "icontains",
15794
16019
  value,
15795
16020
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15796
16021
  ...required2 !== void 0 ? { required: required2 } : {},
16022
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15797
16023
  ...negate !== void 0 ? { negate } : {}
15798
16024
  });
15799
16025
  continue;
@@ -15807,13 +16033,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15807
16033
  continue;
15808
16034
  }
15809
16035
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15810
- const required2 = parseRequired(rawEvaluator.required);
16036
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16037
+ rawEvaluator.required,
16038
+ rawEvaluator.min_score,
16039
+ name21,
16040
+ evalId
16041
+ );
15811
16042
  evaluators.push({
15812
16043
  name: name21,
15813
16044
  type: typeValue,
15814
16045
  value,
15815
16046
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15816
16047
  ...required2 !== void 0 ? { required: required2 } : {},
16048
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15817
16049
  ...negate !== void 0 ? { negate } : {}
15818
16050
  });
15819
16051
  continue;
@@ -15825,13 +16057,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15825
16057
  continue;
15826
16058
  }
15827
16059
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15828
- const required2 = parseRequired(rawEvaluator.required);
16060
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16061
+ rawEvaluator.required,
16062
+ rawEvaluator.min_score,
16063
+ name21,
16064
+ evalId
16065
+ );
15829
16066
  evaluators.push({
15830
16067
  name: name21,
15831
16068
  type: typeValue,
15832
16069
  value,
15833
16070
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15834
16071
  ...required2 !== void 0 ? { required: required2 } : {},
16072
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15835
16073
  ...negate !== void 0 ? { negate } : {}
15836
16074
  });
15837
16075
  continue;
@@ -15844,7 +16082,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15844
16082
  }
15845
16083
  const flags = asString(rawEvaluator.flags);
15846
16084
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15847
- const required2 = parseRequired(rawEvaluator.required);
16085
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16086
+ rawEvaluator.required,
16087
+ rawEvaluator.min_score,
16088
+ name21,
16089
+ evalId
16090
+ );
15848
16091
  evaluators.push({
15849
16092
  name: name21,
15850
16093
  type: "regex",
@@ -15852,18 +16095,25 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15852
16095
  ...flags !== void 0 ? { flags } : {},
15853
16096
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15854
16097
  ...required2 !== void 0 ? { required: required2 } : {},
16098
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15855
16099
  ...negate !== void 0 ? { negate } : {}
15856
16100
  });
15857
16101
  continue;
15858
16102
  }
15859
16103
  if (typeValue === "is-json") {
15860
16104
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15861
- const required2 = parseRequired(rawEvaluator.required);
16105
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16106
+ rawEvaluator.required,
16107
+ rawEvaluator.min_score,
16108
+ name21,
16109
+ evalId
16110
+ );
15862
16111
  evaluators.push({
15863
16112
  name: name21,
15864
16113
  type: "is-json",
15865
16114
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15866
16115
  ...required2 !== void 0 ? { required: required2 } : {},
16116
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15867
16117
  ...negate !== void 0 ? { negate } : {}
15868
16118
  });
15869
16119
  continue;
@@ -15875,13 +16125,19 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15875
16125
  continue;
15876
16126
  }
15877
16127
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15878
- const required2 = parseRequired(rawEvaluator.required);
16128
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16129
+ rawEvaluator.required,
16130
+ rawEvaluator.min_score,
16131
+ name21,
16132
+ evalId
16133
+ );
15879
16134
  evaluators.push({
15880
16135
  name: name21,
15881
16136
  type: "equals",
15882
16137
  value,
15883
16138
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15884
16139
  ...required2 !== void 0 ? { required: required2 } : {},
16140
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15885
16141
  ...negate !== void 0 ? { negate } : {}
15886
16142
  });
15887
16143
  continue;
@@ -15917,7 +16173,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15917
16173
  continue;
15918
16174
  }
15919
16175
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15920
- const required2 = parseRequired(rawEvaluator.required);
16176
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16177
+ rawEvaluator.required,
16178
+ rawEvaluator.min_score,
16179
+ name21,
16180
+ evalId
16181
+ );
15921
16182
  evaluators.push({
15922
16183
  name: name21,
15923
16184
  type: "llm-grader",
@@ -15925,6 +16186,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15925
16186
  ...graderTargetName ? { target: graderTargetName } : {},
15926
16187
  ...weight2 !== void 0 ? { weight: weight2 } : {},
15927
16188
  ...required2 !== void 0 ? { required: required2 } : {},
16189
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
15928
16190
  ...negate !== void 0 ? { negate } : {}
15929
16191
  });
15930
16192
  continue;
@@ -15994,7 +16256,12 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
15994
16256
  continue;
15995
16257
  }
15996
16258
  const weight2 = validateWeight(rawEvaluator.weight, name21, evalId);
15997
- const required2 = parseRequired(rawEvaluator.required);
16259
+ const { required: required2, min_score: min_score2 } = parseRequiredAndMinScore(
16260
+ rawEvaluator.required,
16261
+ rawEvaluator.min_score,
16262
+ name21,
16263
+ evalId
16264
+ );
15998
16265
  evaluators.push({
15999
16266
  name: name21,
16000
16267
  type: "llm-grader",
@@ -16002,12 +16269,18 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
16002
16269
  ...graderTargetName ? { target: graderTargetName } : {},
16003
16270
  ...weight2 !== void 0 ? { weight: weight2 } : {},
16004
16271
  ...required2 !== void 0 ? { required: required2 } : {},
16272
+ ...min_score2 !== void 0 ? { min_score: min_score2 } : {},
16005
16273
  ...negate !== void 0 ? { negate } : {}
16006
16274
  });
16007
16275
  continue;
16008
16276
  }
16009
16277
  const weight = validateWeight(rawEvaluator.weight, name21, evalId);
16010
- const required = parseRequired(rawEvaluator.required);
16278
+ const { required, min_score } = parseRequiredAndMinScore(
16279
+ rawEvaluator.required,
16280
+ rawEvaluator.min_score,
16281
+ name21,
16282
+ evalId
16283
+ );
16011
16284
  const knownProps = /* @__PURE__ */ new Set([
16012
16285
  "name",
16013
16286
  "type",
@@ -16018,6 +16291,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
16018
16291
  "weight",
16019
16292
  "config",
16020
16293
  "required",
16294
+ "min_score",
16021
16295
  "negate",
16022
16296
  "max_steps",
16023
16297
  "maxSteps",
@@ -16047,6 +16321,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
16047
16321
  ...graderTargetName ? { target: graderTargetName } : {},
16048
16322
  ...weight !== void 0 ? { weight } : {},
16049
16323
  ...required !== void 0 ? { required } : {},
16324
+ ...min_score !== void 0 ? { min_score } : {},
16050
16325
  ...negate !== void 0 ? { negate } : {},
16051
16326
  ...finalConfig ? { config: finalConfig } : {},
16052
16327
  ...llmMaxSteps !== void 0 ? { max_steps: llmMaxSteps } : {},
@@ -16178,10 +16453,23 @@ ${detailBlock}${ANSI_RESET4}`);
16178
16453
  console.warn(`${ANSI_YELLOW3}Warning: ${message}${ANSI_RESET4}`);
16179
16454
  }
16180
16455
  }
16181
- function parseRequired(value) {
16182
- if (value === true) return true;
16183
- if (typeof value === "number" && value > 0 && value <= 1) return value;
16184
- return void 0;
16456
+ function parseRequiredAndMinScore(rawRequired, rawMinScore, evaluatorName, evalId) {
16457
+ const result = {};
16458
+ if (typeof rawMinScore === "number" && rawMinScore > 0 && rawMinScore <= 1) {
16459
+ result.min_score = rawMinScore;
16460
+ }
16461
+ if (rawRequired === true) {
16462
+ result.required = true;
16463
+ } else if (typeof rawRequired === "number" && rawRequired > 0 && rawRequired <= 1) {
16464
+ if (result.min_score === void 0) {
16465
+ result.min_score = rawRequired;
16466
+ }
16467
+ result.required = rawRequired;
16468
+ logWarning2(
16469
+ `Evaluator '${evaluatorName}' in '${evalId}': 'required: ${rawRequired}' is deprecated. Use 'required: true' + 'min_score: ${rawRequired}' instead.`
16470
+ );
16471
+ }
16472
+ return result;
16185
16473
  }
16186
16474
  function validateWeight(rawWeight, evaluatorName, evalId) {
16187
16475
  if (rawWeight === void 0) {
@@ -16224,16 +16512,30 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
16224
16512
  const id = asString(rawRubric.id) ?? `rubric-${index + 1}`;
16225
16513
  const expectedOutcome = asString(rawRubric.outcome) ?? "";
16226
16514
  const weight = typeof rawRubric.weight === "number" ? rawRubric.weight : 1;
16515
+ let minScore;
16227
16516
  let requiredMinScore;
16228
16517
  let required;
16229
- if (typeof rawRubric.required_min_score === "number") {
16230
- const minScore = rawRubric.required_min_score;
16231
- if (!Number.isInteger(minScore) || minScore < 0 || minScore > 10) {
16518
+ if (typeof rawRubric.min_score === "number") {
16519
+ const ms = rawRubric.min_score;
16520
+ if (ms <= 0 || ms > 1) {
16521
+ throw new Error(
16522
+ `Invalid min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be in (0, 1] (got ${ms})`
16523
+ );
16524
+ }
16525
+ minScore = ms;
16526
+ requiredMinScore = Math.round(ms * 10);
16527
+ } else if (typeof rawRubric.required_min_score === "number") {
16528
+ const rms = rawRubric.required_min_score;
16529
+ if (!Number.isInteger(rms) || rms < 0 || rms > 10) {
16232
16530
  throw new Error(
16233
- `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${minScore})`
16531
+ `Invalid required_min_score for rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': must be an integer 0-10 (got ${rms})`
16234
16532
  );
16235
16533
  }
16236
- requiredMinScore = minScore;
16534
+ requiredMinScore = rms;
16535
+ minScore = rms / 10;
16536
+ logWarning2(
16537
+ `Rubric '${id}' in evaluator '${evaluatorName}' in '${evalId}': 'required_min_score: ${rms}' is deprecated. Use 'min_score: ${rms / 10}' (0-1 scale) instead.`
16538
+ );
16237
16539
  }
16238
16540
  if (typeof rawRubric.required === "boolean") {
16239
16541
  required = rawRubric.required;
@@ -16253,6 +16555,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
16253
16555
  weight,
16254
16556
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
16255
16557
  ...required !== void 0 ? { required } : {},
16558
+ ...minScore !== void 0 ? { min_score: minScore } : {},
16256
16559
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {},
16257
16560
  score_ranges: scoreRanges
16258
16561
  });
@@ -16269,6 +16572,7 @@ function parseRubricItems(rawRubrics, evaluatorName, evalId) {
16269
16572
  weight,
16270
16573
  // Default to required: true if not specified (backward compatibility)
16271
16574
  required: required ?? true,
16575
+ ...minScore !== void 0 ? { min_score: minScore } : {},
16272
16576
  ...requiredMinScore !== void 0 ? { required_min_score: requiredMinScore } : {}
16273
16577
  });
16274
16578
  }
@@ -16397,12 +16701,22 @@ function parseInlineRubrics(rawRubrics) {
16397
16701
  id: asString(rubric.id) ?? `rubric-${index + 1}`,
16398
16702
  weight: typeof rubric.weight === "number" ? rubric.weight : 1
16399
16703
  };
16704
+ let inlineMinScore;
16705
+ let inlineRequiredMinScore;
16706
+ if (typeof rubric.min_score === "number") {
16707
+ inlineMinScore = rubric.min_score;
16708
+ inlineRequiredMinScore = Math.round(inlineMinScore * 10);
16709
+ } else if (typeof rubric.required_min_score === "number") {
16710
+ inlineRequiredMinScore = rubric.required_min_score;
16711
+ inlineMinScore = inlineRequiredMinScore / 10;
16712
+ }
16400
16713
  if (scoreRanges && scoreRanges.length > 0) {
16401
16714
  return {
16402
16715
  ...baseRubric,
16403
16716
  ...expectedOutcome.length > 0 ? { outcome: expectedOutcome } : {},
16404
16717
  ...typeof rubric.required === "boolean" ? { required: rubric.required } : {},
16405
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {},
16718
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
16719
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {},
16406
16720
  score_ranges: scoreRanges
16407
16721
  };
16408
16722
  }
@@ -16410,7 +16724,8 @@ function parseInlineRubrics(rawRubrics) {
16410
16724
  ...baseRubric,
16411
16725
  outcome: expectedOutcome,
16412
16726
  required: typeof rubric.required === "boolean" ? rubric.required : true,
16413
- ...typeof rubric.required_min_score === "number" ? { required_min_score: rubric.required_min_score } : {}
16727
+ ...inlineMinScore !== void 0 ? { min_score: inlineMinScore } : {},
16728
+ ...inlineRequiredMinScore !== void 0 ? { required_min_score: inlineRequiredMinScore } : {}
16414
16729
  };
16415
16730
  }).filter((r) => r.outcome && r.outcome.length > 0 || "score_ranges" in r && r.score_ranges);
16416
16731
  if (rubricItems.length === 0) {
@@ -16792,6 +17107,9 @@ function resolveExpectedMessages(raw) {
16792
17107
  var ANSI_YELLOW5 = "\x1B[33m";
16793
17108
  var ANSI_RED2 = "\x1B[31m";
16794
17109
  var ANSI_RESET6 = "\x1B[0m";
17110
+ function matchesFilter(id, filter2) {
17111
+ return typeof filter2 === "string" ? micromatch.isMatch(id, filter2) : filter2.some((pattern) => micromatch.isMatch(id, pattern));
17112
+ }
16795
17113
  function detectFormat(filePath) {
16796
17114
  const ext = path6.extname(filePath).toLowerCase();
16797
17115
  if (ext === ".jsonl") return "jsonl";
@@ -16859,40 +17177,40 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
16859
17177
  const sidecar = await loadSidecarMetadata(absoluteTestPath, verbose);
16860
17178
  const rawFile = await readFile5(absoluteTestPath, "utf8");
16861
17179
  const rawCases = parseJsonlContent(rawFile, evalFilePath);
16862
- const fallbackEvalSet = path6.basename(absoluteTestPath, ".jsonl") || "eval";
16863
- const evalSetName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackEvalSet;
17180
+ const fallbackSuiteName = path6.basename(absoluteTestPath, ".jsonl") || "eval";
17181
+ const suiteName = sidecar.name && sidecar.name.trim().length > 0 ? sidecar.name : fallbackSuiteName;
16864
17182
  const globalEvaluator = coerceEvaluator(sidecar.evaluator, "sidecar") ?? "llm-grader";
16865
17183
  const globalExecution = sidecar.execution;
16866
17184
  if (verbose) {
16867
17185
  console.log(`
16868
- [JSONL Dataset: ${evalFilePath}]`);
17186
+ [JSONL Suite: ${evalFilePath}]`);
16869
17187
  console.log(` Cases: ${rawCases.length}`);
16870
- console.log(` Eval set: ${evalSetName}`);
17188
+ console.log(` Suite: ${suiteName}`);
16871
17189
  if (sidecar.description) {
16872
17190
  console.log(` Description: ${sidecar.description}`);
16873
17191
  }
16874
17192
  }
16875
17193
  const results = [];
16876
17194
  for (let lineIndex = 0; lineIndex < rawCases.length; lineIndex++) {
16877
- const evalcase = rawCases[lineIndex];
17195
+ const testCaseConfig = rawCases[lineIndex];
16878
17196
  const lineNumber = lineIndex + 1;
16879
- const id = asString4(evalcase.id);
16880
- if (filterPattern && (!id || !micromatch.isMatch(id, filterPattern))) {
17197
+ const id = asString4(testCaseConfig.id);
17198
+ if (filterPattern && (!id || !matchesFilter(id, filterPattern))) {
16881
17199
  continue;
16882
17200
  }
16883
- const conversationId = asString4(evalcase.conversation_id);
16884
- let outcome = asString4(evalcase.criteria);
16885
- if (!outcome && evalcase.expected_outcome !== void 0) {
16886
- outcome = asString4(evalcase.expected_outcome);
17201
+ const conversationId = asString4(testCaseConfig.conversation_id);
17202
+ let outcome = asString4(testCaseConfig.criteria);
17203
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
17204
+ outcome = asString4(testCaseConfig.expected_outcome);
16887
17205
  if (outcome) {
16888
17206
  logWarning4(
16889
- `Test '${asString4(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
17207
+ `Test '${asString4(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
16890
17208
  );
16891
17209
  }
16892
17210
  }
16893
- const rawInputMessages = resolveInputMessages(evalcase);
16894
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
16895
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assert !== void 0;
17211
+ const rawInputMessages = resolveInputMessages(testCaseConfig);
17212
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
17213
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assert !== void 0;
16896
17214
  if (!id || !hasEvaluationSpec || !rawInputMessages || rawInputMessages.length === 0) {
16897
17215
  logError2(
16898
17216
  `Skipping incomplete test at line ${lineNumber}: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`
@@ -16929,18 +17247,23 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
16929
17247
  }
16930
17248
  }
16931
17249
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
16932
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
17250
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
16933
17251
  const mergedExecution = caseExecution ?? globalExecution;
16934
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
17252
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
16935
17253
  let evaluators;
16936
17254
  try {
16937
- evaluators = await parseEvaluators(evalcase, mergedExecution, searchRoots, id ?? "unknown");
17255
+ evaluators = await parseEvaluators(
17256
+ testCaseConfig,
17257
+ mergedExecution,
17258
+ searchRoots,
17259
+ id ?? "unknown"
17260
+ );
16938
17261
  } catch (error) {
16939
17262
  const message = error instanceof Error ? error.message : String(error);
16940
17263
  logError2(`Skipping test '${id}' at line ${lineNumber}: ${message}`);
16941
17264
  continue;
16942
17265
  }
16943
- const inlineRubrics = evalcase.rubrics;
17266
+ const inlineRubrics = testCaseConfig.rubrics;
16944
17267
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
16945
17268
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
16946
17269
  if (rubricEvaluator) {
@@ -16951,7 +17274,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
16951
17274
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
16952
17275
  const testCase = {
16953
17276
  id,
16954
- dataset: evalSetName,
17277
+ suite: suiteName,
16955
17278
  conversation_id: conversationId,
16956
17279
  question,
16957
17280
  input: inputMessages,
@@ -16959,7 +17282,7 @@ async function loadTestsFromJsonl(evalFilePath, repoRoot, options) {
16959
17282
  reference_answer: referenceAnswer,
16960
17283
  file_paths: userFilePaths,
16961
17284
  criteria: outcome ?? "",
16962
- evaluator: evalCaseEvaluatorKind,
17285
+ evaluator: testCaseEvaluatorKind,
16963
17286
  assertions: evaluators
16964
17287
  };
16965
17288
  results.push(testCase);
@@ -17135,6 +17458,9 @@ function buildChatPromptFromSegments(options) {
17135
17458
  var ANSI_YELLOW6 = "\x1B[33m";
17136
17459
  var ANSI_RED3 = "\x1B[31m";
17137
17460
  var ANSI_RESET7 = "\x1B[0m";
17461
+ function matchesFilter2(id, filter2) {
17462
+ return typeof filter2 === "string" ? micromatch2.isMatch(id, filter2) : filter2.some((pattern) => micromatch2.isMatch(id, pattern));
17463
+ }
17138
17464
  function resolveTests(suite) {
17139
17465
  if (suite.tests !== void 0) return suite.tests;
17140
17466
  if (suite.eval_cases !== void 0) {
@@ -17214,18 +17540,18 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
17214
17540
  throw new Error(`Invalid test file format: ${evalFilePath}`);
17215
17541
  }
17216
17542
  const suite = interpolated;
17217
- const evalSetNameFromSuite = asString5(suite.name)?.trim();
17218
- const fallbackEvalSet = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
17219
- const evalSetName = evalSetNameFromSuite && evalSetNameFromSuite.length > 0 ? evalSetNameFromSuite : fallbackEvalSet;
17220
- const rawTestcases = resolveTests(suite);
17543
+ const suiteNameFromFile = asString5(suite.name)?.trim();
17544
+ const fallbackSuiteName = path7.basename(absoluteTestPath).replace(/\.eval\.ya?ml$/i, "").replace(/\.ya?ml$/i, "") || "eval";
17545
+ const suiteName = suiteNameFromFile && suiteNameFromFile.length > 0 ? suiteNameFromFile : fallbackSuiteName;
17546
+ const rawTestCases = resolveTests(suite);
17221
17547
  const globalEvaluator = coerceEvaluator(suite.evaluator, "global") ?? "llm-grader";
17222
17548
  const evalFileDir = path7.dirname(absoluteTestPath);
17223
- let expandedTestcases;
17224
- if (typeof rawTestcases === "string") {
17225
- const externalPath = path7.resolve(evalFileDir, rawTestcases);
17226
- expandedTestcases = await loadCasesFromFile(externalPath);
17227
- } else if (Array.isArray(rawTestcases)) {
17228
- expandedTestcases = await expandFileReferences(rawTestcases, evalFileDir);
17549
+ let expandedTestCases;
17550
+ if (typeof rawTestCases === "string") {
17551
+ const externalPath = path7.resolve(evalFileDir, rawTestCases);
17552
+ expandedTestCases = await loadCasesFromFile(externalPath);
17553
+ } else if (Array.isArray(rawTestCases)) {
17554
+ expandedTestCases = await expandFileReferences(rawTestCases, evalFileDir);
17229
17555
  } else {
17230
17556
  throw new Error(`Invalid test file format: ${evalFilePath} - missing 'tests' field`);
17231
17557
  }
@@ -17240,32 +17566,33 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
17240
17566
  }
17241
17567
  const globalExecution = suiteAssertions !== void 0 ? { ...rawGlobalExecution ?? {}, assertions: suiteAssertions } : rawGlobalExecution;
17242
17568
  const results = [];
17243
- for (const rawEvalcase of expandedTestcases) {
17244
- if (!isJsonObject(rawEvalcase)) {
17569
+ for (const rawTestCase of expandedTestCases) {
17570
+ if (!isJsonObject(rawTestCase)) {
17245
17571
  logWarning5("Skipping invalid test entry (expected object)");
17246
17572
  continue;
17247
17573
  }
17248
- const evalcase = rawEvalcase;
17249
- const id = asString5(evalcase.id);
17250
- if (filterPattern && (!id || !micromatch2.isMatch(id, filterPattern))) {
17574
+ const testCaseConfig = rawTestCase;
17575
+ const id = asString5(testCaseConfig.id);
17576
+ if (filterPattern && (!id || !matchesFilter2(id, filterPattern))) {
17251
17577
  continue;
17252
17578
  }
17253
- const conversationId = asString5(evalcase.conversation_id);
17254
- let outcome = asString5(evalcase.criteria);
17255
- if (!outcome && evalcase.expected_outcome !== void 0) {
17256
- outcome = asString5(evalcase.expected_outcome);
17579
+ const conversationId = asString5(testCaseConfig.conversation_id);
17580
+ let outcome = asString5(testCaseConfig.criteria);
17581
+ if (!outcome && testCaseConfig.expected_outcome !== void 0) {
17582
+ outcome = asString5(testCaseConfig.expected_outcome);
17257
17583
  if (outcome) {
17258
17584
  logWarning5(
17259
- `Test '${asString5(evalcase.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
17585
+ `Test '${asString5(testCaseConfig.id) ?? "unknown"}': 'expected_outcome' is deprecated. Use 'criteria' instead.`
17260
17586
  );
17261
17587
  }
17262
17588
  }
17263
- const caseExecution = isJsonObject(evalcase.execution) ? evalcase.execution : void 0;
17589
+ const caseExecution = isJsonObject(testCaseConfig.execution) ? testCaseConfig.execution : void 0;
17264
17590
  const skipDefaults = caseExecution?.skip_defaults === true;
17591
+ const caseThreshold = typeof caseExecution?.threshold === "number" && caseExecution.threshold >= 0 && caseExecution.threshold <= 1 ? caseExecution.threshold : void 0;
17265
17592
  const effectiveSuiteInputFiles = suiteInputFiles && !skipDefaults ? suiteInputFiles : void 0;
17266
- const testInputMessages = resolveInputMessages(evalcase, effectiveSuiteInputFiles);
17267
- const expectedMessages = resolveExpectedMessages(evalcase) ?? [];
17268
- const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || evalcase.assertions !== void 0 || evalcase.assert !== void 0;
17593
+ const testInputMessages = resolveInputMessages(testCaseConfig, effectiveSuiteInputFiles);
17594
+ const expectedMessages = resolveExpectedMessages(testCaseConfig) ?? [];
17595
+ const hasEvaluationSpec = !!outcome || expectedMessages.length > 0 || testCaseConfig.assertions !== void 0 || testCaseConfig.assert !== void 0;
17269
17596
  if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) {
17270
17597
  logError3(
17271
17598
  `Skipping incomplete test: ${id ?? "unknown"}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`
@@ -17312,16 +17639,21 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
17312
17639
  }
17313
17640
  }
17314
17641
  const question = inputTextParts.map((part) => part.trim()).filter((part) => part.length > 0).join(" ");
17315
- const evalCaseEvaluatorKind = coerceEvaluator(evalcase.evaluator, id) ?? globalEvaluator;
17642
+ const testCaseEvaluatorKind = coerceEvaluator(testCaseConfig.evaluator, id) ?? globalEvaluator;
17316
17643
  let evaluators;
17317
17644
  try {
17318
- evaluators = await parseEvaluators(evalcase, globalExecution, searchRoots, id ?? "unknown");
17645
+ evaluators = await parseEvaluators(
17646
+ testCaseConfig,
17647
+ globalExecution,
17648
+ searchRoots,
17649
+ id ?? "unknown"
17650
+ );
17319
17651
  } catch (error) {
17320
17652
  const message = error instanceof Error ? error.message : String(error);
17321
17653
  logError3(`Skipping test '${id}': ${message}`);
17322
17654
  continue;
17323
17655
  }
17324
- const inlineRubrics = evalcase.rubrics;
17656
+ const inlineRubrics = testCaseConfig.rubrics;
17325
17657
  if (inlineRubrics !== void 0 && Array.isArray(inlineRubrics)) {
17326
17658
  const rubricEvaluator = parseInlineRubrics(inlineRubrics);
17327
17659
  if (rubricEvaluator) {
@@ -17330,13 +17662,13 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
17330
17662
  }
17331
17663
  warnUnconsumedCriteria(outcome, evaluators, id ?? "unknown");
17332
17664
  const userFilePaths = collectResolvedInputFilePaths(inputMessages);
17333
- const caseWorkspace = await resolveWorkspaceConfig(evalcase.workspace, evalFileDir);
17665
+ const caseWorkspace = await resolveWorkspaceConfig(testCaseConfig.workspace, evalFileDir);
17334
17666
  const mergedWorkspace = mergeWorkspaceConfigs(suiteWorkspace, caseWorkspace);
17335
- const metadata = isJsonObject(evalcase.metadata) ? evalcase.metadata : void 0;
17336
- const caseTargets = extractTargetsFromTestCase(evalcase);
17667
+ const metadata = isJsonObject(testCaseConfig.metadata) ? testCaseConfig.metadata : void 0;
17668
+ const caseTargets = extractTargetsFromTestCase(testCaseConfig);
17337
17669
  const testCase = {
17338
17670
  id,
17339
- dataset: evalSetName,
17671
+ suite: suiteName,
17340
17672
  category: options?.category,
17341
17673
  conversation_id: conversationId,
17342
17674
  question,
@@ -17345,11 +17677,12 @@ async function loadTestsFromYaml(evalFilePath, repoRoot, options) {
17345
17677
  reference_answer: referenceAnswer,
17346
17678
  file_paths: userFilePaths,
17347
17679
  criteria: outcome ?? "",
17348
- evaluator: evalCaseEvaluatorKind,
17680
+ evaluator: testCaseEvaluatorKind,
17349
17681
  assertions: evaluators,
17350
17682
  workspace: mergedWorkspace,
17351
17683
  metadata,
17352
- targets: caseTargets
17684
+ targets: caseTargets,
17685
+ ...caseThreshold !== void 0 ? { threshold: caseThreshold } : {}
17353
17686
  };
17354
17687
  results.push(testCase);
17355
17688
  }
@@ -17880,7 +18213,7 @@ var AzureProvider = class {
17880
18213
  };
17881
18214
  this.retryConfig = config.retry;
17882
18215
  const azure = createAzure(buildAzureOptions(config));
17883
- this.model = azure.chat(config.deploymentName);
18216
+ this.model = config.apiFormat === "responses" ? azure(config.deploymentName) : azure.chat(config.deploymentName);
17884
18217
  }
17885
18218
  id;
17886
18219
  kind = "azure";
@@ -18006,7 +18339,9 @@ function buildAzureOptions(config) {
18006
18339
  const options = {
18007
18340
  apiKey: config.apiKey,
18008
18341
  apiVersion: config.version,
18009
- useDeploymentBasedUrls: true
18342
+ // Chat completions still use deployment-scoped Azure URLs for compatibility
18343
+ // with existing deployments. Responses API should use the SDK's v1 path.
18344
+ useDeploymentBasedUrls: config.apiFormat !== "responses"
18010
18345
  };
18011
18346
  const baseURL = normalizeAzureBaseUrl(config.resourceName);
18012
18347
  if (baseURL) {
@@ -19447,15 +19782,16 @@ var CliProvider = class {
19447
19782
  outputFilePath
19448
19783
  );
19449
19784
  const renderedCommand = renderTemplate(this.config.command, templateValues);
19785
+ const effectiveCwd = requests[0]?.cwd ?? this.config.cwd;
19450
19786
  if (this.verbose) {
19451
19787
  console.log(
19452
- `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
19788
+ `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${effectiveCwd ?? ""} command=${renderedCommand}`
19453
19789
  );
19454
19790
  }
19455
19791
  try {
19456
19792
  const startTime = Date.now();
19457
19793
  const result = await this.runCommand(renderedCommand, {
19458
- cwd: this.config.cwd,
19794
+ cwd: effectiveCwd,
19459
19795
  env: process.env,
19460
19796
  timeoutMs: this.config.timeoutMs,
19461
19797
  signal: controller.signal
@@ -19488,7 +19824,7 @@ var CliProvider = class {
19488
19824
  command: renderedCommand,
19489
19825
  stderr: result.stderr,
19490
19826
  exitCode: result.exitCode ?? 0,
19491
- cwd: this.config.cwd,
19827
+ cwd: effectiveCwd,
19492
19828
  outputFile: outputFilePath
19493
19829
  }
19494
19830
  };
@@ -19506,7 +19842,7 @@ var CliProvider = class {
19506
19842
  command: renderedCommand,
19507
19843
  stderr: result.stderr,
19508
19844
  exitCode: result.exitCode ?? 0,
19509
- cwd: this.config.cwd,
19845
+ cwd: effectiveCwd,
19510
19846
  outputFile: outputFilePath,
19511
19847
  error: errorMessage
19512
19848
  }
@@ -19521,7 +19857,7 @@ var CliProvider = class {
19521
19857
  command: renderedCommand,
19522
19858
  stderr: result.stderr,
19523
19859
  exitCode: result.exitCode ?? 0,
19524
- cwd: this.config.cwd,
19860
+ cwd: effectiveCwd,
19525
19861
  outputFile: outputFilePath,
19526
19862
  recordId: evalCaseId
19527
19863
  }
@@ -21464,6 +21800,73 @@ function subscribeToPiLogEntries(listener) {
21464
21800
  store.delete(listener);
21465
21801
  };
21466
21802
  }
21803
+ var SUBPROVIDER_ALIASES = {
21804
+ azure: "azure-openai-responses"
21805
+ };
21806
+ var SUBPROVIDER_ALIASES_WITH_BASE_URL = {
21807
+ // Azure v1 endpoints are OpenAI-compatible; use the standard client
21808
+ // to avoid AzureOpenAI adding api-version query params.
21809
+ azure: "openai-responses"
21810
+ };
21811
+ var ENV_KEY_MAP = {
21812
+ google: "GEMINI_API_KEY",
21813
+ gemini: "GEMINI_API_KEY",
21814
+ anthropic: "ANTHROPIC_API_KEY",
21815
+ openai: "OPENAI_API_KEY",
21816
+ groq: "GROQ_API_KEY",
21817
+ xai: "XAI_API_KEY",
21818
+ openrouter: "OPENROUTER_API_KEY",
21819
+ azure: "AZURE_OPENAI_API_KEY"
21820
+ };
21821
+ var ENV_BASE_URL_MAP = {
21822
+ openai: "OPENAI_BASE_URL",
21823
+ azure: "AZURE_OPENAI_BASE_URL",
21824
+ openrouter: "OPENROUTER_BASE_URL"
21825
+ };
21826
+ function resolveSubprovider(name21, hasBaseUrl = false) {
21827
+ const lower = name21.toLowerCase();
21828
+ if (hasBaseUrl) {
21829
+ const alias = SUBPROVIDER_ALIASES_WITH_BASE_URL[lower];
21830
+ if (alias) return alias;
21831
+ }
21832
+ return SUBPROVIDER_ALIASES[lower] ?? name21;
21833
+ }
21834
+ function resolveCliProvider(name21) {
21835
+ const lower = name21.toLowerCase();
21836
+ if (lower === "azure") return "azure-openai-responses";
21837
+ return name21;
21838
+ }
21839
+ function resolveEnvKeyName(provider, hasBaseUrl = false) {
21840
+ const lower = provider.toLowerCase();
21841
+ if (hasBaseUrl && lower === "azure") return "OPENAI_API_KEY";
21842
+ return ENV_KEY_MAP[lower];
21843
+ }
21844
+ function resolveEnvBaseUrlName(provider, hasBaseUrl = false) {
21845
+ const lower = provider.toLowerCase();
21846
+ if (hasBaseUrl && lower === "azure") return "OPENAI_BASE_URL";
21847
+ return ENV_BASE_URL_MAP[lower];
21848
+ }
21849
+ function extractAzureResourceName(baseUrl) {
21850
+ const urlMatch = baseUrl.match(/^https?:\/\/([^./]+)/);
21851
+ if (urlMatch) return urlMatch[1];
21852
+ return baseUrl;
21853
+ }
21854
+ function normalizeAzureSdkBaseUrl(baseUrl) {
21855
+ const trimmed = baseUrl.trim().replace(/\/+$/, "");
21856
+ if (!trimmed) {
21857
+ return trimmed;
21858
+ }
21859
+ if (!/^https?:\/\//i.test(trimmed)) {
21860
+ return `https://${trimmed}.openai.azure.com/openai/v1`;
21861
+ }
21862
+ if (/\/openai\/v1$/i.test(trimmed)) {
21863
+ return trimmed;
21864
+ }
21865
+ if (/\/openai$/i.test(trimmed)) {
21866
+ return `${trimmed}/v1`;
21867
+ }
21868
+ return `${trimmed}/openai/v1`;
21869
+ }
21467
21870
  function extractPiTextContent(content) {
21468
21871
  if (typeof content === "string") {
21469
21872
  return content;
@@ -21619,12 +22022,12 @@ var PiCliProvider = class {
21619
22022
  buildPiArgs(prompt, inputFiles) {
21620
22023
  const args = [];
21621
22024
  if (this.config.subprovider) {
21622
- args.push("--provider", this.config.subprovider);
22025
+ args.push("--provider", resolveCliProvider(this.config.subprovider));
21623
22026
  }
21624
22027
  if (this.config.model) {
21625
22028
  args.push("--model", this.config.model);
21626
22029
  }
21627
- if (this.config.apiKey) {
22030
+ if (this.config.apiKey && this.config.subprovider?.toLowerCase() !== "azure") {
21628
22031
  args.push("--api-key", this.config.apiKey);
21629
22032
  }
21630
22033
  args.push("--mode", "json");
@@ -21676,35 +22079,35 @@ ${prompt}` : prompt;
21676
22079
  }
21677
22080
  buildEnv() {
21678
22081
  const env = { ...process.env };
21679
- if (this.config.apiKey) {
21680
- const provider = this.config.subprovider?.toLowerCase() ?? "google";
21681
- const ENV_KEY_MAP = {
21682
- google: "GEMINI_API_KEY",
21683
- gemini: "GEMINI_API_KEY",
21684
- anthropic: "ANTHROPIC_API_KEY",
21685
- openai: "OPENAI_API_KEY",
21686
- groq: "GROQ_API_KEY",
21687
- xai: "XAI_API_KEY",
21688
- openrouter: "OPENROUTER_API_KEY"
21689
- };
21690
- const envKey = ENV_KEY_MAP[provider];
21691
- if (envKey) {
21692
- env[envKey] = this.config.apiKey;
22082
+ const provider = this.config.subprovider?.toLowerCase() ?? "google";
22083
+ if (provider === "azure") {
22084
+ if (this.config.apiKey) {
22085
+ env.AZURE_OPENAI_API_KEY = this.config.apiKey;
22086
+ }
22087
+ if (this.config.baseUrl) {
22088
+ env.AZURE_OPENAI_RESOURCE_NAME = extractAzureResourceName(this.config.baseUrl);
22089
+ }
22090
+ } else {
22091
+ if (this.config.apiKey) {
22092
+ const envKey = resolveEnvKeyName(provider);
22093
+ if (envKey) {
22094
+ env[envKey] = this.config.apiKey;
22095
+ }
21693
22096
  }
21694
22097
  }
21695
22098
  if (this.config.subprovider) {
21696
- const provider = this.config.subprovider.toLowerCase();
22099
+ const resolvedProvider = resolveCliProvider(this.config.subprovider);
21697
22100
  const PROVIDER_OWN_PREFIXES = {
21698
22101
  openrouter: ["OPENROUTER_"],
21699
22102
  anthropic: ["ANTHROPIC_"],
21700
22103
  openai: ["OPENAI_"],
21701
- azure: ["AZURE_OPENAI_"],
22104
+ "azure-openai-responses": ["AZURE_OPENAI_"],
21702
22105
  google: ["GEMINI_", "GOOGLE_GENERATIVE_AI_"],
21703
22106
  gemini: ["GEMINI_", "GOOGLE_GENERATIVE_AI_"],
21704
22107
  groq: ["GROQ_"],
21705
22108
  xai: ["XAI_"]
21706
22109
  };
21707
- const ownPrefixes = PROVIDER_OWN_PREFIXES[provider] ?? [];
22110
+ const ownPrefixes = PROVIDER_OWN_PREFIXES[resolvedProvider] ?? [];
21708
22111
  const allOtherPrefixes = Object.entries(PROVIDER_OWN_PREFIXES).filter(([key]) => key !== provider).flatMap(([, prefixes]) => prefixes);
21709
22112
  for (const key of Object.keys(env)) {
21710
22113
  if (allOtherPrefixes.some((prefix) => key.startsWith(prefix)) && !ownPrefixes.some((prefix) => key.startsWith(prefix))) {
@@ -21995,6 +22398,24 @@ function extractMessages(events) {
21995
22398
  }
21996
22399
  }
21997
22400
  }
22401
+ if (messages) {
22402
+ for (let i = messages.length - 1; i >= 0; i--) {
22403
+ if (messages[i].role === "assistant" && !messages[i].content) {
22404
+ for (let j = events.length - 1; j >= 0; j--) {
22405
+ const evt = events[j];
22406
+ if (!evt || evt.type !== "message_end") continue;
22407
+ const msg = evt.message;
22408
+ if (msg?.role !== "assistant") continue;
22409
+ const text2 = extractPiTextContent(msg.content);
22410
+ if (text2) {
22411
+ messages[i] = { ...messages[i], content: text2 };
22412
+ break;
22413
+ }
22414
+ }
22415
+ break;
22416
+ }
22417
+ }
22418
+ }
21998
22419
  const eventToolCalls = extractToolCallsFromEvents(events);
21999
22420
  if (eventToolCalls.length > 0) {
22000
22421
  injectEventToolCalls(messages, eventToolCalls);
@@ -22179,17 +22600,43 @@ function formatTimeoutSuffix3(timeoutMs) {
22179
22600
  if (!timeoutMs || timeoutMs <= 0) return "";
22180
22601
  return ` after ${Math.ceil(timeoutMs / 1e3)}s`;
22181
22602
  }
22603
+ function resolveWindowsCmd(executable) {
22604
+ if (process.platform !== "win32") return [executable, []];
22605
+ const lower = executable.toLowerCase();
22606
+ if (lower.endsWith(".js") || lower.endsWith(".exe")) return [executable, []];
22607
+ let fullPath;
22608
+ try {
22609
+ fullPath = execSync(`where ${executable}`, { encoding: "utf-8" }).trim().split(/\r?\n/)[0].trim();
22610
+ } catch {
22611
+ return [executable, []];
22612
+ }
22613
+ const cmdPath = fullPath.endsWith(".cmd") ? fullPath : `${fullPath}.cmd`;
22614
+ try {
22615
+ const content = readFileSync2(cmdPath, "utf-8");
22616
+ const match = content.match(/"?%_prog%"?\s+"([^"]+\.js)"/);
22617
+ if (match) {
22618
+ const dp0 = path19.dirname(path19.resolve(cmdPath));
22619
+ const scriptPath = match[1].replace(/%dp0%[/\\]?/gi, `${dp0}${path19.sep}`);
22620
+ try {
22621
+ accessSync(scriptPath);
22622
+ return ["node", [scriptPath]];
22623
+ } catch {
22624
+ }
22625
+ }
22626
+ } catch {
22627
+ }
22628
+ return [executable, []];
22629
+ }
22182
22630
  async function defaultPiRunner(options) {
22183
22631
  return await new Promise((resolve2, reject) => {
22184
22632
  const parts = options.executable.split(/\s+/);
22185
- const executable = parts[0];
22186
- const executableArgs = parts.slice(1);
22633
+ const [resolvedExe, prefixArgs] = resolveWindowsCmd(parts[0]);
22634
+ const executableArgs = [...prefixArgs, ...parts.slice(1)];
22187
22635
  const allArgs = [...executableArgs, ...options.args];
22188
- const child = spawn3(executable, allArgs, {
22636
+ const child = spawn3(resolvedExe, allArgs, {
22189
22637
  cwd: options.cwd,
22190
22638
  env: options.env,
22191
- stdio: ["pipe", "pipe", "pipe"],
22192
- shell: false
22639
+ stdio: ["pipe", "pipe", "pipe"]
22193
22640
  });
22194
22641
  let stdout = "";
22195
22642
  let stderr = "";
@@ -22242,6 +22689,30 @@ async function defaultPiRunner(options) {
22242
22689
  });
22243
22690
  });
22244
22691
  }
22692
+ var logged = false;
22693
+ function getAgentvHome() {
22694
+ const envHome = process.env.AGENTV_HOME;
22695
+ if (envHome && envHome !== "undefined") {
22696
+ if (!logged) {
22697
+ logged = true;
22698
+ console.warn(`Using AGENTV_HOME: ${envHome}`);
22699
+ }
22700
+ return envHome;
22701
+ }
22702
+ return path20.join(os2.homedir(), ".agentv");
22703
+ }
22704
+ function getWorkspacesRoot() {
22705
+ return path20.join(getAgentvHome(), "workspaces");
22706
+ }
22707
+ function getSubagentsRoot() {
22708
+ return path20.join(getAgentvHome(), "subagents");
22709
+ }
22710
+ function getTraceStateRoot() {
22711
+ return path20.join(getAgentvHome(), "trace-state");
22712
+ }
22713
+ function getWorkspacePoolRoot() {
22714
+ return path20.join(getAgentvHome(), "workspace-pool");
22715
+ }
22245
22716
  var piCodingAgentModule = null;
22246
22717
  var piAiModule = null;
22247
22718
  var loadingPromise = null;
@@ -22259,46 +22730,126 @@ async function promptInstall() {
22259
22730
  rl.close();
22260
22731
  }
22261
22732
  }
22262
- function findAgentvRoot() {
22263
- const thisFile = fileURLToPath3(import.meta.url);
22264
- let dir = path20.dirname(thisFile);
22265
- for (let i = 0; i < 10; i++) {
22733
+ function findManagedSdkInstallRoot() {
22734
+ return path21.join(getAgentvHome(), "deps", "pi-sdk");
22735
+ }
22736
+ function resolveGlobalNpmRoot() {
22737
+ try {
22738
+ const root = execSync2("npm root -g", {
22739
+ encoding: "utf-8",
22740
+ stdio: ["ignore", "pipe", "ignore"]
22741
+ }).trim();
22742
+ return root.length > 0 ? root : void 0;
22743
+ } catch {
22744
+ return void 0;
22745
+ }
22746
+ }
22747
+ function buildGlobalModuleEntry(moduleName, globalNpmRoot) {
22748
+ return path21.join(globalNpmRoot, ...moduleName.split("/"), "dist", "index.js");
22749
+ }
22750
+ function findAccessiblePath(paths) {
22751
+ for (const candidate of paths) {
22266
22752
  try {
22267
- const pkg = path20.join(dir, "package.json");
22268
- accessSync(pkg);
22269
- return dir;
22753
+ accessSync2(candidate);
22754
+ return candidate;
22270
22755
  } catch {
22271
- const parent = path20.dirname(dir);
22272
- if (parent === dir) break;
22273
- dir = parent;
22274
22756
  }
22275
22757
  }
22276
- return path20.dirname(thisFile);
22758
+ return void 0;
22277
22759
  }
22278
- async function doLoadSdkModules() {
22760
+ async function tryImportLocalSdkModules() {
22279
22761
  try {
22280
22762
  [piCodingAgentModule, piAiModule] = await Promise.all([
22281
22763
  import("@mariozechner/pi-coding-agent"),
22282
22764
  import("@mariozechner/pi-ai")
22283
22765
  ]);
22766
+ return true;
22284
22767
  } catch {
22285
- if (await promptInstall()) {
22286
- const installDir = findAgentvRoot();
22287
- console.error(`Installing @mariozechner/pi-coding-agent into ${installDir}...`);
22288
- execSync("bun add @mariozechner/pi-coding-agent", {
22289
- cwd: installDir,
22290
- stdio: "inherit"
22291
- });
22292
- [piCodingAgentModule, piAiModule] = await Promise.all([
22293
- import("@mariozechner/pi-coding-agent"),
22294
- import("@mariozechner/pi-ai")
22295
- ]);
22296
- } else {
22297
- throw new Error(
22298
- "pi-coding-agent SDK is not installed. Install it with:\n bun add @mariozechner/pi-coding-agent"
22299
- );
22768
+ return false;
22769
+ }
22770
+ }
22771
+ async function tryImportManagedSdkModules() {
22772
+ const managedRoot = findManagedSdkInstallRoot();
22773
+ const piCodingAgentEntry = findAccessiblePath([
22774
+ path21.join(managedRoot, "node_modules", "@mariozechner", "pi-coding-agent", "dist", "index.js")
22775
+ ]);
22776
+ const piAiEntry = findAccessiblePath([
22777
+ path21.join(managedRoot, "node_modules", "@mariozechner", "pi-ai", "dist", "index.js"),
22778
+ path21.join(
22779
+ managedRoot,
22780
+ "node_modules",
22781
+ "@mariozechner",
22782
+ "pi-coding-agent",
22783
+ "node_modules",
22784
+ "@mariozechner",
22785
+ "pi-ai",
22786
+ "dist",
22787
+ "index.js"
22788
+ )
22789
+ ]);
22790
+ if (!piCodingAgentEntry || !piAiEntry) return false;
22791
+ try {
22792
+ [piCodingAgentModule, piAiModule] = await Promise.all([
22793
+ import(pathToFileURL(piCodingAgentEntry).href),
22794
+ import(pathToFileURL(piAiEntry).href)
22795
+ ]);
22796
+ return true;
22797
+ } catch {
22798
+ return false;
22799
+ }
22800
+ }
22801
+ async function tryImportGlobalSdkModules() {
22802
+ const globalNpmRoot = resolveGlobalNpmRoot();
22803
+ if (!globalNpmRoot) return false;
22804
+ const piCodingAgentEntry = findAccessiblePath([
22805
+ buildGlobalModuleEntry("@mariozechner/pi-coding-agent", globalNpmRoot)
22806
+ ]);
22807
+ const piAiEntry = findAccessiblePath([
22808
+ buildGlobalModuleEntry("@mariozechner/pi-ai", globalNpmRoot),
22809
+ path21.join(
22810
+ globalNpmRoot,
22811
+ "@mariozechner",
22812
+ "pi-coding-agent",
22813
+ "node_modules",
22814
+ "@mariozechner",
22815
+ "pi-ai",
22816
+ "dist",
22817
+ "index.js"
22818
+ )
22819
+ ]);
22820
+ if (!piCodingAgentEntry || !piAiEntry) return false;
22821
+ try {
22822
+ [piCodingAgentModule, piAiModule] = await Promise.all([
22823
+ import(pathToFileURL(piCodingAgentEntry).href),
22824
+ import(pathToFileURL(piAiEntry).href)
22825
+ ]);
22826
+ return true;
22827
+ } catch {
22828
+ return false;
22829
+ }
22830
+ }
22831
+ function installSdkModules(installDir) {
22832
+ console.error(`Installing @mariozechner/pi-coding-agent into ${installDir} via npm...`);
22833
+ mkdirSync(installDir, { recursive: true });
22834
+ execSync2("npm install --no-save --no-package-lock @mariozechner/pi-coding-agent", {
22835
+ cwd: installDir,
22836
+ stdio: "inherit"
22837
+ });
22838
+ }
22839
+ async function doLoadSdkModules() {
22840
+ if (await tryImportLocalSdkModules() || await tryImportManagedSdkModules() || await tryImportGlobalSdkModules()) {
22841
+ return;
22842
+ }
22843
+ if (await promptInstall()) {
22844
+ const installDir = findManagedSdkInstallRoot();
22845
+ installSdkModules(installDir);
22846
+ if (await tryImportManagedSdkModules()) {
22847
+ return;
22300
22848
  }
22301
22849
  }
22850
+ throw new Error(
22851
+ "pi-coding-agent SDK is not installed. Install it with:\n npm install @mariozechner/pi-coding-agent"
22852
+ );
22302
22853
  }
22303
22854
  async function loadSdkModules() {
22304
22855
  if (!piCodingAgentModule || !piAiModule) {
@@ -22326,7 +22877,9 @@ async function loadSdkModules() {
22326
22877
  codingTools: piSdk.codingTools,
22327
22878
  toolMap,
22328
22879
  SessionManager: piSdk.SessionManager,
22329
- getModel: piAi.getModel
22880
+ getModel: piAi.getModel,
22881
+ // biome-ignore lint/suspicious/noExplicitAny: registerBuiltInApiProviders exists at runtime but not in type defs
22882
+ registerBuiltInApiProviders: piAi.registerBuiltInApiProviders
22330
22883
  };
22331
22884
  }
22332
22885
  var PiCodingAgentProvider = class {
@@ -22348,17 +22901,35 @@ var PiCodingAgentProvider = class {
22348
22901
  const startTime = (/* @__PURE__ */ new Date()).toISOString();
22349
22902
  const startMs = Date.now();
22350
22903
  const sdk = await loadSdkModules();
22904
+ sdk.registerBuiltInApiProviders();
22351
22905
  const logger = await this.createStreamLogger(request).catch(() => void 0);
22352
22906
  try {
22353
22907
  const cwd = this.resolveCwd(request.cwd);
22354
- const providerName = this.config.subprovider ?? "google";
22908
+ const rawProvider = this.config.subprovider ?? "google";
22909
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(rawProvider, this.config.baseUrl);
22910
+ const hasBaseUrl = !!normalizedBaseUrl;
22911
+ const providerName = resolveSubprovider(rawProvider, hasBaseUrl);
22355
22912
  const modelId = this.config.model ?? "gemini-2.5-flash";
22356
- this.setApiKeyEnv(providerName);
22357
- const model = sdk.getModel(providerName, modelId);
22913
+ this.setApiKeyEnv(rawProvider, hasBaseUrl);
22914
+ this.setBaseUrlEnv(rawProvider, normalizedBaseUrl, hasBaseUrl);
22915
+ let model = sdk.getModel(providerName, modelId);
22916
+ if (model && normalizedBaseUrl) {
22917
+ model = { ...model, baseUrl: normalizedBaseUrl };
22918
+ }
22358
22919
  if (!model) {
22359
- throw new Error(
22360
- `pi-coding-agent: getModel('${providerName}', '${modelId}') returned undefined. The model '${modelId}' is not registered for provider '${providerName}' in pi-ai. Check that subprovider and model are correct in your target config.`
22361
- );
22920
+ const envProvider = providerName.replace(/-responses$/, "");
22921
+ model = {
22922
+ id: modelId,
22923
+ name: modelId,
22924
+ api: providerName,
22925
+ provider: envProvider,
22926
+ baseUrl: normalizedBaseUrl ?? "",
22927
+ reasoning: false,
22928
+ input: ["text"],
22929
+ cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
22930
+ contextWindow: 128e3,
22931
+ maxTokens: 16384
22932
+ };
22362
22933
  }
22363
22934
  const tools = this.resolveTools(sdk);
22364
22935
  const { session } = await sdk.createAgentSession({
@@ -22511,28 +23082,35 @@ ${fileList}`;
22511
23082
  }
22512
23083
  }
22513
23084
  /** Maps config apiKey to the provider-specific env var the SDK reads. */
22514
- setApiKeyEnv(providerName) {
23085
+ setApiKeyEnv(providerName, hasBaseUrl = false) {
22515
23086
  if (!this.config.apiKey) return;
22516
- const ENV_KEY_MAP = {
22517
- google: "GEMINI_API_KEY",
22518
- gemini: "GEMINI_API_KEY",
22519
- anthropic: "ANTHROPIC_API_KEY",
22520
- openai: "OPENAI_API_KEY",
22521
- groq: "GROQ_API_KEY",
22522
- xai: "XAI_API_KEY",
22523
- openrouter: "OPENROUTER_API_KEY"
22524
- };
22525
- const envKey = ENV_KEY_MAP[providerName.toLowerCase()];
23087
+ const envKey = resolveEnvKeyName(providerName, hasBaseUrl);
22526
23088
  if (envKey) {
22527
23089
  process.env[envKey] = this.config.apiKey;
22528
23090
  }
22529
23091
  }
23092
+ /** Maps config baseUrl to the provider-specific env var the SDK reads. */
23093
+ setBaseUrlEnv(providerName, baseUrl = this.config.baseUrl, hasBaseUrl = false) {
23094
+ const normalizedBaseUrl = this.normalizeSdkBaseUrl(providerName, baseUrl);
23095
+ if (!normalizedBaseUrl) return;
23096
+ const envKey = resolveEnvBaseUrlName(providerName, hasBaseUrl);
23097
+ if (envKey) {
23098
+ process.env[envKey] = normalizedBaseUrl;
23099
+ }
23100
+ }
23101
+ normalizeSdkBaseUrl(providerName, baseUrl) {
23102
+ if (!baseUrl) return void 0;
23103
+ if (providerName.toLowerCase() === "azure") {
23104
+ return normalizeAzureSdkBaseUrl(baseUrl);
23105
+ }
23106
+ return baseUrl;
23107
+ }
22530
23108
  resolveCwd(cwdOverride) {
22531
23109
  if (cwdOverride) {
22532
- return path20.resolve(cwdOverride);
23110
+ return path21.resolve(cwdOverride);
22533
23111
  }
22534
23112
  if (this.config.cwd) {
22535
- return path20.resolve(this.config.cwd);
23113
+ return path21.resolve(this.config.cwd);
22536
23114
  }
22537
23115
  return process.cwd();
22538
23116
  }
@@ -22551,9 +23129,9 @@ ${fileList}`;
22551
23129
  }
22552
23130
  resolveLogDirectory() {
22553
23131
  if (this.config.logDir) {
22554
- return path20.resolve(this.config.logDir);
23132
+ return path21.resolve(this.config.logDir);
22555
23133
  }
22556
- return path20.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
23134
+ return path21.join(process.cwd(), ".agentv", "logs", "pi-coding-agent");
22557
23135
  }
22558
23136
  async createStreamLogger(request) {
22559
23137
  const logDir = this.resolveLogDirectory();
@@ -22567,7 +23145,7 @@ ${fileList}`;
22567
23145
  console.warn(`Skipping Pi stream logging (could not create ${logDir}): ${message}`);
22568
23146
  return void 0;
22569
23147
  }
22570
- const filePath = path20.join(logDir, buildLogFilename6(request, this.targetName));
23148
+ const filePath = path21.join(logDir, buildLogFilename6(request, this.targetName));
22571
23149
  try {
22572
23150
  const logger = await PiStreamLogger2.create({
22573
23151
  filePath,
@@ -22791,7 +23369,7 @@ async function readDirEntries(target) {
22791
23369
  const entries = await readdir2(target, { withFileTypes: true });
22792
23370
  return entries.map((entry) => ({
22793
23371
  name: entry.name,
22794
- absolutePath: path21.join(target, entry.name),
23372
+ absolutePath: path222.join(target, entry.name),
22795
23373
  isDirectory: entry.isDirectory()
22796
23374
  }));
22797
23375
  }
@@ -22805,7 +23383,7 @@ async function removeIfExists(target) {
22805
23383
  }
22806
23384
  }
22807
23385
  function pathToFileUri2(filePath) {
22808
- const absolutePath = path222.isAbsolute(filePath) ? filePath : path222.resolve(filePath);
23386
+ const absolutePath = path23.isAbsolute(filePath) ? filePath : path23.resolve(filePath);
22809
23387
  const normalizedPath = absolutePath.replace(/\\/g, "/");
22810
23388
  if (/^[a-zA-Z]:\//.test(normalizedPath)) {
22811
23389
  return `file:///${normalizedPath}`;
@@ -22897,8 +23475,8 @@ function createBatchRequestPrompt(userQuery, responseFileTmp, responseFileFinal,
22897
23475
  });
22898
23476
  }
22899
23477
  function createBatchOrchestratorPrompt(requestFiles, responseFiles, templateContent) {
22900
- const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path23.basename(file)}`).join("\n");
22901
- const responseList = responseFiles.map((file) => `"${path23.basename(file)}"`).join(", ");
23478
+ const requestLines = requestFiles.map((file, index) => `${index + 1}. messages/${path24.basename(file)}`).join("\n");
23479
+ const responseList = responseFiles.map((file) => `"${path24.basename(file)}"`).join(", ");
22902
23480
  return renderTemplate2(templateContent, {
22903
23481
  requestFiles: requestLines,
22904
23482
  responseList
@@ -22958,7 +23536,7 @@ async function waitForResponseOutput(responseFileFinal, pollInterval = 1e3, sile
22958
23536
  }
22959
23537
  async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, silent = false, timeoutMs = DEFAULT_TIMEOUT_MS) {
22960
23538
  if (!silent) {
22961
- const fileList = responseFilesFinal.map((file) => path24.basename(file)).join(", ");
23539
+ const fileList = responseFilesFinal.map((file) => path25.basename(file)).join(", ");
22962
23540
  console.error(`waiting for ${responseFilesFinal.length} batch response(s): ${fileList}`);
22963
23541
  }
22964
23542
  const deadline = Date.now() + timeoutMs;
@@ -22967,7 +23545,7 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
22967
23545
  while (pending.size > 0) {
22968
23546
  if (Date.now() >= deadline) {
22969
23547
  if (!silent) {
22970
- const remaining = [...pending].map((f) => path24.basename(f)).join(", ");
23548
+ const remaining = [...pending].map((f) => path25.basename(f)).join(", ");
22971
23549
  console.error(
22972
23550
  `error: timed out after ${Math.round(timeoutMs / 1e3)}s waiting for batch responses. Still pending: ${remaining}`
22973
23551
  );
@@ -23014,30 +23592,6 @@ async function waitForBatchResponses(responseFilesFinal, pollInterval = 1e3, sil
23014
23592
  }
23015
23593
  return true;
23016
23594
  }
23017
- var logged = false;
23018
- function getAgentvHome() {
23019
- const envHome = process.env.AGENTV_HOME;
23020
- if (envHome && envHome !== "undefined") {
23021
- if (!logged) {
23022
- logged = true;
23023
- console.warn(`Using AGENTV_HOME: ${envHome}`);
23024
- }
23025
- return envHome;
23026
- }
23027
- return path25.join(os2.homedir(), ".agentv");
23028
- }
23029
- function getWorkspacesRoot() {
23030
- return path25.join(getAgentvHome(), "workspaces");
23031
- }
23032
- function getSubagentsRoot() {
23033
- return path25.join(getAgentvHome(), "subagents");
23034
- }
23035
- function getTraceStateRoot() {
23036
- return path25.join(getAgentvHome(), "trace-state");
23037
- }
23038
- function getWorkspacePoolRoot() {
23039
- return path25.join(getAgentvHome(), "workspace-pool");
23040
- }
23041
23595
  var DEFAULT_LOCK_NAME = "subagent.lock";
23042
23596
  var DEFAULT_ALIVE_FILENAME = ".alive";
23043
23597
  function getDefaultSubagentRoot(vscodeCmd = "code") {
@@ -24258,9 +24812,10 @@ function resolveAndCreateProvider(definition, env = process.env) {
24258
24812
  const resolved = resolveTargetDefinition(definition, env);
24259
24813
  return createProvider(resolved);
24260
24814
  }
24261
- var PASS_THRESHOLD = 0.8;
24262
- function scoreToVerdict(score) {
24263
- return score >= PASS_THRESHOLD ? "pass" : "fail";
24815
+ var DEFAULT_THRESHOLD = 0.8;
24816
+ var PASS_THRESHOLD = DEFAULT_THRESHOLD;
24817
+ function scoreToVerdict(score, threshold = DEFAULT_THRESHOLD) {
24818
+ return score >= threshold ? "pass" : "fail";
24264
24819
  }
24265
24820
  function clampScore(value) {
24266
24821
  if (Number.isNaN(value) || !Number.isFinite(value)) {
@@ -24442,13 +24997,13 @@ async function execFileWithStdinNode(argv, stdinPayload, options) {
24442
24997
  async function execShellWithStdin(command, stdinPayload, options = {}) {
24443
24998
  const { mkdir: mkdir16, readFile: readFile15, rm: rm6, writeFile: writeFile9 } = await import("node:fs/promises");
24444
24999
  const { tmpdir: tmpdir3 } = await import("node:os");
24445
- const path49 = await import("node:path");
25000
+ const path50 = await import("node:path");
24446
25001
  const { randomUUID: randomUUID10 } = await import("node:crypto");
24447
- const dir = path49.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
25002
+ const dir = path50.join(tmpdir3(), `agentv-exec-${randomUUID10()}`);
24448
25003
  await mkdir16(dir, { recursive: true });
24449
- const stdinPath = path49.join(dir, "stdin.txt");
24450
- const stdoutPath = path49.join(dir, "stdout.txt");
24451
- const stderrPath = path49.join(dir, "stderr.txt");
25004
+ const stdinPath = path50.join(dir, "stdin.txt");
25005
+ const stdoutPath = path50.join(dir, "stdout.txt");
25006
+ const stderrPath = path50.join(dir, "stderr.txt");
24452
25007
  await writeFile9(stdinPath, stdinPayload, "utf8");
24453
25008
  const wrappedCommand = process.platform === "win32" ? `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}` : `(${command}) < ${shellEscapePath(stdinPath)} > ${shellEscapePath(stdoutPath)} 2> ${shellEscapePath(stderrPath)}`;
24454
25009
  const { spawn: spawn5 } = await import("node:child_process");
@@ -25629,7 +26184,7 @@ ${outputSchema2}`;
25629
26184
  parts.push("[[ ## scoring_criteria ## ]]");
25630
26185
  for (const rubric of rubrics) {
25631
26186
  const weightLabel = rubric.weight !== 1 ? ` (weight: ${rubric.weight})` : "";
25632
- const minScoreLabel = rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
26187
+ const minScoreLabel = rubric.min_score !== void 0 ? ` [REQUIRED: min score ${rubric.min_score}]` : rubric.required_min_score !== void 0 ? ` [REQUIRED: min score ${rubric.required_min_score}]` : "";
25633
26188
  parts.push("", `### Criterion: ${rubric.id}${weightLabel}${minScoreLabel}`);
25634
26189
  if (rubric.outcome) {
25635
26190
  parts.push(`Description: ${rubric.outcome}`);
@@ -25683,54 +26238,106 @@ ${outputSchema2}`;
25683
26238
  async runWithRetry(options) {
25684
26239
  const { context: context2, graderProvider, systemPrompt, userPrompt, schema, images } = options;
25685
26240
  let lastError;
26241
+ let lastInvalidResponse;
26242
+ let shouldAttemptStructureFix = false;
25686
26243
  for (let attempt = 1; attempt <= 3; attempt++) {
25687
26244
  try {
25688
- const model = graderProvider.asLanguageModel?.();
25689
- if (model) {
25690
- const modelOptions = {
25691
- ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
25692
- ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
25693
- };
25694
- const hasImages = images && images.length > 0;
25695
- const result = hasImages ? await generateText({
25696
- model,
25697
- system: systemPrompt,
25698
- messages: [
25699
- {
25700
- role: "user",
25701
- content: [
25702
- { type: "text", text: userPrompt },
25703
- ...toAiSdkImageParts(images)
25704
- ]
25705
- }
25706
- ],
25707
- ...modelOptions
25708
- }) : await generateText({
25709
- model,
25710
- system: systemPrompt,
25711
- prompt: userPrompt,
25712
- ...modelOptions
25713
- });
25714
- const data2 = schema.parse(parseJsonFromText(result.text));
25715
- const rawUsage = result.usage;
25716
- const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
25717
- return { data: data2, tokenUsage };
26245
+ const result = await this.generateStructuredResponse({
26246
+ context: context2,
26247
+ graderProvider,
26248
+ systemPrompt,
26249
+ userPrompt,
26250
+ images
26251
+ });
26252
+ const canRepairResponse = result.text.trim().length > 0;
26253
+ lastInvalidResponse = canRepairResponse ? result : void 0;
26254
+ let data;
26255
+ try {
26256
+ data = schema.parse(parseJsonFromText(result.text));
26257
+ } catch (e) {
26258
+ lastError = e instanceof Error ? e : new Error(String(e));
26259
+ shouldAttemptStructureFix = canRepairResponse;
26260
+ continue;
25718
26261
  }
25719
- const response = await graderProvider.invoke({
25720
- question: userPrompt,
26262
+ return {
26263
+ data,
26264
+ providerResponse: result.providerResponse,
26265
+ tokenUsage: result.tokenUsage
26266
+ };
26267
+ } catch (e) {
26268
+ lastError = e instanceof Error ? e : new Error(String(e));
26269
+ }
26270
+ }
26271
+ if (shouldAttemptStructureFix && lastInvalidResponse) {
26272
+ try {
26273
+ const repaired = await this.generateStructuredResponse({
26274
+ context: context2,
26275
+ graderProvider,
25721
26276
  systemPrompt,
25722
- evalCaseId: context2.evalCase.id,
25723
- attempt: context2.attempt,
25724
- maxOutputTokens: this.maxOutputTokens,
25725
- temperature: this.temperature
26277
+ userPrompt: buildStructureRepairPrompt({
26278
+ validationError: lastError?.message ?? "Schema validation failed",
26279
+ invalidResponse: lastInvalidResponse.text
26280
+ })
25726
26281
  });
25727
- const data = schema.parse(parseJsonFromText(extractLastAssistantContent(response.output)));
25728
- return { data, providerResponse: response, tokenUsage: response.tokenUsage };
26282
+ const data = schema.parse(parseJsonFromText(repaired.text));
26283
+ return {
26284
+ data,
26285
+ providerResponse: repaired.providerResponse,
26286
+ tokenUsage: sumTokenUsage(lastInvalidResponse.tokenUsage, repaired.tokenUsage)
26287
+ };
25729
26288
  } catch (e) {
25730
26289
  lastError = e instanceof Error ? e : new Error(String(e));
25731
26290
  }
25732
26291
  }
25733
- throw new Error(`Failed to parse evaluator response after 3 attempts: ${lastError?.message}`);
26292
+ throw new Error(
26293
+ `Failed to parse evaluator response after 3 attempts and 1 structure-fix attempt: ${lastError?.message}`
26294
+ );
26295
+ }
26296
+ async generateStructuredResponse(options) {
26297
+ const { context: context2, graderProvider, systemPrompt, userPrompt, images } = options;
26298
+ const model = graderProvider.asLanguageModel?.();
26299
+ if (model) {
26300
+ const modelOptions = {
26301
+ ...this.maxOutputTokens ? { maxTokens: this.maxOutputTokens } : {},
26302
+ ...typeof this.temperature === "number" ? { temperature: this.temperature } : {}
26303
+ };
26304
+ const hasImages = images && images.length > 0;
26305
+ const result = hasImages ? await generateText({
26306
+ model,
26307
+ system: systemPrompt,
26308
+ messages: [
26309
+ {
26310
+ role: "user",
26311
+ content: [
26312
+ { type: "text", text: userPrompt },
26313
+ ...toAiSdkImageParts(images)
26314
+ ]
26315
+ }
26316
+ ],
26317
+ ...modelOptions
26318
+ }) : await generateText({
26319
+ model,
26320
+ system: systemPrompt,
26321
+ prompt: userPrompt,
26322
+ ...modelOptions
26323
+ });
26324
+ const rawUsage = result.usage;
26325
+ const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } : void 0;
26326
+ return { text: result.text, tokenUsage };
26327
+ }
26328
+ const response = await graderProvider.invoke({
26329
+ question: userPrompt,
26330
+ systemPrompt,
26331
+ evalCaseId: context2.evalCase.id,
26332
+ attempt: context2.attempt,
26333
+ maxOutputTokens: this.maxOutputTokens,
26334
+ temperature: this.temperature
26335
+ });
26336
+ return {
26337
+ text: extractLastAssistantContent(response.output),
26338
+ providerResponse: response,
26339
+ tokenUsage: response.tokenUsage
26340
+ };
25734
26341
  }
25735
26342
  };
25736
26343
  function buildOutputSchema() {
@@ -25750,6 +26357,29 @@ function buildOutputSchema() {
25750
26357
  "}"
25751
26358
  ].join("\n");
25752
26359
  }
26360
+ function buildStructureRepairPrompt(options) {
26361
+ const { validationError, invalidResponse } = options;
26362
+ return [
26363
+ "The following evaluation response has useful grading content but invalid JSON structure.",
26364
+ "Repair it to satisfy the schema in the system prompt.",
26365
+ "Preserve the evaluation meaning, do not re-grade the answer, and return only a single JSON object.",
26366
+ "",
26367
+ "Validation error:",
26368
+ validationError,
26369
+ "",
26370
+ "Invalid response:",
26371
+ invalidResponse
26372
+ ].join("\n");
26373
+ }
26374
+ function sumTokenUsage(first, second) {
26375
+ if (!first && !second) {
26376
+ return void 0;
26377
+ }
26378
+ return {
26379
+ input: (first?.input ?? 0) + (second?.input ?? 0),
26380
+ output: (first?.output ?? 0) + (second?.output ?? 0)
26381
+ };
26382
+ }
25753
26383
  function buildRubricOutputSchema() {
25754
26384
  return `You are an expert evaluator. Evaluate the candidate answer against each rubric item.
25755
26385
  You must return a valid JSON object matching this schema:
@@ -25849,19 +26479,21 @@ function calculateScoreRangeResult(result, rubrics) {
25849
26479
  rawScores[rubric.id] = rawScore;
25850
26480
  totalWeight += rubric.weight;
25851
26481
  weightedScoreSum += normalizedScore * rubric.weight;
25852
- let requiredMinScore;
25853
- if (rubric.required_min_score !== void 0) {
25854
- requiredMinScore = rubric.required_min_score;
26482
+ let minScoreThreshold;
26483
+ if (rubric.min_score !== void 0) {
26484
+ minScoreThreshold = rubric.min_score;
26485
+ } else if (rubric.required_min_score !== void 0) {
26486
+ minScoreThreshold = rubric.required_min_score / 10;
25855
26487
  } else if (rubric.required === true) {
25856
- requiredMinScore = 10;
26488
+ minScoreThreshold = 1;
25857
26489
  }
25858
26490
  const matchingRange = rubric.score_ranges?.find(
25859
26491
  (r) => rawScore >= r.score_range[0] && rawScore <= r.score_range[1]
25860
26492
  );
25861
26493
  const rangeDescription = matchingRange?.outcome ?? "";
25862
26494
  const criterionLabel = rubric.outcome ?? rubric.id;
25863
- const passed = !(requiredMinScore !== void 0 && rawScore < requiredMinScore) && rawScore >= 7;
25864
- if (requiredMinScore !== void 0 && rawScore < requiredMinScore) {
26495
+ const passed = !(minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) && rawScore >= 7;
26496
+ if (minScoreThreshold !== void 0 && normalizedScore < minScoreThreshold) {
25865
26497
  failedRequired = true;
25866
26498
  }
25867
26499
  assertions.push({
@@ -25938,11 +26570,11 @@ function createFilesystemTools(workspacePath) {
25938
26570
  execute: async (input) => {
25939
26571
  try {
25940
26572
  const resolved = resolveSandboxed(workspacePath, input.path);
25941
- const stat10 = await fs2.stat(resolved);
25942
- if (stat10.isDirectory()) {
26573
+ const stat11 = await fs2.stat(resolved);
26574
+ if (stat11.isDirectory()) {
25943
26575
  return { error: `'${input.path}' is a directory, not a file` };
25944
26576
  }
25945
- const buffer = Buffer.alloc(Math.min(stat10.size, MAX_FILE_SIZE));
26577
+ const buffer = Buffer.alloc(Math.min(stat11.size, MAX_FILE_SIZE));
25946
26578
  const fd = await fs2.open(resolved, "r");
25947
26579
  try {
25948
26580
  await fd.read(buffer, 0, buffer.length, 0);
@@ -25950,8 +26582,8 @@ function createFilesystemTools(workspacePath) {
25950
26582
  await fd.close();
25951
26583
  }
25952
26584
  const content = buffer.toString("utf-8");
25953
- const truncated = stat10.size > MAX_FILE_SIZE;
25954
- return { content, truncated, size: stat10.size };
26585
+ const truncated = stat11.size > MAX_FILE_SIZE;
26586
+ return { content, truncated, size: stat11.size };
25955
26587
  } catch (error) {
25956
26588
  return { error: error instanceof Error ? error.message : String(error) };
25957
26589
  }
@@ -26002,8 +26634,8 @@ async function searchDirectory(dirPath, workspacePath, regex, matches) {
26002
26634
  const ext = path35.extname(entry.name).toLowerCase();
26003
26635
  if (BINARY_EXTENSIONS.has(ext)) continue;
26004
26636
  try {
26005
- const stat10 = await fs2.stat(fullPath);
26006
- if (stat10.size > MAX_FILE_SIZE) continue;
26637
+ const stat11 = await fs2.stat(fullPath);
26638
+ if (stat11.size > MAX_FILE_SIZE) continue;
26007
26639
  const content = await fs2.readFile(fullPath, "utf-8");
26008
26640
  const lines = content.split("\n");
26009
26641
  for (let i = 0; i < lines.length; i++) {
@@ -26636,115 +27268,115 @@ var FieldAccuracyEvaluator = class {
26636
27268
  * Evaluate a single field against the expected value.
26637
27269
  */
26638
27270
  evaluateField(fieldConfig, candidateData, expectedData) {
26639
- const { path: path49, match, required = true, weight = 1 } = fieldConfig;
26640
- const candidateValue = resolvePath(candidateData, path49);
26641
- const expectedValue = resolvePath(expectedData, path49);
27271
+ const { path: path50, match, required = true, weight = 1 } = fieldConfig;
27272
+ const candidateValue = resolvePath(candidateData, path50);
27273
+ const expectedValue = resolvePath(expectedData, path50);
26642
27274
  if (expectedValue === void 0) {
26643
27275
  return {
26644
- path: path49,
27276
+ path: path50,
26645
27277
  score: 1,
26646
27278
  // No expected value means no comparison needed
26647
27279
  weight,
26648
27280
  hit: true,
26649
- message: `${path49}: no expected value`
27281
+ message: `${path50}: no expected value`
26650
27282
  };
26651
27283
  }
26652
27284
  if (candidateValue === void 0) {
26653
27285
  if (required) {
26654
27286
  return {
26655
- path: path49,
27287
+ path: path50,
26656
27288
  score: 0,
26657
27289
  weight,
26658
27290
  hit: false,
26659
- message: `${path49} (required, missing)`
27291
+ message: `${path50} (required, missing)`
26660
27292
  };
26661
27293
  }
26662
27294
  return {
26663
- path: path49,
27295
+ path: path50,
26664
27296
  score: 1,
26665
27297
  // Don't penalize missing optional fields
26666
27298
  weight: 0,
26667
27299
  // Zero weight means it won't affect the score
26668
27300
  hit: true,
26669
- message: `${path49}: optional field missing`
27301
+ message: `${path50}: optional field missing`
26670
27302
  };
26671
27303
  }
26672
27304
  switch (match) {
26673
27305
  case "exact":
26674
- return this.compareExact(path49, candidateValue, expectedValue, weight);
27306
+ return this.compareExact(path50, candidateValue, expectedValue, weight);
26675
27307
  case "numeric_tolerance":
26676
27308
  return this.compareNumericTolerance(
26677
- path49,
27309
+ path50,
26678
27310
  candidateValue,
26679
27311
  expectedValue,
26680
27312
  fieldConfig,
26681
27313
  weight
26682
27314
  );
26683
27315
  case "date":
26684
- return this.compareDate(path49, candidateValue, expectedValue, fieldConfig, weight);
27316
+ return this.compareDate(path50, candidateValue, expectedValue, fieldConfig, weight);
26685
27317
  default:
26686
27318
  return {
26687
- path: path49,
27319
+ path: path50,
26688
27320
  score: 0,
26689
27321
  weight,
26690
27322
  hit: false,
26691
- message: `${path49}: unknown match type "${match}"`
27323
+ message: `${path50}: unknown match type "${match}"`
26692
27324
  };
26693
27325
  }
26694
27326
  }
26695
27327
  /**
26696
27328
  * Exact equality comparison.
26697
27329
  */
26698
- compareExact(path49, candidateValue, expectedValue, weight) {
27330
+ compareExact(path50, candidateValue, expectedValue, weight) {
26699
27331
  if (deepEqual(candidateValue, expectedValue)) {
26700
27332
  return {
26701
- path: path49,
27333
+ path: path50,
26702
27334
  score: 1,
26703
27335
  weight,
26704
27336
  hit: true,
26705
- message: path49
27337
+ message: path50
26706
27338
  };
26707
27339
  }
26708
27340
  if (typeof candidateValue !== typeof expectedValue) {
26709
27341
  return {
26710
- path: path49,
27342
+ path: path50,
26711
27343
  score: 0,
26712
27344
  weight,
26713
27345
  hit: false,
26714
- message: `${path49} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
27346
+ message: `${path50} (type mismatch: got ${typeof candidateValue}, expected ${typeof expectedValue})`
26715
27347
  };
26716
27348
  }
26717
27349
  return {
26718
- path: path49,
27350
+ path: path50,
26719
27351
  score: 0,
26720
27352
  weight,
26721
27353
  hit: false,
26722
- message: `${path49} (value mismatch)`
27354
+ message: `${path50} (value mismatch)`
26723
27355
  };
26724
27356
  }
26725
27357
  /**
26726
27358
  * Numeric comparison with absolute or relative tolerance.
26727
27359
  */
26728
- compareNumericTolerance(path49, candidateValue, expectedValue, fieldConfig, weight) {
27360
+ compareNumericTolerance(path50, candidateValue, expectedValue, fieldConfig, weight) {
26729
27361
  const { tolerance = 0, relative = false } = fieldConfig;
26730
27362
  const candidateNum = toNumber(candidateValue);
26731
27363
  const expectedNum = toNumber(expectedValue);
26732
27364
  if (candidateNum === null || expectedNum === null) {
26733
27365
  return {
26734
- path: path49,
27366
+ path: path50,
26735
27367
  score: 0,
26736
27368
  weight,
26737
27369
  hit: false,
26738
- message: `${path49} (non-numeric value)`
27370
+ message: `${path50} (non-numeric value)`
26739
27371
  };
26740
27372
  }
26741
27373
  if (!Number.isFinite(candidateNum) || !Number.isFinite(expectedNum)) {
26742
27374
  return {
26743
- path: path49,
27375
+ path: path50,
26744
27376
  score: 0,
26745
27377
  weight,
26746
27378
  hit: false,
26747
- message: `${path49} (invalid numeric value)`
27379
+ message: `${path50} (invalid numeric value)`
26748
27380
  };
26749
27381
  }
26750
27382
  const diff = Math.abs(candidateNum - expectedNum);
@@ -26757,61 +27389,61 @@ var FieldAccuracyEvaluator = class {
26757
27389
  }
26758
27390
  if (withinTolerance) {
26759
27391
  return {
26760
- path: path49,
27392
+ path: path50,
26761
27393
  score: 1,
26762
27394
  weight,
26763
27395
  hit: true,
26764
- message: `${path49} (within tolerance: diff=${diff.toFixed(2)})`
27396
+ message: `${path50} (within tolerance: diff=${diff.toFixed(2)})`
26765
27397
  };
26766
27398
  }
26767
27399
  return {
26768
- path: path49,
27400
+ path: path50,
26769
27401
  score: 0,
26770
27402
  weight,
26771
27403
  hit: false,
26772
- message: `${path49} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
27404
+ message: `${path50} (outside tolerance: diff=${diff.toFixed(2)}, tolerance=${tolerance})`
26773
27405
  };
26774
27406
  }
26775
27407
  /**
26776
27408
  * Date comparison with format normalization.
26777
27409
  */
26778
- compareDate(path49, candidateValue, expectedValue, fieldConfig, weight) {
27410
+ compareDate(path50, candidateValue, expectedValue, fieldConfig, weight) {
26779
27411
  const formats = fieldConfig.formats ?? DEFAULT_DATE_FORMATS;
26780
27412
  const candidateDate = parseDate(String(candidateValue), formats);
26781
27413
  const expectedDate = parseDate(String(expectedValue), formats);
26782
27414
  if (candidateDate === null) {
26783
27415
  return {
26784
- path: path49,
27416
+ path: path50,
26785
27417
  score: 0,
26786
27418
  weight,
26787
27419
  hit: false,
26788
- message: `${path49} (unparseable candidate date)`
27420
+ message: `${path50} (unparseable candidate date)`
26789
27421
  };
26790
27422
  }
26791
27423
  if (expectedDate === null) {
26792
27424
  return {
26793
- path: path49,
27425
+ path: path50,
26794
27426
  score: 0,
26795
27427
  weight,
26796
27428
  hit: false,
26797
- message: `${path49} (unparseable expected date)`
27429
+ message: `${path50} (unparseable expected date)`
26798
27430
  };
26799
27431
  }
26800
27432
  if (candidateDate.getFullYear() === expectedDate.getFullYear() && candidateDate.getMonth() === expectedDate.getMonth() && candidateDate.getDate() === expectedDate.getDate()) {
26801
27433
  return {
26802
- path: path49,
27434
+ path: path50,
26803
27435
  score: 1,
26804
27436
  weight,
26805
27437
  hit: true,
26806
- message: path49
27438
+ message: path50
26807
27439
  };
26808
27440
  }
26809
27441
  return {
26810
- path: path49,
27442
+ path: path50,
26811
27443
  score: 0,
26812
27444
  weight,
26813
27445
  hit: false,
26814
- message: `${path49} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
27446
+ message: `${path50} (date mismatch: got ${formatDateISO(candidateDate)}, expected ${formatDateISO(expectedDate)})`
26815
27447
  };
26816
27448
  }
26817
27449
  /**
@@ -26844,11 +27476,11 @@ var FieldAccuracyEvaluator = class {
26844
27476
  };
26845
27477
  }
26846
27478
  };
26847
- function resolvePath(obj, path49) {
26848
- if (!path49 || !obj) {
27479
+ function resolvePath(obj, path50) {
27480
+ if (!path50 || !obj) {
26849
27481
  return void 0;
26850
27482
  }
26851
- const parts = path49.split(/\.|\[|\]/).filter((p) => p.length > 0);
27483
+ const parts = path50.split(/\.|\[|\]/).filter((p) => p.length > 0);
26852
27484
  let current = obj;
26853
27485
  for (const part of parts) {
26854
27486
  if (current === null || current === void 0) {
@@ -27330,8 +27962,8 @@ var TokenUsageEvaluator = class {
27330
27962
  };
27331
27963
  }
27332
27964
  };
27333
- function getNestedValue(obj, path49) {
27334
- const parts = path49.split(".");
27965
+ function getNestedValue(obj, path50) {
27966
+ const parts = path50.split(".");
27335
27967
  let current = obj;
27336
27968
  for (const part of parts) {
27337
27969
  if (current === null || current === void 0 || typeof current !== "object") {
@@ -29054,7 +29686,7 @@ var WorkspacePoolManager = class {
29054
29686
  }
29055
29687
  /**
29056
29688
  * Reset an existing slot for reuse:
29057
- * 1. Reset repos (git reset --hard {ref} && git clean -fd per repo)
29689
+ * 1. Reset repos (fetch from origin when resolve=remote, then git reset --hard && git clean per repo)
29058
29690
  * 2. Re-copy template files (skip repo directories)
29059
29691
  */
29060
29692
  async resetSlot(slotPath, templatePath, repos, poolReset = "fast") {
@@ -29067,7 +29699,17 @@ var WorkspacePoolManager = class {
29067
29699
  continue;
29068
29700
  }
29069
29701
  const ref = repo.checkout?.ref ?? "HEAD";
29070
- await git(["reset", "--hard", ref], { cwd: repoDir });
29702
+ const resolve2 = repo.checkout?.resolve ?? "remote";
29703
+ if (resolve2 === "remote") {
29704
+ const fetchArgs = ["fetch", "origin", ref];
29705
+ if (repo.clone?.depth) {
29706
+ fetchArgs.splice(1, 0, "--depth", String(repo.clone.depth));
29707
+ }
29708
+ await git(fetchArgs, { cwd: repoDir });
29709
+ await git(["reset", "--hard", "FETCH_HEAD"], { cwd: repoDir });
29710
+ } else {
29711
+ await git(["reset", "--hard", ref], { cwd: repoDir });
29712
+ }
29071
29713
  const cleanFlag = poolReset === "strict" ? "-fdx" : "-fd";
29072
29714
  await git(["clean", cleanFlag], { cwd: repoDir });
29073
29715
  }
@@ -29350,7 +29992,7 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
29350
29992
  }
29351
29993
  return result.stdout;
29352
29994
  }
29353
- function classifyQualityStatus(score, threshold = PASS_THRESHOLD) {
29995
+ function classifyQualityStatus(score, threshold = DEFAULT_THRESHOLD) {
29354
29996
  return score >= threshold ? "ok" : "quality_failure";
29355
29997
  }
29356
29998
  function buildSkippedEvaluatorError(scores) {
@@ -29442,7 +30084,7 @@ async function runEvaluation(options) {
29442
30084
  const filteredEvalCases = filterEvalCases(evalCases, filter2);
29443
30085
  if (filteredEvalCases.length === 0) {
29444
30086
  if (filter2) {
29445
- throw new Error(`No tests matched filter '${filter2}' in ${evalFilePath}`);
30087
+ throw new Error(`No tests matched filter '${formatFilter(filter2)}' in ${evalFilePath}`);
29446
30088
  }
29447
30089
  return [];
29448
30090
  }
@@ -29468,20 +30110,10 @@ async function runEvaluation(options) {
29468
30110
  if (resolvedTargetsByName.has(name21)) {
29469
30111
  return resolvedTargetsByName.get(name21);
29470
30112
  }
29471
- let definition = targetDefinitions.get(name21);
30113
+ const definition = resolveDelegatedTargetDefinition(name21, targetDefinitions, envLookup);
29472
30114
  if (!definition) {
29473
30115
  return void 0;
29474
30116
  }
29475
- for (let depth = 0; depth < 5; depth++) {
29476
- const useTarget = definition.use_target;
29477
- if (typeof useTarget !== "string" || useTarget.trim().length === 0) break;
29478
- const envMatch = useTarget.trim().match(/^\$\{\{\s*([A-Z0-9_]+)\s*\}\}$/i);
29479
- const resolvedName = envMatch ? envLookup[envMatch[1]] ?? "" : useTarget.trim();
29480
- if (resolvedName.length === 0) break;
29481
- const next = targetDefinitions.get(resolvedName);
29482
- if (!next) break;
29483
- definition = next;
29484
- }
29485
30117
  const resolved = resolveTargetDefinition(definition, envLookup, evalFilePath);
29486
30118
  resolvedTargetsByName.set(name21, resolved);
29487
30119
  return resolved;
@@ -29504,6 +30136,9 @@ async function runEvaluation(options) {
29504
30136
  const graderName = targetContext.graderTarget ?? targetContext.name;
29505
30137
  const resolvedGrader = resolveTargetByName(graderName);
29506
30138
  if (!resolvedGrader) {
30139
+ if (!LLM_GRADER_CAPABLE_KINDS.includes(targetContext.kind)) {
30140
+ return void 0;
30141
+ }
29507
30142
  return getOrCreateProvider(targetContext);
29508
30143
  }
29509
30144
  return getOrCreateProvider(resolvedGrader);
@@ -29834,7 +30469,7 @@ async function runEvaluation(options) {
29834
30469
  const budgetResult = {
29835
30470
  timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
29836
30471
  testId: evalCase.id,
29837
- dataset: evalCase.dataset,
30472
+ suite: evalCase.suite,
29838
30473
  category: evalCase.category,
29839
30474
  score: 0,
29840
30475
  assertions: [],
@@ -29871,7 +30506,7 @@ async function runEvaluation(options) {
29871
30506
  const haltResult = {
29872
30507
  timestamp: (now2 ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
29873
30508
  testId: evalCase.id,
29874
- dataset: evalCase.dataset,
30509
+ suite: evalCase.suite,
29875
30510
  category: evalCase.category,
29876
30511
  score: 0,
29877
30512
  assertions: [],
@@ -30183,7 +30818,7 @@ async function runBatchEvaluation(options) {
30183
30818
  targetResolver,
30184
30819
  availableTargets,
30185
30820
  verbose,
30186
- threshold: batchThreshold
30821
+ threshold: evalCase.threshold ?? batchThreshold
30187
30822
  });
30188
30823
  if (providerError) {
30189
30824
  result = {
@@ -30645,8 +31280,9 @@ async function runEvalCase(options) {
30645
31280
  fileChanges,
30646
31281
  workspacePath,
30647
31282
  verbose,
30648
- threshold: caseThreshold
31283
+ threshold: evalCase.threshold ?? caseThreshold
30649
31284
  });
31285
+ const effectiveThreshold = evalCase.threshold ?? caseThreshold;
30650
31286
  const totalDurationMs = Date.now() - caseStartMs;
30651
31287
  const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
30652
31288
  const evalRunTokenUsage = tokenUsage || graderTokens ? {
@@ -30660,7 +31296,7 @@ async function runEvalCase(options) {
30660
31296
  ...evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}
30661
31297
  };
30662
31298
  const skippedEvaluatorError = buildSkippedEvaluatorError(result.scores);
30663
- const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, caseThreshold);
31299
+ const executionStatus = providerError || skippedEvaluatorError ? "execution_error" : classifyQualityStatus(result.score, effectiveThreshold);
30664
31300
  const targetUsedField = targetUsed ? { targetUsed } : {};
30665
31301
  const finalResult = providerError ? {
30666
31302
  ...result,
@@ -30861,7 +31497,8 @@ async function evaluateCandidate(options) {
30861
31497
  targetResolver,
30862
31498
  availableTargets,
30863
31499
  fileChanges,
30864
- workspacePath
31500
+ workspacePath,
31501
+ threshold: evalThreshold
30865
31502
  });
30866
31503
  const completedAt = nowFn();
30867
31504
  let agentRequest;
@@ -30892,7 +31529,7 @@ async function evaluateCandidate(options) {
30892
31529
  return {
30893
31530
  timestamp: completedAt.toISOString(),
30894
31531
  testId: evalCase.id,
30895
- dataset: evalCase.dataset,
31532
+ suite: evalCase.suite,
30896
31533
  category: evalCase.category,
30897
31534
  conversationId: evalCase.conversation_id,
30898
31535
  score: score.score,
@@ -30935,7 +31572,8 @@ async function runEvaluatorsForCase(options) {
30935
31572
  targetResolver,
30936
31573
  availableTargets,
30937
31574
  fileChanges,
30938
- workspacePath
31575
+ workspacePath,
31576
+ threshold
30939
31577
  } = options;
30940
31578
  if (evalCase.assertions && evalCase.assertions.length > 0) {
30941
31579
  return runEvaluatorList({
@@ -30961,7 +31599,8 @@ async function runEvaluatorsForCase(options) {
30961
31599
  targetResolver,
30962
31600
  availableTargets,
30963
31601
  fileChanges,
30964
- workspacePath
31602
+ workspacePath,
31603
+ threshold
30965
31604
  });
30966
31605
  }
30967
31606
  const evaluatorKind = evalCase.evaluator ?? "llm-grader";
@@ -31063,7 +31702,8 @@ async function runEvaluatorList(options) {
31063
31702
  name: evaluatorConfig.name,
31064
31703
  type: evaluatorConfig.type,
31065
31704
  weight,
31066
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
31705
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
31706
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
31067
31707
  });
31068
31708
  scores.push({
31069
31709
  name: evaluatorConfig.name,
@@ -31098,7 +31738,8 @@ async function runEvaluatorList(options) {
31098
31738
  name: evaluatorConfig.name ?? "unknown",
31099
31739
  type: evaluatorConfig.type ?? "llm-grader",
31100
31740
  weight,
31101
- ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {}
31741
+ ...evaluatorConfig.required !== void 0 ? { required: evaluatorConfig.required } : {},
31742
+ ...evaluatorConfig.min_score !== void 0 ? { min_score: evaluatorConfig.min_score } : {}
31102
31743
  });
31103
31744
  scores.push({
31104
31745
  name: evaluatorConfig.name ?? "unknown",
@@ -31132,9 +31773,10 @@ async function runEvaluatorList(options) {
31132
31773
  }
31133
31774
  }
31134
31775
  }
31776
+ const effectiveThreshold = options.threshold ?? DEFAULT_THRESHOLD;
31135
31777
  const hasRequiredFailure = scored.some((entry) => {
31136
31778
  if (!entry.required) return false;
31137
- const minScore = typeof entry.required === "number" ? entry.required : PASS_THRESHOLD;
31779
+ const minScore = entry.min_score ?? (typeof entry.required === "number" ? entry.required : effectiveThreshold);
31138
31780
  return entry.score.score < minScore;
31139
31781
  });
31140
31782
  const scorable = scored.filter((entry) => entry.score.verdict !== "skip");
@@ -31145,17 +31787,23 @@ async function runEvaluatorList(options) {
31145
31787
  const expectedAspectCount = assertions.length || 1;
31146
31788
  const score = {
31147
31789
  score: aggregateScore,
31148
- verdict: scoreToVerdict(aggregateScore),
31790
+ verdict: scoreToVerdict(aggregateScore, effectiveThreshold),
31149
31791
  assertions,
31150
31792
  expectedAspectCount
31151
31793
  };
31152
31794
  return { score, scores };
31153
31795
  }
31796
+ function formatFilter(filter2) {
31797
+ return typeof filter2 === "string" ? filter2 : filter2.join(", ");
31798
+ }
31799
+ function matchesFilter3(id, filter2) {
31800
+ return typeof filter2 === "string" ? micromatch3.isMatch(id, filter2) : filter2.some((pattern) => micromatch3.isMatch(id, pattern));
31801
+ }
31154
31802
  function filterEvalCases(evalCases, filter2) {
31155
31803
  if (!filter2) {
31156
31804
  return evalCases;
31157
31805
  }
31158
- return evalCases.filter((evalCase) => micromatch3.isMatch(evalCase.id, filter2));
31806
+ return evalCases.filter((evalCase) => matchesFilter3(evalCase.id, filter2));
31159
31807
  }
31160
31808
  function buildEvaluatorRegistry(overrides, resolveGraderProvider) {
31161
31809
  const llmGrader = overrides?.["llm-grader"] ?? new LlmGraderEvaluator({
@@ -31242,7 +31890,7 @@ function buildErrorResult(evalCase, targetName, timestamp, error, promptInputs,
31242
31890
  return {
31243
31891
  timestamp: timestamp.toISOString(),
31244
31892
  testId: evalCase.id,
31245
- dataset: evalCase.dataset,
31893
+ suite: evalCase.suite,
31246
31894
  category: evalCase.category,
31247
31895
  conversationId: evalCase.conversation_id,
31248
31896
  score: 0,
@@ -31506,6 +32154,7 @@ async function evaluate(config) {
31506
32154
  verbose: config.verbose,
31507
32155
  maxConcurrency: config.workers ?? 3,
31508
32156
  filter: config.filter,
32157
+ threshold: config.threshold,
31509
32158
  evalCases,
31510
32159
  onResult: async (result) => {
31511
32160
  collectedResults.push(result);
@@ -31516,19 +32165,19 @@ async function evaluate(config) {
31516
32165
  const durationMs = Date.now() - startTime;
31517
32166
  return {
31518
32167
  results: allResults,
31519
- summary: computeSummary(allResults, durationMs)
32168
+ summary: computeSummary(allResults, durationMs, config.threshold)
31520
32169
  };
31521
32170
  }
31522
32171
  function mapAssertionType(type) {
31523
32172
  return type.replace(/_/g, "-");
31524
32173
  }
31525
- function computeSummary(results, durationMs) {
32174
+ function computeSummary(results, durationMs, threshold = DEFAULT_THRESHOLD) {
31526
32175
  const total = results.length;
31527
32176
  let passed = 0;
31528
32177
  let scoreSum = 0;
31529
32178
  for (const r of results) {
31530
32179
  scoreSum += r.score;
31531
- if (r.score >= PASS_THRESHOLD) {
32180
+ if (r.score >= threshold) {
31532
32181
  passed++;
31533
32182
  }
31534
32183
  }
@@ -31559,7 +32208,7 @@ async function discoverDefaultTarget(repoRoot) {
31559
32208
  return null;
31560
32209
  }
31561
32210
  async function loadEnvHierarchy(repoRoot, startPath) {
31562
- const { readFileSync: readFileSync3 } = await import("node:fs");
32211
+ const { readFileSync: readFileSync4 } = await import("node:fs");
31563
32212
  const chain = buildDirectoryChain(startPath, repoRoot);
31564
32213
  const envFiles = [];
31565
32214
  for (const dir of chain) {
@@ -31568,7 +32217,7 @@ async function loadEnvHierarchy(repoRoot, startPath) {
31568
32217
  }
31569
32218
  for (let i = 0; i < envFiles.length; i++) {
31570
32219
  try {
31571
- const content = readFileSync3(envFiles[i], "utf8");
32220
+ const content = readFileSync4(envFiles[i], "utf8");
31572
32221
  for (const line of content.split("\n")) {
31573
32222
  const trimmed = line.trim();
31574
32223
  if (!trimmed || trimmed.startsWith("#")) continue;
@@ -31638,7 +32287,7 @@ var CONFIG_FILE_NAMES = [
31638
32287
  ];
31639
32288
  async function loadTsConfig(projectRoot) {
31640
32289
  const { existsSync: existsSync7 } = await import("node:fs");
31641
- const { pathToFileURL } = await import("node:url");
32290
+ const { pathToFileURL: pathToFileURL2 } = await import("node:url");
31642
32291
  const { join: join2 } = await import("node:path");
31643
32292
  for (const fileName of CONFIG_FILE_NAMES) {
31644
32293
  const filePath = join2(projectRoot, fileName);
@@ -31646,7 +32295,7 @@ async function loadTsConfig(projectRoot) {
31646
32295
  continue;
31647
32296
  }
31648
32297
  try {
31649
- const fileUrl = pathToFileURL(filePath).href;
32298
+ const fileUrl = pathToFileURL2(filePath).href;
31650
32299
  const mod = await import(fileUrl);
31651
32300
  const config = mod.default ?? mod;
31652
32301
  return AgentVConfigSchema.parse(config);
@@ -31779,7 +32428,7 @@ function loadProjectRegistry() {
31779
32428
  return { projects: [] };
31780
32429
  }
31781
32430
  try {
31782
- const raw = readFileSync2(registryPath, "utf-8");
32431
+ const raw = readFileSync3(registryPath, "utf-8");
31783
32432
  const parsed = parseYaml3(raw);
31784
32433
  if (!parsed || !Array.isArray(parsed.projects)) {
31785
32434
  return { projects: [] };
@@ -31793,7 +32442,7 @@ function saveProjectRegistry(registry) {
31793
32442
  const registryPath = getProjectsRegistryPath();
31794
32443
  const dir = path47.dirname(registryPath);
31795
32444
  if (!existsSync6(dir)) {
31796
- mkdirSync(dir, { recursive: true });
32445
+ mkdirSync2(dir, { recursive: true });
31797
32446
  }
31798
32447
  writeFileSync(registryPath, stringifyYaml(registry), "utf-8");
31799
32448
  }
@@ -32053,7 +32702,7 @@ var OtelTraceExporter = class {
32053
32702
  rootSpan.setAttribute("gen_ai.system", "agentv");
32054
32703
  rootSpan.setAttribute("agentv.test_id", result.testId);
32055
32704
  rootSpan.setAttribute("agentv.target", result.target);
32056
- if (result.dataset) rootSpan.setAttribute("agentv.dataset", result.dataset);
32705
+ if (result.suite) rootSpan.setAttribute("agentv.suite", result.suite);
32057
32706
  rootSpan.setAttribute("agentv.score", result.score);
32058
32707
  if (captureContent && result.output.length > 0) {
32059
32708
  const lastMsg = result.output[result.output.length - 1];
@@ -32262,7 +32911,7 @@ var OtelStreamingObserver = class {
32262
32911
  this.rootSpan.setAttribute("gen_ai.system", "agentv");
32263
32912
  this.rootSpan.setAttribute("agentv.test_id", testId);
32264
32913
  this.rootSpan.setAttribute("agentv.target", target);
32265
- if (evalSet) this.rootSpan.setAttribute("agentv.dataset", evalSet);
32914
+ if (evalSet) this.rootSpan.setAttribute("agentv.suite", evalSet);
32266
32915
  this.rootCtx = this.api.trace.setSpan(this.api.context.active(), this.rootSpan);
32267
32916
  }
32268
32917
  /** Create and immediately export a tool span */
@@ -32608,7 +33257,230 @@ function extractToolResultContent(content) {
32608
33257
  }
32609
33258
  return parts.length > 0 ? parts.join("") : void 0;
32610
33259
  }
32611
- var DEFAULT_PROJECTS_DIR = () => path48.join(homedir3(), ".claude", "projects");
33260
+ function parseCodexSession(jsonl) {
33261
+ const messages = [];
33262
+ let sessionId = "";
33263
+ let cwd;
33264
+ let model;
33265
+ let version;
33266
+ let startTimestamp;
33267
+ let endTimestamp;
33268
+ const pendingCalls = /* @__PURE__ */ new Map();
33269
+ const lines = jsonl.split("\n").filter((l) => l.trim().length > 0);
33270
+ for (const line of lines) {
33271
+ let entry;
33272
+ try {
33273
+ entry = JSON.parse(line);
33274
+ } catch {
33275
+ continue;
33276
+ }
33277
+ if (!entry.type) continue;
33278
+ if (entry.timestamp) {
33279
+ if (!startTimestamp) startTimestamp = entry.timestamp;
33280
+ endTimestamp = entry.timestamp;
33281
+ }
33282
+ const payload = entry.payload ?? {};
33283
+ switch (entry.type) {
33284
+ case "session_meta": {
33285
+ sessionId = String(payload.id ?? "");
33286
+ cwd = payload.cwd ? String(payload.cwd) : void 0;
33287
+ version = payload.cli_version ? String(payload.cli_version) : void 0;
33288
+ if (payload.model && !model) {
33289
+ model = String(payload.model);
33290
+ }
33291
+ break;
33292
+ }
33293
+ case "turn_context": {
33294
+ if (payload.model && !model) {
33295
+ model = String(payload.model);
33296
+ }
33297
+ if (payload.cwd && !cwd) {
33298
+ cwd = String(payload.cwd);
33299
+ }
33300
+ break;
33301
+ }
33302
+ case "response_item": {
33303
+ const itemType = String(payload.type ?? "");
33304
+ const role = String(payload.role ?? "");
33305
+ switch (itemType) {
33306
+ case "message": {
33307
+ if (role === "developer") break;
33308
+ const content = extractResponseItemContent(payload.content);
33309
+ if (role === "user" && content) {
33310
+ messages.push({ role: "user", content });
33311
+ } else if (role === "assistant" && content) {
33312
+ messages.push({ role: "assistant", content });
33313
+ }
33314
+ break;
33315
+ }
33316
+ case "function_call": {
33317
+ const toolName = String(payload.name ?? "");
33318
+ const callId = String(payload.call_id ?? "");
33319
+ let input;
33320
+ if (typeof payload.arguments === "string") {
33321
+ try {
33322
+ input = JSON.parse(payload.arguments);
33323
+ } catch {
33324
+ input = payload.arguments;
33325
+ }
33326
+ } else {
33327
+ input = payload.arguments;
33328
+ }
33329
+ const toolCall = { tool: toolName, input, id: callId };
33330
+ const msgIdx = messages.length;
33331
+ messages.push({
33332
+ role: "assistant",
33333
+ toolCalls: [toolCall]
33334
+ });
33335
+ if (callId) {
33336
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
33337
+ }
33338
+ break;
33339
+ }
33340
+ case "custom_tool_call": {
33341
+ const toolName = String(payload.name ?? "");
33342
+ const callId = String(payload.call_id ?? "");
33343
+ let input;
33344
+ if (typeof payload.arguments === "string") {
33345
+ try {
33346
+ input = JSON.parse(payload.arguments);
33347
+ } catch {
33348
+ input = payload.arguments;
33349
+ }
33350
+ } else {
33351
+ input = payload.arguments;
33352
+ }
33353
+ const toolCall = { tool: toolName, input, id: callId };
33354
+ const msgIdx = messages.length;
33355
+ messages.push({
33356
+ role: "assistant",
33357
+ toolCalls: [toolCall]
33358
+ });
33359
+ if (callId) {
33360
+ pendingCalls.set(callId, { msgIdx, toolIdx: 0 });
33361
+ }
33362
+ break;
33363
+ }
33364
+ case "function_call_output":
33365
+ case "custom_tool_call_output": {
33366
+ const callId = String(payload.call_id ?? "");
33367
+ const pending = pendingCalls.get(callId);
33368
+ if (pending) {
33369
+ const existingMsg = messages[pending.msgIdx];
33370
+ const existingCalls = [...existingMsg.toolCalls ?? []];
33371
+ existingCalls[pending.toolIdx] = {
33372
+ ...existingCalls[pending.toolIdx],
33373
+ output: payload.output
33374
+ };
33375
+ messages[pending.msgIdx] = { ...existingMsg, toolCalls: existingCalls };
33376
+ pendingCalls.delete(callId);
33377
+ }
33378
+ break;
33379
+ }
33380
+ // Skip reasoning blocks (thinking tokens)
33381
+ case "reasoning":
33382
+ break;
33383
+ }
33384
+ break;
33385
+ }
33386
+ }
33387
+ }
33388
+ let durationMs;
33389
+ if (startTimestamp && endTimestamp) {
33390
+ durationMs = new Date(endTimestamp).getTime() - new Date(startTimestamp).getTime();
33391
+ }
33392
+ const source = {
33393
+ provider: "codex",
33394
+ sessionId,
33395
+ cwd,
33396
+ startedAt: startTimestamp,
33397
+ model,
33398
+ version
33399
+ };
33400
+ return {
33401
+ messages,
33402
+ source,
33403
+ // Codex rollout files don't include token counts (only rate limit info)
33404
+ tokenUsage: void 0,
33405
+ durationMs,
33406
+ costUsd: null
33407
+ };
33408
+ }
33409
+ function extractResponseItemContent(content) {
33410
+ if (typeof content === "string") return content;
33411
+ if (!Array.isArray(content)) return void 0;
33412
+ const parts = [];
33413
+ for (const block of content) {
33414
+ if (typeof block === "object" && block !== null) {
33415
+ const b = block;
33416
+ if (typeof b.text === "string") {
33417
+ parts.push(b.text);
33418
+ }
33419
+ }
33420
+ }
33421
+ return parts.length > 0 ? parts.join("") : void 0;
33422
+ }
33423
+ var DEFAULT_SESSIONS_DIR = () => path48.join(homedir3(), ".codex", "sessions");
33424
+ async function discoverCodexSessions(opts) {
33425
+ const sessionsDir = opts?.sessionsDir ?? DEFAULT_SESSIONS_DIR();
33426
+ const limit = opts?.latest ? 1 : opts?.limit ?? 10;
33427
+ const sessions = [];
33428
+ let yearDirs;
33429
+ try {
33430
+ yearDirs = await readdir8(sessionsDir);
33431
+ } catch {
33432
+ return [];
33433
+ }
33434
+ for (const year of yearDirs) {
33435
+ const yearPath = path48.join(sessionsDir, year);
33436
+ let monthDirs;
33437
+ try {
33438
+ monthDirs = await readdir8(yearPath);
33439
+ } catch {
33440
+ continue;
33441
+ }
33442
+ for (const month of monthDirs) {
33443
+ const monthPath = path48.join(yearPath, month);
33444
+ let dayDirs;
33445
+ try {
33446
+ dayDirs = await readdir8(monthPath);
33447
+ } catch {
33448
+ continue;
33449
+ }
33450
+ for (const day of dayDirs) {
33451
+ if (opts?.date) {
33452
+ const dirDate = `${year}-${month}-${day}`;
33453
+ if (dirDate !== opts.date) continue;
33454
+ }
33455
+ const dayPath = path48.join(monthPath, day);
33456
+ let files;
33457
+ try {
33458
+ files = await readdir8(dayPath);
33459
+ } catch {
33460
+ continue;
33461
+ }
33462
+ for (const file of files) {
33463
+ if (!file.startsWith("rollout-") || !file.endsWith(".jsonl")) continue;
33464
+ const filePath = path48.join(dayPath, file);
33465
+ const nameWithoutExt = file.replace(/\.jsonl$/, "");
33466
+ const parts = nameWithoutExt.split("-");
33467
+ const sessionId = parts.length >= 6 ? parts.slice(-5).join("-") : nameWithoutExt;
33468
+ let updatedAt;
33469
+ try {
33470
+ const fileStat = await stat9(filePath);
33471
+ updatedAt = fileStat.mtime;
33472
+ } catch {
33473
+ updatedAt = /* @__PURE__ */ new Date(0);
33474
+ }
33475
+ sessions.push({ sessionId, filePath, filename: file, updatedAt });
33476
+ }
33477
+ }
33478
+ }
33479
+ }
33480
+ sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
33481
+ return sessions.slice(0, limit);
33482
+ }
33483
+ var DEFAULT_PROJECTS_DIR = () => path49.join(homedir4(), ".claude", "projects");
32612
33484
  function encodeProjectPath(projectPath) {
32613
33485
  return projectPath.replace(/\//g, "-");
32614
33486
  }
@@ -32617,7 +33489,7 @@ async function discoverClaudeSessions(opts) {
32617
33489
  const limit = opts?.latest ? 1 : opts?.limit ?? 10;
32618
33490
  let projectDirs;
32619
33491
  try {
32620
- projectDirs = await readdir8(projectsDir);
33492
+ projectDirs = await readdir9(projectsDir);
32621
33493
  } catch {
32622
33494
  return [];
32623
33495
  }
@@ -32627,10 +33499,10 @@ async function discoverClaudeSessions(opts) {
32627
33499
  }
32628
33500
  const sessions = [];
32629
33501
  for (const projectDir of projectDirs) {
32630
- const dirPath = path48.join(projectsDir, projectDir);
33502
+ const dirPath = path49.join(projectsDir, projectDir);
32631
33503
  let entries;
32632
33504
  try {
32633
- entries = await readdir8(dirPath);
33505
+ entries = await readdir9(dirPath);
32634
33506
  } catch {
32635
33507
  continue;
32636
33508
  }
@@ -32638,10 +33510,10 @@ async function discoverClaudeSessions(opts) {
32638
33510
  if (!entry.endsWith(".jsonl")) continue;
32639
33511
  const sessionId = entry.replace(/\.jsonl$/, "");
32640
33512
  if (opts?.sessionId && sessionId !== opts.sessionId) continue;
32641
- const filePath = path48.join(dirPath, entry);
33513
+ const filePath = path49.join(dirPath, entry);
32642
33514
  let updatedAt;
32643
33515
  try {
32644
- const fileStat = await stat9(filePath);
33516
+ const fileStat = await stat10(filePath);
32645
33517
  updatedAt = fileStat.mtime;
32646
33518
  } catch {
32647
33519
  updatedAt = /* @__PURE__ */ new Date(0);
@@ -32657,9 +33529,82 @@ async function discoverClaudeSessions(opts) {
32657
33529
  sessions.sort((a, b) => b.updatedAt.getTime() - a.updatedAt.getTime());
32658
33530
  return sessions.slice(0, limit);
32659
33531
  }
33532
+ function toTranscriptJsonLine(entry) {
33533
+ const firstUserMessage = entry.messages.find((m) => m.role === "user");
33534
+ const input = typeof firstUserMessage?.content === "string" ? firstUserMessage.content : "";
33535
+ return {
33536
+ input,
33537
+ output: entry.messages,
33538
+ token_usage: entry.tokenUsage ? {
33539
+ input: entry.tokenUsage.input,
33540
+ output: entry.tokenUsage.output,
33541
+ cached: entry.tokenUsage.cached
33542
+ } : void 0,
33543
+ duration_ms: entry.durationMs,
33544
+ cost_usd: entry.costUsd,
33545
+ source: {
33546
+ provider: entry.source.provider,
33547
+ session_id: entry.source.sessionId,
33548
+ model: entry.source.model,
33549
+ timestamp: entry.source.startedAt,
33550
+ git_branch: entry.source.gitBranch,
33551
+ cwd: entry.source.cwd ?? entry.source.projectPath,
33552
+ version: entry.source.version
33553
+ }
33554
+ };
33555
+ }
33556
+ async function readTranscriptJsonl(filePath) {
33557
+ const text2 = await readFile14(filePath, "utf8");
33558
+ return text2.split("\n").filter((line) => line.trim().length > 0).map((line) => JSON.parse(line));
33559
+ }
32660
33560
  async function readTranscriptFile(filePath) {
32661
33561
  return readFile14(filePath, "utf8");
32662
33562
  }
33563
+ var TranscriptProvider = class _TranscriptProvider {
33564
+ id;
33565
+ kind = "transcript";
33566
+ targetName;
33567
+ lines;
33568
+ cursor = 0;
33569
+ constructor(targetName, lines) {
33570
+ this.targetName = targetName;
33571
+ this.id = `transcript:${targetName}`;
33572
+ this.lines = lines;
33573
+ }
33574
+ /**
33575
+ * Create a TranscriptProvider from a JSONL file path.
33576
+ */
33577
+ static async fromFile(filePath) {
33578
+ const lines = await readTranscriptJsonl(filePath);
33579
+ if (lines.length === 0) {
33580
+ throw new Error(`Transcript file is empty: ${filePath}`);
33581
+ }
33582
+ const providerName = lines[0].source.provider ?? "transcript";
33583
+ return new _TranscriptProvider(providerName, lines);
33584
+ }
33585
+ get lineCount() {
33586
+ return this.lines.length;
33587
+ }
33588
+ async invoke(_request) {
33589
+ if (this.cursor >= this.lines.length) {
33590
+ throw new Error(
33591
+ `Transcript exhausted: ${this.lines.length} line(s) available but ${this.cursor + 1} invocations attempted. Each transcript line maps to one test case.`
33592
+ );
33593
+ }
33594
+ const line = this.lines[this.cursor++];
33595
+ return {
33596
+ output: line.output,
33597
+ tokenUsage: line.token_usage ? {
33598
+ input: line.token_usage.input,
33599
+ output: line.token_usage.output,
33600
+ cached: line.token_usage.cached
33601
+ } : void 0,
33602
+ durationMs: line.duration_ms,
33603
+ costUsd: line.cost_usd ?? void 0,
33604
+ startTime: line.source.timestamp
33605
+ };
33606
+ }
33607
+ };
32663
33608
  function createAgentKernel() {
32664
33609
  return { status: "stub" };
32665
33610
  }
@@ -32683,7 +33628,9 @@ export {
32683
33628
  buildSearchRoots,
32684
33629
  resolveFileReference,
32685
33630
  CLI_PLACEHOLDERS,
33631
+ findDeprecatedCamelCaseTargetWarnings,
32686
33632
  COMMON_TARGET_SETTINGS,
33633
+ resolveDelegatedTargetDefinition,
32687
33634
  resolveTargetDefinition,
32688
33635
  KNOWN_PROVIDERS,
32689
33636
  PROVIDER_ALIASES,
@@ -32726,17 +33673,18 @@ export {
32726
33673
  subscribeToCodexLogEntries,
32727
33674
  consumeCopilotCliLogEntries,
32728
33675
  subscribeToCopilotCliLogEntries,
33676
+ parseCopilotEvents,
32729
33677
  discoverCopilotSessions,
32730
33678
  consumeCopilotSdkLogEntries,
32731
33679
  subscribeToCopilotSdkLogEntries,
32732
33680
  consumePiLogEntries,
32733
33681
  subscribeToPiLogEntries,
32734
- ProviderRegistry,
32735
33682
  getAgentvHome,
32736
33683
  getWorkspacesRoot,
32737
33684
  getSubagentsRoot,
32738
33685
  getTraceStateRoot,
32739
33686
  getWorkspacePoolRoot,
33687
+ ProviderRegistry,
32740
33688
  ensureVSCodeSubagents,
32741
33689
  readTargetDefinitions,
32742
33690
  listTargetNames,
@@ -32744,6 +33692,7 @@ export {
32744
33692
  createBuiltinProviderRegistry,
32745
33693
  createProvider,
32746
33694
  resolveAndCreateProvider,
33695
+ DEFAULT_THRESHOLD,
32747
33696
  PASS_THRESHOLD,
32748
33697
  scoreToVerdict,
32749
33698
  clampScore,
@@ -32831,8 +33780,13 @@ export {
32831
33780
  OtelTraceExporter,
32832
33781
  OtelStreamingObserver,
32833
33782
  parseClaudeSession,
33783
+ parseCodexSession,
33784
+ discoverCodexSessions,
32834
33785
  discoverClaudeSessions,
33786
+ toTranscriptJsonLine,
33787
+ readTranscriptJsonl,
32835
33788
  readTranscriptFile,
33789
+ TranscriptProvider,
32836
33790
  createAgentKernel
32837
33791
  };
32838
- //# sourceMappingURL=chunk-KQQTEWZF.js.map
33792
+ //# sourceMappingURL=chunk-I6UE4LHZ.js.map