@agentv/core 2.7.1-next.5 → 2.8.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -17,7 +17,7 @@ import {
17
17
  readTextFile,
18
18
  resolveFileReference,
19
19
  resolveTargetDefinition
20
- } from "./chunk-6W5E3VR6.js";
20
+ } from "./chunk-P2465XAH.js";
21
21
  import {
22
22
  OtlpJsonFileExporter
23
23
  } from "./chunk-HFSYZHGF.js";
@@ -538,6 +538,24 @@ function extractCacheConfig(suite) {
538
538
  const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
539
539
  return { enabled: cache, cachePath: resolvedCachePath };
540
540
  }
541
+ function extractTotalBudgetUsd(suite) {
542
+ const execution = suite.execution;
543
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
544
+ return void 0;
545
+ }
546
+ const executionObj = execution;
547
+ const rawBudget = executionObj.total_budget_usd ?? executionObj.totalBudgetUsd;
548
+ if (rawBudget === void 0 || rawBudget === null) {
549
+ return void 0;
550
+ }
551
+ if (typeof rawBudget === "number" && rawBudget > 0) {
552
+ return rawBudget;
553
+ }
554
+ logWarning(
555
+ `Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`
556
+ );
557
+ return void 0;
558
+ }
541
559
  function logWarning(message) {
542
560
  console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
543
561
  }
@@ -670,24 +688,24 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
670
688
  continue;
671
689
  }
672
690
  if (typeValue === "code_judge") {
673
- let script;
674
- const rawScript = rawEvaluator.script;
675
- if (typeof rawScript === "string") {
676
- const trimmed = rawScript.trim();
691
+ let command;
692
+ const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
693
+ if (typeof rawCommand === "string") {
694
+ const trimmed = rawCommand.trim();
677
695
  if (trimmed.length === 0) {
678
696
  throw new Error(
679
- `Invalid code_judge script for evaluator '${name}' in '${evalId}': script cannot be empty`
697
+ `Invalid code_judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
680
698
  );
681
699
  }
682
- script = parseCommandToArgv(trimmed);
700
+ command = parseCommandToArgv(trimmed);
683
701
  } else {
684
- script = asStringArray(
685
- rawScript,
686
- `code_judge script for evaluator '${name}' in '${evalId}'`
702
+ command = asStringArray(
703
+ rawCommand,
704
+ `code_judge command for evaluator '${name}' in '${evalId}'`
687
705
  );
688
706
  }
689
- if (!script) {
690
- logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
707
+ if (!command) {
708
+ logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing command`);
691
709
  continue;
692
710
  }
693
711
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
@@ -732,6 +750,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
732
750
  const knownProps2 = /* @__PURE__ */ new Set([
733
751
  "name",
734
752
  "type",
753
+ "command",
735
754
  "script",
736
755
  "cwd",
737
756
  "weight",
@@ -748,7 +767,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
748
767
  evaluators.push({
749
768
  name,
750
769
  type: "code",
751
- script,
770
+ command,
752
771
  cwd,
753
772
  resolvedCwd,
754
773
  ...weight2 !== void 0 ? { weight: weight2 } : {},
@@ -1350,20 +1369,20 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
1350
1369
  let resolvedPromptScript;
1351
1370
  let promptScriptConfig;
1352
1371
  if (isJsonObject2(rawPrompt)) {
1353
- const scriptArray = asStringArray(
1354
- rawPrompt.script,
1355
- `prompt.script for evaluator '${name}' in '${evalId}'`
1372
+ const commandArray = asStringArray(
1373
+ rawPrompt.command ?? rawPrompt.script,
1374
+ `prompt.command for evaluator '${name}' in '${evalId}'`
1356
1375
  );
1357
- if (!scriptArray) {
1358
- throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`);
1376
+ if (!commandArray) {
1377
+ throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires command array`);
1359
1378
  }
1360
- const scriptPath = scriptArray[scriptArray.length - 1];
1361
- const resolved = await resolveFileReference3(scriptPath, searchRoots);
1379
+ const commandPath = commandArray[commandArray.length - 1];
1380
+ const resolved = await resolveFileReference3(commandPath, searchRoots);
1362
1381
  if (resolved.resolvedPath) {
1363
- resolvedPromptScript = [...scriptArray.slice(0, -1), path4.resolve(resolved.resolvedPath)];
1382
+ resolvedPromptScript = [...commandArray.slice(0, -1), path4.resolve(resolved.resolvedPath)];
1364
1383
  } else {
1365
1384
  throw new Error(
1366
- `Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
1385
+ `Evaluator '${name}' in '${evalId}': prompt command file not found: ${resolved.displayPath}`
1367
1386
  );
1368
1387
  }
1369
1388
  if (isJsonObject2(rawPrompt.config)) {
@@ -2594,6 +2613,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
2594
2613
  trials: extractTrialsConfig(parsed),
2595
2614
  targets: extractTargetsFromSuite(parsed),
2596
2615
  cacheConfig: extractCacheConfig(parsed),
2616
+ totalBudgetUsd: extractTotalBudgetUsd(parsed),
2597
2617
  ...metadata !== void 0 && { metadata }
2598
2618
  };
2599
2619
  }
@@ -2784,16 +2804,16 @@ var loadEvalCaseById = loadTestById;
2784
2804
  function parseWorkspaceScriptConfig(raw, evalFileDir) {
2785
2805
  if (!isJsonObject(raw)) return void 0;
2786
2806
  const obj = raw;
2787
- const script = obj.script;
2788
- if (!Array.isArray(script) || script.length === 0) return void 0;
2789
- const scriptArr = script.filter((s) => typeof s === "string");
2790
- if (scriptArr.length === 0) return void 0;
2807
+ const commandSource = obj.command ?? obj.script;
2808
+ if (!Array.isArray(commandSource) || commandSource.length === 0) return void 0;
2809
+ const commandArr = commandSource.filter((s) => typeof s === "string");
2810
+ if (commandArr.length === 0) return void 0;
2791
2811
  const timeoutMs = typeof obj.timeout_ms === "number" ? obj.timeout_ms : void 0;
2792
2812
  let cwd = typeof obj.cwd === "string" ? obj.cwd : void 0;
2793
2813
  if (cwd && !path8.isAbsolute(cwd)) {
2794
2814
  cwd = path8.resolve(evalFileDir, cwd);
2795
2815
  }
2796
- const config = { script: scriptArr };
2816
+ const config = { command: commandArr };
2797
2817
  if (timeoutMs !== void 0) {
2798
2818
  return { ...config, timeout_ms: timeoutMs, ...cwd !== void 0 && { cwd } };
2799
2819
  }
@@ -3871,50 +3891,58 @@ var CliProvider = class {
3871
3891
  await this.ensureHealthy(request.signal);
3872
3892
  const effectiveCwd = request.cwd ?? this.config.cwd;
3873
3893
  const outputFilePath = generateOutputFilePath(request.evalCaseId);
3874
- const templateValues = buildTemplateValues(request, this.config, outputFilePath);
3875
- const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
3894
+ const { values: templateValues, promptFilePath } = await buildTemplateValues(
3895
+ request,
3896
+ this.config,
3897
+ outputFilePath
3898
+ );
3899
+ const renderedCommand = renderTemplate(this.config.command, templateValues);
3876
3900
  if (this.verbose) {
3877
3901
  console.log(
3878
3902
  `[cli-provider:${this.targetName}] cwd=${effectiveCwd ?? ""} command=${renderedCommand}`
3879
3903
  );
3880
3904
  }
3881
- const startTime = Date.now();
3882
- const result = await this.runCommand(renderedCommand, {
3883
- cwd: effectiveCwd,
3884
- env: process.env,
3885
- timeoutMs: this.config.timeoutMs,
3886
- signal: request.signal
3887
- });
3888
- const measuredDurationMs = Date.now() - startTime;
3889
- if (result.failed || (result.exitCode ?? 0) !== 0) {
3890
- if (request.signal?.aborted) {
3891
- throw new Error("CLI provider request was aborted");
3892
- }
3893
- if (result.timedOut) {
3894
- throw new Error(
3895
- `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
3896
- );
3897
- }
3898
- const codeText = result.exitCode !== null ? result.exitCode : "unknown";
3899
- const detail = result.stderr.trim() || result.stdout.trim();
3900
- const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
3901
- throw new Error(message);
3902
- }
3903
- const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
3904
- const parsed = this.parseOutputContent(responseContent);
3905
- return {
3906
- output: parsed.output,
3907
- tokenUsage: parsed.tokenUsage,
3908
- costUsd: parsed.costUsd,
3909
- durationMs: parsed.durationMs ?? measuredDurationMs,
3910
- raw: {
3911
- command: renderedCommand,
3912
- stderr: result.stderr,
3913
- exitCode: result.exitCode ?? 0,
3905
+ try {
3906
+ const startTime = Date.now();
3907
+ const result = await this.runCommand(renderedCommand, {
3914
3908
  cwd: effectiveCwd,
3915
- outputFile: outputFilePath
3909
+ env: process.env,
3910
+ timeoutMs: this.config.timeoutMs,
3911
+ signal: request.signal
3912
+ });
3913
+ const measuredDurationMs = Date.now() - startTime;
3914
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
3915
+ if (request.signal?.aborted) {
3916
+ throw new Error("CLI provider request was aborted");
3917
+ }
3918
+ if (result.timedOut) {
3919
+ throw new Error(
3920
+ `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
3921
+ );
3922
+ }
3923
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
3924
+ const detail = result.stderr.trim() || result.stdout.trim();
3925
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
3926
+ throw new Error(message);
3916
3927
  }
3917
- };
3928
+ const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
3929
+ const parsed = this.parseOutputContent(responseContent);
3930
+ return {
3931
+ output: parsed.output,
3932
+ tokenUsage: parsed.tokenUsage,
3933
+ costUsd: parsed.costUsd,
3934
+ durationMs: parsed.durationMs ?? measuredDurationMs,
3935
+ raw: {
3936
+ command: renderedCommand,
3937
+ stderr: result.stderr,
3938
+ exitCode: result.exitCode ?? 0,
3939
+ cwd: effectiveCwd,
3940
+ outputFile: outputFilePath
3941
+ }
3942
+ };
3943
+ } finally {
3944
+ await cleanupTempFile(promptFilePath, this.keepTempFiles);
3945
+ }
3918
3946
  }
3919
3947
  async invokeBatch(requests) {
3920
3948
  if (requests.length === 0) {
@@ -3937,7 +3965,7 @@ var CliProvider = class {
3937
3965
  batchInputFiles.push(...request.inputFiles);
3938
3966
  }
3939
3967
  }
3940
- const templateValues = buildTemplateValues(
3968
+ const { values: templateValues, promptFilePath } = await buildTemplateValues(
3941
3969
  {
3942
3970
  question: "",
3943
3971
  guidelines: "",
@@ -3948,87 +3976,91 @@ var CliProvider = class {
3948
3976
  this.config,
3949
3977
  outputFilePath
3950
3978
  );
3951
- const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
3979
+ const renderedCommand = renderTemplate(this.config.command, templateValues);
3952
3980
  if (this.verbose) {
3953
3981
  console.log(
3954
3982
  `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
3955
3983
  );
3956
3984
  }
3957
- const startTime = Date.now();
3958
- const result = await this.runCommand(renderedCommand, {
3959
- cwd: this.config.cwd,
3960
- env: process.env,
3961
- timeoutMs: this.config.timeoutMs,
3962
- signal: controller.signal
3963
- });
3964
- const measuredDurationMs = Date.now() - startTime;
3965
- if (result.failed || (result.exitCode ?? 0) !== 0) {
3966
- if (controller.signal.aborted) {
3967
- throw new Error("CLI provider request was aborted");
3968
- }
3969
- if (result.timedOut) {
3970
- throw new Error(
3971
- `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
3972
- );
3973
- }
3974
- const codeText = result.exitCode !== null ? result.exitCode : "unknown";
3975
- const detail = result.stderr.trim() || result.stdout.trim();
3976
- const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
3977
- throw new Error(message);
3978
- }
3979
- const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
3980
- const recordsById = this.parseJsonlBatchOutput(responseContent);
3981
- const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
3982
- const responses = requests.map((request) => {
3983
- const evalCaseId = request.evalCaseId;
3984
- if (!evalCaseId) {
3985
- return {
3986
- output: [],
3987
- durationMs: perRequestFallbackMs,
3988
- raw: {
3989
- command: renderedCommand,
3990
- stderr: result.stderr,
3991
- exitCode: result.exitCode ?? 0,
3992
- cwd: this.config.cwd,
3993
- outputFile: outputFilePath
3985
+ try {
3986
+ const startTime = Date.now();
3987
+ const result = await this.runCommand(renderedCommand, {
3988
+ cwd: this.config.cwd,
3989
+ env: process.env,
3990
+ timeoutMs: this.config.timeoutMs,
3991
+ signal: controller.signal
3992
+ });
3993
+ const measuredDurationMs = Date.now() - startTime;
3994
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
3995
+ if (controller.signal.aborted) {
3996
+ throw new Error("CLI provider request was aborted");
3997
+ }
3998
+ if (result.timedOut) {
3999
+ throw new Error(
4000
+ `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
4001
+ );
4002
+ }
4003
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
4004
+ const detail = result.stderr.trim() || result.stdout.trim();
4005
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
4006
+ throw new Error(message);
4007
+ }
4008
+ const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
4009
+ const recordsById = this.parseJsonlBatchOutput(responseContent);
4010
+ const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
4011
+ const responses = requests.map((request) => {
4012
+ const evalCaseId = request.evalCaseId;
4013
+ if (!evalCaseId) {
4014
+ return {
4015
+ output: [],
4016
+ durationMs: perRequestFallbackMs,
4017
+ raw: {
4018
+ command: renderedCommand,
4019
+ stderr: result.stderr,
4020
+ exitCode: result.exitCode ?? 0,
4021
+ cwd: this.config.cwd,
4022
+ outputFile: outputFilePath
4023
+ }
4024
+ };
4025
+ }
4026
+ const parsed = recordsById.get(evalCaseId);
4027
+ if (!parsed) {
4028
+ const errorMessage = `Batch output missing id '${evalCaseId}'`;
4029
+ if (this.verbose) {
4030
+ console.warn(`[cli-provider:${this.targetName}] ${errorMessage}`);
3994
4031
  }
3995
- };
3996
- }
3997
- const parsed = recordsById.get(evalCaseId);
3998
- if (!parsed) {
3999
- const errorMessage = `Batch output missing id '${evalCaseId}'`;
4000
- if (this.verbose) {
4001
- console.warn(`[cli-provider:${this.targetName}] ${errorMessage}`);
4032
+ return {
4033
+ output: [{ role: "assistant", content: `Error: ${errorMessage}` }],
4034
+ durationMs: perRequestFallbackMs,
4035
+ raw: {
4036
+ command: renderedCommand,
4037
+ stderr: result.stderr,
4038
+ exitCode: result.exitCode ?? 0,
4039
+ cwd: this.config.cwd,
4040
+ outputFile: outputFilePath,
4041
+ error: errorMessage
4042
+ }
4043
+ };
4002
4044
  }
4003
4045
  return {
4004
- output: [{ role: "assistant", content: `Error: ${errorMessage}` }],
4005
- durationMs: perRequestFallbackMs,
4046
+ output: parsed.output,
4047
+ tokenUsage: parsed.tokenUsage,
4048
+ costUsd: parsed.costUsd,
4049
+ durationMs: parsed.durationMs ?? perRequestFallbackMs,
4006
4050
  raw: {
4007
4051
  command: renderedCommand,
4008
4052
  stderr: result.stderr,
4009
4053
  exitCode: result.exitCode ?? 0,
4010
4054
  cwd: this.config.cwd,
4011
4055
  outputFile: outputFilePath,
4012
- error: errorMessage
4056
+ recordId: evalCaseId
4013
4057
  }
4014
4058
  };
4015
- }
4016
- return {
4017
- output: parsed.output,
4018
- tokenUsage: parsed.tokenUsage,
4019
- costUsd: parsed.costUsd,
4020
- durationMs: parsed.durationMs ?? perRequestFallbackMs,
4021
- raw: {
4022
- command: renderedCommand,
4023
- stderr: result.stderr,
4024
- exitCode: result.exitCode ?? 0,
4025
- cwd: this.config.cwd,
4026
- outputFile: outputFilePath,
4027
- recordId: evalCaseId
4028
- }
4029
- };
4030
- });
4031
- return responses;
4059
+ });
4060
+ return responses;
4061
+ } finally {
4062
+ await cleanupTempFile(promptFilePath, this.keepTempFiles);
4063
+ }
4032
4064
  }
4033
4065
  /**
4034
4066
  * Parse output content from CLI.
@@ -4143,7 +4175,7 @@ var CliProvider = class {
4143
4175
  return;
4144
4176
  }
4145
4177
  const timeoutMs = healthcheck.timeoutMs ?? this.config.timeoutMs;
4146
- if (healthcheck.type === "http") {
4178
+ if ("url" in healthcheck && healthcheck.url) {
4147
4179
  const controller = new AbortController();
4148
4180
  const timer = timeoutMs ? setTimeout(() => controller.abort(), timeoutMs) : void 0;
4149
4181
  signal?.addEventListener("abort", () => controller.abort(), { once: true });
@@ -4162,50 +4194,70 @@ var CliProvider = class {
4162
4194
  }
4163
4195
  return;
4164
4196
  }
4165
- const renderedCommand = renderTemplate(
4166
- healthcheck.commandTemplate,
4167
- buildTemplateValues(
4168
- {
4169
- question: "",
4170
- guidelines: "",
4171
- inputFiles: [],
4172
- evalCaseId: "healthcheck",
4173
- attempt: 0
4174
- },
4175
- this.config,
4176
- generateOutputFilePath("healthcheck")
4177
- )
4197
+ const hcCommand = "command" in healthcheck ? healthcheck.command : void 0;
4198
+ if (!hcCommand) {
4199
+ throw new Error(`CLI healthcheck for '${this.targetName}': 'command' or 'url' is required`);
4200
+ }
4201
+ const { values: templateValues, promptFilePath } = await buildTemplateValues(
4202
+ {
4203
+ question: "",
4204
+ guidelines: "",
4205
+ inputFiles: [],
4206
+ evalCaseId: "healthcheck",
4207
+ attempt: 0
4208
+ },
4209
+ this.config,
4210
+ generateOutputFilePath("healthcheck")
4178
4211
  );
4212
+ const renderedCommand = renderTemplate(hcCommand, templateValues);
4213
+ const hcCwd = "cwd" in healthcheck ? healthcheck.cwd : void 0;
4179
4214
  if (this.verbose) {
4180
4215
  console.log(
4181
- `[cli-provider:${this.targetName}] (healthcheck) cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
4216
+ `[cli-provider:${this.targetName}] (healthcheck) cwd=${hcCwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
4182
4217
  );
4183
4218
  }
4184
- const result = await this.runCommand(renderedCommand, {
4185
- cwd: healthcheck.cwd ?? this.config.cwd,
4186
- env: process.env,
4187
- timeoutMs,
4188
- signal
4189
- });
4190
- if (result.failed || (result.exitCode ?? 0) !== 0) {
4191
- const codeText = result.exitCode !== null ? result.exitCode : "unknown";
4192
- const detail = result.stderr.trim() || result.stdout.trim();
4193
- const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
4194
- throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
4219
+ try {
4220
+ const result = await this.runCommand(renderedCommand, {
4221
+ cwd: hcCwd ?? this.config.cwd,
4222
+ env: process.env,
4223
+ timeoutMs,
4224
+ signal
4225
+ });
4226
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
4227
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
4228
+ const detail = result.stderr.trim() || result.stdout.trim();
4229
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
4230
+ throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
4231
+ }
4232
+ } finally {
4233
+ await cleanupTempFile(promptFilePath, this.keepTempFiles);
4195
4234
  }
4196
4235
  }
4197
4236
  };
4198
- function buildTemplateValues(request, config, outputFilePath) {
4237
+ async function buildTemplateValues(request, config, outputFilePath) {
4199
4238
  const inputFiles = normalizeInputFiles2(request.inputFiles);
4239
+ const promptFilePath = generateOutputFilePath(request.evalCaseId, ".prompt.txt");
4240
+ await fs.writeFile(promptFilePath, request.question ?? "", "utf8");
4200
4241
  return {
4201
- PROMPT: shellEscape(request.question ?? ""),
4202
- GUIDELINES: shellEscape(request.guidelines ?? ""),
4203
- EVAL_ID: shellEscape(request.evalCaseId ?? ""),
4204
- ATTEMPT: shellEscape(String(request.attempt ?? 0)),
4205
- FILES: formatFileList(inputFiles, config.filesFormat),
4206
- OUTPUT_FILE: shellEscape(outputFilePath)
4242
+ values: {
4243
+ PROMPT: shellEscape(request.question ?? ""),
4244
+ PROMPT_FILE: shellEscape(promptFilePath),
4245
+ GUIDELINES: shellEscape(request.guidelines ?? ""),
4246
+ EVAL_ID: shellEscape(request.evalCaseId ?? ""),
4247
+ ATTEMPT: shellEscape(String(request.attempt ?? 0)),
4248
+ FILES: formatFileList(inputFiles, config.filesFormat),
4249
+ OUTPUT_FILE: shellEscape(outputFilePath)
4250
+ },
4251
+ promptFilePath
4207
4252
  };
4208
4253
  }
4254
+ async function cleanupTempFile(filePath, keepTempFiles) {
4255
+ if (!filePath || keepTempFiles) {
4256
+ return;
4257
+ }
4258
+ await fs.unlink(filePath).catch(() => {
4259
+ });
4260
+ }
4209
4261
  function normalizeInputFiles2(inputFiles) {
4210
4262
  if (!inputFiles || inputFiles.length === 0) {
4211
4263
  return void 0;
@@ -8086,7 +8138,7 @@ async function discoverProviders(registry, baseDir) {
8086
8138
  }
8087
8139
  registry.register(kindName, (target) => {
8088
8140
  return new CliProvider(target.name, {
8089
- commandTemplate: `bun run ${filePath} {PROMPT}`
8141
+ command: `bun run ${filePath} {PROMPT}`
8090
8142
  });
8091
8143
  });
8092
8144
  discoveredKinds.push(kindName);
@@ -8599,13 +8651,13 @@ function toCamelCaseDeep(obj) {
8599
8651
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
8600
8652
  var CodeEvaluator = class {
8601
8653
  kind = "code";
8602
- script;
8654
+ command;
8603
8655
  cwd;
8604
8656
  agentTimeoutMs;
8605
8657
  config;
8606
8658
  target;
8607
8659
  constructor(options) {
8608
- this.script = options.script;
8660
+ this.command = options.command ?? options.script ?? [];
8609
8661
  this.cwd = options.cwd;
8610
8662
  this.agentTimeoutMs = options.agentTimeoutMs;
8611
8663
  this.config = options.config;
@@ -8664,7 +8716,7 @@ var CodeEvaluator = class {
8664
8716
  const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : void 0;
8665
8717
  try {
8666
8718
  const stdout = await executeScript(
8667
- this.script,
8719
+ this.command,
8668
8720
  inputPayload,
8669
8721
  this.agentTimeoutMs,
8670
8722
  this.cwd,
@@ -8678,7 +8730,7 @@ var CodeEvaluator = class {
8678
8730
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
8679
8731
  const proxyUsage = getProxyUsage?.();
8680
8732
  const evaluatorRawRequest = {
8681
- script: this.script,
8733
+ command: this.command,
8682
8734
  ...this.cwd ? { cwd: this.cwd } : {},
8683
8735
  ...proxyUsage ? {
8684
8736
  target_proxy: {
@@ -8708,7 +8760,7 @@ var CodeEvaluator = class {
8708
8760
  expectedAspectCount: 1,
8709
8761
  reasoning: message,
8710
8762
  evaluatorRawRequest: {
8711
- script: this.script,
8763
+ command: this.command,
8712
8764
  ...this.cwd ? { cwd: this.cwd } : {},
8713
8765
  ...proxyUsage ? {
8714
8766
  target_proxy: {
@@ -11638,7 +11690,7 @@ var llmJudgeFactory = (config, context) => {
11638
11690
  var codeFactory = (config, context) => {
11639
11691
  const c = config;
11640
11692
  return new CodeEvaluator({
11641
- script: c.script,
11693
+ command: c.command ?? c.script ?? [],
11642
11694
  cwd: c.resolvedCwd ?? c.cwd,
11643
11695
  agentTimeoutMs: context.agentTimeoutMs,
11644
11696
  config: c.config,
@@ -11820,7 +11872,7 @@ async function discoverAssertions(registry, baseDir) {
11820
11872
  }
11821
11873
  const factory = (_config, context) => {
11822
11874
  return new CodeEvaluator({
11823
- script: ["bun", "run", filePath],
11875
+ command: ["bun", "run", filePath],
11824
11876
  agentTimeoutMs: context.agentTimeoutMs
11825
11877
  });
11826
11878
  };
@@ -12174,7 +12226,8 @@ async function executeWorkspaceScript(config, context, failureMode = "fatal") {
12174
12226
  });
12175
12227
  const timeoutMs = config.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
12176
12228
  const cwd = config.cwd;
12177
- const result = await execFileWithStdin(config.script, stdin, {
12229
+ const commandArray = config.command ?? config.script ?? [];
12230
+ const result = await execFileWithStdin(commandArray, stdin, {
12178
12231
  timeoutMs,
12179
12232
  cwd
12180
12233
  });
@@ -12221,7 +12274,8 @@ async function runEvaluation(options) {
12221
12274
  keepWorkspaces,
12222
12275
  cleanupWorkspaces,
12223
12276
  trials,
12224
- streamCallbacks
12277
+ streamCallbacks,
12278
+ totalBudgetUsd
12225
12279
  } = options;
12226
12280
  let useCache = options.useCache;
12227
12281
  if (trials && trials.count > 1 && useCache) {
@@ -12394,10 +12448,39 @@ async function runEvaluation(options) {
12394
12448
  let nextWorkerId = 1;
12395
12449
  const workerIdByEvalId = /* @__PURE__ */ new Map();
12396
12450
  let beforeAllOutputAttached = false;
12451
+ let cumulativeBudgetCost = 0;
12452
+ let budgetExhausted = false;
12397
12453
  const promises = filteredEvalCases.map(
12398
12454
  (evalCase) => limit(async () => {
12399
12455
  const workerId = nextWorkerId++;
12400
12456
  workerIdByEvalId.set(evalCase.id, workerId);
12457
+ if (totalBudgetUsd !== void 0 && budgetExhausted) {
12458
+ const budgetResult = {
12459
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
12460
+ testId: evalCase.id,
12461
+ dataset: evalCase.dataset,
12462
+ score: 0,
12463
+ hits: [],
12464
+ misses: [],
12465
+ answer: "",
12466
+ target: target.name,
12467
+ error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
12468
+ budgetExceeded: true
12469
+ };
12470
+ if (onProgress) {
12471
+ await onProgress({
12472
+ workerId,
12473
+ testId: evalCase.id,
12474
+ status: "failed",
12475
+ completedAt: Date.now(),
12476
+ error: budgetResult.error
12477
+ });
12478
+ }
12479
+ if (onResult) {
12480
+ await onResult(budgetResult);
12481
+ }
12482
+ return budgetResult;
12483
+ }
12401
12484
  if (onProgress) {
12402
12485
  await onProgress({
12403
12486
  workerId,
@@ -12431,6 +12514,23 @@ async function runEvaluation(options) {
12431
12514
  typeRegistry
12432
12515
  };
12433
12516
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
12517
+ if (totalBudgetUsd !== void 0) {
12518
+ let caseCost;
12519
+ if (result.trials && result.trials.length > 0) {
12520
+ const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
12521
+ if (trialCostSum > 0) {
12522
+ caseCost = trialCostSum;
12523
+ }
12524
+ } else {
12525
+ caseCost = result.trace?.costUsd;
12526
+ }
12527
+ if (caseCost !== void 0) {
12528
+ cumulativeBudgetCost += caseCost;
12529
+ if (cumulativeBudgetCost >= totalBudgetUsd) {
12530
+ budgetExhausted = true;
12531
+ }
12532
+ }
12533
+ }
12434
12534
  if (beforeAllOutput && !beforeAllOutputAttached) {
12435
12535
  result = { ...result, beforeAllOutput };
12436
12536
  beforeAllOutputAttached = true;