@agentv/core 2.7.1-next.5 → 2.8.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -2141,6 +2141,24 @@ function extractCacheConfig(suite) {
2141
2141
  const resolvedCachePath = typeof cachePath === "string" && cachePath.trim().length > 0 ? cachePath.trim() : void 0;
2142
2142
  return { enabled: cache, cachePath: resolvedCachePath };
2143
2143
  }
2144
+ function extractTotalBudgetUsd(suite) {
2145
+ const execution = suite.execution;
2146
+ if (!execution || typeof execution !== "object" || Array.isArray(execution)) {
2147
+ return void 0;
2148
+ }
2149
+ const executionObj = execution;
2150
+ const rawBudget = executionObj.total_budget_usd ?? executionObj.totalBudgetUsd;
2151
+ if (rawBudget === void 0 || rawBudget === null) {
2152
+ return void 0;
2153
+ }
2154
+ if (typeof rawBudget === "number" && rawBudget > 0) {
2155
+ return rawBudget;
2156
+ }
2157
+ logWarning(
2158
+ `Invalid execution.total_budget_usd: ${rawBudget}. Must be a positive number. Ignoring.`
2159
+ );
2160
+ return void 0;
2161
+ }
2144
2162
  function logWarning(message) {
2145
2163
  console.warn(`${ANSI_YELLOW2}Warning: ${message}${ANSI_RESET2}`);
2146
2164
  }
@@ -2273,24 +2291,24 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2273
2291
  continue;
2274
2292
  }
2275
2293
  if (typeValue === "code_judge") {
2276
- let script;
2277
- const rawScript = rawEvaluator.script;
2278
- if (typeof rawScript === "string") {
2279
- const trimmed = rawScript.trim();
2294
+ let command;
2295
+ const rawCommand = rawEvaluator.command ?? rawEvaluator.script;
2296
+ if (typeof rawCommand === "string") {
2297
+ const trimmed = rawCommand.trim();
2280
2298
  if (trimmed.length === 0) {
2281
2299
  throw new Error(
2282
- `Invalid code_judge script for evaluator '${name}' in '${evalId}': script cannot be empty`
2300
+ `Invalid code_judge command for evaluator '${name}' in '${evalId}': command cannot be empty`
2283
2301
  );
2284
2302
  }
2285
- script = parseCommandToArgv(trimmed);
2303
+ command = parseCommandToArgv(trimmed);
2286
2304
  } else {
2287
- script = asStringArray(
2288
- rawScript,
2289
- `code_judge script for evaluator '${name}' in '${evalId}'`
2305
+ command = asStringArray(
2306
+ rawCommand,
2307
+ `code_judge command for evaluator '${name}' in '${evalId}'`
2290
2308
  );
2291
2309
  }
2292
- if (!script) {
2293
- logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing script`);
2310
+ if (!command) {
2311
+ logWarning2(`Skipping code_judge evaluator '${name}' in '${evalId}': missing command`);
2294
2312
  continue;
2295
2313
  }
2296
2314
  const weight2 = validateWeight(rawEvaluator.weight, name, evalId);
@@ -2335,6 +2353,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2335
2353
  const knownProps2 = /* @__PURE__ */ new Set([
2336
2354
  "name",
2337
2355
  "type",
2356
+ "command",
2338
2357
  "script",
2339
2358
  "cwd",
2340
2359
  "weight",
@@ -2351,7 +2370,7 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2351
2370
  evaluators.push({
2352
2371
  name,
2353
2372
  type: "code",
2354
- script,
2373
+ command,
2355
2374
  cwd,
2356
2375
  resolvedCwd,
2357
2376
  ...weight2 !== void 0 ? { weight: weight2 } : {},
@@ -2953,20 +2972,20 @@ async function parseEvaluatorList(candidateEvaluators, searchRoots, evalId) {
2953
2972
  let resolvedPromptScript;
2954
2973
  let promptScriptConfig;
2955
2974
  if (isJsonObject2(rawPrompt)) {
2956
- const scriptArray = asStringArray(
2957
- rawPrompt.script,
2958
- `prompt.script for evaluator '${name}' in '${evalId}'`
2975
+ const commandArray = asStringArray(
2976
+ rawPrompt.command ?? rawPrompt.script,
2977
+ `prompt.command for evaluator '${name}' in '${evalId}'`
2959
2978
  );
2960
- if (!scriptArray) {
2961
- throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires script array`);
2979
+ if (!commandArray) {
2980
+ throw new Error(`Evaluator '${name}' in '${evalId}': prompt object requires command array`);
2962
2981
  }
2963
- const scriptPath = scriptArray[scriptArray.length - 1];
2964
- const resolved = await resolveFileReference2(scriptPath, searchRoots);
2982
+ const commandPath = commandArray[commandArray.length - 1];
2983
+ const resolved = await resolveFileReference2(commandPath, searchRoots);
2965
2984
  if (resolved.resolvedPath) {
2966
- resolvedPromptScript = [...scriptArray.slice(0, -1), import_node_path4.default.resolve(resolved.resolvedPath)];
2985
+ resolvedPromptScript = [...commandArray.slice(0, -1), import_node_path4.default.resolve(resolved.resolvedPath)];
2967
2986
  } else {
2968
2987
  throw new Error(
2969
- `Evaluator '${name}' in '${evalId}': prompt script file not found: ${resolved.displayPath}`
2988
+ `Evaluator '${name}' in '${evalId}': prompt command file not found: ${resolved.displayPath}`
2970
2989
  );
2971
2990
  }
2972
2991
  if (isJsonObject2(rawPrompt.config)) {
@@ -4197,6 +4216,7 @@ async function loadTestSuite(evalFilePath, repoRoot, options) {
4197
4216
  trials: extractTrialsConfig(parsed),
4198
4217
  targets: extractTargetsFromSuite(parsed),
4199
4218
  cacheConfig: extractCacheConfig(parsed),
4219
+ totalBudgetUsd: extractTotalBudgetUsd(parsed),
4200
4220
  ...metadata !== void 0 && { metadata }
4201
4221
  };
4202
4222
  }
@@ -4387,16 +4407,16 @@ var loadEvalCaseById = loadTestById;
4387
4407
  function parseWorkspaceScriptConfig(raw, evalFileDir) {
4388
4408
  if (!isJsonObject(raw)) return void 0;
4389
4409
  const obj = raw;
4390
- const script = obj.script;
4391
- if (!Array.isArray(script) || script.length === 0) return void 0;
4392
- const scriptArr = script.filter((s) => typeof s === "string");
4393
- if (scriptArr.length === 0) return void 0;
4410
+ const commandSource = obj.command ?? obj.script;
4411
+ if (!Array.isArray(commandSource) || commandSource.length === 0) return void 0;
4412
+ const commandArr = commandSource.filter((s) => typeof s === "string");
4413
+ if (commandArr.length === 0) return void 0;
4394
4414
  const timeoutMs = typeof obj.timeout_ms === "number" ? obj.timeout_ms : void 0;
4395
4415
  let cwd = typeof obj.cwd === "string" ? obj.cwd : void 0;
4396
4416
  if (cwd && !import_node_path8.default.isAbsolute(cwd)) {
4397
4417
  cwd = import_node_path8.default.resolve(evalFileDir, cwd);
4398
4418
  }
4399
- const config = { script: scriptArr };
4419
+ const config = { command: commandArr };
4400
4420
  if (timeoutMs !== void 0) {
4401
4421
  return { ...config, timeout_ms: timeoutMs, ...cwd !== void 0 && { cwd } };
4402
4422
  }
@@ -5589,50 +5609,58 @@ var CliProvider = class {
5589
5609
  await this.ensureHealthy(request.signal);
5590
5610
  const effectiveCwd = request.cwd ?? this.config.cwd;
5591
5611
  const outputFilePath = generateOutputFilePath(request.evalCaseId);
5592
- const templateValues = buildTemplateValues(request, this.config, outputFilePath);
5593
- const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
5612
+ const { values: templateValues, promptFilePath } = await buildTemplateValues(
5613
+ request,
5614
+ this.config,
5615
+ outputFilePath
5616
+ );
5617
+ const renderedCommand = renderTemplate(this.config.command, templateValues);
5594
5618
  if (this.verbose) {
5595
5619
  console.log(
5596
5620
  `[cli-provider:${this.targetName}] cwd=${effectiveCwd ?? ""} command=${renderedCommand}`
5597
5621
  );
5598
5622
  }
5599
- const startTime = Date.now();
5600
- const result = await this.runCommand(renderedCommand, {
5601
- cwd: effectiveCwd,
5602
- env: process.env,
5603
- timeoutMs: this.config.timeoutMs,
5604
- signal: request.signal
5605
- });
5606
- const measuredDurationMs = Date.now() - startTime;
5607
- if (result.failed || (result.exitCode ?? 0) !== 0) {
5608
- if (request.signal?.aborted) {
5609
- throw new Error("CLI provider request was aborted");
5610
- }
5611
- if (result.timedOut) {
5612
- throw new Error(
5613
- `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
5614
- );
5615
- }
5616
- const codeText = result.exitCode !== null ? result.exitCode : "unknown";
5617
- const detail = result.stderr.trim() || result.stdout.trim();
5618
- const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
5619
- throw new Error(message);
5620
- }
5621
- const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
5622
- const parsed = this.parseOutputContent(responseContent);
5623
- return {
5624
- output: parsed.output,
5625
- tokenUsage: parsed.tokenUsage,
5626
- costUsd: parsed.costUsd,
5627
- durationMs: parsed.durationMs ?? measuredDurationMs,
5628
- raw: {
5629
- command: renderedCommand,
5630
- stderr: result.stderr,
5631
- exitCode: result.exitCode ?? 0,
5623
+ try {
5624
+ const startTime = Date.now();
5625
+ const result = await this.runCommand(renderedCommand, {
5632
5626
  cwd: effectiveCwd,
5633
- outputFile: outputFilePath
5627
+ env: process.env,
5628
+ timeoutMs: this.config.timeoutMs,
5629
+ signal: request.signal
5630
+ });
5631
+ const measuredDurationMs = Date.now() - startTime;
5632
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
5633
+ if (request.signal?.aborted) {
5634
+ throw new Error("CLI provider request was aborted");
5635
+ }
5636
+ if (result.timedOut) {
5637
+ throw new Error(
5638
+ `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
5639
+ );
5640
+ }
5641
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
5642
+ const detail = result.stderr.trim() || result.stdout.trim();
5643
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
5644
+ throw new Error(message);
5634
5645
  }
5635
- };
5646
+ const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
5647
+ const parsed = this.parseOutputContent(responseContent);
5648
+ return {
5649
+ output: parsed.output,
5650
+ tokenUsage: parsed.tokenUsage,
5651
+ costUsd: parsed.costUsd,
5652
+ durationMs: parsed.durationMs ?? measuredDurationMs,
5653
+ raw: {
5654
+ command: renderedCommand,
5655
+ stderr: result.stderr,
5656
+ exitCode: result.exitCode ?? 0,
5657
+ cwd: effectiveCwd,
5658
+ outputFile: outputFilePath
5659
+ }
5660
+ };
5661
+ } finally {
5662
+ await cleanupTempFile(promptFilePath, this.keepTempFiles);
5663
+ }
5636
5664
  }
5637
5665
  async invokeBatch(requests) {
5638
5666
  if (requests.length === 0) {
@@ -5655,7 +5683,7 @@ var CliProvider = class {
5655
5683
  batchInputFiles.push(...request.inputFiles);
5656
5684
  }
5657
5685
  }
5658
- const templateValues = buildTemplateValues(
5686
+ const { values: templateValues, promptFilePath } = await buildTemplateValues(
5659
5687
  {
5660
5688
  question: "",
5661
5689
  guidelines: "",
@@ -5666,87 +5694,91 @@ var CliProvider = class {
5666
5694
  this.config,
5667
5695
  outputFilePath
5668
5696
  );
5669
- const renderedCommand = renderTemplate(this.config.commandTemplate, templateValues);
5697
+ const renderedCommand = renderTemplate(this.config.command, templateValues);
5670
5698
  if (this.verbose) {
5671
5699
  console.log(
5672
5700
  `[cli-provider:${this.targetName}] (batch size=${requests.length}) cwd=${this.config.cwd ?? ""} command=${renderedCommand}`
5673
5701
  );
5674
5702
  }
5675
- const startTime = Date.now();
5676
- const result = await this.runCommand(renderedCommand, {
5677
- cwd: this.config.cwd,
5678
- env: process.env,
5679
- timeoutMs: this.config.timeoutMs,
5680
- signal: controller.signal
5681
- });
5682
- const measuredDurationMs = Date.now() - startTime;
5683
- if (result.failed || (result.exitCode ?? 0) !== 0) {
5684
- if (controller.signal.aborted) {
5685
- throw new Error("CLI provider request was aborted");
5686
- }
5687
- if (result.timedOut) {
5688
- throw new Error(
5689
- `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
5690
- );
5691
- }
5692
- const codeText = result.exitCode !== null ? result.exitCode : "unknown";
5693
- const detail = result.stderr.trim() || result.stdout.trim();
5694
- const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
5695
- throw new Error(message);
5696
- }
5697
- const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
5698
- const recordsById = this.parseJsonlBatchOutput(responseContent);
5699
- const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
5700
- const responses = requests.map((request) => {
5701
- const evalCaseId = request.evalCaseId;
5702
- if (!evalCaseId) {
5703
- return {
5704
- output: [],
5705
- durationMs: perRequestFallbackMs,
5706
- raw: {
5707
- command: renderedCommand,
5708
- stderr: result.stderr,
5709
- exitCode: result.exitCode ?? 0,
5710
- cwd: this.config.cwd,
5711
- outputFile: outputFilePath
5703
+ try {
5704
+ const startTime = Date.now();
5705
+ const result = await this.runCommand(renderedCommand, {
5706
+ cwd: this.config.cwd,
5707
+ env: process.env,
5708
+ timeoutMs: this.config.timeoutMs,
5709
+ signal: controller.signal
5710
+ });
5711
+ const measuredDurationMs = Date.now() - startTime;
5712
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
5713
+ if (controller.signal.aborted) {
5714
+ throw new Error("CLI provider request was aborted");
5715
+ }
5716
+ if (result.timedOut) {
5717
+ throw new Error(
5718
+ `CLI provider timed out${formatTimeoutSuffix(this.config.timeoutMs ?? void 0)}`
5719
+ );
5720
+ }
5721
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
5722
+ const detail = result.stderr.trim() || result.stdout.trim();
5723
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI exited with code ${codeText}`;
5724
+ throw new Error(message);
5725
+ }
5726
+ const responseContent = await this.readAndCleanupOutputFile(outputFilePath);
5727
+ const recordsById = this.parseJsonlBatchOutput(responseContent);
5728
+ const perRequestFallbackMs = Math.round(measuredDurationMs / requests.length);
5729
+ const responses = requests.map((request) => {
5730
+ const evalCaseId = request.evalCaseId;
5731
+ if (!evalCaseId) {
5732
+ return {
5733
+ output: [],
5734
+ durationMs: perRequestFallbackMs,
5735
+ raw: {
5736
+ command: renderedCommand,
5737
+ stderr: result.stderr,
5738
+ exitCode: result.exitCode ?? 0,
5739
+ cwd: this.config.cwd,
5740
+ outputFile: outputFilePath
5741
+ }
5742
+ };
5743
+ }
5744
+ const parsed = recordsById.get(evalCaseId);
5745
+ if (!parsed) {
5746
+ const errorMessage = `Batch output missing id '${evalCaseId}'`;
5747
+ if (this.verbose) {
5748
+ console.warn(`[cli-provider:${this.targetName}] ${errorMessage}`);
5712
5749
  }
5713
- };
5714
- }
5715
- const parsed = recordsById.get(evalCaseId);
5716
- if (!parsed) {
5717
- const errorMessage = `Batch output missing id '${evalCaseId}'`;
5718
- if (this.verbose) {
5719
- console.warn(`[cli-provider:${this.targetName}] ${errorMessage}`);
5750
+ return {
5751
+ output: [{ role: "assistant", content: `Error: ${errorMessage}` }],
5752
+ durationMs: perRequestFallbackMs,
5753
+ raw: {
5754
+ command: renderedCommand,
5755
+ stderr: result.stderr,
5756
+ exitCode: result.exitCode ?? 0,
5757
+ cwd: this.config.cwd,
5758
+ outputFile: outputFilePath,
5759
+ error: errorMessage
5760
+ }
5761
+ };
5720
5762
  }
5721
5763
  return {
5722
- output: [{ role: "assistant", content: `Error: ${errorMessage}` }],
5723
- durationMs: perRequestFallbackMs,
5764
+ output: parsed.output,
5765
+ tokenUsage: parsed.tokenUsage,
5766
+ costUsd: parsed.costUsd,
5767
+ durationMs: parsed.durationMs ?? perRequestFallbackMs,
5724
5768
  raw: {
5725
5769
  command: renderedCommand,
5726
5770
  stderr: result.stderr,
5727
5771
  exitCode: result.exitCode ?? 0,
5728
5772
  cwd: this.config.cwd,
5729
5773
  outputFile: outputFilePath,
5730
- error: errorMessage
5774
+ recordId: evalCaseId
5731
5775
  }
5732
5776
  };
5733
- }
5734
- return {
5735
- output: parsed.output,
5736
- tokenUsage: parsed.tokenUsage,
5737
- costUsd: parsed.costUsd,
5738
- durationMs: parsed.durationMs ?? perRequestFallbackMs,
5739
- raw: {
5740
- command: renderedCommand,
5741
- stderr: result.stderr,
5742
- exitCode: result.exitCode ?? 0,
5743
- cwd: this.config.cwd,
5744
- outputFile: outputFilePath,
5745
- recordId: evalCaseId
5746
- }
5747
- };
5748
- });
5749
- return responses;
5777
+ });
5778
+ return responses;
5779
+ } finally {
5780
+ await cleanupTempFile(promptFilePath, this.keepTempFiles);
5781
+ }
5750
5782
  }
5751
5783
  /**
5752
5784
  * Parse output content from CLI.
@@ -5861,7 +5893,7 @@ var CliProvider = class {
5861
5893
  return;
5862
5894
  }
5863
5895
  const timeoutMs = healthcheck.timeoutMs ?? this.config.timeoutMs;
5864
- if (healthcheck.type === "http") {
5896
+ if ("url" in healthcheck && healthcheck.url) {
5865
5897
  const controller = new AbortController();
5866
5898
  const timer = timeoutMs ? setTimeout(() => controller.abort(), timeoutMs) : void 0;
5867
5899
  signal?.addEventListener("abort", () => controller.abort(), { once: true });
@@ -5880,50 +5912,70 @@ var CliProvider = class {
5880
5912
  }
5881
5913
  return;
5882
5914
  }
5883
- const renderedCommand = renderTemplate(
5884
- healthcheck.commandTemplate,
5885
- buildTemplateValues(
5886
- {
5887
- question: "",
5888
- guidelines: "",
5889
- inputFiles: [],
5890
- evalCaseId: "healthcheck",
5891
- attempt: 0
5892
- },
5893
- this.config,
5894
- generateOutputFilePath("healthcheck")
5895
- )
5915
+ const hcCommand = "command" in healthcheck ? healthcheck.command : void 0;
5916
+ if (!hcCommand) {
5917
+ throw new Error(`CLI healthcheck for '${this.targetName}': 'command' or 'url' is required`);
5918
+ }
5919
+ const { values: templateValues, promptFilePath } = await buildTemplateValues(
5920
+ {
5921
+ question: "",
5922
+ guidelines: "",
5923
+ inputFiles: [],
5924
+ evalCaseId: "healthcheck",
5925
+ attempt: 0
5926
+ },
5927
+ this.config,
5928
+ generateOutputFilePath("healthcheck")
5896
5929
  );
5930
+ const renderedCommand = renderTemplate(hcCommand, templateValues);
5931
+ const hcCwd = "cwd" in healthcheck ? healthcheck.cwd : void 0;
5897
5932
  if (this.verbose) {
5898
5933
  console.log(
5899
- `[cli-provider:${this.targetName}] (healthcheck) cwd=${healthcheck.cwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
5934
+ `[cli-provider:${this.targetName}] (healthcheck) cwd=${hcCwd ?? this.config.cwd ?? ""} command=${renderedCommand}`
5900
5935
  );
5901
5936
  }
5902
- const result = await this.runCommand(renderedCommand, {
5903
- cwd: healthcheck.cwd ?? this.config.cwd,
5904
- env: process.env,
5905
- timeoutMs,
5906
- signal
5907
- });
5908
- if (result.failed || (result.exitCode ?? 0) !== 0) {
5909
- const codeText = result.exitCode !== null ? result.exitCode : "unknown";
5910
- const detail = result.stderr.trim() || result.stdout.trim();
5911
- const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
5912
- throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
5937
+ try {
5938
+ const result = await this.runCommand(renderedCommand, {
5939
+ cwd: hcCwd ?? this.config.cwd,
5940
+ env: process.env,
5941
+ timeoutMs,
5942
+ signal
5943
+ });
5944
+ if (result.failed || (result.exitCode ?? 0) !== 0) {
5945
+ const codeText = result.exitCode !== null ? result.exitCode : "unknown";
5946
+ const detail = result.stderr.trim() || result.stdout.trim();
5947
+ const message = detail ? `${detail} (exit code ${codeText})` : `CLI healthcheck command exited with code ${codeText}`;
5948
+ throw new Error(`CLI healthcheck failed for '${this.targetName}': ${message}`);
5949
+ }
5950
+ } finally {
5951
+ await cleanupTempFile(promptFilePath, this.keepTempFiles);
5913
5952
  }
5914
5953
  }
5915
5954
  };
5916
- function buildTemplateValues(request, config, outputFilePath) {
5955
+ async function buildTemplateValues(request, config, outputFilePath) {
5917
5956
  const inputFiles = normalizeInputFiles2(request.inputFiles);
5957
+ const promptFilePath = generateOutputFilePath(request.evalCaseId, ".prompt.txt");
5958
+ await import_promises11.default.writeFile(promptFilePath, request.question ?? "", "utf8");
5918
5959
  return {
5919
- PROMPT: shellEscape(request.question ?? ""),
5920
- GUIDELINES: shellEscape(request.guidelines ?? ""),
5921
- EVAL_ID: shellEscape(request.evalCaseId ?? ""),
5922
- ATTEMPT: shellEscape(String(request.attempt ?? 0)),
5923
- FILES: formatFileList(inputFiles, config.filesFormat),
5924
- OUTPUT_FILE: shellEscape(outputFilePath)
5960
+ values: {
5961
+ PROMPT: shellEscape(request.question ?? ""),
5962
+ PROMPT_FILE: shellEscape(promptFilePath),
5963
+ GUIDELINES: shellEscape(request.guidelines ?? ""),
5964
+ EVAL_ID: shellEscape(request.evalCaseId ?? ""),
5965
+ ATTEMPT: shellEscape(String(request.attempt ?? 0)),
5966
+ FILES: formatFileList(inputFiles, config.filesFormat),
5967
+ OUTPUT_FILE: shellEscape(outputFilePath)
5968
+ },
5969
+ promptFilePath
5925
5970
  };
5926
5971
  }
5972
+ async function cleanupTempFile(filePath, keepTempFiles) {
5973
+ if (!filePath || keepTempFiles) {
5974
+ return;
5975
+ }
5976
+ await import_promises11.default.unlink(filePath).catch(() => {
5977
+ });
5978
+ }
5927
5979
  function normalizeInputFiles2(inputFiles) {
5928
5980
  if (!inputFiles || inputFiles.length === 0) {
5929
5981
  return void 0;
@@ -8285,29 +8337,25 @@ var ProviderRegistry = class {
8285
8337
  var import_node_path18 = __toESM(require("path"), 1);
8286
8338
  var import_zod3 = require("zod");
8287
8339
  var CliHealthcheckHttpInputSchema = import_zod3.z.object({
8288
- type: import_zod3.z.literal("http"),
8289
8340
  url: import_zod3.z.string().min(1, "healthcheck URL is required"),
8290
8341
  timeout_seconds: import_zod3.z.number().positive().optional(),
8291
8342
  timeoutSeconds: import_zod3.z.number().positive().optional()
8292
8343
  });
8293
8344
  var CliHealthcheckCommandInputSchema = import_zod3.z.object({
8294
- type: import_zod3.z.literal("command"),
8295
- command_template: import_zod3.z.string().optional(),
8296
- commandTemplate: import_zod3.z.string().optional(),
8345
+ command: import_zod3.z.string().min(1, "healthcheck command is required"),
8297
8346
  cwd: import_zod3.z.string().optional(),
8298
8347
  timeout_seconds: import_zod3.z.number().positive().optional(),
8299
8348
  timeoutSeconds: import_zod3.z.number().positive().optional()
8300
8349
  });
8301
- var CliHealthcheckInputSchema = import_zod3.z.discriminatedUnion("type", [
8350
+ var CliHealthcheckInputSchema = import_zod3.z.union([
8302
8351
  CliHealthcheckHttpInputSchema,
8303
8352
  CliHealthcheckCommandInputSchema
8304
8353
  ]);
8305
8354
  var CliTargetInputSchema = import_zod3.z.object({
8306
8355
  name: import_zod3.z.string().min(1, "target name is required"),
8307
8356
  provider: import_zod3.z.string().refine((p) => p.toLowerCase() === "cli", { message: "provider must be 'cli'" }),
8308
- // Command template - required (accept both naming conventions)
8309
- command_template: import_zod3.z.string().optional(),
8310
- commandTemplate: import_zod3.z.string().optional(),
8357
+ // Command - required
8358
+ command: import_zod3.z.string(),
8311
8359
  // Files format - optional
8312
8360
  files_format: import_zod3.z.string().optional(),
8313
8361
  filesFormat: import_zod3.z.string().optional(),
@@ -8337,26 +8385,22 @@ var CliTargetInputSchema = import_zod3.z.object({
8337
8385
  workers: import_zod3.z.number().int().min(1).optional(),
8338
8386
  provider_batching: import_zod3.z.boolean().optional(),
8339
8387
  providerBatching: import_zod3.z.boolean().optional()
8340
- }).refine((data) => data.command_template !== void 0 || data.commandTemplate !== void 0, {
8341
- message: "Either command_template or commandTemplate is required"
8342
8388
  });
8343
8389
  var CliHealthcheckHttpSchema = import_zod3.z.object({
8344
- type: import_zod3.z.literal("http"),
8345
8390
  url: import_zod3.z.string().min(1),
8346
8391
  timeoutMs: import_zod3.z.number().positive().optional()
8347
8392
  }).strict();
8348
8393
  var CliHealthcheckCommandSchema = import_zod3.z.object({
8349
- type: import_zod3.z.literal("command"),
8350
- commandTemplate: import_zod3.z.string().min(1),
8394
+ command: import_zod3.z.string().min(1),
8351
8395
  cwd: import_zod3.z.string().optional(),
8352
8396
  timeoutMs: import_zod3.z.number().positive().optional()
8353
8397
  }).strict();
8354
- var CliHealthcheckSchema = import_zod3.z.discriminatedUnion("type", [
8398
+ var CliHealthcheckSchema = import_zod3.z.union([
8355
8399
  CliHealthcheckHttpSchema,
8356
8400
  CliHealthcheckCommandSchema
8357
8401
  ]);
8358
8402
  var CliTargetConfigSchema = import_zod3.z.object({
8359
- commandTemplate: import_zod3.z.string().min(1),
8403
+ command: import_zod3.z.string().min(1),
8360
8404
  filesFormat: import_zod3.z.string().optional(),
8361
8405
  cwd: import_zod3.z.string().optional(),
8362
8406
  workspaceTemplate: import_zod3.z.string().optional(),
@@ -8368,26 +8412,19 @@ var CliTargetConfigSchema = import_zod3.z.object({
8368
8412
  function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
8369
8413
  const timeoutSeconds = input.timeout_seconds ?? input.timeoutSeconds;
8370
8414
  const timeoutMs = timeoutSeconds !== void 0 ? Math.floor(timeoutSeconds * 1e3) : void 0;
8371
- if (input.type === "http") {
8415
+ if ("url" in input && input.url) {
8372
8416
  const url = resolveString(input.url, env, `${targetName} healthcheck URL`);
8373
8417
  return {
8374
- type: "http",
8375
8418
  url,
8376
8419
  timeoutMs
8377
8420
  };
8378
8421
  }
8379
- const commandTemplateSource = input.command_template ?? input.commandTemplate;
8380
- if (commandTemplateSource === void 0) {
8422
+ if (!("command" in input) || !input.command) {
8381
8423
  throw new Error(
8382
- `${targetName} healthcheck: Either command_template or commandTemplate is required for command healthcheck`
8424
+ `${targetName} healthcheck: Either 'command' or 'url' is required for healthcheck`
8383
8425
  );
8384
8426
  }
8385
- const commandTemplate = resolveString(
8386
- commandTemplateSource,
8387
- env,
8388
- `${targetName} healthcheck command template`,
8389
- true
8390
- );
8427
+ const command = resolveString(input.command, env, `${targetName} healthcheck command`, true);
8391
8428
  let cwd = resolveOptionalString(input.cwd, env, `${targetName} healthcheck cwd`, {
8392
8429
  allowLiteral: true,
8393
8430
  optionalEnv: true
@@ -8399,24 +8436,14 @@ function normalizeCliHealthcheck(input, env, targetName, evalFilePath) {
8399
8436
  cwd = import_node_path18.default.dirname(import_node_path18.default.resolve(evalFilePath));
8400
8437
  }
8401
8438
  return {
8402
- type: "command",
8403
- commandTemplate,
8439
+ command,
8404
8440
  cwd,
8405
8441
  timeoutMs
8406
8442
  };
8407
8443
  }
8408
8444
  function normalizeCliTargetInput(input, env, evalFilePath) {
8409
8445
  const targetName = input.name;
8410
- const commandTemplateSource = input.command_template ?? input.commandTemplate;
8411
- if (commandTemplateSource === void 0) {
8412
- throw new Error(`${targetName}: Either command_template or commandTemplate is required`);
8413
- }
8414
- const commandTemplate = resolveString(
8415
- commandTemplateSource,
8416
- env,
8417
- `${targetName} CLI command template`,
8418
- true
8419
- );
8446
+ const command = resolveString(input.command, env, `${targetName} CLI command`, true);
8420
8447
  const filesFormatSource = input.files_format ?? input.filesFormat ?? input.attachments_format ?? input.attachmentsFormat;
8421
8448
  const filesFormat = resolveOptionalLiteralString(filesFormatSource);
8422
8449
  const workspaceTemplateSource = input.workspace_template ?? input.workspaceTemplate;
@@ -8455,7 +8482,7 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
8455
8482
  );
8456
8483
  const healthcheck = input.healthcheck ? normalizeCliHealthcheck(input.healthcheck, env, targetName, evalFilePath) : void 0;
8457
8484
  return {
8458
- commandTemplate,
8485
+ command,
8459
8486
  filesFormat,
8460
8487
  cwd,
8461
8488
  workspaceTemplate,
@@ -8467,6 +8494,7 @@ function normalizeCliTargetInput(input, env, evalFilePath) {
8467
8494
  }
8468
8495
  var CLI_PLACEHOLDERS = /* @__PURE__ */ new Set([
8469
8496
  "PROMPT",
8497
+ "PROMPT_FILE",
8470
8498
  "GUIDELINES",
8471
8499
  "EVAL_ID",
8472
8500
  "ATTEMPT",
@@ -9181,8 +9209,8 @@ var cliErrorMap = (issue, ctx) => {
9181
9209
  if (issue.code === import_zod3.z.ZodIssueCode.unrecognized_keys) {
9182
9210
  return { message: `Unknown CLI provider settings: ${issue.keys.join(", ")}` };
9183
9211
  }
9184
- if (issue.code === import_zod3.z.ZodIssueCode.invalid_union_discriminator) {
9185
- return { message: "healthcheck type must be 'http' or 'command'" };
9212
+ if (issue.code === import_zod3.z.ZodIssueCode.invalid_union) {
9213
+ return { message: "healthcheck must have either 'url' (HTTP) or 'command' (command)" };
9186
9214
  }
9187
9215
  if (issue.code === import_zod3.z.ZodIssueCode.invalid_type && issue.expected === "string") {
9188
9216
  return { message: `${ctx.defaultError} (expected a string value)` };
@@ -9198,18 +9226,17 @@ function resolveCliConfig(target, env, evalFilePath) {
9198
9226
  throw new Error(`${prefix}${firstError?.message}`);
9199
9227
  }
9200
9228
  const normalized = normalizeCliTargetInput(parseResult.data, env, evalFilePath);
9201
- assertSupportedCliPlaceholders(normalized.commandTemplate, `${target.name} CLI command template`);
9202
- if (normalized.healthcheck?.type === "command") {
9229
+ assertSupportedCliPlaceholders(normalized.command, `${target.name} CLI command`);
9230
+ if ("command" in (normalized.healthcheck ?? {}) && normalized.healthcheck.command) {
9203
9231
  assertSupportedCliPlaceholders(
9204
- normalized.healthcheck.commandTemplate,
9205
- `${target.name} healthcheck command template`
9232
+ normalized.healthcheck.command,
9233
+ `${target.name} healthcheck command`
9206
9234
  );
9207
9235
  }
9208
9236
  return normalized;
9209
9237
  }
9210
9238
  function resolveDiscoveredProviderConfig(target, providerKind, env, evalFilePath) {
9211
- const commandTemplateSource = target.command_template ?? target.commandTemplate;
9212
- const commandTemplate = commandTemplateSource ? resolveString(commandTemplateSource, env, `${target.name} command template`, true) : `bun run .agentv/providers/${providerKind}.ts {PROMPT}`;
9239
+ const command = target.command ? resolveString(target.command, env, `${target.name} command`, true) : `bun run .agentv/providers/${providerKind}.ts {PROMPT}`;
9213
9240
  const timeoutSeconds = target.timeout_seconds ?? target.timeoutSeconds;
9214
9241
  const timeoutMs = resolveTimeoutMs(timeoutSeconds, `${target.name} timeout`);
9215
9242
  let cwd = resolveOptionalString(target.cwd, env, `${target.name} working directory`, {
@@ -9223,7 +9250,7 @@ function resolveDiscoveredProviderConfig(target, providerKind, env, evalFilePath
9223
9250
  cwd = import_node_path18.default.dirname(import_node_path18.default.resolve(evalFilePath));
9224
9251
  }
9225
9252
  return {
9226
- commandTemplate,
9253
+ command,
9227
9254
  cwd,
9228
9255
  timeoutMs
9229
9256
  };
@@ -10926,7 +10953,7 @@ async function discoverProviders(registry, baseDir) {
10926
10953
  }
10927
10954
  registry.register(kindName, (target) => {
10928
10955
  return new CliProvider(target.name, {
10929
- commandTemplate: `bun run ${filePath} {PROMPT}`
10956
+ command: `bun run ${filePath} {PROMPT}`
10930
10957
  });
10931
10958
  });
10932
10959
  discoveredKinds.push(kindName);
@@ -11439,13 +11466,13 @@ function toCamelCaseDeep(obj) {
11439
11466
  var FILE_BACKED_OUTPUT_THRESHOLD = 5e4;
11440
11467
  var CodeEvaluator = class {
11441
11468
  kind = "code";
11442
- script;
11469
+ command;
11443
11470
  cwd;
11444
11471
  agentTimeoutMs;
11445
11472
  config;
11446
11473
  target;
11447
11474
  constructor(options) {
11448
- this.script = options.script;
11475
+ this.command = options.command ?? options.script ?? [];
11449
11476
  this.cwd = options.cwd;
11450
11477
  this.agentTimeoutMs = options.agentTimeoutMs;
11451
11478
  this.config = options.config;
@@ -11504,7 +11531,7 @@ var CodeEvaluator = class {
11504
11531
  const env = proxyEnv || workspaceEnv ? { ...proxyEnv, ...workspaceEnv } : void 0;
11505
11532
  try {
11506
11533
  const stdout = await executeScript(
11507
- this.script,
11534
+ this.command,
11508
11535
  inputPayload,
11509
11536
  this.agentTimeoutMs,
11510
11537
  this.cwd,
@@ -11518,7 +11545,7 @@ var CodeEvaluator = class {
11518
11545
  const details = parsed?.details && typeof parsed.details === "object" && !Array.isArray(parsed.details) ? parsed.details : void 0;
11519
11546
  const proxyUsage = getProxyUsage?.();
11520
11547
  const evaluatorRawRequest = {
11521
- script: this.script,
11548
+ command: this.command,
11522
11549
  ...this.cwd ? { cwd: this.cwd } : {},
11523
11550
  ...proxyUsage ? {
11524
11551
  target_proxy: {
@@ -11548,7 +11575,7 @@ var CodeEvaluator = class {
11548
11575
  expectedAspectCount: 1,
11549
11576
  reasoning: message,
11550
11577
  evaluatorRawRequest: {
11551
- script: this.script,
11578
+ command: this.command,
11552
11579
  ...this.cwd ? { cwd: this.cwd } : {},
11553
11580
  ...proxyUsage ? {
11554
11581
  target_proxy: {
@@ -14507,7 +14534,7 @@ var llmJudgeFactory = (config, context2) => {
14507
14534
  var codeFactory = (config, context2) => {
14508
14535
  const c = config;
14509
14536
  return new CodeEvaluator({
14510
- script: c.script,
14537
+ command: c.command ?? c.script ?? [],
14511
14538
  cwd: c.resolvedCwd ?? c.cwd,
14512
14539
  agentTimeoutMs: context2.agentTimeoutMs,
14513
14540
  config: c.config,
@@ -14689,7 +14716,7 @@ async function discoverAssertions(registry, baseDir) {
14689
14716
  }
14690
14717
  const factory = (_config, context2) => {
14691
14718
  return new CodeEvaluator({
14692
- script: ["bun", "run", filePath],
14719
+ command: ["bun", "run", filePath],
14693
14720
  agentTimeoutMs: context2.agentTimeoutMs
14694
14721
  });
14695
14722
  };
@@ -15043,7 +15070,8 @@ async function executeWorkspaceScript(config, context2, failureMode = "fatal") {
15043
15070
  });
15044
15071
  const timeoutMs = config.timeout_ms ?? (failureMode === "fatal" ? 6e4 : 3e4);
15045
15072
  const cwd = config.cwd;
15046
- const result = await execFileWithStdin(config.script, stdin, {
15073
+ const commandArray = config.command ?? config.script ?? [];
15074
+ const result = await execFileWithStdin(commandArray, stdin, {
15047
15075
  timeoutMs,
15048
15076
  cwd
15049
15077
  });
@@ -15090,7 +15118,8 @@ async function runEvaluation(options) {
15090
15118
  keepWorkspaces,
15091
15119
  cleanupWorkspaces,
15092
15120
  trials,
15093
- streamCallbacks
15121
+ streamCallbacks,
15122
+ totalBudgetUsd
15094
15123
  } = options;
15095
15124
  let useCache = options.useCache;
15096
15125
  if (trials && trials.count > 1 && useCache) {
@@ -15263,10 +15292,39 @@ async function runEvaluation(options) {
15263
15292
  let nextWorkerId = 1;
15264
15293
  const workerIdByEvalId = /* @__PURE__ */ new Map();
15265
15294
  let beforeAllOutputAttached = false;
15295
+ let cumulativeBudgetCost = 0;
15296
+ let budgetExhausted = false;
15266
15297
  const promises = filteredEvalCases.map(
15267
15298
  (evalCase) => limit(async () => {
15268
15299
  const workerId = nextWorkerId++;
15269
15300
  workerIdByEvalId.set(evalCase.id, workerId);
15301
+ if (totalBudgetUsd !== void 0 && budgetExhausted) {
15302
+ const budgetResult = {
15303
+ timestamp: (now ?? (() => /* @__PURE__ */ new Date()))().toISOString(),
15304
+ testId: evalCase.id,
15305
+ dataset: evalCase.dataset,
15306
+ score: 0,
15307
+ hits: [],
15308
+ misses: [],
15309
+ answer: "",
15310
+ target: target.name,
15311
+ error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
15312
+ budgetExceeded: true
15313
+ };
15314
+ if (onProgress) {
15315
+ await onProgress({
15316
+ workerId,
15317
+ testId: evalCase.id,
15318
+ status: "failed",
15319
+ completedAt: Date.now(),
15320
+ error: budgetResult.error
15321
+ });
15322
+ }
15323
+ if (onResult) {
15324
+ await onResult(budgetResult);
15325
+ }
15326
+ return budgetResult;
15327
+ }
15270
15328
  if (onProgress) {
15271
15329
  await onProgress({
15272
15330
  workerId,
@@ -15300,6 +15358,23 @@ async function runEvaluation(options) {
15300
15358
  typeRegistry
15301
15359
  };
15302
15360
  let result = trials && trials.count > 1 ? await runEvalCaseWithTrials(runCaseOptions, trials) : await runEvalCase(runCaseOptions);
15361
+ if (totalBudgetUsd !== void 0) {
15362
+ let caseCost;
15363
+ if (result.trials && result.trials.length > 0) {
15364
+ const trialCostSum = result.trials.reduce((sum, t) => sum + (t.costUsd ?? 0), 0);
15365
+ if (trialCostSum > 0) {
15366
+ caseCost = trialCostSum;
15367
+ }
15368
+ } else {
15369
+ caseCost = result.trace?.costUsd;
15370
+ }
15371
+ if (caseCost !== void 0) {
15372
+ cumulativeBudgetCost += caseCost;
15373
+ if (cumulativeBudgetCost >= totalBudgetUsd) {
15374
+ budgetExhausted = true;
15375
+ }
15376
+ }
15377
+ }
15303
15378
  if (beforeAllOutput && !beforeAllOutputAttached) {
15304
15379
  result = { ...result, beforeAllOutput };
15305
15380
  beforeAllOutputAttached = true;