agentv 4.19.0 → 4.20.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,9 +15,10 @@ import {
15
15
  resolveWorkspaceOrFilePath,
16
16
  toSnakeCaseDeep,
17
17
  writeArtifactsFromResults
18
- } from "./chunk-NL6P5MUH.js";
18
+ } from "./chunk-ZNS74WKH.js";
19
19
  import {
20
20
  ResponseCache,
21
+ RunBudgetTracker,
21
22
  commitAndPushResultsBranch,
22
23
  createDraftResultsPr,
23
24
  deriveCategory,
@@ -30,7 +31,7 @@ import {
30
31
  shouldSkipCacheForTemperature,
31
32
  stageResultsArtifacts,
32
33
  syncResultsRepo
33
- } from "./chunk-R2QDYORI.js";
34
+ } from "./chunk-36HXBYUY.js";
34
35
  import {
35
36
  CLI_PLACEHOLDERS,
36
37
  COMMON_TARGET_SETTINGS,
@@ -60,12 +61,12 @@ import {
60
61
  subscribeToCopilotSdkLogEntries,
61
62
  subscribeToPiLogEntries,
62
63
  toCamelCaseDeep
63
- } from "./chunk-PTYQS37Y.js";
64
+ } from "./chunk-LP4Y5D2Z.js";
64
65
 
65
66
  // package.json
66
67
  var package_default = {
67
68
  name: "agentv",
68
- version: "4.19.0",
69
+ version: "4.20.0-next.1",
69
70
  description: "CLI entry point for AgentV",
70
71
  type: "module",
71
72
  repository: {
@@ -4764,7 +4765,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
4764
4765
  tags: normalizeStringArray(rawOptions.tag),
4765
4766
  excludeTags: normalizeStringArray(rawOptions.excludeTag),
4766
4767
  transcript: normalizeString(rawOptions.transcript),
4767
- experiment: normalizeString(rawOptions.experiment)
4768
+ experiment: normalizeString(rawOptions.experiment),
4769
+ budgetUsd: normalizeOptionalNumber(rawOptions.budgetUsd)
4768
4770
  };
4769
4771
  }
4770
4772
  async function ensureFileExists(filePath, description) {
@@ -5022,6 +5024,7 @@ async function runSingleEvalFile(params) {
5022
5024
  trialsConfig,
5023
5025
  matrixMode,
5024
5026
  budgetUsd,
5027
+ runBudgetTracker,
5025
5028
  failOnError,
5026
5029
  providerFactory
5027
5030
  } = params;
@@ -5088,6 +5091,7 @@ async function runSingleEvalFile(params) {
5088
5091
  keepWorkspaces: options.keepWorkspaces,
5089
5092
  trials: trialsConfig,
5090
5093
  budgetUsd,
5094
+ runBudgetTracker,
5091
5095
  failOnError,
5092
5096
  graderTarget: options.graderTarget,
5093
5097
  model: options.model,
@@ -5277,7 +5281,7 @@ async function runEvalCommand(input) {
5277
5281
  const useFileExport = !!options.otelFile;
5278
5282
  if (options.exportOtel || useFileExport) {
5279
5283
  try {
5280
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-RTIUSC6L.js");
5284
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-GURCO6IS.js");
5281
5285
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
5282
5286
  let headers = {};
5283
5287
  if (options.otelBackend) {
@@ -5335,6 +5339,10 @@ async function runEvalCommand(input) {
5335
5339
  const remoteEvalSummaries = [];
5336
5340
  const seenTestCases = /* @__PURE__ */ new Set();
5337
5341
  const displayIdTracker = createDisplayIdTracker();
5342
+ const runBudgetTracker = options.budgetUsd ? new RunBudgetTracker(options.budgetUsd) : void 0;
5343
+ if (runBudgetTracker) {
5344
+ console.log(`Run budget cap: $${runBudgetTracker.budgetCapUsd.toFixed(2)}`);
5345
+ }
5338
5346
  const perFileWorkers = options.workers;
5339
5347
  const fileMetadata = /* @__PURE__ */ new Map();
5340
5348
  for (const testFilePath of resolvedTestFiles) {
@@ -5472,7 +5480,7 @@ async function runEvalCommand(input) {
5472
5480
  const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
5473
5481
  let transcriptProviderFactory;
5474
5482
  if (options.transcript) {
5475
- const { TranscriptProvider } = await import("./dist-RTIUSC6L.js");
5483
+ const { TranscriptProvider } = await import("./dist-GURCO6IS.js");
5476
5484
  const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
5477
5485
  const totalTests = [...fileMetadata.values()].reduce(
5478
5486
  (sum, meta) => sum + meta.testCases.length,
@@ -5490,6 +5498,34 @@ async function runEvalCommand(input) {
5490
5498
  }
5491
5499
  try {
5492
5500
  for (const testFilePath of activeTestFiles) {
5501
+ if (runBudgetTracker?.isExceeded()) {
5502
+ const targetPrep2 = fileMetadata.get(testFilePath);
5503
+ if (!targetPrep2) continue;
5504
+ const budgetMsg = `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`;
5505
+ console.log(`
5506
+ \u26A0 ${budgetMsg} \u2014 skipping ${path15.basename(testFilePath)}`);
5507
+ for (const { selection } of targetPrep2.selections) {
5508
+ const skippedResults = targetPrep2.testCases.map((testCase) => ({
5509
+ timestamp: (/* @__PURE__ */ new Date()).toISOString(),
5510
+ testId: testCase.id,
5511
+ score: 0,
5512
+ assertions: [],
5513
+ output: [],
5514
+ error: budgetMsg,
5515
+ budgetExceeded: true,
5516
+ executionStatus: "execution_error",
5517
+ failureStage: "setup",
5518
+ failureReasonCode: "budget_exceeded",
5519
+ executionError: { message: budgetMsg, stage: "setup" },
5520
+ target: selection.targetName
5521
+ }));
5522
+ for (const r of skippedResults) {
5523
+ await outputWriter.append(r);
5524
+ }
5525
+ allResults.push(...skippedResults);
5526
+ }
5527
+ continue;
5528
+ }
5493
5529
  const targetPrep = fileMetadata.get(testFilePath);
5494
5530
  if (!targetPrep) {
5495
5531
  throw new Error(`Missing metadata for ${testFilePath}`);
@@ -5530,6 +5566,7 @@ async function runEvalCommand(input) {
5530
5566
  trialsConfig: options.transcript ? void 0 : targetPrep.trialsConfig,
5531
5567
  matrixMode: targetPrep.selections.length > 1,
5532
5568
  budgetUsd: targetPrep.budgetUsd,
5569
+ runBudgetTracker,
5533
5570
  failOnError: targetPrep.failOnError,
5534
5571
  threshold: resolvedThreshold,
5535
5572
  providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory
@@ -5612,7 +5649,7 @@ async function runEvalCommand(input) {
5612
5649
  if (usesDefaultArtifactWorkspace && allResults.length > 0) {
5613
5650
  const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : "";
5614
5651
  if (isResumeAppend) {
5615
- const { writePerTestArtifacts } = await import("./artifact-writer-YATMDPWI.js");
5652
+ const { writePerTestArtifacts } = await import("./artifact-writer-RFXWXUOV.js");
5616
5653
  await writePerTestArtifacts(allResults, runDir, {
5617
5654
  experiment: normalizeExperimentName(options.experiment)
5618
5655
  });
@@ -5702,13 +5739,21 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
5702
5739
  agentv eval run ${evalFileArgs}${targetFlag} --output ${relativeRunDir} --rerun-failed`
5703
5740
  );
5704
5741
  }
5742
+ const runBudgetExceeded = runBudgetTracker?.isExceeded() ?? false;
5743
+ if (runBudgetExceeded) {
5744
+ console.log(
5745
+ `
5746
+ \u26A0 Run budget exceeded: $${runBudgetTracker?.currentCostUsd.toFixed(4)} spent of $${runBudgetTracker?.budgetCapUsd.toFixed(4)} cap`
5747
+ );
5748
+ }
5705
5749
  return {
5706
5750
  executionErrorCount: summary.executionErrorCount,
5707
5751
  outputPath,
5708
5752
  testFiles: activeTestFiles,
5709
5753
  target: options.target,
5710
5754
  thresholdFailed,
5711
- allExecutionErrors
5755
+ allExecutionErrors,
5756
+ budgetExceeded: runBudgetExceeded || void 0
5712
5757
  };
5713
5758
  } finally {
5714
5759
  unsubscribeCodexLogs();
@@ -5822,4 +5867,4 @@ export {
5822
5867
  getCategories,
5823
5868
  filterByCategory
5824
5869
  };
5825
- //# sourceMappingURL=chunk-IWI4AJRS.js.map
5870
+ //# sourceMappingURL=chunk-LMQFWJJL.js.map