agentv 4.19.0 → 4.20.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{artifact-writer-YATMDPWI.js → artifact-writer-RFXWXUOV.js} +4 -4
- package/dist/{chunk-R2QDYORI.js → chunk-36HXBYUY.js} +25 -2
- package/dist/chunk-36HXBYUY.js.map +1 -0
- package/dist/{chunk-62M5MR5K.js → chunk-G7I2BPLB.js} +19 -6
- package/dist/chunk-G7I2BPLB.js.map +1 -0
- package/dist/{chunk-IWI4AJRS.js → chunk-LMQFWJJL.js} +55 -10
- package/dist/chunk-LMQFWJJL.js.map +1 -0
- package/dist/{chunk-PTYQS37Y.js → chunk-LP4Y5D2Z.js} +161 -24
- package/dist/chunk-LP4Y5D2Z.js.map +1 -0
- package/dist/{chunk-NL6P5MUH.js → chunk-ZNS74WKH.js} +3 -3
- package/dist/cli.js +5 -5
- package/dist/{dist-RTIUSC6L.js → dist-GURCO6IS.js} +7 -3
- package/dist/index.js +5 -5
- package/dist/{interactive-7AZMOH2V.js → interactive-5GFT3WPN.js} +5 -5
- package/dist/{ts-eval-loader-XFQ6S4DT-S7P2UUBX.js → ts-eval-loader-32COE32J-TCT4RIRT.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-62M5MR5K.js.map +0 -1
- package/dist/chunk-IWI4AJRS.js.map +0 -1
- package/dist/chunk-PTYQS37Y.js.map +0 -1
- package/dist/chunk-R2QDYORI.js.map +0 -1
- /package/dist/{artifact-writer-YATMDPWI.js.map → artifact-writer-RFXWXUOV.js.map} +0 -0
- /package/dist/{chunk-NL6P5MUH.js.map → chunk-ZNS74WKH.js.map} +0 -0
- /package/dist/{dist-RTIUSC6L.js.map → dist-GURCO6IS.js.map} +0 -0
- /package/dist/{interactive-7AZMOH2V.js.map → interactive-5GFT3WPN.js.map} +0 -0
- /package/dist/{ts-eval-loader-XFQ6S4DT-S7P2UUBX.js.map → ts-eval-loader-32COE32J-TCT4RIRT.js.map} +0 -0
|
@@ -15,9 +15,10 @@ import {
|
|
|
15
15
|
resolveWorkspaceOrFilePath,
|
|
16
16
|
toSnakeCaseDeep,
|
|
17
17
|
writeArtifactsFromResults
|
|
18
|
-
} from "./chunk-
|
|
18
|
+
} from "./chunk-ZNS74WKH.js";
|
|
19
19
|
import {
|
|
20
20
|
ResponseCache,
|
|
21
|
+
RunBudgetTracker,
|
|
21
22
|
commitAndPushResultsBranch,
|
|
22
23
|
createDraftResultsPr,
|
|
23
24
|
deriveCategory,
|
|
@@ -30,7 +31,7 @@ import {
|
|
|
30
31
|
shouldSkipCacheForTemperature,
|
|
31
32
|
stageResultsArtifacts,
|
|
32
33
|
syncResultsRepo
|
|
33
|
-
} from "./chunk-
|
|
34
|
+
} from "./chunk-36HXBYUY.js";
|
|
34
35
|
import {
|
|
35
36
|
CLI_PLACEHOLDERS,
|
|
36
37
|
COMMON_TARGET_SETTINGS,
|
|
@@ -60,12 +61,12 @@ import {
|
|
|
60
61
|
subscribeToCopilotSdkLogEntries,
|
|
61
62
|
subscribeToPiLogEntries,
|
|
62
63
|
toCamelCaseDeep
|
|
63
|
-
} from "./chunk-
|
|
64
|
+
} from "./chunk-LP4Y5D2Z.js";
|
|
64
65
|
|
|
65
66
|
// package.json
|
|
66
67
|
var package_default = {
|
|
67
68
|
name: "agentv",
|
|
68
|
-
version: "4.
|
|
69
|
+
version: "4.20.0-next.1",
|
|
69
70
|
description: "CLI entry point for AgentV",
|
|
70
71
|
type: "module",
|
|
71
72
|
repository: {
|
|
@@ -4764,7 +4765,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
4764
4765
|
tags: normalizeStringArray(rawOptions.tag),
|
|
4765
4766
|
excludeTags: normalizeStringArray(rawOptions.excludeTag),
|
|
4766
4767
|
transcript: normalizeString(rawOptions.transcript),
|
|
4767
|
-
experiment: normalizeString(rawOptions.experiment)
|
|
4768
|
+
experiment: normalizeString(rawOptions.experiment),
|
|
4769
|
+
budgetUsd: normalizeOptionalNumber(rawOptions.budgetUsd)
|
|
4768
4770
|
};
|
|
4769
4771
|
}
|
|
4770
4772
|
async function ensureFileExists(filePath, description) {
|
|
@@ -5022,6 +5024,7 @@ async function runSingleEvalFile(params) {
|
|
|
5022
5024
|
trialsConfig,
|
|
5023
5025
|
matrixMode,
|
|
5024
5026
|
budgetUsd,
|
|
5027
|
+
runBudgetTracker,
|
|
5025
5028
|
failOnError,
|
|
5026
5029
|
providerFactory
|
|
5027
5030
|
} = params;
|
|
@@ -5088,6 +5091,7 @@ async function runSingleEvalFile(params) {
|
|
|
5088
5091
|
keepWorkspaces: options.keepWorkspaces,
|
|
5089
5092
|
trials: trialsConfig,
|
|
5090
5093
|
budgetUsd,
|
|
5094
|
+
runBudgetTracker,
|
|
5091
5095
|
failOnError,
|
|
5092
5096
|
graderTarget: options.graderTarget,
|
|
5093
5097
|
model: options.model,
|
|
@@ -5277,7 +5281,7 @@ async function runEvalCommand(input) {
|
|
|
5277
5281
|
const useFileExport = !!options.otelFile;
|
|
5278
5282
|
if (options.exportOtel || useFileExport) {
|
|
5279
5283
|
try {
|
|
5280
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
5284
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-GURCO6IS.js");
|
|
5281
5285
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
5282
5286
|
let headers = {};
|
|
5283
5287
|
if (options.otelBackend) {
|
|
@@ -5335,6 +5339,10 @@ async function runEvalCommand(input) {
|
|
|
5335
5339
|
const remoteEvalSummaries = [];
|
|
5336
5340
|
const seenTestCases = /* @__PURE__ */ new Set();
|
|
5337
5341
|
const displayIdTracker = createDisplayIdTracker();
|
|
5342
|
+
const runBudgetTracker = options.budgetUsd ? new RunBudgetTracker(options.budgetUsd) : void 0;
|
|
5343
|
+
if (runBudgetTracker) {
|
|
5344
|
+
console.log(`Run budget cap: $${runBudgetTracker.budgetCapUsd.toFixed(2)}`);
|
|
5345
|
+
}
|
|
5338
5346
|
const perFileWorkers = options.workers;
|
|
5339
5347
|
const fileMetadata = /* @__PURE__ */ new Map();
|
|
5340
5348
|
for (const testFilePath of resolvedTestFiles) {
|
|
@@ -5472,7 +5480,7 @@ async function runEvalCommand(input) {
|
|
|
5472
5480
|
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
|
|
5473
5481
|
let transcriptProviderFactory;
|
|
5474
5482
|
if (options.transcript) {
|
|
5475
|
-
const { TranscriptProvider } = await import("./dist-
|
|
5483
|
+
const { TranscriptProvider } = await import("./dist-GURCO6IS.js");
|
|
5476
5484
|
const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
|
|
5477
5485
|
const totalTests = [...fileMetadata.values()].reduce(
|
|
5478
5486
|
(sum, meta) => sum + meta.testCases.length,
|
|
@@ -5490,6 +5498,34 @@ async function runEvalCommand(input) {
|
|
|
5490
5498
|
}
|
|
5491
5499
|
try {
|
|
5492
5500
|
for (const testFilePath of activeTestFiles) {
|
|
5501
|
+
if (runBudgetTracker?.isExceeded()) {
|
|
5502
|
+
const targetPrep2 = fileMetadata.get(testFilePath);
|
|
5503
|
+
if (!targetPrep2) continue;
|
|
5504
|
+
const budgetMsg = `Run budget exceeded ($${runBudgetTracker.currentCostUsd.toFixed(4)} / $${runBudgetTracker.budgetCapUsd.toFixed(4)})`;
|
|
5505
|
+
console.log(`
|
|
5506
|
+
\u26A0 ${budgetMsg} \u2014 skipping ${path15.basename(testFilePath)}`);
|
|
5507
|
+
for (const { selection } of targetPrep2.selections) {
|
|
5508
|
+
const skippedResults = targetPrep2.testCases.map((testCase) => ({
|
|
5509
|
+
timestamp: (/* @__PURE__ */ new Date()).toISOString(),
|
|
5510
|
+
testId: testCase.id,
|
|
5511
|
+
score: 0,
|
|
5512
|
+
assertions: [],
|
|
5513
|
+
output: [],
|
|
5514
|
+
error: budgetMsg,
|
|
5515
|
+
budgetExceeded: true,
|
|
5516
|
+
executionStatus: "execution_error",
|
|
5517
|
+
failureStage: "setup",
|
|
5518
|
+
failureReasonCode: "budget_exceeded",
|
|
5519
|
+
executionError: { message: budgetMsg, stage: "setup" },
|
|
5520
|
+
target: selection.targetName
|
|
5521
|
+
}));
|
|
5522
|
+
for (const r of skippedResults) {
|
|
5523
|
+
await outputWriter.append(r);
|
|
5524
|
+
}
|
|
5525
|
+
allResults.push(...skippedResults);
|
|
5526
|
+
}
|
|
5527
|
+
continue;
|
|
5528
|
+
}
|
|
5493
5529
|
const targetPrep = fileMetadata.get(testFilePath);
|
|
5494
5530
|
if (!targetPrep) {
|
|
5495
5531
|
throw new Error(`Missing metadata for ${testFilePath}`);
|
|
@@ -5530,6 +5566,7 @@ async function runEvalCommand(input) {
|
|
|
5530
5566
|
trialsConfig: options.transcript ? void 0 : targetPrep.trialsConfig,
|
|
5531
5567
|
matrixMode: targetPrep.selections.length > 1,
|
|
5532
5568
|
budgetUsd: targetPrep.budgetUsd,
|
|
5569
|
+
runBudgetTracker,
|
|
5533
5570
|
failOnError: targetPrep.failOnError,
|
|
5534
5571
|
threshold: resolvedThreshold,
|
|
5535
5572
|
providerFactory: transcriptProviderFactory ?? targetPrep.providerFactory
|
|
@@ -5612,7 +5649,7 @@ async function runEvalCommand(input) {
|
|
|
5612
5649
|
if (usesDefaultArtifactWorkspace && allResults.length > 0) {
|
|
5613
5650
|
const evalFile = activeTestFiles.length === 1 ? activeTestFiles[0] : "";
|
|
5614
5651
|
if (isResumeAppend) {
|
|
5615
|
-
const { writePerTestArtifacts } = await import("./artifact-writer-
|
|
5652
|
+
const { writePerTestArtifacts } = await import("./artifact-writer-RFXWXUOV.js");
|
|
5616
5653
|
await writePerTestArtifacts(allResults, runDir, {
|
|
5617
5654
|
experiment: normalizeExperimentName(options.experiment)
|
|
5618
5655
|
});
|
|
@@ -5702,13 +5739,21 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
|
|
|
5702
5739
|
agentv eval run ${evalFileArgs}${targetFlag} --output ${relativeRunDir} --rerun-failed`
|
|
5703
5740
|
);
|
|
5704
5741
|
}
|
|
5742
|
+
const runBudgetExceeded = runBudgetTracker?.isExceeded() ?? false;
|
|
5743
|
+
if (runBudgetExceeded) {
|
|
5744
|
+
console.log(
|
|
5745
|
+
`
|
|
5746
|
+
\u26A0 Run budget exceeded: $${runBudgetTracker?.currentCostUsd.toFixed(4)} spent of $${runBudgetTracker?.budgetCapUsd.toFixed(4)} cap`
|
|
5747
|
+
);
|
|
5748
|
+
}
|
|
5705
5749
|
return {
|
|
5706
5750
|
executionErrorCount: summary.executionErrorCount,
|
|
5707
5751
|
outputPath,
|
|
5708
5752
|
testFiles: activeTestFiles,
|
|
5709
5753
|
target: options.target,
|
|
5710
5754
|
thresholdFailed,
|
|
5711
|
-
allExecutionErrors
|
|
5755
|
+
allExecutionErrors,
|
|
5756
|
+
budgetExceeded: runBudgetExceeded || void 0
|
|
5712
5757
|
};
|
|
5713
5758
|
} finally {
|
|
5714
5759
|
unsubscribeCodexLogs();
|
|
@@ -5822,4 +5867,4 @@ export {
|
|
|
5822
5867
|
getCategories,
|
|
5823
5868
|
filterByCategory
|
|
5824
5869
|
};
|
|
5825
|
-
//# sourceMappingURL=chunk-
|
|
5870
|
+
//# sourceMappingURL=chunk-LMQFWJJL.js.map
|