agentv 4.35.1 → 4.36.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{artifact-writer-G57MG52C.js → artifact-writer-3YRN6YTA.js} +4 -4
- package/dist/{chunk-CRMGUVRZ.js → chunk-4M6FAQTW.js} +85 -19
- package/dist/chunk-4M6FAQTW.js.map +1 -0
- package/dist/{chunk-INOKS5LF.js → chunk-7KZ2AF26.js} +269 -57
- package/dist/chunk-7KZ2AF26.js.map +1 -0
- package/dist/{chunk-KJGYL3M3.js → chunk-HVBAVOAH.js} +72 -50
- package/dist/chunk-HVBAVOAH.js.map +1 -0
- package/dist/{chunk-KNF3AGCI.js → chunk-P5JONEWJ.js} +231 -35
- package/dist/chunk-P5JONEWJ.js.map +1 -0
- package/dist/{chunk-6QEIZ33V.js → chunk-TUTURE2B.js} +1227 -372
- package/dist/chunk-TUTURE2B.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/dashboard/assets/index-DA96FAM5.js +119 -0
- package/dist/dashboard/assets/{index-Bdk-9a_8.js → index-l4t97uO8.js} +1 -1
- package/dist/dashboard/assets/index-nmrFBoNd.css +1 -0
- package/dist/dashboard/index.html +2 -2
- package/dist/{dist-M4B77IW4.js → dist-BSFUYS54.js} +73 -3
- package/dist/index.js +5 -5
- package/dist/{interactive-VYQ5SYMR.js → interactive-IEC63EVP.js} +5 -5
- package/dist/skills/agentv-eval-writer/SKILL.md +6 -0
- package/dist/{ts-eval-loader-EQJX3OLT-THE7D3GR.js → ts-eval-loader-4DU65XGW-YM47FFG2.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-6QEIZ33V.js.map +0 -1
- package/dist/chunk-CRMGUVRZ.js.map +0 -1
- package/dist/chunk-INOKS5LF.js.map +0 -1
- package/dist/chunk-KJGYL3M3.js.map +0 -1
- package/dist/chunk-KNF3AGCI.js.map +0 -1
- package/dist/dashboard/assets/index-BPMAZqjE.css +0 -1
- package/dist/dashboard/assets/index-BWO0UcxG.js +0 -118
- /package/dist/{artifact-writer-G57MG52C.js.map → artifact-writer-3YRN6YTA.js.map} +0 -0
- /package/dist/{dist-M4B77IW4.js.map → dist-BSFUYS54.js.map} +0 -0
- /package/dist/{interactive-VYQ5SYMR.js.map → interactive-IEC63EVP.js.map} +0 -0
- /package/dist/{ts-eval-loader-EQJX3OLT-THE7D3GR.js.map → ts-eval-loader-4DU65XGW-YM47FFG2.js.map} +0 -0
|
@@ -13,12 +13,14 @@ import {
|
|
|
13
13
|
resolveRunIndexPath,
|
|
14
14
|
resolveRunManifestPath,
|
|
15
15
|
resolveWorkspaceOrFilePath,
|
|
16
|
-
toSnakeCaseDeep,
|
|
16
|
+
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
17
17
|
writeArtifactsFromResults,
|
|
18
18
|
writeInitialBenchmarkArtifact
|
|
19
|
-
} from "./chunk-
|
|
19
|
+
} from "./chunk-HVBAVOAH.js";
|
|
20
20
|
import {
|
|
21
21
|
RunBudgetTracker,
|
|
22
|
+
buildWipBranchName,
|
|
23
|
+
deleteWipBranch,
|
|
22
24
|
deriveCategory,
|
|
23
25
|
directPushResults,
|
|
24
26
|
directorySizeBytes,
|
|
@@ -29,9 +31,12 @@ import {
|
|
|
29
31
|
loadTsConfig,
|
|
30
32
|
materializeGitRun,
|
|
31
33
|
normalizeResultsConfig,
|
|
34
|
+
pushWipCheckpoint,
|
|
32
35
|
resolveResultsRepoRunsDir,
|
|
33
|
-
|
|
34
|
-
|
|
36
|
+
setupWipWorktree,
|
|
37
|
+
syncResultsRepoForProject,
|
|
38
|
+
traceFromTranscriptJsonLines
|
|
39
|
+
} from "./chunk-P5JONEWJ.js";
|
|
35
40
|
import {
|
|
36
41
|
CLI_PLACEHOLDERS,
|
|
37
42
|
COMMON_TARGET_SETTINGS,
|
|
@@ -42,6 +47,7 @@ import {
|
|
|
42
47
|
ResponseCache,
|
|
43
48
|
buildDirectoryChain,
|
|
44
49
|
buildSearchRoots,
|
|
50
|
+
buildTraceFromMessages,
|
|
45
51
|
ensureVSCodeSubagents,
|
|
46
52
|
findDeprecatedCamelCaseTargetWarnings,
|
|
47
53
|
findGitRoot,
|
|
@@ -66,8 +72,9 @@ import {
|
|
|
66
72
|
subscribeToCopilotCliLogEntries,
|
|
67
73
|
subscribeToCopilotSdkLogEntries,
|
|
68
74
|
subscribeToPiLogEntries,
|
|
69
|
-
toCamelCaseDeep
|
|
70
|
-
|
|
75
|
+
toCamelCaseDeep,
|
|
76
|
+
toSnakeCaseDeep
|
|
77
|
+
} from "./chunk-TUTURE2B.js";
|
|
71
78
|
|
|
72
79
|
// src/commands/eval/shared.ts
|
|
73
80
|
import { constants } from "node:fs";
|
|
@@ -183,7 +190,7 @@ async function findRepoRoot(start) {
|
|
|
183
190
|
// package.json
|
|
184
191
|
var package_default = {
|
|
185
192
|
name: "agentv",
|
|
186
|
-
version: "4.
|
|
193
|
+
version: "4.36.0-next.1",
|
|
187
194
|
description: "CLI entry point for AgentV",
|
|
188
195
|
type: "module",
|
|
189
196
|
repository: {
|
|
@@ -574,18 +581,31 @@ function hydrateInput(baseDir, record) {
|
|
|
574
581
|
return messages.length > 0 ? messages : [{ role: "user", content: inputText.trimEnd() }];
|
|
575
582
|
}
|
|
576
583
|
function hydrateOutput(baseDir, record) {
|
|
577
|
-
const responseText = readOptionalText(
|
|
584
|
+
const responseText = readOptionalText(
|
|
585
|
+
baseDir,
|
|
586
|
+
record.output_path ?? record.answer_path ?? record.response_path
|
|
587
|
+
);
|
|
578
588
|
if (!responseText) {
|
|
579
589
|
return void 0;
|
|
580
590
|
}
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
591
|
+
return responseText.trimEnd();
|
|
592
|
+
}
|
|
593
|
+
function hydrateTrace(baseDir, record) {
|
|
594
|
+
const transcriptText = readOptionalText(baseDir, record.transcript_path);
|
|
595
|
+
if (transcriptText) {
|
|
596
|
+
try {
|
|
597
|
+
return traceFromTranscriptJsonLines(parseJsonlLines(transcriptText));
|
|
598
|
+
} catch {
|
|
599
|
+
}
|
|
587
600
|
}
|
|
588
|
-
|
|
601
|
+
const output = hydrateOutput(baseDir, record) ?? "";
|
|
602
|
+
return buildTraceFromMessages({
|
|
603
|
+
input: hydrateInput(baseDir, record),
|
|
604
|
+
output: output ? [{ role: "assistant", content: output }] : [],
|
|
605
|
+
finalOutput: output,
|
|
606
|
+
target: record.target,
|
|
607
|
+
testId: record.test_id
|
|
608
|
+
});
|
|
589
609
|
}
|
|
590
610
|
function hydrateManifestRecord(baseDir, record) {
|
|
591
611
|
const grading = readOptionalJson(baseDir, record.grading_path);
|
|
@@ -630,7 +650,8 @@ function hydrateManifestRecord(baseDir, record) {
|
|
|
630
650
|
durationMs: timing?.duration_ms ?? record.duration_ms,
|
|
631
651
|
costUsd: record.cost_usd,
|
|
632
652
|
input: hydrateInput(baseDir, record),
|
|
633
|
-
output: hydrateOutput(baseDir, record),
|
|
653
|
+
output: hydrateOutput(baseDir, record) ?? "",
|
|
654
|
+
trace: hydrateTrace(baseDir, record),
|
|
634
655
|
metadata: record.metadata
|
|
635
656
|
};
|
|
636
657
|
}
|
|
@@ -756,6 +777,7 @@ function toRawResult(result) {
|
|
|
756
777
|
end_time: result.endTime,
|
|
757
778
|
input: result.input,
|
|
758
779
|
output: result.output,
|
|
780
|
+
trace: toSnakeCaseDeep(result.trace),
|
|
759
781
|
file_changes: result.fileChanges
|
|
760
782
|
};
|
|
761
783
|
}
|
|
@@ -1336,25 +1358,33 @@ function deleteRemoteRunTags(repoDir, manifestPath) {
|
|
|
1336
1358
|
// src/commands/results/remote.ts
|
|
1337
1359
|
var gitRunsCache = /* @__PURE__ */ new Map();
|
|
1338
1360
|
var GIT_RUNS_CACHE_TTL_MS = 6e4;
|
|
1339
|
-
function
|
|
1361
|
+
function getResultsStorageRef(config) {
|
|
1362
|
+
return config.branch ? `origin/${config.branch}` : void 0;
|
|
1363
|
+
}
|
|
1364
|
+
function cachedListGitRuns(repoDir, ref) {
|
|
1340
1365
|
const now = Date.now();
|
|
1341
|
-
const
|
|
1366
|
+
const cacheKey = `${repoDir}\0${ref ?? ""}`;
|
|
1367
|
+
const cached = gitRunsCache.get(cacheKey);
|
|
1342
1368
|
if (cached && cached.expiresAt > now) {
|
|
1343
1369
|
return cached.data;
|
|
1344
1370
|
}
|
|
1345
|
-
const promise = listGitRuns(repoDir);
|
|
1346
|
-
gitRunsCache.set(
|
|
1371
|
+
const promise = ref ? listGitRuns(repoDir, ref) : listGitRuns(repoDir);
|
|
1372
|
+
gitRunsCache.set(cacheKey, { data: promise, expiresAt: now + GIT_RUNS_CACHE_TTL_MS });
|
|
1347
1373
|
promise.catch(() => {
|
|
1348
1374
|
}).finally(() => {
|
|
1349
|
-
const entry = gitRunsCache.get(
|
|
1375
|
+
const entry = gitRunsCache.get(cacheKey);
|
|
1350
1376
|
if (entry && entry.expiresAt <= Date.now()) {
|
|
1351
|
-
gitRunsCache.delete(
|
|
1377
|
+
gitRunsCache.delete(cacheKey);
|
|
1352
1378
|
}
|
|
1353
1379
|
});
|
|
1354
1380
|
return promise;
|
|
1355
1381
|
}
|
|
1356
1382
|
function invalidateGitRunsCache(repoDir) {
|
|
1357
|
-
gitRunsCache.
|
|
1383
|
+
for (const key of gitRunsCache.keys()) {
|
|
1384
|
+
if (key.startsWith(`${repoDir}\0`)) {
|
|
1385
|
+
gitRunsCache.delete(key);
|
|
1386
|
+
}
|
|
1387
|
+
}
|
|
1358
1388
|
}
|
|
1359
1389
|
var REMOTE_RUN_PREFIX = "remote::";
|
|
1360
1390
|
var SIZE_WARNING_BYTES = 10 * 1024 * 1024;
|
|
@@ -1391,6 +1421,7 @@ async function loadNormalizedResultsConfig(cwd, projectId) {
|
|
|
1391
1421
|
const projectResults = project?.results ? {
|
|
1392
1422
|
mode: "github",
|
|
1393
1423
|
repo: project.results.repoUrl,
|
|
1424
|
+
branch: project.results.branch,
|
|
1394
1425
|
path: project.results.path,
|
|
1395
1426
|
auto_push: project.results.sync?.autoPush,
|
|
1396
1427
|
branch_prefix: project.results.branchPrefix
|
|
@@ -1420,9 +1451,11 @@ async function getRemoteRunCount(config, status) {
|
|
|
1420
1451
|
let runCount = 0;
|
|
1421
1452
|
if (config && status.available) {
|
|
1422
1453
|
try {
|
|
1423
|
-
runCount = (await cachedListGitRuns(config.path)).length;
|
|
1454
|
+
runCount = (await cachedListGitRuns(config.path, getResultsStorageRef(config))).length;
|
|
1424
1455
|
} catch {
|
|
1425
|
-
|
|
1456
|
+
if (!config.branch) {
|
|
1457
|
+
runCount = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).length;
|
|
1458
|
+
}
|
|
1426
1459
|
}
|
|
1427
1460
|
}
|
|
1428
1461
|
return runCount;
|
|
@@ -1482,7 +1515,7 @@ async function listMergedResultFiles(cwd, limit, projectId) {
|
|
|
1482
1515
|
let remoteRuns = [];
|
|
1483
1516
|
if (config.mode === "github") {
|
|
1484
1517
|
try {
|
|
1485
|
-
const gitRuns = await cachedListGitRuns(config.path);
|
|
1518
|
+
const gitRuns = await cachedListGitRuns(config.path, getResultsStorageRef(config));
|
|
1486
1519
|
remoteRuns = gitRuns.map((r) => ({
|
|
1487
1520
|
filename: encodeRemoteRunId(r.run_id),
|
|
1488
1521
|
raw_filename: r.run_id,
|
|
@@ -1496,15 +1529,19 @@ async function listMergedResultFiles(cwd, limit, projectId) {
|
|
|
1496
1529
|
sizeBytes: r.size_bytes || 0
|
|
1497
1530
|
}));
|
|
1498
1531
|
} catch (error) {
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1532
|
+
if (config.branch) {
|
|
1533
|
+
console.error("git-native listing failed for configured results branch", error);
|
|
1534
|
+
} else {
|
|
1535
|
+
console.error("git-native listing failed, falling back", error);
|
|
1536
|
+
remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
|
|
1537
|
+
(meta) => ({
|
|
1538
|
+
...meta,
|
|
1539
|
+
filename: encodeRemoteRunId(meta.filename),
|
|
1540
|
+
raw_filename: meta.filename,
|
|
1541
|
+
source: "remote"
|
|
1542
|
+
})
|
|
1543
|
+
);
|
|
1544
|
+
}
|
|
1508
1545
|
}
|
|
1509
1546
|
} else {
|
|
1510
1547
|
remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
|
|
@@ -1544,7 +1581,7 @@ async function ensureRemoteRunAvailable(cwd, meta, projectId) {
|
|
|
1544
1581
|
".agentv/results/runs",
|
|
1545
1582
|
path7.posix.dirname(relativeManifestPath)
|
|
1546
1583
|
);
|
|
1547
|
-
await materializeGitRun(config.path, relativeRunPath);
|
|
1584
|
+
await materializeGitRun(config.path, relativeRunPath, getResultsStorageRef(config));
|
|
1548
1585
|
}
|
|
1549
1586
|
async function readRemoteRunTagState(cwd, meta, projectId) {
|
|
1550
1587
|
if (meta.source !== "remote") return void 0;
|
|
@@ -1581,7 +1618,7 @@ async function clearRemoteRunTags(cwd, meta, projectId) {
|
|
|
1581
1618
|
async function maybeAutoExportRunArtifacts(payload) {
|
|
1582
1619
|
const config = await loadNormalizedResultsConfig(payload.cwd);
|
|
1583
1620
|
if (!config?.auto_push) {
|
|
1584
|
-
return;
|
|
1621
|
+
return "disabled";
|
|
1585
1622
|
}
|
|
1586
1623
|
try {
|
|
1587
1624
|
await maybeWarnLargeArtifact(payload.run_dir);
|
|
@@ -1595,12 +1632,14 @@ async function maybeAutoExportRunArtifacts(payload) {
|
|
|
1595
1632
|
});
|
|
1596
1633
|
if (!pushed) {
|
|
1597
1634
|
console.warn("Warning: results export produced no git changes. Skipping push.");
|
|
1598
|
-
return;
|
|
1635
|
+
return "already_published";
|
|
1599
1636
|
}
|
|
1600
1637
|
console.log(`Results pushed to ${config.repo} (${config.path}/${relativeRunPath})`);
|
|
1638
|
+
return "published";
|
|
1601
1639
|
} catch (error) {
|
|
1602
1640
|
console.warn(`Warning: skipping results export: ${getStatusMessage(error)}`);
|
|
1603
1641
|
console.warn("Warning: Run 'gh auth login' if GitHub authentication is missing.");
|
|
1642
|
+
return "failed";
|
|
1604
1643
|
}
|
|
1605
1644
|
}
|
|
1606
1645
|
|
|
@@ -1906,7 +1945,7 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
1906
1945
|
if (this.closed) {
|
|
1907
1946
|
throw new Error("Cannot write to closed JSONL writer");
|
|
1908
1947
|
}
|
|
1909
|
-
const snakeCaseRecord =
|
|
1948
|
+
const snakeCaseRecord = toSnakeCaseDeep2(record);
|
|
1910
1949
|
const line = `${JSON.stringify(snakeCaseRecord)}
|
|
1911
1950
|
`;
|
|
1912
1951
|
if (!this.stream.write(line)) {
|
|
@@ -3425,6 +3464,14 @@ var MOCK_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
3425
3464
|
"trace"
|
|
3426
3465
|
// For testing tool-trajectory evaluator
|
|
3427
3466
|
]);
|
|
3467
|
+
var REPLAY_SETTINGS = /* @__PURE__ */ new Set([
|
|
3468
|
+
...COMMON_SETTINGS,
|
|
3469
|
+
"fixtures",
|
|
3470
|
+
"source_target",
|
|
3471
|
+
"suite",
|
|
3472
|
+
"eval_path",
|
|
3473
|
+
"variant"
|
|
3474
|
+
]);
|
|
3428
3475
|
var CLAUDE_SETTINGS = /* @__PURE__ */ new Set([
|
|
3429
3476
|
...COMMON_SETTINGS,
|
|
3430
3477
|
"executable",
|
|
@@ -3480,6 +3527,8 @@ function getKnownSettings(provider) {
|
|
|
3480
3527
|
return VSCODE_SETTINGS;
|
|
3481
3528
|
case "mock":
|
|
3482
3529
|
return MOCK_SETTINGS;
|
|
3530
|
+
case "replay":
|
|
3531
|
+
return REPLAY_SETTINGS;
|
|
3483
3532
|
case "cli":
|
|
3484
3533
|
return null;
|
|
3485
3534
|
default:
|
|
@@ -3761,6 +3810,24 @@ async function validateTargetsFile(filePath) {
|
|
|
3761
3810
|
if (providerValue === "cli") {
|
|
3762
3811
|
validateCliSettings(target, absolutePath, location, errors);
|
|
3763
3812
|
}
|
|
3813
|
+
if (providerValue === "replay") {
|
|
3814
|
+
if (!isNonEmptyString(target.fixtures)) {
|
|
3815
|
+
errors.push({
|
|
3816
|
+
severity: "error",
|
|
3817
|
+
filePath: absolutePath,
|
|
3818
|
+
location: `${location}.fixtures`,
|
|
3819
|
+
message: "Replay provider requires 'fixtures' as a non-empty string"
|
|
3820
|
+
});
|
|
3821
|
+
}
|
|
3822
|
+
if (!isNonEmptyString(target.source_target)) {
|
|
3823
|
+
errors.push({
|
|
3824
|
+
severity: "error",
|
|
3825
|
+
filePath: absolutePath,
|
|
3826
|
+
location: `${location}.source_target`,
|
|
3827
|
+
message: "Replay provider requires 'source_target' as a non-empty string"
|
|
3828
|
+
});
|
|
3829
|
+
}
|
|
3830
|
+
}
|
|
3764
3831
|
if (typeof provider === "string" && !isTemplated) {
|
|
3765
3832
|
validateUnknownSettings(target, provider, absolutePath, location, errors);
|
|
3766
3833
|
}
|
|
@@ -4005,6 +4072,14 @@ function validateProjectResultsConfig(errors, filePath, rawResults, location) {
|
|
|
4005
4072
|
}
|
|
4006
4073
|
}
|
|
4007
4074
|
validateGitRemoteUrl(errors, filePath, resultsRecord.repo_url, `${location}.repo_url`);
|
|
4075
|
+
if (resultsRecord.branch !== void 0 && (typeof resultsRecord.branch !== "string" || resultsRecord.branch.trim().length === 0)) {
|
|
4076
|
+
errors.push({
|
|
4077
|
+
severity: "error",
|
|
4078
|
+
filePath,
|
|
4079
|
+
location: `${location}.branch`,
|
|
4080
|
+
message: `Field '${location}.branch' must be a non-empty string`
|
|
4081
|
+
});
|
|
4082
|
+
}
|
|
4008
4083
|
if (resultsRecord.path !== void 0) {
|
|
4009
4084
|
if (typeof resultsRecord.path !== "string" || resultsRecord.path.trim().length === 0) {
|
|
4010
4085
|
errors.push({
|
|
@@ -4074,6 +4149,14 @@ function validateResultsConfig(errors, filePath, rawResults, location) {
|
|
|
4074
4149
|
});
|
|
4075
4150
|
}
|
|
4076
4151
|
validateRequiredString(errors, filePath, resultsRecord.repo, `${location}.repo`);
|
|
4152
|
+
if (resultsRecord.branch !== void 0 && (typeof resultsRecord.branch !== "string" || resultsRecord.branch.trim().length === 0)) {
|
|
4153
|
+
errors.push({
|
|
4154
|
+
severity: "error",
|
|
4155
|
+
filePath,
|
|
4156
|
+
location: `${location}.branch`,
|
|
4157
|
+
message: `Field '${location}.branch' must be a non-empty string`
|
|
4158
|
+
});
|
|
4159
|
+
}
|
|
4077
4160
|
if (resultsRecord.path !== void 0) {
|
|
4078
4161
|
if (typeof resultsRecord.path !== "string" || resultsRecord.path.trim().length === 0) {
|
|
4079
4162
|
errors.push({
|
|
@@ -4582,6 +4665,94 @@ Errors in ${targetsFilePath}:`);
|
|
|
4582
4665
|
return results;
|
|
4583
4666
|
}
|
|
4584
4667
|
|
|
4668
|
+
// src/commands/eval/wip-checkpoint.ts
|
|
4669
|
+
var WIP_CHECKPOINT_INTERVAL_MS = 3e4;
|
|
4670
|
+
var defaultDependencies = {
|
|
4671
|
+
buildWipBranchName,
|
|
4672
|
+
deleteWipBranch,
|
|
4673
|
+
pushWipCheckpoint,
|
|
4674
|
+
setupWipWorktree
|
|
4675
|
+
};
|
|
4676
|
+
function warnCheckpointError(context, error) {
|
|
4677
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
4678
|
+
console.warn(`WIP checkpoint: ${context}: ${message}`);
|
|
4679
|
+
}
|
|
4680
|
+
var WipCheckpointLoop = class {
|
|
4681
|
+
wipBranch;
|
|
4682
|
+
config;
|
|
4683
|
+
runDir;
|
|
4684
|
+
destinationPath;
|
|
4685
|
+
intervalMs;
|
|
4686
|
+
deps;
|
|
4687
|
+
handle;
|
|
4688
|
+
timer;
|
|
4689
|
+
checkpointInFlight;
|
|
4690
|
+
active = false;
|
|
4691
|
+
constructor(params) {
|
|
4692
|
+
this.config = params.config;
|
|
4693
|
+
this.runDir = params.runDir;
|
|
4694
|
+
this.destinationPath = params.destinationPath;
|
|
4695
|
+
this.intervalMs = params.intervalMs ?? WIP_CHECKPOINT_INTERVAL_MS;
|
|
4696
|
+
this.deps = params.dependencies ?? defaultDependencies;
|
|
4697
|
+
this.wipBranch = this.deps.buildWipBranchName(params.runDir);
|
|
4698
|
+
}
|
|
4699
|
+
async start() {
|
|
4700
|
+
try {
|
|
4701
|
+
this.handle = await this.deps.setupWipWorktree({
|
|
4702
|
+
config: this.config,
|
|
4703
|
+
wipBranch: this.wipBranch
|
|
4704
|
+
});
|
|
4705
|
+
} catch (err) {
|
|
4706
|
+
warnCheckpointError("failed to set up WIP worktree", err);
|
|
4707
|
+
return;
|
|
4708
|
+
}
|
|
4709
|
+
this.active = true;
|
|
4710
|
+
this.timer = setInterval(() => {
|
|
4711
|
+
this.runCheckpointIfIdle();
|
|
4712
|
+
}, this.intervalMs);
|
|
4713
|
+
this.timer.unref?.();
|
|
4714
|
+
}
|
|
4715
|
+
runCheckpointIfIdle() {
|
|
4716
|
+
if (!this.active || this.checkpointInFlight) return;
|
|
4717
|
+
this.checkpointInFlight = this.checkpoint().catch((err) => warnCheckpointError("push failed", err)).finally(() => {
|
|
4718
|
+
this.checkpointInFlight = void 0;
|
|
4719
|
+
});
|
|
4720
|
+
}
|
|
4721
|
+
async checkpoint() {
|
|
4722
|
+
if (!this.handle) return;
|
|
4723
|
+
await this.deps.pushWipCheckpoint({
|
|
4724
|
+
handle: this.handle,
|
|
4725
|
+
sourceDir: this.runDir,
|
|
4726
|
+
destinationPath: this.destinationPath
|
|
4727
|
+
});
|
|
4728
|
+
}
|
|
4729
|
+
/** Stop the loop and clean up the local worktree. Does NOT delete the remote WIP branch. */
|
|
4730
|
+
async stop() {
|
|
4731
|
+
this.active = false;
|
|
4732
|
+
if (this.timer !== void 0) {
|
|
4733
|
+
clearInterval(this.timer);
|
|
4734
|
+
this.timer = void 0;
|
|
4735
|
+
}
|
|
4736
|
+
await this.checkpointInFlight;
|
|
4737
|
+
if (this.handle) {
|
|
4738
|
+
await this.handle.cleanup().catch((err) => warnCheckpointError("worktree cleanup failed", err));
|
|
4739
|
+
this.handle = void 0;
|
|
4740
|
+
}
|
|
4741
|
+
}
|
|
4742
|
+
/**
|
|
4743
|
+
* Stop the loop and delete the remote WIP branch.
|
|
4744
|
+
* Call after a successful run to keep the results repo tidy.
|
|
4745
|
+
*/
|
|
4746
|
+
async stopAndDeleteWipBranch() {
|
|
4747
|
+
await this.stop();
|
|
4748
|
+
try {
|
|
4749
|
+
await this.deps.deleteWipBranch({ config: this.config, wipBranch: this.wipBranch });
|
|
4750
|
+
} catch (err) {
|
|
4751
|
+
warnCheckpointError(`failed to delete remote branch ${this.wipBranch}`, err);
|
|
4752
|
+
}
|
|
4753
|
+
}
|
|
4754
|
+
};
|
|
4755
|
+
|
|
4585
4756
|
// src/commands/eval/run-eval.ts
|
|
4586
4757
|
var DEFAULT_WORKERS = 3;
|
|
4587
4758
|
function shouldSkipExistingResultForResume(result, rerunFailed) {
|
|
@@ -4722,17 +4893,8 @@ function normalizeOutputMessages(cliValue) {
|
|
|
4722
4893
|
}
|
|
4723
4894
|
return parsed;
|
|
4724
4895
|
}
|
|
4725
|
-
function trimOutputMessages(output,
|
|
4726
|
-
|
|
4727
|
-
if (outputMessages === "all") {
|
|
4728
|
-
return messages.map((m) => ({ role: m.role, content: m.content }));
|
|
4729
|
-
}
|
|
4730
|
-
if (outputMessages === 1) {
|
|
4731
|
-
const lastAssistant = messages.filter((m) => m.role === "assistant").at(-1);
|
|
4732
|
-
return lastAssistant ? [{ role: lastAssistant.role, content: lastAssistant.content }] : [];
|
|
4733
|
-
}
|
|
4734
|
-
const sliced = messages.slice(-outputMessages);
|
|
4735
|
-
return sliced.map((m) => ({ role: m.role, content: m.content }));
|
|
4896
|
+
function trimOutputMessages(output, _outputMessages) {
|
|
4897
|
+
return output;
|
|
4736
4898
|
}
|
|
4737
4899
|
function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
4738
4900
|
const cliWorkers = normalizeOptionalNumber(rawOptions.workers);
|
|
@@ -4819,6 +4981,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
4819
4981
|
tags: normalizeStringArray(rawOptions.tag),
|
|
4820
4982
|
excludeTags: normalizeStringArray(rawOptions.excludeTag),
|
|
4821
4983
|
transcript: normalizeString(rawOptions.transcript),
|
|
4984
|
+
recordReplay: normalizeString(rawOptions.recordReplay),
|
|
4985
|
+
recordReplayVariant: normalizeString(rawOptions.recordReplayVariant),
|
|
4822
4986
|
experiment: normalizeString(rawOptions.experiment),
|
|
4823
4987
|
budgetUsd: normalizeOptionalNumber(rawOptions.budgetUsd),
|
|
4824
4988
|
sourceMetadataByEvalFile: normalizeSourceMetadataByEvalFile(
|
|
@@ -5115,6 +5279,11 @@ async function runSingleEvalFile(params) {
|
|
|
5115
5279
|
providerFactory
|
|
5116
5280
|
} = params;
|
|
5117
5281
|
const targetName = selection.targetName;
|
|
5282
|
+
const replayRecording = options.recordReplay ? {
|
|
5283
|
+
fixturesPath: path12.resolve(options.recordReplay),
|
|
5284
|
+
sourceTarget: targetName,
|
|
5285
|
+
variant: options.recordReplayVariant
|
|
5286
|
+
} : void 0;
|
|
5118
5287
|
await ensureFileExists(testFilePath, "Test file");
|
|
5119
5288
|
const resolvedTargetSelection = applyVerboseOverride(selection, options.verbose);
|
|
5120
5289
|
const providerLabel = options.dryRun ? `${resolvedTargetSelection.resolvedTarget.kind} (dry-run)` : resolvedTargetSelection.resolvedTarget.kind;
|
|
@@ -5191,6 +5360,7 @@ async function runSingleEvalFile(params) {
|
|
|
5191
5360
|
model: options.model,
|
|
5192
5361
|
threshold: options.threshold,
|
|
5193
5362
|
targetHooks: resolvedTargetSelection.targetHooks,
|
|
5363
|
+
replayRecording,
|
|
5194
5364
|
providerFactory,
|
|
5195
5365
|
streamCallbacks: streamingObserver?.getStreamCallbacks(),
|
|
5196
5366
|
onResult: async (result) => {
|
|
@@ -5381,7 +5551,7 @@ async function runEvalCommand(input) {
|
|
|
5381
5551
|
const useFileExport = !!options.otelFile;
|
|
5382
5552
|
if (options.exportOtel || useFileExport) {
|
|
5383
5553
|
try {
|
|
5384
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
5554
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-BSFUYS54.js");
|
|
5385
5555
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
5386
5556
|
let headers = {};
|
|
5387
5557
|
if (options.otelBackend) {
|
|
@@ -5480,6 +5650,9 @@ async function runEvalCommand(input) {
|
|
|
5480
5650
|
if (cache) {
|
|
5481
5651
|
console.log(`Response cache: enabled (${cache.cachePath})`);
|
|
5482
5652
|
}
|
|
5653
|
+
if (options.recordReplay) {
|
|
5654
|
+
console.log(`Replay recording: ${path12.resolve(options.recordReplay)}`);
|
|
5655
|
+
}
|
|
5483
5656
|
const yamlThreshold = firstMeta?.threshold;
|
|
5484
5657
|
const resolvedThreshold = options.threshold ?? yamlThreshold;
|
|
5485
5658
|
if (resolvedThreshold !== void 0 && (resolvedThreshold < 0 || resolvedThreshold > 1)) {
|
|
@@ -5574,7 +5747,7 @@ async function runEvalCommand(input) {
|
|
|
5574
5747
|
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
|
|
5575
5748
|
let transcriptProviderFactory;
|
|
5576
5749
|
if (options.transcript) {
|
|
5577
|
-
const { TranscriptProvider } = await import("./dist-
|
|
5750
|
+
const { TranscriptProvider } = await import("./dist-BSFUYS54.js");
|
|
5578
5751
|
const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
|
|
5579
5752
|
const totalTests = [...fileMetadata.values()].reduce(
|
|
5580
5753
|
(sum, meta) => sum + meta.testCases.length,
|
|
@@ -5598,6 +5771,20 @@ async function runEvalCommand(input) {
|
|
|
5598
5771
|
experiment: normalizeExperimentName(options.experiment)
|
|
5599
5772
|
});
|
|
5600
5773
|
}
|
|
5774
|
+
let wipLoop;
|
|
5775
|
+
let wipCleanedUp = false;
|
|
5776
|
+
let finalExportStatus = "disabled";
|
|
5777
|
+
{
|
|
5778
|
+
const wipConfig = await loadNormalizedResultsConfig(cwd).catch(() => void 0);
|
|
5779
|
+
if (wipConfig?.auto_push) {
|
|
5780
|
+
wipLoop = new WipCheckpointLoop({
|
|
5781
|
+
config: wipConfig,
|
|
5782
|
+
runDir,
|
|
5783
|
+
destinationPath: getRelativeRunPath(cwd, runDir)
|
|
5784
|
+
});
|
|
5785
|
+
await wipLoop.start();
|
|
5786
|
+
}
|
|
5787
|
+
}
|
|
5601
5788
|
try {
|
|
5602
5789
|
for (const testFilePath of activeTestFiles) {
|
|
5603
5790
|
if (runBudgetTracker?.isExceeded()) {
|
|
@@ -5612,7 +5799,16 @@ async function runEvalCommand(input) {
|
|
|
5612
5799
|
testId: testCase.id,
|
|
5613
5800
|
score: 0,
|
|
5614
5801
|
assertions: [],
|
|
5615
|
-
output:
|
|
5802
|
+
output: budgetMsg,
|
|
5803
|
+
trace: buildTraceFromMessages({
|
|
5804
|
+
input: testCase.input,
|
|
5805
|
+
output: [{ role: "assistant", content: budgetMsg }],
|
|
5806
|
+
finalOutput: budgetMsg,
|
|
5807
|
+
target: selection.targetName,
|
|
5808
|
+
testId: testCase.id,
|
|
5809
|
+
conversationId: testCase.conversation_id,
|
|
5810
|
+
error: budgetMsg
|
|
5811
|
+
}),
|
|
5616
5812
|
error: budgetMsg,
|
|
5617
5813
|
budgetExceeded: true,
|
|
5618
5814
|
executionStatus: "execution_error",
|
|
@@ -5702,7 +5898,16 @@ async function runEvalCommand(input) {
|
|
|
5702
5898
|
testId: testCase.id,
|
|
5703
5899
|
score: 0,
|
|
5704
5900
|
assertions: [],
|
|
5705
|
-
output:
|
|
5901
|
+
output: message,
|
|
5902
|
+
trace: buildTraceFromMessages({
|
|
5903
|
+
input: testCase.input,
|
|
5904
|
+
output: [{ role: "assistant", content: message }],
|
|
5905
|
+
finalOutput: message,
|
|
5906
|
+
target: selection.targetName,
|
|
5907
|
+
testId: testCase.id,
|
|
5908
|
+
conversationId: testCase.conversation_id,
|
|
5909
|
+
error: message
|
|
5910
|
+
}),
|
|
5706
5911
|
scores: [],
|
|
5707
5912
|
error: message,
|
|
5708
5913
|
executionStatus: "execution_error",
|
|
@@ -5758,7 +5963,7 @@ async function runEvalCommand(input) {
|
|
|
5758
5963
|
);
|
|
5759
5964
|
const taskBundleTargets = buildTaskBundleTargetSelections(activeTestFiles, fileMetadata);
|
|
5760
5965
|
if (isResumeAppend) {
|
|
5761
|
-
const { writePerTestArtifacts } = await import("./artifact-writer-
|
|
5966
|
+
const { writePerTestArtifacts } = await import("./artifact-writer-3YRN6YTA.js");
|
|
5762
5967
|
await writePerTestArtifacts(allResults, runDir, {
|
|
5763
5968
|
experiment: normalizeExperimentName(options.experiment),
|
|
5764
5969
|
cwd,
|
|
@@ -5815,7 +6020,7 @@ async function runEvalCommand(input) {
|
|
|
5815
6020
|
console.log(`
|
|
5816
6021
|
Results written to: ${outputPath}`);
|
|
5817
6022
|
await saveRunCache(cwd, outputPath).catch(() => void 0);
|
|
5818
|
-
await maybeAutoExportRunArtifacts({
|
|
6023
|
+
finalExportStatus = await maybeAutoExportRunArtifacts({
|
|
5819
6024
|
cwd,
|
|
5820
6025
|
run_dir: runDir,
|
|
5821
6026
|
test_files: activeTestFiles,
|
|
@@ -5851,6 +6056,10 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
|
|
|
5851
6056
|
\u26A0 Run budget exceeded: $${runBudgetTracker?.currentCostUsd.toFixed(4)} spent of $${runBudgetTracker?.budgetCapUsd.toFixed(4)} cap`
|
|
5852
6057
|
);
|
|
5853
6058
|
}
|
|
6059
|
+
if (wipLoop && (finalExportStatus === "published" || finalExportStatus === "already_published")) {
|
|
6060
|
+
wipCleanedUp = true;
|
|
6061
|
+
await wipLoop.stopAndDeleteWipBranch();
|
|
6062
|
+
}
|
|
5854
6063
|
return {
|
|
5855
6064
|
executionErrorCount: summary.executionErrorCount,
|
|
5856
6065
|
outputPath,
|
|
@@ -5861,6 +6070,9 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
|
|
|
5861
6070
|
budgetExceeded: runBudgetExceeded || void 0
|
|
5862
6071
|
};
|
|
5863
6072
|
} finally {
|
|
6073
|
+
if (wipLoop && !wipCleanedUp) {
|
|
6074
|
+
await wipLoop.stop().catch(() => void 0);
|
|
6075
|
+
}
|
|
5864
6076
|
unsubscribeCodexLogs();
|
|
5865
6077
|
unsubscribePiLogs();
|
|
5866
6078
|
unsubscribeCopilotSdkLogs();
|
|
@@ -5986,4 +6198,4 @@ export {
|
|
|
5986
6198
|
getCategories,
|
|
5987
6199
|
filterByCategory
|
|
5988
6200
|
};
|
|
5989
|
-
//# sourceMappingURL=chunk-
|
|
6201
|
+
//# sourceMappingURL=chunk-7KZ2AF26.js.map
|