agentv 4.35.1 → 4.37.0-next.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{artifact-writer-G57MG52C.js → artifact-writer-GFNKYREE.js} +4 -4
- package/dist/{chunk-INOKS5LF.js → chunk-M7AMFWBZ.js} +275 -58
- package/dist/chunk-M7AMFWBZ.js.map +1 -0
- package/dist/{chunk-KJGYL3M3.js → chunk-N6E5XFOM.js} +213 -85
- package/dist/chunk-N6E5XFOM.js.map +1 -0
- package/dist/{chunk-KNF3AGCI.js → chunk-OYI35QFW.js} +314 -49
- package/dist/chunk-OYI35QFW.js.map +1 -0
- package/dist/{chunk-CRMGUVRZ.js → chunk-P4LSNFZR.js} +85 -19
- package/dist/chunk-P4LSNFZR.js.map +1 -0
- package/dist/{chunk-6QEIZ33V.js → chunk-RL4S2FBZ.js} +2700 -456
- package/dist/chunk-RL4S2FBZ.js.map +1 -0
- package/dist/cli.js +5 -5
- package/dist/dashboard/assets/index-9tV-u4HJ.css +1 -0
- package/dist/dashboard/assets/{index-Bdk-9a_8.js → index-BDRYJsGF.js} +1 -1
- package/dist/dashboard/assets/index-DuESU7zZ.js +118 -0
- package/dist/dashboard/index.html +2 -2
- package/dist/{dist-M4B77IW4.js → dist-OY3JSP6Z.js} +125 -3
- package/dist/index.js +5 -5
- package/dist/{interactive-VYQ5SYMR.js → interactive-CQELHITQ.js} +5 -5
- package/dist/skills/agentv-eval-writer/SKILL.md +6 -0
- package/dist/{ts-eval-loader-EQJX3OLT-THE7D3GR.js → ts-eval-loader-RBTB2HG2-H5TRXZLO.js} +2 -2
- package/package.json +1 -1
- package/dist/chunk-6QEIZ33V.js.map +0 -1
- package/dist/chunk-CRMGUVRZ.js.map +0 -1
- package/dist/chunk-INOKS5LF.js.map +0 -1
- package/dist/chunk-KJGYL3M3.js.map +0 -1
- package/dist/chunk-KNF3AGCI.js.map +0 -1
- package/dist/dashboard/assets/index-BPMAZqjE.css +0 -1
- package/dist/dashboard/assets/index-BWO0UcxG.js +0 -118
- /package/dist/{artifact-writer-G57MG52C.js.map → artifact-writer-GFNKYREE.js.map} +0 -0
- /package/dist/{dist-M4B77IW4.js.map → dist-OY3JSP6Z.js.map} +0 -0
- /package/dist/{interactive-VYQ5SYMR.js.map → interactive-CQELHITQ.js.map} +0 -0
- /package/dist/{ts-eval-loader-EQJX3OLT-THE7D3GR.js.map → ts-eval-loader-RBTB2HG2-H5TRXZLO.js.map} +0 -0
|
@@ -14,11 +14,11 @@ import {
|
|
|
14
14
|
writeArtifactsFromResults,
|
|
15
15
|
writeInitialBenchmarkArtifact,
|
|
16
16
|
writePerTestArtifacts
|
|
17
|
-
} from "./chunk-
|
|
18
|
-
import "./chunk-
|
|
17
|
+
} from "./chunk-N6E5XFOM.js";
|
|
18
|
+
import "./chunk-OYI35QFW.js";
|
|
19
19
|
import "./chunk-QOBQ5XYF.js";
|
|
20
20
|
import "./chunk-BPGJ4HBU.js";
|
|
21
|
-
import "./chunk-
|
|
21
|
+
import "./chunk-RL4S2FBZ.js";
|
|
22
22
|
import "./chunk-NPVGBFF6.js";
|
|
23
23
|
import "./chunk-M7BUKBAF.js";
|
|
24
24
|
import "./chunk-5H446C7X.js";
|
|
@@ -38,4 +38,4 @@ export {
|
|
|
38
38
|
writeInitialBenchmarkArtifact,
|
|
39
39
|
writePerTestArtifacts
|
|
40
40
|
};
|
|
41
|
-
//# sourceMappingURL=artifact-writer-
|
|
41
|
+
//# sourceMappingURL=artifact-writer-GFNKYREE.js.map
|
|
@@ -13,12 +13,14 @@ import {
|
|
|
13
13
|
resolveRunIndexPath,
|
|
14
14
|
resolveRunManifestPath,
|
|
15
15
|
resolveWorkspaceOrFilePath,
|
|
16
|
-
toSnakeCaseDeep,
|
|
16
|
+
toSnakeCaseDeep as toSnakeCaseDeep2,
|
|
17
17
|
writeArtifactsFromResults,
|
|
18
18
|
writeInitialBenchmarkArtifact
|
|
19
|
-
} from "./chunk-
|
|
19
|
+
} from "./chunk-N6E5XFOM.js";
|
|
20
20
|
import {
|
|
21
21
|
RunBudgetTracker,
|
|
22
|
+
buildWipBranchName,
|
|
23
|
+
deleteWipBranch,
|
|
22
24
|
deriveCategory,
|
|
23
25
|
directPushResults,
|
|
24
26
|
directorySizeBytes,
|
|
@@ -29,9 +31,12 @@ import {
|
|
|
29
31
|
loadTsConfig,
|
|
30
32
|
materializeGitRun,
|
|
31
33
|
normalizeResultsConfig,
|
|
34
|
+
pushWipCheckpoint,
|
|
32
35
|
resolveResultsRepoRunsDir,
|
|
33
|
-
|
|
34
|
-
|
|
36
|
+
setupWipWorktree,
|
|
37
|
+
syncResultsRepoForProject,
|
|
38
|
+
traceFromTranscriptJsonLines
|
|
39
|
+
} from "./chunk-OYI35QFW.js";
|
|
35
40
|
import {
|
|
36
41
|
CLI_PLACEHOLDERS,
|
|
37
42
|
COMMON_TARGET_SETTINGS,
|
|
@@ -42,6 +47,7 @@ import {
|
|
|
42
47
|
ResponseCache,
|
|
43
48
|
buildDirectoryChain,
|
|
44
49
|
buildSearchRoots,
|
|
50
|
+
buildTraceFromMessages,
|
|
45
51
|
ensureVSCodeSubagents,
|
|
46
52
|
findDeprecatedCamelCaseTargetWarnings,
|
|
47
53
|
findGitRoot,
|
|
@@ -66,8 +72,9 @@ import {
|
|
|
66
72
|
subscribeToCopilotCliLogEntries,
|
|
67
73
|
subscribeToCopilotSdkLogEntries,
|
|
68
74
|
subscribeToPiLogEntries,
|
|
69
|
-
toCamelCaseDeep
|
|
70
|
-
|
|
75
|
+
toCamelCaseDeep,
|
|
76
|
+
toSnakeCaseDeep
|
|
77
|
+
} from "./chunk-RL4S2FBZ.js";
|
|
71
78
|
|
|
72
79
|
// src/commands/eval/shared.ts
|
|
73
80
|
import { constants } from "node:fs";
|
|
@@ -183,7 +190,7 @@ async function findRepoRoot(start) {
|
|
|
183
190
|
// package.json
|
|
184
191
|
var package_default = {
|
|
185
192
|
name: "agentv",
|
|
186
|
-
version: "4.
|
|
193
|
+
version: "4.37.0-next.1",
|
|
187
194
|
description: "CLI entry point for AgentV",
|
|
188
195
|
type: "module",
|
|
189
196
|
repository: {
|
|
@@ -574,18 +581,31 @@ function hydrateInput(baseDir, record) {
|
|
|
574
581
|
return messages.length > 0 ? messages : [{ role: "user", content: inputText.trimEnd() }];
|
|
575
582
|
}
|
|
576
583
|
function hydrateOutput(baseDir, record) {
|
|
577
|
-
const responseText = readOptionalText(
|
|
584
|
+
const responseText = readOptionalText(
|
|
585
|
+
baseDir,
|
|
586
|
+
record.output_path ?? record.answer_path ?? record.response_path
|
|
587
|
+
);
|
|
578
588
|
if (!responseText) {
|
|
579
589
|
return void 0;
|
|
580
590
|
}
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
591
|
+
return responseText.trimEnd();
|
|
592
|
+
}
|
|
593
|
+
function hydrateTrace(baseDir, record) {
|
|
594
|
+
const transcriptText = readOptionalText(baseDir, record.transcript_path);
|
|
595
|
+
if (transcriptText) {
|
|
596
|
+
try {
|
|
597
|
+
return traceFromTranscriptJsonLines(parseJsonlLines(transcriptText));
|
|
598
|
+
} catch {
|
|
599
|
+
}
|
|
587
600
|
}
|
|
588
|
-
|
|
601
|
+
const output = hydrateOutput(baseDir, record) ?? "";
|
|
602
|
+
return buildTraceFromMessages({
|
|
603
|
+
input: hydrateInput(baseDir, record),
|
|
604
|
+
output: output ? [{ role: "assistant", content: output }] : [],
|
|
605
|
+
finalOutput: output,
|
|
606
|
+
target: record.target,
|
|
607
|
+
testId: record.test_id
|
|
608
|
+
});
|
|
589
609
|
}
|
|
590
610
|
function hydrateManifestRecord(baseDir, record) {
|
|
591
611
|
const grading = readOptionalJson(baseDir, record.grading_path);
|
|
@@ -630,7 +650,8 @@ function hydrateManifestRecord(baseDir, record) {
|
|
|
630
650
|
durationMs: timing?.duration_ms ?? record.duration_ms,
|
|
631
651
|
costUsd: record.cost_usd,
|
|
632
652
|
input: hydrateInput(baseDir, record),
|
|
633
|
-
output: hydrateOutput(baseDir, record),
|
|
653
|
+
output: hydrateOutput(baseDir, record) ?? "",
|
|
654
|
+
trace: hydrateTrace(baseDir, record),
|
|
634
655
|
metadata: record.metadata
|
|
635
656
|
};
|
|
636
657
|
}
|
|
@@ -756,6 +777,7 @@ function toRawResult(result) {
|
|
|
756
777
|
end_time: result.endTime,
|
|
757
778
|
input: result.input,
|
|
758
779
|
output: result.output,
|
|
780
|
+
trace: toSnakeCaseDeep(result.trace),
|
|
759
781
|
file_changes: result.fileChanges
|
|
760
782
|
};
|
|
761
783
|
}
|
|
@@ -1336,25 +1358,33 @@ function deleteRemoteRunTags(repoDir, manifestPath) {
|
|
|
1336
1358
|
// src/commands/results/remote.ts
|
|
1337
1359
|
var gitRunsCache = /* @__PURE__ */ new Map();
|
|
1338
1360
|
var GIT_RUNS_CACHE_TTL_MS = 6e4;
|
|
1339
|
-
function
|
|
1361
|
+
function getResultsStorageRef(config) {
|
|
1362
|
+
return config.branch ? `origin/${config.branch}` : void 0;
|
|
1363
|
+
}
|
|
1364
|
+
function cachedListGitRuns(repoDir, ref) {
|
|
1340
1365
|
const now = Date.now();
|
|
1341
|
-
const
|
|
1366
|
+
const cacheKey = `${repoDir}\0${ref ?? ""}`;
|
|
1367
|
+
const cached = gitRunsCache.get(cacheKey);
|
|
1342
1368
|
if (cached && cached.expiresAt > now) {
|
|
1343
1369
|
return cached.data;
|
|
1344
1370
|
}
|
|
1345
|
-
const promise = listGitRuns(repoDir);
|
|
1346
|
-
gitRunsCache.set(
|
|
1371
|
+
const promise = ref ? listGitRuns(repoDir, ref) : listGitRuns(repoDir);
|
|
1372
|
+
gitRunsCache.set(cacheKey, { data: promise, expiresAt: now + GIT_RUNS_CACHE_TTL_MS });
|
|
1347
1373
|
promise.catch(() => {
|
|
1348
1374
|
}).finally(() => {
|
|
1349
|
-
const entry = gitRunsCache.get(
|
|
1375
|
+
const entry = gitRunsCache.get(cacheKey);
|
|
1350
1376
|
if (entry && entry.expiresAt <= Date.now()) {
|
|
1351
|
-
gitRunsCache.delete(
|
|
1377
|
+
gitRunsCache.delete(cacheKey);
|
|
1352
1378
|
}
|
|
1353
1379
|
});
|
|
1354
1380
|
return promise;
|
|
1355
1381
|
}
|
|
1356
1382
|
function invalidateGitRunsCache(repoDir) {
|
|
1357
|
-
gitRunsCache.
|
|
1383
|
+
for (const key of gitRunsCache.keys()) {
|
|
1384
|
+
if (key.startsWith(`${repoDir}\0`)) {
|
|
1385
|
+
gitRunsCache.delete(key);
|
|
1386
|
+
}
|
|
1387
|
+
}
|
|
1358
1388
|
}
|
|
1359
1389
|
var REMOTE_RUN_PREFIX = "remote::";
|
|
1360
1390
|
var SIZE_WARNING_BYTES = 10 * 1024 * 1024;
|
|
@@ -1391,6 +1421,7 @@ async function loadNormalizedResultsConfig(cwd, projectId) {
|
|
|
1391
1421
|
const projectResults = project?.results ? {
|
|
1392
1422
|
mode: "github",
|
|
1393
1423
|
repo: project.results.repoUrl,
|
|
1424
|
+
branch: project.results.branch,
|
|
1394
1425
|
path: project.results.path,
|
|
1395
1426
|
auto_push: project.results.sync?.autoPush,
|
|
1396
1427
|
branch_prefix: project.results.branchPrefix
|
|
@@ -1420,9 +1451,11 @@ async function getRemoteRunCount(config, status) {
|
|
|
1420
1451
|
let runCount = 0;
|
|
1421
1452
|
if (config && status.available) {
|
|
1422
1453
|
try {
|
|
1423
|
-
runCount = (await cachedListGitRuns(config.path)).length;
|
|
1454
|
+
runCount = (await cachedListGitRuns(config.path, getResultsStorageRef(config))).length;
|
|
1424
1455
|
} catch {
|
|
1425
|
-
|
|
1456
|
+
if (!config.branch) {
|
|
1457
|
+
runCount = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).length;
|
|
1458
|
+
}
|
|
1426
1459
|
}
|
|
1427
1460
|
}
|
|
1428
1461
|
return runCount;
|
|
@@ -1482,7 +1515,7 @@ async function listMergedResultFiles(cwd, limit, projectId) {
|
|
|
1482
1515
|
let remoteRuns = [];
|
|
1483
1516
|
if (config.mode === "github") {
|
|
1484
1517
|
try {
|
|
1485
|
-
const gitRuns = await cachedListGitRuns(config.path);
|
|
1518
|
+
const gitRuns = await cachedListGitRuns(config.path, getResultsStorageRef(config));
|
|
1486
1519
|
remoteRuns = gitRuns.map((r) => ({
|
|
1487
1520
|
filename: encodeRemoteRunId(r.run_id),
|
|
1488
1521
|
raw_filename: r.run_id,
|
|
@@ -1496,15 +1529,19 @@ async function listMergedResultFiles(cwd, limit, projectId) {
|
|
|
1496
1529
|
sizeBytes: r.size_bytes || 0
|
|
1497
1530
|
}));
|
|
1498
1531
|
} catch (error) {
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1532
|
+
if (config.branch) {
|
|
1533
|
+
console.error("git-native listing failed for configured results branch", error);
|
|
1534
|
+
} else {
|
|
1535
|
+
console.error("git-native listing failed, falling back", error);
|
|
1536
|
+
remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
|
|
1537
|
+
(meta) => ({
|
|
1538
|
+
...meta,
|
|
1539
|
+
filename: encodeRemoteRunId(meta.filename),
|
|
1540
|
+
raw_filename: meta.filename,
|
|
1541
|
+
source: "remote"
|
|
1542
|
+
})
|
|
1543
|
+
);
|
|
1544
|
+
}
|
|
1508
1545
|
}
|
|
1509
1546
|
} else {
|
|
1510
1547
|
remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
|
|
@@ -1544,7 +1581,7 @@ async function ensureRemoteRunAvailable(cwd, meta, projectId) {
|
|
|
1544
1581
|
".agentv/results/runs",
|
|
1545
1582
|
path7.posix.dirname(relativeManifestPath)
|
|
1546
1583
|
);
|
|
1547
|
-
await materializeGitRun(config.path, relativeRunPath);
|
|
1584
|
+
await materializeGitRun(config.path, relativeRunPath, getResultsStorageRef(config));
|
|
1548
1585
|
}
|
|
1549
1586
|
async function readRemoteRunTagState(cwd, meta, projectId) {
|
|
1550
1587
|
if (meta.source !== "remote") return void 0;
|
|
@@ -1581,7 +1618,7 @@ async function clearRemoteRunTags(cwd, meta, projectId) {
|
|
|
1581
1618
|
async function maybeAutoExportRunArtifacts(payload) {
|
|
1582
1619
|
const config = await loadNormalizedResultsConfig(payload.cwd);
|
|
1583
1620
|
if (!config?.auto_push) {
|
|
1584
|
-
return;
|
|
1621
|
+
return "disabled";
|
|
1585
1622
|
}
|
|
1586
1623
|
try {
|
|
1587
1624
|
await maybeWarnLargeArtifact(payload.run_dir);
|
|
@@ -1595,12 +1632,14 @@ async function maybeAutoExportRunArtifacts(payload) {
|
|
|
1595
1632
|
});
|
|
1596
1633
|
if (!pushed) {
|
|
1597
1634
|
console.warn("Warning: results export produced no git changes. Skipping push.");
|
|
1598
|
-
return;
|
|
1635
|
+
return "already_published";
|
|
1599
1636
|
}
|
|
1600
1637
|
console.log(`Results pushed to ${config.repo} (${config.path}/${relativeRunPath})`);
|
|
1638
|
+
return "published";
|
|
1601
1639
|
} catch (error) {
|
|
1602
1640
|
console.warn(`Warning: skipping results export: ${getStatusMessage(error)}`);
|
|
1603
1641
|
console.warn("Warning: Run 'gh auth login' if GitHub authentication is missing.");
|
|
1642
|
+
return "failed";
|
|
1604
1643
|
}
|
|
1605
1644
|
}
|
|
1606
1645
|
|
|
@@ -1906,7 +1945,7 @@ var JsonlWriter = class _JsonlWriter {
|
|
|
1906
1945
|
if (this.closed) {
|
|
1907
1946
|
throw new Error("Cannot write to closed JSONL writer");
|
|
1908
1947
|
}
|
|
1909
|
-
const snakeCaseRecord =
|
|
1948
|
+
const snakeCaseRecord = toSnakeCaseDeep2(record);
|
|
1910
1949
|
const line = `${JSON.stringify(snakeCaseRecord)}
|
|
1911
1950
|
`;
|
|
1912
1951
|
if (!this.stream.write(line)) {
|
|
@@ -3394,6 +3433,7 @@ var COPILOT_SDK_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
3394
3433
|
"log_format",
|
|
3395
3434
|
"stream_log",
|
|
3396
3435
|
"system_prompt",
|
|
3436
|
+
"custom_provider",
|
|
3397
3437
|
"byok"
|
|
3398
3438
|
]);
|
|
3399
3439
|
var COPILOT_CLI_SETTINGS = /* @__PURE__ */ new Set([
|
|
@@ -3409,7 +3449,8 @@ var COPILOT_CLI_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
3409
3449
|
"log_dir",
|
|
3410
3450
|
"log_format",
|
|
3411
3451
|
"stream_log",
|
|
3412
|
-
"system_prompt"
|
|
3452
|
+
"system_prompt",
|
|
3453
|
+
"custom_provider"
|
|
3413
3454
|
]);
|
|
3414
3455
|
var VSCODE_SETTINGS = /* @__PURE__ */ new Set([
|
|
3415
3456
|
...COMMON_SETTINGS,
|
|
@@ -3425,6 +3466,15 @@ var MOCK_SETTINGS = /* @__PURE__ */ new Set([
|
|
|
3425
3466
|
"trace"
|
|
3426
3467
|
// For testing tool-trajectory evaluator
|
|
3427
3468
|
]);
|
|
3469
|
+
var REPLAY_SETTINGS = /* @__PURE__ */ new Set([
|
|
3470
|
+
...COMMON_SETTINGS,
|
|
3471
|
+
"fixtures",
|
|
3472
|
+
"trace_envelopes",
|
|
3473
|
+
"source_target",
|
|
3474
|
+
"suite",
|
|
3475
|
+
"eval_path",
|
|
3476
|
+
"variant"
|
|
3477
|
+
]);
|
|
3428
3478
|
var CLAUDE_SETTINGS = /* @__PURE__ */ new Set([
|
|
3429
3479
|
...COMMON_SETTINGS,
|
|
3430
3480
|
"executable",
|
|
@@ -3480,6 +3530,8 @@ function getKnownSettings(provider) {
|
|
|
3480
3530
|
return VSCODE_SETTINGS;
|
|
3481
3531
|
case "mock":
|
|
3482
3532
|
return MOCK_SETTINGS;
|
|
3533
|
+
case "replay":
|
|
3534
|
+
return REPLAY_SETTINGS;
|
|
3483
3535
|
case "cli":
|
|
3484
3536
|
return null;
|
|
3485
3537
|
default:
|
|
@@ -3761,6 +3813,26 @@ async function validateTargetsFile(filePath) {
|
|
|
3761
3813
|
if (providerValue === "cli") {
|
|
3762
3814
|
validateCliSettings(target, absolutePath, location, errors);
|
|
3763
3815
|
}
|
|
3816
|
+
if (providerValue === "replay") {
|
|
3817
|
+
const hasFixtures = isNonEmptyString(target.fixtures);
|
|
3818
|
+
const hasTraceEnvelopes = isNonEmptyString(target.trace_envelopes);
|
|
3819
|
+
if (hasFixtures === hasTraceEnvelopes) {
|
|
3820
|
+
errors.push({
|
|
3821
|
+
severity: "error",
|
|
3822
|
+
filePath: absolutePath,
|
|
3823
|
+
location,
|
|
3824
|
+
message: "Replay provider requires exactly one replay source: 'fixtures' or 'trace_envelopes'"
|
|
3825
|
+
});
|
|
3826
|
+
}
|
|
3827
|
+
if (!isNonEmptyString(target.source_target)) {
|
|
3828
|
+
errors.push({
|
|
3829
|
+
severity: "error",
|
|
3830
|
+
filePath: absolutePath,
|
|
3831
|
+
location: `${location}.source_target`,
|
|
3832
|
+
message: "Replay provider requires 'source_target' as a non-empty string"
|
|
3833
|
+
});
|
|
3834
|
+
}
|
|
3835
|
+
}
|
|
3764
3836
|
if (typeof provider === "string" && !isTemplated) {
|
|
3765
3837
|
validateUnknownSettings(target, provider, absolutePath, location, errors);
|
|
3766
3838
|
}
|
|
@@ -4005,6 +4077,14 @@ function validateProjectResultsConfig(errors, filePath, rawResults, location) {
|
|
|
4005
4077
|
}
|
|
4006
4078
|
}
|
|
4007
4079
|
validateGitRemoteUrl(errors, filePath, resultsRecord.repo_url, `${location}.repo_url`);
|
|
4080
|
+
if (resultsRecord.branch !== void 0 && (typeof resultsRecord.branch !== "string" || resultsRecord.branch.trim().length === 0)) {
|
|
4081
|
+
errors.push({
|
|
4082
|
+
severity: "error",
|
|
4083
|
+
filePath,
|
|
4084
|
+
location: `${location}.branch`,
|
|
4085
|
+
message: `Field '${location}.branch' must be a non-empty string`
|
|
4086
|
+
});
|
|
4087
|
+
}
|
|
4008
4088
|
if (resultsRecord.path !== void 0) {
|
|
4009
4089
|
if (typeof resultsRecord.path !== "string" || resultsRecord.path.trim().length === 0) {
|
|
4010
4090
|
errors.push({
|
|
@@ -4074,6 +4154,14 @@ function validateResultsConfig(errors, filePath, rawResults, location) {
|
|
|
4074
4154
|
});
|
|
4075
4155
|
}
|
|
4076
4156
|
validateRequiredString(errors, filePath, resultsRecord.repo, `${location}.repo`);
|
|
4157
|
+
if (resultsRecord.branch !== void 0 && (typeof resultsRecord.branch !== "string" || resultsRecord.branch.trim().length === 0)) {
|
|
4158
|
+
errors.push({
|
|
4159
|
+
severity: "error",
|
|
4160
|
+
filePath,
|
|
4161
|
+
location: `${location}.branch`,
|
|
4162
|
+
message: `Field '${location}.branch' must be a non-empty string`
|
|
4163
|
+
});
|
|
4164
|
+
}
|
|
4077
4165
|
if (resultsRecord.path !== void 0) {
|
|
4078
4166
|
if (typeof resultsRecord.path !== "string" || resultsRecord.path.trim().length === 0) {
|
|
4079
4167
|
errors.push({
|
|
@@ -4582,6 +4670,94 @@ Errors in ${targetsFilePath}:`);
|
|
|
4582
4670
|
return results;
|
|
4583
4671
|
}
|
|
4584
4672
|
|
|
4673
|
+
// src/commands/eval/wip-checkpoint.ts
|
|
4674
|
+
var WIP_CHECKPOINT_INTERVAL_MS = 3e4;
|
|
4675
|
+
var defaultDependencies = {
|
|
4676
|
+
buildWipBranchName,
|
|
4677
|
+
deleteWipBranch,
|
|
4678
|
+
pushWipCheckpoint,
|
|
4679
|
+
setupWipWorktree
|
|
4680
|
+
};
|
|
4681
|
+
function warnCheckpointError(context, error) {
|
|
4682
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
4683
|
+
console.warn(`WIP checkpoint: ${context}: ${message}`);
|
|
4684
|
+
}
|
|
4685
|
+
var WipCheckpointLoop = class {
|
|
4686
|
+
wipBranch;
|
|
4687
|
+
config;
|
|
4688
|
+
runDir;
|
|
4689
|
+
destinationPath;
|
|
4690
|
+
intervalMs;
|
|
4691
|
+
deps;
|
|
4692
|
+
handle;
|
|
4693
|
+
timer;
|
|
4694
|
+
checkpointInFlight;
|
|
4695
|
+
active = false;
|
|
4696
|
+
constructor(params) {
|
|
4697
|
+
this.config = params.config;
|
|
4698
|
+
this.runDir = params.runDir;
|
|
4699
|
+
this.destinationPath = params.destinationPath;
|
|
4700
|
+
this.intervalMs = params.intervalMs ?? WIP_CHECKPOINT_INTERVAL_MS;
|
|
4701
|
+
this.deps = params.dependencies ?? defaultDependencies;
|
|
4702
|
+
this.wipBranch = this.deps.buildWipBranchName(params.runDir);
|
|
4703
|
+
}
|
|
4704
|
+
async start() {
|
|
4705
|
+
try {
|
|
4706
|
+
this.handle = await this.deps.setupWipWorktree({
|
|
4707
|
+
config: this.config,
|
|
4708
|
+
wipBranch: this.wipBranch
|
|
4709
|
+
});
|
|
4710
|
+
} catch (err) {
|
|
4711
|
+
warnCheckpointError("failed to set up WIP worktree", err);
|
|
4712
|
+
return;
|
|
4713
|
+
}
|
|
4714
|
+
this.active = true;
|
|
4715
|
+
this.timer = setInterval(() => {
|
|
4716
|
+
this.runCheckpointIfIdle();
|
|
4717
|
+
}, this.intervalMs);
|
|
4718
|
+
this.timer.unref?.();
|
|
4719
|
+
}
|
|
4720
|
+
runCheckpointIfIdle() {
|
|
4721
|
+
if (!this.active || this.checkpointInFlight) return;
|
|
4722
|
+
this.checkpointInFlight = this.checkpoint().catch((err) => warnCheckpointError("push failed", err)).finally(() => {
|
|
4723
|
+
this.checkpointInFlight = void 0;
|
|
4724
|
+
});
|
|
4725
|
+
}
|
|
4726
|
+
async checkpoint() {
|
|
4727
|
+
if (!this.handle) return;
|
|
4728
|
+
await this.deps.pushWipCheckpoint({
|
|
4729
|
+
handle: this.handle,
|
|
4730
|
+
sourceDir: this.runDir,
|
|
4731
|
+
destinationPath: this.destinationPath
|
|
4732
|
+
});
|
|
4733
|
+
}
|
|
4734
|
+
/** Stop the loop and clean up the local worktree. Does NOT delete the remote WIP branch. */
|
|
4735
|
+
async stop() {
|
|
4736
|
+
this.active = false;
|
|
4737
|
+
if (this.timer !== void 0) {
|
|
4738
|
+
clearInterval(this.timer);
|
|
4739
|
+
this.timer = void 0;
|
|
4740
|
+
}
|
|
4741
|
+
await this.checkpointInFlight;
|
|
4742
|
+
if (this.handle) {
|
|
4743
|
+
await this.handle.cleanup().catch((err) => warnCheckpointError("worktree cleanup failed", err));
|
|
4744
|
+
this.handle = void 0;
|
|
4745
|
+
}
|
|
4746
|
+
}
|
|
4747
|
+
/**
|
|
4748
|
+
* Stop the loop and delete the remote WIP branch.
|
|
4749
|
+
* Call after a successful run to keep the results repo tidy.
|
|
4750
|
+
*/
|
|
4751
|
+
async stopAndDeleteWipBranch() {
|
|
4752
|
+
await this.stop();
|
|
4753
|
+
try {
|
|
4754
|
+
await this.deps.deleteWipBranch({ config: this.config, wipBranch: this.wipBranch });
|
|
4755
|
+
} catch (err) {
|
|
4756
|
+
warnCheckpointError(`failed to delete remote branch ${this.wipBranch}`, err);
|
|
4757
|
+
}
|
|
4758
|
+
}
|
|
4759
|
+
};
|
|
4760
|
+
|
|
4585
4761
|
// src/commands/eval/run-eval.ts
|
|
4586
4762
|
var DEFAULT_WORKERS = 3;
|
|
4587
4763
|
function shouldSkipExistingResultForResume(result, rerunFailed) {
|
|
@@ -4722,17 +4898,8 @@ function normalizeOutputMessages(cliValue) {
|
|
|
4722
4898
|
}
|
|
4723
4899
|
return parsed;
|
|
4724
4900
|
}
|
|
4725
|
-
function trimOutputMessages(output,
|
|
4726
|
-
|
|
4727
|
-
if (outputMessages === "all") {
|
|
4728
|
-
return messages.map((m) => ({ role: m.role, content: m.content }));
|
|
4729
|
-
}
|
|
4730
|
-
if (outputMessages === 1) {
|
|
4731
|
-
const lastAssistant = messages.filter((m) => m.role === "assistant").at(-1);
|
|
4732
|
-
return lastAssistant ? [{ role: lastAssistant.role, content: lastAssistant.content }] : [];
|
|
4733
|
-
}
|
|
4734
|
-
const sliced = messages.slice(-outputMessages);
|
|
4735
|
-
return sliced.map((m) => ({ role: m.role, content: m.content }));
|
|
4901
|
+
function trimOutputMessages(output, _outputMessages) {
|
|
4902
|
+
return output;
|
|
4736
4903
|
}
|
|
4737
4904
|
function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
4738
4905
|
const cliWorkers = normalizeOptionalNumber(rawOptions.workers);
|
|
@@ -4819,6 +4986,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
|
|
|
4819
4986
|
tags: normalizeStringArray(rawOptions.tag),
|
|
4820
4987
|
excludeTags: normalizeStringArray(rawOptions.excludeTag),
|
|
4821
4988
|
transcript: normalizeString(rawOptions.transcript),
|
|
4989
|
+
recordReplay: normalizeString(rawOptions.recordReplay),
|
|
4990
|
+
recordReplayVariant: normalizeString(rawOptions.recordReplayVariant),
|
|
4822
4991
|
experiment: normalizeString(rawOptions.experiment),
|
|
4823
4992
|
budgetUsd: normalizeOptionalNumber(rawOptions.budgetUsd),
|
|
4824
4993
|
sourceMetadataByEvalFile: normalizeSourceMetadataByEvalFile(
|
|
@@ -5115,6 +5284,11 @@ async function runSingleEvalFile(params) {
|
|
|
5115
5284
|
providerFactory
|
|
5116
5285
|
} = params;
|
|
5117
5286
|
const targetName = selection.targetName;
|
|
5287
|
+
const replayRecording = options.recordReplay ? {
|
|
5288
|
+
fixturesPath: path12.resolve(options.recordReplay),
|
|
5289
|
+
sourceTarget: targetName,
|
|
5290
|
+
variant: options.recordReplayVariant
|
|
5291
|
+
} : void 0;
|
|
5118
5292
|
await ensureFileExists(testFilePath, "Test file");
|
|
5119
5293
|
const resolvedTargetSelection = applyVerboseOverride(selection, options.verbose);
|
|
5120
5294
|
const providerLabel = options.dryRun ? `${resolvedTargetSelection.resolvedTarget.kind} (dry-run)` : resolvedTargetSelection.resolvedTarget.kind;
|
|
@@ -5191,6 +5365,7 @@ async function runSingleEvalFile(params) {
|
|
|
5191
5365
|
model: options.model,
|
|
5192
5366
|
threshold: options.threshold,
|
|
5193
5367
|
targetHooks: resolvedTargetSelection.targetHooks,
|
|
5368
|
+
replayRecording,
|
|
5194
5369
|
providerFactory,
|
|
5195
5370
|
streamCallbacks: streamingObserver?.getStreamCallbacks(),
|
|
5196
5371
|
onResult: async (result) => {
|
|
@@ -5381,7 +5556,7 @@ async function runEvalCommand(input) {
|
|
|
5381
5556
|
const useFileExport = !!options.otelFile;
|
|
5382
5557
|
if (options.exportOtel || useFileExport) {
|
|
5383
5558
|
try {
|
|
5384
|
-
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-
|
|
5559
|
+
const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-OY3JSP6Z.js");
|
|
5385
5560
|
let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
|
|
5386
5561
|
let headers = {};
|
|
5387
5562
|
if (options.otelBackend) {
|
|
@@ -5480,6 +5655,9 @@ async function runEvalCommand(input) {
|
|
|
5480
5655
|
if (cache) {
|
|
5481
5656
|
console.log(`Response cache: enabled (${cache.cachePath})`);
|
|
5482
5657
|
}
|
|
5658
|
+
if (options.recordReplay) {
|
|
5659
|
+
console.log(`Replay recording: ${path12.resolve(options.recordReplay)}`);
|
|
5660
|
+
}
|
|
5483
5661
|
const yamlThreshold = firstMeta?.threshold;
|
|
5484
5662
|
const resolvedThreshold = options.threshold ?? yamlThreshold;
|
|
5485
5663
|
if (resolvedThreshold !== void 0 && (resolvedThreshold < 0 || resolvedThreshold > 1)) {
|
|
@@ -5574,7 +5752,7 @@ async function runEvalCommand(input) {
|
|
|
5574
5752
|
const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
|
|
5575
5753
|
let transcriptProviderFactory;
|
|
5576
5754
|
if (options.transcript) {
|
|
5577
|
-
const { TranscriptProvider } = await import("./dist-
|
|
5755
|
+
const { TranscriptProvider } = await import("./dist-OY3JSP6Z.js");
|
|
5578
5756
|
const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
|
|
5579
5757
|
const totalTests = [...fileMetadata.values()].reduce(
|
|
5580
5758
|
(sum, meta) => sum + meta.testCases.length,
|
|
@@ -5598,6 +5776,20 @@ async function runEvalCommand(input) {
|
|
|
5598
5776
|
experiment: normalizeExperimentName(options.experiment)
|
|
5599
5777
|
});
|
|
5600
5778
|
}
|
|
5779
|
+
let wipLoop;
|
|
5780
|
+
let wipCleanedUp = false;
|
|
5781
|
+
let finalExportStatus = "disabled";
|
|
5782
|
+
{
|
|
5783
|
+
const wipConfig = await loadNormalizedResultsConfig(cwd).catch(() => void 0);
|
|
5784
|
+
if (wipConfig?.auto_push) {
|
|
5785
|
+
wipLoop = new WipCheckpointLoop({
|
|
5786
|
+
config: wipConfig,
|
|
5787
|
+
runDir,
|
|
5788
|
+
destinationPath: getRelativeRunPath(cwd, runDir)
|
|
5789
|
+
});
|
|
5790
|
+
await wipLoop.start();
|
|
5791
|
+
}
|
|
5792
|
+
}
|
|
5601
5793
|
try {
|
|
5602
5794
|
for (const testFilePath of activeTestFiles) {
|
|
5603
5795
|
if (runBudgetTracker?.isExceeded()) {
|
|
@@ -5612,7 +5804,16 @@ async function runEvalCommand(input) {
|
|
|
5612
5804
|
testId: testCase.id,
|
|
5613
5805
|
score: 0,
|
|
5614
5806
|
assertions: [],
|
|
5615
|
-
output:
|
|
5807
|
+
output: budgetMsg,
|
|
5808
|
+
trace: buildTraceFromMessages({
|
|
5809
|
+
input: testCase.input,
|
|
5810
|
+
output: [{ role: "assistant", content: budgetMsg }],
|
|
5811
|
+
finalOutput: budgetMsg,
|
|
5812
|
+
target: selection.targetName,
|
|
5813
|
+
testId: testCase.id,
|
|
5814
|
+
conversationId: testCase.conversation_id,
|
|
5815
|
+
error: budgetMsg
|
|
5816
|
+
}),
|
|
5616
5817
|
error: budgetMsg,
|
|
5617
5818
|
budgetExceeded: true,
|
|
5618
5819
|
executionStatus: "execution_error",
|
|
@@ -5702,7 +5903,16 @@ async function runEvalCommand(input) {
|
|
|
5702
5903
|
testId: testCase.id,
|
|
5703
5904
|
score: 0,
|
|
5704
5905
|
assertions: [],
|
|
5705
|
-
output:
|
|
5906
|
+
output: message,
|
|
5907
|
+
trace: buildTraceFromMessages({
|
|
5908
|
+
input: testCase.input,
|
|
5909
|
+
output: [{ role: "assistant", content: message }],
|
|
5910
|
+
finalOutput: message,
|
|
5911
|
+
target: selection.targetName,
|
|
5912
|
+
testId: testCase.id,
|
|
5913
|
+
conversationId: testCase.conversation_id,
|
|
5914
|
+
error: message
|
|
5915
|
+
}),
|
|
5706
5916
|
scores: [],
|
|
5707
5917
|
error: message,
|
|
5708
5918
|
executionStatus: "execution_error",
|
|
@@ -5758,7 +5968,7 @@ async function runEvalCommand(input) {
|
|
|
5758
5968
|
);
|
|
5759
5969
|
const taskBundleTargets = buildTaskBundleTargetSelections(activeTestFiles, fileMetadata);
|
|
5760
5970
|
if (isResumeAppend) {
|
|
5761
|
-
const { writePerTestArtifacts } = await import("./artifact-writer-
|
|
5971
|
+
const { writePerTestArtifacts } = await import("./artifact-writer-GFNKYREE.js");
|
|
5762
5972
|
await writePerTestArtifacts(allResults, runDir, {
|
|
5763
5973
|
experiment: normalizeExperimentName(options.experiment),
|
|
5764
5974
|
cwd,
|
|
@@ -5815,7 +6025,7 @@ async function runEvalCommand(input) {
|
|
|
5815
6025
|
console.log(`
|
|
5816
6026
|
Results written to: ${outputPath}`);
|
|
5817
6027
|
await saveRunCache(cwd, outputPath).catch(() => void 0);
|
|
5818
|
-
await maybeAutoExportRunArtifacts({
|
|
6028
|
+
finalExportStatus = await maybeAutoExportRunArtifacts({
|
|
5819
6029
|
cwd,
|
|
5820
6030
|
run_dir: runDir,
|
|
5821
6031
|
test_files: activeTestFiles,
|
|
@@ -5851,6 +6061,10 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
|
|
|
5851
6061
|
\u26A0 Run budget exceeded: $${runBudgetTracker?.currentCostUsd.toFixed(4)} spent of $${runBudgetTracker?.budgetCapUsd.toFixed(4)} cap`
|
|
5852
6062
|
);
|
|
5853
6063
|
}
|
|
6064
|
+
if (wipLoop && (finalExportStatus === "published" || finalExportStatus === "already_published")) {
|
|
6065
|
+
wipCleanedUp = true;
|
|
6066
|
+
await wipLoop.stopAndDeleteWipBranch();
|
|
6067
|
+
}
|
|
5854
6068
|
return {
|
|
5855
6069
|
executionErrorCount: summary.executionErrorCount,
|
|
5856
6070
|
outputPath,
|
|
@@ -5861,6 +6075,9 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
|
|
|
5861
6075
|
budgetExceeded: runBudgetExceeded || void 0
|
|
5862
6076
|
};
|
|
5863
6077
|
} finally {
|
|
6078
|
+
if (wipLoop && !wipCleanedUp) {
|
|
6079
|
+
await wipLoop.stop().catch(() => void 0);
|
|
6080
|
+
}
|
|
5864
6081
|
unsubscribeCodexLogs();
|
|
5865
6082
|
unsubscribePiLogs();
|
|
5866
6083
|
unsubscribeCopilotSdkLogs();
|
|
@@ -5986,4 +6203,4 @@ export {
|
|
|
5986
6203
|
getCategories,
|
|
5987
6204
|
filterByCategory
|
|
5988
6205
|
};
|
|
5989
|
-
//# sourceMappingURL=chunk-
|
|
6206
|
+
//# sourceMappingURL=chunk-M7AMFWBZ.js.map
|