agentv 4.35.1 → 4.37.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/dist/{artifact-writer-G57MG52C.js → artifact-writer-GFNKYREE.js} +4 -4
  2. package/dist/{chunk-INOKS5LF.js → chunk-M7AMFWBZ.js} +275 -58
  3. package/dist/chunk-M7AMFWBZ.js.map +1 -0
  4. package/dist/{chunk-KJGYL3M3.js → chunk-N6E5XFOM.js} +213 -85
  5. package/dist/chunk-N6E5XFOM.js.map +1 -0
  6. package/dist/{chunk-KNF3AGCI.js → chunk-OYI35QFW.js} +314 -49
  7. package/dist/chunk-OYI35QFW.js.map +1 -0
  8. package/dist/{chunk-CRMGUVRZ.js → chunk-P4LSNFZR.js} +85 -19
  9. package/dist/chunk-P4LSNFZR.js.map +1 -0
  10. package/dist/{chunk-6QEIZ33V.js → chunk-RL4S2FBZ.js} +2700 -456
  11. package/dist/chunk-RL4S2FBZ.js.map +1 -0
  12. package/dist/cli.js +5 -5
  13. package/dist/dashboard/assets/index-9tV-u4HJ.css +1 -0
  14. package/dist/dashboard/assets/{index-Bdk-9a_8.js → index-BDRYJsGF.js} +1 -1
  15. package/dist/dashboard/assets/index-DuESU7zZ.js +118 -0
  16. package/dist/dashboard/index.html +2 -2
  17. package/dist/{dist-M4B77IW4.js → dist-OY3JSP6Z.js} +125 -3
  18. package/dist/index.js +5 -5
  19. package/dist/{interactive-VYQ5SYMR.js → interactive-CQELHITQ.js} +5 -5
  20. package/dist/skills/agentv-eval-writer/SKILL.md +6 -0
  21. package/dist/{ts-eval-loader-EQJX3OLT-THE7D3GR.js → ts-eval-loader-RBTB2HG2-H5TRXZLO.js} +2 -2
  22. package/package.json +1 -1
  23. package/dist/chunk-6QEIZ33V.js.map +0 -1
  24. package/dist/chunk-CRMGUVRZ.js.map +0 -1
  25. package/dist/chunk-INOKS5LF.js.map +0 -1
  26. package/dist/chunk-KJGYL3M3.js.map +0 -1
  27. package/dist/chunk-KNF3AGCI.js.map +0 -1
  28. package/dist/dashboard/assets/index-BPMAZqjE.css +0 -1
  29. package/dist/dashboard/assets/index-BWO0UcxG.js +0 -118
  30. /package/dist/{artifact-writer-G57MG52C.js.map → artifact-writer-GFNKYREE.js.map} +0 -0
  31. /package/dist/{dist-M4B77IW4.js.map → dist-OY3JSP6Z.js.map} +0 -0
  32. /package/dist/{interactive-VYQ5SYMR.js.map → interactive-CQELHITQ.js.map} +0 -0
  33. /package/dist/{ts-eval-loader-EQJX3OLT-THE7D3GR.js.map → ts-eval-loader-RBTB2HG2-H5TRXZLO.js.map} +0 -0
@@ -14,11 +14,11 @@ import {
14
14
  writeArtifactsFromResults,
15
15
  writeInitialBenchmarkArtifact,
16
16
  writePerTestArtifacts
17
- } from "./chunk-KJGYL3M3.js";
18
- import "./chunk-KNF3AGCI.js";
17
+ } from "./chunk-N6E5XFOM.js";
18
+ import "./chunk-OYI35QFW.js";
19
19
  import "./chunk-QOBQ5XYF.js";
20
20
  import "./chunk-BPGJ4HBU.js";
21
- import "./chunk-6QEIZ33V.js";
21
+ import "./chunk-RL4S2FBZ.js";
22
22
  import "./chunk-NPVGBFF6.js";
23
23
  import "./chunk-M7BUKBAF.js";
24
24
  import "./chunk-5H446C7X.js";
@@ -38,4 +38,4 @@ export {
38
38
  writeInitialBenchmarkArtifact,
39
39
  writePerTestArtifacts
40
40
  };
41
- //# sourceMappingURL=artifact-writer-G57MG52C.js.map
41
+ //# sourceMappingURL=artifact-writer-GFNKYREE.js.map
@@ -13,12 +13,14 @@ import {
13
13
  resolveRunIndexPath,
14
14
  resolveRunManifestPath,
15
15
  resolveWorkspaceOrFilePath,
16
- toSnakeCaseDeep,
16
+ toSnakeCaseDeep as toSnakeCaseDeep2,
17
17
  writeArtifactsFromResults,
18
18
  writeInitialBenchmarkArtifact
19
- } from "./chunk-KJGYL3M3.js";
19
+ } from "./chunk-N6E5XFOM.js";
20
20
  import {
21
21
  RunBudgetTracker,
22
+ buildWipBranchName,
23
+ deleteWipBranch,
22
24
  deriveCategory,
23
25
  directPushResults,
24
26
  directorySizeBytes,
@@ -29,9 +31,12 @@ import {
29
31
  loadTsConfig,
30
32
  materializeGitRun,
31
33
  normalizeResultsConfig,
34
+ pushWipCheckpoint,
32
35
  resolveResultsRepoRunsDir,
33
- syncResultsRepoForProject
34
- } from "./chunk-KNF3AGCI.js";
36
+ setupWipWorktree,
37
+ syncResultsRepoForProject,
38
+ traceFromTranscriptJsonLines
39
+ } from "./chunk-OYI35QFW.js";
35
40
  import {
36
41
  CLI_PLACEHOLDERS,
37
42
  COMMON_TARGET_SETTINGS,
@@ -42,6 +47,7 @@ import {
42
47
  ResponseCache,
43
48
  buildDirectoryChain,
44
49
  buildSearchRoots,
50
+ buildTraceFromMessages,
45
51
  ensureVSCodeSubagents,
46
52
  findDeprecatedCamelCaseTargetWarnings,
47
53
  findGitRoot,
@@ -66,8 +72,9 @@ import {
66
72
  subscribeToCopilotCliLogEntries,
67
73
  subscribeToCopilotSdkLogEntries,
68
74
  subscribeToPiLogEntries,
69
- toCamelCaseDeep
70
- } from "./chunk-6QEIZ33V.js";
75
+ toCamelCaseDeep,
76
+ toSnakeCaseDeep
77
+ } from "./chunk-RL4S2FBZ.js";
71
78
 
72
79
  // src/commands/eval/shared.ts
73
80
  import { constants } from "node:fs";
@@ -183,7 +190,7 @@ async function findRepoRoot(start) {
183
190
  // package.json
184
191
  var package_default = {
185
192
  name: "agentv",
186
- version: "4.35.1",
193
+ version: "4.37.0-next.1",
187
194
  description: "CLI entry point for AgentV",
188
195
  type: "module",
189
196
  repository: {
@@ -574,18 +581,31 @@ function hydrateInput(baseDir, record) {
574
581
  return messages.length > 0 ? messages : [{ role: "user", content: inputText.trimEnd() }];
575
582
  }
576
583
  function hydrateOutput(baseDir, record) {
577
- const responseText = readOptionalText(baseDir, record.output_path ?? record.response_path);
584
+ const responseText = readOptionalText(
585
+ baseDir,
586
+ record.output_path ?? record.answer_path ?? record.response_path
587
+ );
578
588
  if (!responseText) {
579
589
  return void 0;
580
590
  }
581
- const messages = parseMarkdownMessages(responseText);
582
- if (messages.length > 0) {
583
- return messages.map((message) => ({
584
- role: message.role,
585
- content: message.content
586
- }));
591
+ return responseText.trimEnd();
592
+ }
593
+ function hydrateTrace(baseDir, record) {
594
+ const transcriptText = readOptionalText(baseDir, record.transcript_path);
595
+ if (transcriptText) {
596
+ try {
597
+ return traceFromTranscriptJsonLines(parseJsonlLines(transcriptText));
598
+ } catch {
599
+ }
587
600
  }
588
- return [{ role: "assistant", content: responseText.trimEnd() }];
601
+ const output = hydrateOutput(baseDir, record) ?? "";
602
+ return buildTraceFromMessages({
603
+ input: hydrateInput(baseDir, record),
604
+ output: output ? [{ role: "assistant", content: output }] : [],
605
+ finalOutput: output,
606
+ target: record.target,
607
+ testId: record.test_id
608
+ });
589
609
  }
590
610
  function hydrateManifestRecord(baseDir, record) {
591
611
  const grading = readOptionalJson(baseDir, record.grading_path);
@@ -630,7 +650,8 @@ function hydrateManifestRecord(baseDir, record) {
630
650
  durationMs: timing?.duration_ms ?? record.duration_ms,
631
651
  costUsd: record.cost_usd,
632
652
  input: hydrateInput(baseDir, record),
633
- output: hydrateOutput(baseDir, record),
653
+ output: hydrateOutput(baseDir, record) ?? "",
654
+ trace: hydrateTrace(baseDir, record),
634
655
  metadata: record.metadata
635
656
  };
636
657
  }
@@ -756,6 +777,7 @@ function toRawResult(result) {
756
777
  end_time: result.endTime,
757
778
  input: result.input,
758
779
  output: result.output,
780
+ trace: toSnakeCaseDeep(result.trace),
759
781
  file_changes: result.fileChanges
760
782
  };
761
783
  }
@@ -1336,25 +1358,33 @@ function deleteRemoteRunTags(repoDir, manifestPath) {
1336
1358
  // src/commands/results/remote.ts
1337
1359
  var gitRunsCache = /* @__PURE__ */ new Map();
1338
1360
  var GIT_RUNS_CACHE_TTL_MS = 6e4;
1339
- function cachedListGitRuns(repoDir) {
1361
+ function getResultsStorageRef(config) {
1362
+ return config.branch ? `origin/${config.branch}` : void 0;
1363
+ }
1364
+ function cachedListGitRuns(repoDir, ref) {
1340
1365
  const now = Date.now();
1341
- const cached = gitRunsCache.get(repoDir);
1366
+ const cacheKey = `${repoDir}\0${ref ?? ""}`;
1367
+ const cached = gitRunsCache.get(cacheKey);
1342
1368
  if (cached && cached.expiresAt > now) {
1343
1369
  return cached.data;
1344
1370
  }
1345
- const promise = listGitRuns(repoDir);
1346
- gitRunsCache.set(repoDir, { data: promise, expiresAt: now + GIT_RUNS_CACHE_TTL_MS });
1371
+ const promise = ref ? listGitRuns(repoDir, ref) : listGitRuns(repoDir);
1372
+ gitRunsCache.set(cacheKey, { data: promise, expiresAt: now + GIT_RUNS_CACHE_TTL_MS });
1347
1373
  promise.catch(() => {
1348
1374
  }).finally(() => {
1349
- const entry = gitRunsCache.get(repoDir);
1375
+ const entry = gitRunsCache.get(cacheKey);
1350
1376
  if (entry && entry.expiresAt <= Date.now()) {
1351
- gitRunsCache.delete(repoDir);
1377
+ gitRunsCache.delete(cacheKey);
1352
1378
  }
1353
1379
  });
1354
1380
  return promise;
1355
1381
  }
1356
1382
  function invalidateGitRunsCache(repoDir) {
1357
- gitRunsCache.delete(repoDir);
1383
+ for (const key of gitRunsCache.keys()) {
1384
+ if (key.startsWith(`${repoDir}\0`)) {
1385
+ gitRunsCache.delete(key);
1386
+ }
1387
+ }
1358
1388
  }
1359
1389
  var REMOTE_RUN_PREFIX = "remote::";
1360
1390
  var SIZE_WARNING_BYTES = 10 * 1024 * 1024;
@@ -1391,6 +1421,7 @@ async function loadNormalizedResultsConfig(cwd, projectId) {
1391
1421
  const projectResults = project?.results ? {
1392
1422
  mode: "github",
1393
1423
  repo: project.results.repoUrl,
1424
+ branch: project.results.branch,
1394
1425
  path: project.results.path,
1395
1426
  auto_push: project.results.sync?.autoPush,
1396
1427
  branch_prefix: project.results.branchPrefix
@@ -1420,9 +1451,11 @@ async function getRemoteRunCount(config, status) {
1420
1451
  let runCount = 0;
1421
1452
  if (config && status.available) {
1422
1453
  try {
1423
- runCount = (await cachedListGitRuns(config.path)).length;
1454
+ runCount = (await cachedListGitRuns(config.path, getResultsStorageRef(config))).length;
1424
1455
  } catch {
1425
- runCount = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).length;
1456
+ if (!config.branch) {
1457
+ runCount = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).length;
1458
+ }
1426
1459
  }
1427
1460
  }
1428
1461
  return runCount;
@@ -1482,7 +1515,7 @@ async function listMergedResultFiles(cwd, limit, projectId) {
1482
1515
  let remoteRuns = [];
1483
1516
  if (config.mode === "github") {
1484
1517
  try {
1485
- const gitRuns = await cachedListGitRuns(config.path);
1518
+ const gitRuns = await cachedListGitRuns(config.path, getResultsStorageRef(config));
1486
1519
  remoteRuns = gitRuns.map((r) => ({
1487
1520
  filename: encodeRemoteRunId(r.run_id),
1488
1521
  raw_filename: r.run_id,
@@ -1496,15 +1529,19 @@ async function listMergedResultFiles(cwd, limit, projectId) {
1496
1529
  sizeBytes: r.size_bytes || 0
1497
1530
  }));
1498
1531
  } catch (error) {
1499
- console.error("git-native listing failed, falling back", error);
1500
- remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
1501
- (meta) => ({
1502
- ...meta,
1503
- filename: encodeRemoteRunId(meta.filename),
1504
- raw_filename: meta.filename,
1505
- source: "remote"
1506
- })
1507
- );
1532
+ if (config.branch) {
1533
+ console.error("git-native listing failed for configured results branch", error);
1534
+ } else {
1535
+ console.error("git-native listing failed, falling back", error);
1536
+ remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
1537
+ (meta) => ({
1538
+ ...meta,
1539
+ filename: encodeRemoteRunId(meta.filename),
1540
+ raw_filename: meta.filename,
1541
+ source: "remote"
1542
+ })
1543
+ );
1544
+ }
1508
1545
  }
1509
1546
  } else {
1510
1547
  remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
@@ -1544,7 +1581,7 @@ async function ensureRemoteRunAvailable(cwd, meta, projectId) {
1544
1581
  ".agentv/results/runs",
1545
1582
  path7.posix.dirname(relativeManifestPath)
1546
1583
  );
1547
- await materializeGitRun(config.path, relativeRunPath);
1584
+ await materializeGitRun(config.path, relativeRunPath, getResultsStorageRef(config));
1548
1585
  }
1549
1586
  async function readRemoteRunTagState(cwd, meta, projectId) {
1550
1587
  if (meta.source !== "remote") return void 0;
@@ -1581,7 +1618,7 @@ async function clearRemoteRunTags(cwd, meta, projectId) {
1581
1618
  async function maybeAutoExportRunArtifacts(payload) {
1582
1619
  const config = await loadNormalizedResultsConfig(payload.cwd);
1583
1620
  if (!config?.auto_push) {
1584
- return;
1621
+ return "disabled";
1585
1622
  }
1586
1623
  try {
1587
1624
  await maybeWarnLargeArtifact(payload.run_dir);
@@ -1595,12 +1632,14 @@ async function maybeAutoExportRunArtifacts(payload) {
1595
1632
  });
1596
1633
  if (!pushed) {
1597
1634
  console.warn("Warning: results export produced no git changes. Skipping push.");
1598
- return;
1635
+ return "already_published";
1599
1636
  }
1600
1637
  console.log(`Results pushed to ${config.repo} (${config.path}/${relativeRunPath})`);
1638
+ return "published";
1601
1639
  } catch (error) {
1602
1640
  console.warn(`Warning: skipping results export: ${getStatusMessage(error)}`);
1603
1641
  console.warn("Warning: Run 'gh auth login' if GitHub authentication is missing.");
1642
+ return "failed";
1604
1643
  }
1605
1644
  }
1606
1645
 
@@ -1906,7 +1945,7 @@ var JsonlWriter = class _JsonlWriter {
1906
1945
  if (this.closed) {
1907
1946
  throw new Error("Cannot write to closed JSONL writer");
1908
1947
  }
1909
- const snakeCaseRecord = toSnakeCaseDeep(record);
1948
+ const snakeCaseRecord = toSnakeCaseDeep2(record);
1910
1949
  const line = `${JSON.stringify(snakeCaseRecord)}
1911
1950
  `;
1912
1951
  if (!this.stream.write(line)) {
@@ -3394,6 +3433,7 @@ var COPILOT_SDK_SETTINGS = /* @__PURE__ */ new Set([
3394
3433
  "log_format",
3395
3434
  "stream_log",
3396
3435
  "system_prompt",
3436
+ "custom_provider",
3397
3437
  "byok"
3398
3438
  ]);
3399
3439
  var COPILOT_CLI_SETTINGS = /* @__PURE__ */ new Set([
@@ -3409,7 +3449,8 @@ var COPILOT_CLI_SETTINGS = /* @__PURE__ */ new Set([
3409
3449
  "log_dir",
3410
3450
  "log_format",
3411
3451
  "stream_log",
3412
- "system_prompt"
3452
+ "system_prompt",
3453
+ "custom_provider"
3413
3454
  ]);
3414
3455
  var VSCODE_SETTINGS = /* @__PURE__ */ new Set([
3415
3456
  ...COMMON_SETTINGS,
@@ -3425,6 +3466,15 @@ var MOCK_SETTINGS = /* @__PURE__ */ new Set([
3425
3466
  "trace"
3426
3467
  // For testing tool-trajectory evaluator
3427
3468
  ]);
3469
+ var REPLAY_SETTINGS = /* @__PURE__ */ new Set([
3470
+ ...COMMON_SETTINGS,
3471
+ "fixtures",
3472
+ "trace_envelopes",
3473
+ "source_target",
3474
+ "suite",
3475
+ "eval_path",
3476
+ "variant"
3477
+ ]);
3428
3478
  var CLAUDE_SETTINGS = /* @__PURE__ */ new Set([
3429
3479
  ...COMMON_SETTINGS,
3430
3480
  "executable",
@@ -3480,6 +3530,8 @@ function getKnownSettings(provider) {
3480
3530
  return VSCODE_SETTINGS;
3481
3531
  case "mock":
3482
3532
  return MOCK_SETTINGS;
3533
+ case "replay":
3534
+ return REPLAY_SETTINGS;
3483
3535
  case "cli":
3484
3536
  return null;
3485
3537
  default:
@@ -3761,6 +3813,26 @@ async function validateTargetsFile(filePath) {
3761
3813
  if (providerValue === "cli") {
3762
3814
  validateCliSettings(target, absolutePath, location, errors);
3763
3815
  }
3816
+ if (providerValue === "replay") {
3817
+ const hasFixtures = isNonEmptyString(target.fixtures);
3818
+ const hasTraceEnvelopes = isNonEmptyString(target.trace_envelopes);
3819
+ if (hasFixtures === hasTraceEnvelopes) {
3820
+ errors.push({
3821
+ severity: "error",
3822
+ filePath: absolutePath,
3823
+ location,
3824
+ message: "Replay provider requires exactly one replay source: 'fixtures' or 'trace_envelopes'"
3825
+ });
3826
+ }
3827
+ if (!isNonEmptyString(target.source_target)) {
3828
+ errors.push({
3829
+ severity: "error",
3830
+ filePath: absolutePath,
3831
+ location: `${location}.source_target`,
3832
+ message: "Replay provider requires 'source_target' as a non-empty string"
3833
+ });
3834
+ }
3835
+ }
3764
3836
  if (typeof provider === "string" && !isTemplated) {
3765
3837
  validateUnknownSettings(target, provider, absolutePath, location, errors);
3766
3838
  }
@@ -4005,6 +4077,14 @@ function validateProjectResultsConfig(errors, filePath, rawResults, location) {
4005
4077
  }
4006
4078
  }
4007
4079
  validateGitRemoteUrl(errors, filePath, resultsRecord.repo_url, `${location}.repo_url`);
4080
+ if (resultsRecord.branch !== void 0 && (typeof resultsRecord.branch !== "string" || resultsRecord.branch.trim().length === 0)) {
4081
+ errors.push({
4082
+ severity: "error",
4083
+ filePath,
4084
+ location: `${location}.branch`,
4085
+ message: `Field '${location}.branch' must be a non-empty string`
4086
+ });
4087
+ }
4008
4088
  if (resultsRecord.path !== void 0) {
4009
4089
  if (typeof resultsRecord.path !== "string" || resultsRecord.path.trim().length === 0) {
4010
4090
  errors.push({
@@ -4074,6 +4154,14 @@ function validateResultsConfig(errors, filePath, rawResults, location) {
4074
4154
  });
4075
4155
  }
4076
4156
  validateRequiredString(errors, filePath, resultsRecord.repo, `${location}.repo`);
4157
+ if (resultsRecord.branch !== void 0 && (typeof resultsRecord.branch !== "string" || resultsRecord.branch.trim().length === 0)) {
4158
+ errors.push({
4159
+ severity: "error",
4160
+ filePath,
4161
+ location: `${location}.branch`,
4162
+ message: `Field '${location}.branch' must be a non-empty string`
4163
+ });
4164
+ }
4077
4165
  if (resultsRecord.path !== void 0) {
4078
4166
  if (typeof resultsRecord.path !== "string" || resultsRecord.path.trim().length === 0) {
4079
4167
  errors.push({
@@ -4582,6 +4670,94 @@ Errors in ${targetsFilePath}:`);
4582
4670
  return results;
4583
4671
  }
4584
4672
 
4673
+ // src/commands/eval/wip-checkpoint.ts
4674
+ var WIP_CHECKPOINT_INTERVAL_MS = 3e4;
4675
+ var defaultDependencies = {
4676
+ buildWipBranchName,
4677
+ deleteWipBranch,
4678
+ pushWipCheckpoint,
4679
+ setupWipWorktree
4680
+ };
4681
+ function warnCheckpointError(context, error) {
4682
+ const message = error instanceof Error ? error.message : String(error);
4683
+ console.warn(`WIP checkpoint: ${context}: ${message}`);
4684
+ }
4685
+ var WipCheckpointLoop = class {
4686
+ wipBranch;
4687
+ config;
4688
+ runDir;
4689
+ destinationPath;
4690
+ intervalMs;
4691
+ deps;
4692
+ handle;
4693
+ timer;
4694
+ checkpointInFlight;
4695
+ active = false;
4696
+ constructor(params) {
4697
+ this.config = params.config;
4698
+ this.runDir = params.runDir;
4699
+ this.destinationPath = params.destinationPath;
4700
+ this.intervalMs = params.intervalMs ?? WIP_CHECKPOINT_INTERVAL_MS;
4701
+ this.deps = params.dependencies ?? defaultDependencies;
4702
+ this.wipBranch = this.deps.buildWipBranchName(params.runDir);
4703
+ }
4704
+ async start() {
4705
+ try {
4706
+ this.handle = await this.deps.setupWipWorktree({
4707
+ config: this.config,
4708
+ wipBranch: this.wipBranch
4709
+ });
4710
+ } catch (err) {
4711
+ warnCheckpointError("failed to set up WIP worktree", err);
4712
+ return;
4713
+ }
4714
+ this.active = true;
4715
+ this.timer = setInterval(() => {
4716
+ this.runCheckpointIfIdle();
4717
+ }, this.intervalMs);
4718
+ this.timer.unref?.();
4719
+ }
4720
+ runCheckpointIfIdle() {
4721
+ if (!this.active || this.checkpointInFlight) return;
4722
+ this.checkpointInFlight = this.checkpoint().catch((err) => warnCheckpointError("push failed", err)).finally(() => {
4723
+ this.checkpointInFlight = void 0;
4724
+ });
4725
+ }
4726
+ async checkpoint() {
4727
+ if (!this.handle) return;
4728
+ await this.deps.pushWipCheckpoint({
4729
+ handle: this.handle,
4730
+ sourceDir: this.runDir,
4731
+ destinationPath: this.destinationPath
4732
+ });
4733
+ }
4734
+ /** Stop the loop and clean up the local worktree. Does NOT delete the remote WIP branch. */
4735
+ async stop() {
4736
+ this.active = false;
4737
+ if (this.timer !== void 0) {
4738
+ clearInterval(this.timer);
4739
+ this.timer = void 0;
4740
+ }
4741
+ await this.checkpointInFlight;
4742
+ if (this.handle) {
4743
+ await this.handle.cleanup().catch((err) => warnCheckpointError("worktree cleanup failed", err));
4744
+ this.handle = void 0;
4745
+ }
4746
+ }
4747
+ /**
4748
+ * Stop the loop and delete the remote WIP branch.
4749
+ * Call after a successful run to keep the results repo tidy.
4750
+ */
4751
+ async stopAndDeleteWipBranch() {
4752
+ await this.stop();
4753
+ try {
4754
+ await this.deps.deleteWipBranch({ config: this.config, wipBranch: this.wipBranch });
4755
+ } catch (err) {
4756
+ warnCheckpointError(`failed to delete remote branch ${this.wipBranch}`, err);
4757
+ }
4758
+ }
4759
+ };
4760
+
4585
4761
  // src/commands/eval/run-eval.ts
4586
4762
  var DEFAULT_WORKERS = 3;
4587
4763
  function shouldSkipExistingResultForResume(result, rerunFailed) {
@@ -4722,17 +4898,8 @@ function normalizeOutputMessages(cliValue) {
4722
4898
  }
4723
4899
  return parsed;
4724
4900
  }
4725
- function trimOutputMessages(output, outputMessages) {
4726
- const messages = output ?? [];
4727
- if (outputMessages === "all") {
4728
- return messages.map((m) => ({ role: m.role, content: m.content }));
4729
- }
4730
- if (outputMessages === 1) {
4731
- const lastAssistant = messages.filter((m) => m.role === "assistant").at(-1);
4732
- return lastAssistant ? [{ role: lastAssistant.role, content: lastAssistant.content }] : [];
4733
- }
4734
- const sliced = messages.slice(-outputMessages);
4735
- return sliced.map((m) => ({ role: m.role, content: m.content }));
4901
+ function trimOutputMessages(output, _outputMessages) {
4902
+ return output;
4736
4903
  }
4737
4904
  function normalizeOptions(rawOptions, config, yamlExecution) {
4738
4905
  const cliWorkers = normalizeOptionalNumber(rawOptions.workers);
@@ -4819,6 +4986,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
4819
4986
  tags: normalizeStringArray(rawOptions.tag),
4820
4987
  excludeTags: normalizeStringArray(rawOptions.excludeTag),
4821
4988
  transcript: normalizeString(rawOptions.transcript),
4989
+ recordReplay: normalizeString(rawOptions.recordReplay),
4990
+ recordReplayVariant: normalizeString(rawOptions.recordReplayVariant),
4822
4991
  experiment: normalizeString(rawOptions.experiment),
4823
4992
  budgetUsd: normalizeOptionalNumber(rawOptions.budgetUsd),
4824
4993
  sourceMetadataByEvalFile: normalizeSourceMetadataByEvalFile(
@@ -5115,6 +5284,11 @@ async function runSingleEvalFile(params) {
5115
5284
  providerFactory
5116
5285
  } = params;
5117
5286
  const targetName = selection.targetName;
5287
+ const replayRecording = options.recordReplay ? {
5288
+ fixturesPath: path12.resolve(options.recordReplay),
5289
+ sourceTarget: targetName,
5290
+ variant: options.recordReplayVariant
5291
+ } : void 0;
5118
5292
  await ensureFileExists(testFilePath, "Test file");
5119
5293
  const resolvedTargetSelection = applyVerboseOverride(selection, options.verbose);
5120
5294
  const providerLabel = options.dryRun ? `${resolvedTargetSelection.resolvedTarget.kind} (dry-run)` : resolvedTargetSelection.resolvedTarget.kind;
@@ -5191,6 +5365,7 @@ async function runSingleEvalFile(params) {
5191
5365
  model: options.model,
5192
5366
  threshold: options.threshold,
5193
5367
  targetHooks: resolvedTargetSelection.targetHooks,
5368
+ replayRecording,
5194
5369
  providerFactory,
5195
5370
  streamCallbacks: streamingObserver?.getStreamCallbacks(),
5196
5371
  onResult: async (result) => {
@@ -5381,7 +5556,7 @@ async function runEvalCommand(input) {
5381
5556
  const useFileExport = !!options.otelFile;
5382
5557
  if (options.exportOtel || useFileExport) {
5383
5558
  try {
5384
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-M4B77IW4.js");
5559
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-OY3JSP6Z.js");
5385
5560
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
5386
5561
  let headers = {};
5387
5562
  if (options.otelBackend) {
@@ -5480,6 +5655,9 @@ async function runEvalCommand(input) {
5480
5655
  if (cache) {
5481
5656
  console.log(`Response cache: enabled (${cache.cachePath})`);
5482
5657
  }
5658
+ if (options.recordReplay) {
5659
+ console.log(`Replay recording: ${path12.resolve(options.recordReplay)}`);
5660
+ }
5483
5661
  const yamlThreshold = firstMeta?.threshold;
5484
5662
  const resolvedThreshold = options.threshold ?? yamlThreshold;
5485
5663
  if (resolvedThreshold !== void 0 && (resolvedThreshold < 0 || resolvedThreshold > 1)) {
@@ -5574,7 +5752,7 @@ async function runEvalCommand(input) {
5574
5752
  const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
5575
5753
  let transcriptProviderFactory;
5576
5754
  if (options.transcript) {
5577
- const { TranscriptProvider } = await import("./dist-M4B77IW4.js");
5755
+ const { TranscriptProvider } = await import("./dist-OY3JSP6Z.js");
5578
5756
  const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
5579
5757
  const totalTests = [...fileMetadata.values()].reduce(
5580
5758
  (sum, meta) => sum + meta.testCases.length,
@@ -5598,6 +5776,20 @@ async function runEvalCommand(input) {
5598
5776
  experiment: normalizeExperimentName(options.experiment)
5599
5777
  });
5600
5778
  }
5779
+ let wipLoop;
5780
+ let wipCleanedUp = false;
5781
+ let finalExportStatus = "disabled";
5782
+ {
5783
+ const wipConfig = await loadNormalizedResultsConfig(cwd).catch(() => void 0);
5784
+ if (wipConfig?.auto_push) {
5785
+ wipLoop = new WipCheckpointLoop({
5786
+ config: wipConfig,
5787
+ runDir,
5788
+ destinationPath: getRelativeRunPath(cwd, runDir)
5789
+ });
5790
+ await wipLoop.start();
5791
+ }
5792
+ }
5601
5793
  try {
5602
5794
  for (const testFilePath of activeTestFiles) {
5603
5795
  if (runBudgetTracker?.isExceeded()) {
@@ -5612,7 +5804,16 @@ async function runEvalCommand(input) {
5612
5804
  testId: testCase.id,
5613
5805
  score: 0,
5614
5806
  assertions: [],
5615
- output: [],
5807
+ output: budgetMsg,
5808
+ trace: buildTraceFromMessages({
5809
+ input: testCase.input,
5810
+ output: [{ role: "assistant", content: budgetMsg }],
5811
+ finalOutput: budgetMsg,
5812
+ target: selection.targetName,
5813
+ testId: testCase.id,
5814
+ conversationId: testCase.conversation_id,
5815
+ error: budgetMsg
5816
+ }),
5616
5817
  error: budgetMsg,
5617
5818
  budgetExceeded: true,
5618
5819
  executionStatus: "execution_error",
@@ -5702,7 +5903,16 @@ async function runEvalCommand(input) {
5702
5903
  testId: testCase.id,
5703
5904
  score: 0,
5704
5905
  assertions: [],
5705
- output: [],
5906
+ output: message,
5907
+ trace: buildTraceFromMessages({
5908
+ input: testCase.input,
5909
+ output: [{ role: "assistant", content: message }],
5910
+ finalOutput: message,
5911
+ target: selection.targetName,
5912
+ testId: testCase.id,
5913
+ conversationId: testCase.conversation_id,
5914
+ error: message
5915
+ }),
5706
5916
  scores: [],
5707
5917
  error: message,
5708
5918
  executionStatus: "execution_error",
@@ -5758,7 +5968,7 @@ async function runEvalCommand(input) {
5758
5968
  );
5759
5969
  const taskBundleTargets = buildTaskBundleTargetSelections(activeTestFiles, fileMetadata);
5760
5970
  if (isResumeAppend) {
5761
- const { writePerTestArtifacts } = await import("./artifact-writer-G57MG52C.js");
5971
+ const { writePerTestArtifacts } = await import("./artifact-writer-GFNKYREE.js");
5762
5972
  await writePerTestArtifacts(allResults, runDir, {
5763
5973
  experiment: normalizeExperimentName(options.experiment),
5764
5974
  cwd,
@@ -5815,7 +6025,7 @@ async function runEvalCommand(input) {
5815
6025
  console.log(`
5816
6026
  Results written to: ${outputPath}`);
5817
6027
  await saveRunCache(cwd, outputPath).catch(() => void 0);
5818
- await maybeAutoExportRunArtifacts({
6028
+ finalExportStatus = await maybeAutoExportRunArtifacts({
5819
6029
  cwd,
5820
6030
  run_dir: runDir,
5821
6031
  test_files: activeTestFiles,
@@ -5851,6 +6061,10 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
5851
6061
  \u26A0 Run budget exceeded: $${runBudgetTracker?.currentCostUsd.toFixed(4)} spent of $${runBudgetTracker?.budgetCapUsd.toFixed(4)} cap`
5852
6062
  );
5853
6063
  }
6064
+ if (wipLoop && (finalExportStatus === "published" || finalExportStatus === "already_published")) {
6065
+ wipCleanedUp = true;
6066
+ await wipLoop.stopAndDeleteWipBranch();
6067
+ }
5854
6068
  return {
5855
6069
  executionErrorCount: summary.executionErrorCount,
5856
6070
  outputPath,
@@ -5861,6 +6075,9 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
5861
6075
  budgetExceeded: runBudgetExceeded || void 0
5862
6076
  };
5863
6077
  } finally {
6078
+ if (wipLoop && !wipCleanedUp) {
6079
+ await wipLoop.stop().catch(() => void 0);
6080
+ }
5864
6081
  unsubscribeCodexLogs();
5865
6082
  unsubscribePiLogs();
5866
6083
  unsubscribeCopilotSdkLogs();
@@ -5986,4 +6203,4 @@ export {
5986
6203
  getCategories,
5987
6204
  filterByCategory
5988
6205
  };
5989
- //# sourceMappingURL=chunk-INOKS5LF.js.map
6206
+ //# sourceMappingURL=chunk-M7AMFWBZ.js.map