agentv 4.35.1 → 4.36.0-next.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. package/dist/{artifact-writer-G57MG52C.js → artifact-writer-3YRN6YTA.js} +4 -4
  2. package/dist/{chunk-CRMGUVRZ.js → chunk-4M6FAQTW.js} +85 -19
  3. package/dist/chunk-4M6FAQTW.js.map +1 -0
  4. package/dist/{chunk-INOKS5LF.js → chunk-7KZ2AF26.js} +269 -57
  5. package/dist/chunk-7KZ2AF26.js.map +1 -0
  6. package/dist/{chunk-KJGYL3M3.js → chunk-HVBAVOAH.js} +72 -50
  7. package/dist/chunk-HVBAVOAH.js.map +1 -0
  8. package/dist/{chunk-KNF3AGCI.js → chunk-P5JONEWJ.js} +231 -35
  9. package/dist/chunk-P5JONEWJ.js.map +1 -0
  10. package/dist/{chunk-6QEIZ33V.js → chunk-TUTURE2B.js} +1227 -372
  11. package/dist/chunk-TUTURE2B.js.map +1 -0
  12. package/dist/cli.js +5 -5
  13. package/dist/dashboard/assets/index-DA96FAM5.js +119 -0
  14. package/dist/dashboard/assets/{index-Bdk-9a_8.js → index-l4t97uO8.js} +1 -1
  15. package/dist/dashboard/assets/index-nmrFBoNd.css +1 -0
  16. package/dist/dashboard/index.html +2 -2
  17. package/dist/{dist-M4B77IW4.js → dist-BSFUYS54.js} +73 -3
  18. package/dist/index.js +5 -5
  19. package/dist/{interactive-VYQ5SYMR.js → interactive-IEC63EVP.js} +5 -5
  20. package/dist/skills/agentv-eval-writer/SKILL.md +6 -0
  21. package/dist/{ts-eval-loader-EQJX3OLT-THE7D3GR.js → ts-eval-loader-4DU65XGW-YM47FFG2.js} +2 -2
  22. package/package.json +1 -1
  23. package/dist/chunk-6QEIZ33V.js.map +0 -1
  24. package/dist/chunk-CRMGUVRZ.js.map +0 -1
  25. package/dist/chunk-INOKS5LF.js.map +0 -1
  26. package/dist/chunk-KJGYL3M3.js.map +0 -1
  27. package/dist/chunk-KNF3AGCI.js.map +0 -1
  28. package/dist/dashboard/assets/index-BPMAZqjE.css +0 -1
  29. package/dist/dashboard/assets/index-BWO0UcxG.js +0 -118
  30. /package/dist/{artifact-writer-G57MG52C.js.map → artifact-writer-3YRN6YTA.js.map} +0 -0
  31. /package/dist/{dist-M4B77IW4.js.map → dist-BSFUYS54.js.map} +0 -0
  32. /package/dist/{interactive-VYQ5SYMR.js.map → interactive-IEC63EVP.js.map} +0 -0
  33. /package/dist/{ts-eval-loader-EQJX3OLT-THE7D3GR.js.map → ts-eval-loader-4DU65XGW-YM47FFG2.js.map} +0 -0
@@ -13,12 +13,14 @@ import {
13
13
  resolveRunIndexPath,
14
14
  resolveRunManifestPath,
15
15
  resolveWorkspaceOrFilePath,
16
- toSnakeCaseDeep,
16
+ toSnakeCaseDeep as toSnakeCaseDeep2,
17
17
  writeArtifactsFromResults,
18
18
  writeInitialBenchmarkArtifact
19
- } from "./chunk-KJGYL3M3.js";
19
+ } from "./chunk-HVBAVOAH.js";
20
20
  import {
21
21
  RunBudgetTracker,
22
+ buildWipBranchName,
23
+ deleteWipBranch,
22
24
  deriveCategory,
23
25
  directPushResults,
24
26
  directorySizeBytes,
@@ -29,9 +31,12 @@ import {
29
31
  loadTsConfig,
30
32
  materializeGitRun,
31
33
  normalizeResultsConfig,
34
+ pushWipCheckpoint,
32
35
  resolveResultsRepoRunsDir,
33
- syncResultsRepoForProject
34
- } from "./chunk-KNF3AGCI.js";
36
+ setupWipWorktree,
37
+ syncResultsRepoForProject,
38
+ traceFromTranscriptJsonLines
39
+ } from "./chunk-P5JONEWJ.js";
35
40
  import {
36
41
  CLI_PLACEHOLDERS,
37
42
  COMMON_TARGET_SETTINGS,
@@ -42,6 +47,7 @@ import {
42
47
  ResponseCache,
43
48
  buildDirectoryChain,
44
49
  buildSearchRoots,
50
+ buildTraceFromMessages,
45
51
  ensureVSCodeSubagents,
46
52
  findDeprecatedCamelCaseTargetWarnings,
47
53
  findGitRoot,
@@ -66,8 +72,9 @@ import {
66
72
  subscribeToCopilotCliLogEntries,
67
73
  subscribeToCopilotSdkLogEntries,
68
74
  subscribeToPiLogEntries,
69
- toCamelCaseDeep
70
- } from "./chunk-6QEIZ33V.js";
75
+ toCamelCaseDeep,
76
+ toSnakeCaseDeep
77
+ } from "./chunk-TUTURE2B.js";
71
78
 
72
79
  // src/commands/eval/shared.ts
73
80
  import { constants } from "node:fs";
@@ -183,7 +190,7 @@ async function findRepoRoot(start) {
183
190
  // package.json
184
191
  var package_default = {
185
192
  name: "agentv",
186
- version: "4.35.1",
193
+ version: "4.36.0-next.1",
187
194
  description: "CLI entry point for AgentV",
188
195
  type: "module",
189
196
  repository: {
@@ -574,18 +581,31 @@ function hydrateInput(baseDir, record) {
574
581
  return messages.length > 0 ? messages : [{ role: "user", content: inputText.trimEnd() }];
575
582
  }
576
583
  function hydrateOutput(baseDir, record) {
577
- const responseText = readOptionalText(baseDir, record.output_path ?? record.response_path);
584
+ const responseText = readOptionalText(
585
+ baseDir,
586
+ record.output_path ?? record.answer_path ?? record.response_path
587
+ );
578
588
  if (!responseText) {
579
589
  return void 0;
580
590
  }
581
- const messages = parseMarkdownMessages(responseText);
582
- if (messages.length > 0) {
583
- return messages.map((message) => ({
584
- role: message.role,
585
- content: message.content
586
- }));
591
+ return responseText.trimEnd();
592
+ }
593
+ function hydrateTrace(baseDir, record) {
594
+ const transcriptText = readOptionalText(baseDir, record.transcript_path);
595
+ if (transcriptText) {
596
+ try {
597
+ return traceFromTranscriptJsonLines(parseJsonlLines(transcriptText));
598
+ } catch {
599
+ }
587
600
  }
588
- return [{ role: "assistant", content: responseText.trimEnd() }];
601
+ const output = hydrateOutput(baseDir, record) ?? "";
602
+ return buildTraceFromMessages({
603
+ input: hydrateInput(baseDir, record),
604
+ output: output ? [{ role: "assistant", content: output }] : [],
605
+ finalOutput: output,
606
+ target: record.target,
607
+ testId: record.test_id
608
+ });
589
609
  }
590
610
  function hydrateManifestRecord(baseDir, record) {
591
611
  const grading = readOptionalJson(baseDir, record.grading_path);
@@ -630,7 +650,8 @@ function hydrateManifestRecord(baseDir, record) {
630
650
  durationMs: timing?.duration_ms ?? record.duration_ms,
631
651
  costUsd: record.cost_usd,
632
652
  input: hydrateInput(baseDir, record),
633
- output: hydrateOutput(baseDir, record),
653
+ output: hydrateOutput(baseDir, record) ?? "",
654
+ trace: hydrateTrace(baseDir, record),
634
655
  metadata: record.metadata
635
656
  };
636
657
  }
@@ -756,6 +777,7 @@ function toRawResult(result) {
756
777
  end_time: result.endTime,
757
778
  input: result.input,
758
779
  output: result.output,
780
+ trace: toSnakeCaseDeep(result.trace),
759
781
  file_changes: result.fileChanges
760
782
  };
761
783
  }
@@ -1336,25 +1358,33 @@ function deleteRemoteRunTags(repoDir, manifestPath) {
1336
1358
  // src/commands/results/remote.ts
1337
1359
  var gitRunsCache = /* @__PURE__ */ new Map();
1338
1360
  var GIT_RUNS_CACHE_TTL_MS = 6e4;
1339
- function cachedListGitRuns(repoDir) {
1361
+ function getResultsStorageRef(config) {
1362
+ return config.branch ? `origin/${config.branch}` : void 0;
1363
+ }
1364
+ function cachedListGitRuns(repoDir, ref) {
1340
1365
  const now = Date.now();
1341
- const cached = gitRunsCache.get(repoDir);
1366
+ const cacheKey = `${repoDir}\0${ref ?? ""}`;
1367
+ const cached = gitRunsCache.get(cacheKey);
1342
1368
  if (cached && cached.expiresAt > now) {
1343
1369
  return cached.data;
1344
1370
  }
1345
- const promise = listGitRuns(repoDir);
1346
- gitRunsCache.set(repoDir, { data: promise, expiresAt: now + GIT_RUNS_CACHE_TTL_MS });
1371
+ const promise = ref ? listGitRuns(repoDir, ref) : listGitRuns(repoDir);
1372
+ gitRunsCache.set(cacheKey, { data: promise, expiresAt: now + GIT_RUNS_CACHE_TTL_MS });
1347
1373
  promise.catch(() => {
1348
1374
  }).finally(() => {
1349
- const entry = gitRunsCache.get(repoDir);
1375
+ const entry = gitRunsCache.get(cacheKey);
1350
1376
  if (entry && entry.expiresAt <= Date.now()) {
1351
- gitRunsCache.delete(repoDir);
1377
+ gitRunsCache.delete(cacheKey);
1352
1378
  }
1353
1379
  });
1354
1380
  return promise;
1355
1381
  }
1356
1382
  function invalidateGitRunsCache(repoDir) {
1357
- gitRunsCache.delete(repoDir);
1383
+ for (const key of gitRunsCache.keys()) {
1384
+ if (key.startsWith(`${repoDir}\0`)) {
1385
+ gitRunsCache.delete(key);
1386
+ }
1387
+ }
1358
1388
  }
1359
1389
  var REMOTE_RUN_PREFIX = "remote::";
1360
1390
  var SIZE_WARNING_BYTES = 10 * 1024 * 1024;
@@ -1391,6 +1421,7 @@ async function loadNormalizedResultsConfig(cwd, projectId) {
1391
1421
  const projectResults = project?.results ? {
1392
1422
  mode: "github",
1393
1423
  repo: project.results.repoUrl,
1424
+ branch: project.results.branch,
1394
1425
  path: project.results.path,
1395
1426
  auto_push: project.results.sync?.autoPush,
1396
1427
  branch_prefix: project.results.branchPrefix
@@ -1420,9 +1451,11 @@ async function getRemoteRunCount(config, status) {
1420
1451
  let runCount = 0;
1421
1452
  if (config && status.available) {
1422
1453
  try {
1423
- runCount = (await cachedListGitRuns(config.path)).length;
1454
+ runCount = (await cachedListGitRuns(config.path, getResultsStorageRef(config))).length;
1424
1455
  } catch {
1425
- runCount = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).length;
1456
+ if (!config.branch) {
1457
+ runCount = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).length;
1458
+ }
1426
1459
  }
1427
1460
  }
1428
1461
  return runCount;
@@ -1482,7 +1515,7 @@ async function listMergedResultFiles(cwd, limit, projectId) {
1482
1515
  let remoteRuns = [];
1483
1516
  if (config.mode === "github") {
1484
1517
  try {
1485
- const gitRuns = await cachedListGitRuns(config.path);
1518
+ const gitRuns = await cachedListGitRuns(config.path, getResultsStorageRef(config));
1486
1519
  remoteRuns = gitRuns.map((r) => ({
1487
1520
  filename: encodeRemoteRunId(r.run_id),
1488
1521
  raw_filename: r.run_id,
@@ -1496,15 +1529,19 @@ async function listMergedResultFiles(cwd, limit, projectId) {
1496
1529
  sizeBytes: r.size_bytes || 0
1497
1530
  }));
1498
1531
  } catch (error) {
1499
- console.error("git-native listing failed, falling back", error);
1500
- remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
1501
- (meta) => ({
1502
- ...meta,
1503
- filename: encodeRemoteRunId(meta.filename),
1504
- raw_filename: meta.filename,
1505
- source: "remote"
1506
- })
1507
- );
1532
+ if (config.branch) {
1533
+ console.error("git-native listing failed for configured results branch", error);
1534
+ } else {
1535
+ console.error("git-native listing failed, falling back", error);
1536
+ remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
1537
+ (meta) => ({
1538
+ ...meta,
1539
+ filename: encodeRemoteRunId(meta.filename),
1540
+ raw_filename: meta.filename,
1541
+ source: "remote"
1542
+ })
1543
+ );
1544
+ }
1508
1545
  }
1509
1546
  } else {
1510
1547
  remoteRuns = listResultFilesFromRunsDir(resolveResultsRepoRunsDir(config)).map(
@@ -1544,7 +1581,7 @@ async function ensureRemoteRunAvailable(cwd, meta, projectId) {
1544
1581
  ".agentv/results/runs",
1545
1582
  path7.posix.dirname(relativeManifestPath)
1546
1583
  );
1547
- await materializeGitRun(config.path, relativeRunPath);
1584
+ await materializeGitRun(config.path, relativeRunPath, getResultsStorageRef(config));
1548
1585
  }
1549
1586
  async function readRemoteRunTagState(cwd, meta, projectId) {
1550
1587
  if (meta.source !== "remote") return void 0;
@@ -1581,7 +1618,7 @@ async function clearRemoteRunTags(cwd, meta, projectId) {
1581
1618
  async function maybeAutoExportRunArtifacts(payload) {
1582
1619
  const config = await loadNormalizedResultsConfig(payload.cwd);
1583
1620
  if (!config?.auto_push) {
1584
- return;
1621
+ return "disabled";
1585
1622
  }
1586
1623
  try {
1587
1624
  await maybeWarnLargeArtifact(payload.run_dir);
@@ -1595,12 +1632,14 @@ async function maybeAutoExportRunArtifacts(payload) {
1595
1632
  });
1596
1633
  if (!pushed) {
1597
1634
  console.warn("Warning: results export produced no git changes. Skipping push.");
1598
- return;
1635
+ return "already_published";
1599
1636
  }
1600
1637
  console.log(`Results pushed to ${config.repo} (${config.path}/${relativeRunPath})`);
1638
+ return "published";
1601
1639
  } catch (error) {
1602
1640
  console.warn(`Warning: skipping results export: ${getStatusMessage(error)}`);
1603
1641
  console.warn("Warning: Run 'gh auth login' if GitHub authentication is missing.");
1642
+ return "failed";
1604
1643
  }
1605
1644
  }
1606
1645
 
@@ -1906,7 +1945,7 @@ var JsonlWriter = class _JsonlWriter {
1906
1945
  if (this.closed) {
1907
1946
  throw new Error("Cannot write to closed JSONL writer");
1908
1947
  }
1909
- const snakeCaseRecord = toSnakeCaseDeep(record);
1948
+ const snakeCaseRecord = toSnakeCaseDeep2(record);
1910
1949
  const line = `${JSON.stringify(snakeCaseRecord)}
1911
1950
  `;
1912
1951
  if (!this.stream.write(line)) {
@@ -3425,6 +3464,14 @@ var MOCK_SETTINGS = /* @__PURE__ */ new Set([
3425
3464
  "trace"
3426
3465
  // For testing tool-trajectory evaluator
3427
3466
  ]);
3467
+ var REPLAY_SETTINGS = /* @__PURE__ */ new Set([
3468
+ ...COMMON_SETTINGS,
3469
+ "fixtures",
3470
+ "source_target",
3471
+ "suite",
3472
+ "eval_path",
3473
+ "variant"
3474
+ ]);
3428
3475
  var CLAUDE_SETTINGS = /* @__PURE__ */ new Set([
3429
3476
  ...COMMON_SETTINGS,
3430
3477
  "executable",
@@ -3480,6 +3527,8 @@ function getKnownSettings(provider) {
3480
3527
  return VSCODE_SETTINGS;
3481
3528
  case "mock":
3482
3529
  return MOCK_SETTINGS;
3530
+ case "replay":
3531
+ return REPLAY_SETTINGS;
3483
3532
  case "cli":
3484
3533
  return null;
3485
3534
  default:
@@ -3761,6 +3810,24 @@ async function validateTargetsFile(filePath) {
3761
3810
  if (providerValue === "cli") {
3762
3811
  validateCliSettings(target, absolutePath, location, errors);
3763
3812
  }
3813
+ if (providerValue === "replay") {
3814
+ if (!isNonEmptyString(target.fixtures)) {
3815
+ errors.push({
3816
+ severity: "error",
3817
+ filePath: absolutePath,
3818
+ location: `${location}.fixtures`,
3819
+ message: "Replay provider requires 'fixtures' as a non-empty string"
3820
+ });
3821
+ }
3822
+ if (!isNonEmptyString(target.source_target)) {
3823
+ errors.push({
3824
+ severity: "error",
3825
+ filePath: absolutePath,
3826
+ location: `${location}.source_target`,
3827
+ message: "Replay provider requires 'source_target' as a non-empty string"
3828
+ });
3829
+ }
3830
+ }
3764
3831
  if (typeof provider === "string" && !isTemplated) {
3765
3832
  validateUnknownSettings(target, provider, absolutePath, location, errors);
3766
3833
  }
@@ -4005,6 +4072,14 @@ function validateProjectResultsConfig(errors, filePath, rawResults, location) {
4005
4072
  }
4006
4073
  }
4007
4074
  validateGitRemoteUrl(errors, filePath, resultsRecord.repo_url, `${location}.repo_url`);
4075
+ if (resultsRecord.branch !== void 0 && (typeof resultsRecord.branch !== "string" || resultsRecord.branch.trim().length === 0)) {
4076
+ errors.push({
4077
+ severity: "error",
4078
+ filePath,
4079
+ location: `${location}.branch`,
4080
+ message: `Field '${location}.branch' must be a non-empty string`
4081
+ });
4082
+ }
4008
4083
  if (resultsRecord.path !== void 0) {
4009
4084
  if (typeof resultsRecord.path !== "string" || resultsRecord.path.trim().length === 0) {
4010
4085
  errors.push({
@@ -4074,6 +4149,14 @@ function validateResultsConfig(errors, filePath, rawResults, location) {
4074
4149
  });
4075
4150
  }
4076
4151
  validateRequiredString(errors, filePath, resultsRecord.repo, `${location}.repo`);
4152
+ if (resultsRecord.branch !== void 0 && (typeof resultsRecord.branch !== "string" || resultsRecord.branch.trim().length === 0)) {
4153
+ errors.push({
4154
+ severity: "error",
4155
+ filePath,
4156
+ location: `${location}.branch`,
4157
+ message: `Field '${location}.branch' must be a non-empty string`
4158
+ });
4159
+ }
4077
4160
  if (resultsRecord.path !== void 0) {
4078
4161
  if (typeof resultsRecord.path !== "string" || resultsRecord.path.trim().length === 0) {
4079
4162
  errors.push({
@@ -4582,6 +4665,94 @@ Errors in ${targetsFilePath}:`);
4582
4665
  return results;
4583
4666
  }
4584
4667
 
4668
+ // src/commands/eval/wip-checkpoint.ts
4669
+ var WIP_CHECKPOINT_INTERVAL_MS = 3e4;
4670
+ var defaultDependencies = {
4671
+ buildWipBranchName,
4672
+ deleteWipBranch,
4673
+ pushWipCheckpoint,
4674
+ setupWipWorktree
4675
+ };
4676
+ function warnCheckpointError(context, error) {
4677
+ const message = error instanceof Error ? error.message : String(error);
4678
+ console.warn(`WIP checkpoint: ${context}: ${message}`);
4679
+ }
4680
+ var WipCheckpointLoop = class {
4681
+ wipBranch;
4682
+ config;
4683
+ runDir;
4684
+ destinationPath;
4685
+ intervalMs;
4686
+ deps;
4687
+ handle;
4688
+ timer;
4689
+ checkpointInFlight;
4690
+ active = false;
4691
+ constructor(params) {
4692
+ this.config = params.config;
4693
+ this.runDir = params.runDir;
4694
+ this.destinationPath = params.destinationPath;
4695
+ this.intervalMs = params.intervalMs ?? WIP_CHECKPOINT_INTERVAL_MS;
4696
+ this.deps = params.dependencies ?? defaultDependencies;
4697
+ this.wipBranch = this.deps.buildWipBranchName(params.runDir);
4698
+ }
4699
+ async start() {
4700
+ try {
4701
+ this.handle = await this.deps.setupWipWorktree({
4702
+ config: this.config,
4703
+ wipBranch: this.wipBranch
4704
+ });
4705
+ } catch (err) {
4706
+ warnCheckpointError("failed to set up WIP worktree", err);
4707
+ return;
4708
+ }
4709
+ this.active = true;
4710
+ this.timer = setInterval(() => {
4711
+ this.runCheckpointIfIdle();
4712
+ }, this.intervalMs);
4713
+ this.timer.unref?.();
4714
+ }
4715
+ runCheckpointIfIdle() {
4716
+ if (!this.active || this.checkpointInFlight) return;
4717
+ this.checkpointInFlight = this.checkpoint().catch((err) => warnCheckpointError("push failed", err)).finally(() => {
4718
+ this.checkpointInFlight = void 0;
4719
+ });
4720
+ }
4721
+ async checkpoint() {
4722
+ if (!this.handle) return;
4723
+ await this.deps.pushWipCheckpoint({
4724
+ handle: this.handle,
4725
+ sourceDir: this.runDir,
4726
+ destinationPath: this.destinationPath
4727
+ });
4728
+ }
4729
+ /** Stop the loop and clean up the local worktree. Does NOT delete the remote WIP branch. */
4730
+ async stop() {
4731
+ this.active = false;
4732
+ if (this.timer !== void 0) {
4733
+ clearInterval(this.timer);
4734
+ this.timer = void 0;
4735
+ }
4736
+ await this.checkpointInFlight;
4737
+ if (this.handle) {
4738
+ await this.handle.cleanup().catch((err) => warnCheckpointError("worktree cleanup failed", err));
4739
+ this.handle = void 0;
4740
+ }
4741
+ }
4742
+ /**
4743
+ * Stop the loop and delete the remote WIP branch.
4744
+ * Call after a successful run to keep the results repo tidy.
4745
+ */
4746
+ async stopAndDeleteWipBranch() {
4747
+ await this.stop();
4748
+ try {
4749
+ await this.deps.deleteWipBranch({ config: this.config, wipBranch: this.wipBranch });
4750
+ } catch (err) {
4751
+ warnCheckpointError(`failed to delete remote branch ${this.wipBranch}`, err);
4752
+ }
4753
+ }
4754
+ };
4755
+
4585
4756
  // src/commands/eval/run-eval.ts
4586
4757
  var DEFAULT_WORKERS = 3;
4587
4758
  function shouldSkipExistingResultForResume(result, rerunFailed) {
@@ -4722,17 +4893,8 @@ function normalizeOutputMessages(cliValue) {
4722
4893
  }
4723
4894
  return parsed;
4724
4895
  }
4725
- function trimOutputMessages(output, outputMessages) {
4726
- const messages = output ?? [];
4727
- if (outputMessages === "all") {
4728
- return messages.map((m) => ({ role: m.role, content: m.content }));
4729
- }
4730
- if (outputMessages === 1) {
4731
- const lastAssistant = messages.filter((m) => m.role === "assistant").at(-1);
4732
- return lastAssistant ? [{ role: lastAssistant.role, content: lastAssistant.content }] : [];
4733
- }
4734
- const sliced = messages.slice(-outputMessages);
4735
- return sliced.map((m) => ({ role: m.role, content: m.content }));
4896
+ function trimOutputMessages(output, _outputMessages) {
4897
+ return output;
4736
4898
  }
4737
4899
  function normalizeOptions(rawOptions, config, yamlExecution) {
4738
4900
  const cliWorkers = normalizeOptionalNumber(rawOptions.workers);
@@ -4819,6 +4981,8 @@ function normalizeOptions(rawOptions, config, yamlExecution) {
4819
4981
  tags: normalizeStringArray(rawOptions.tag),
4820
4982
  excludeTags: normalizeStringArray(rawOptions.excludeTag),
4821
4983
  transcript: normalizeString(rawOptions.transcript),
4984
+ recordReplay: normalizeString(rawOptions.recordReplay),
4985
+ recordReplayVariant: normalizeString(rawOptions.recordReplayVariant),
4822
4986
  experiment: normalizeString(rawOptions.experiment),
4823
4987
  budgetUsd: normalizeOptionalNumber(rawOptions.budgetUsd),
4824
4988
  sourceMetadataByEvalFile: normalizeSourceMetadataByEvalFile(
@@ -5115,6 +5279,11 @@ async function runSingleEvalFile(params) {
5115
5279
  providerFactory
5116
5280
  } = params;
5117
5281
  const targetName = selection.targetName;
5282
+ const replayRecording = options.recordReplay ? {
5283
+ fixturesPath: path12.resolve(options.recordReplay),
5284
+ sourceTarget: targetName,
5285
+ variant: options.recordReplayVariant
5286
+ } : void 0;
5118
5287
  await ensureFileExists(testFilePath, "Test file");
5119
5288
  const resolvedTargetSelection = applyVerboseOverride(selection, options.verbose);
5120
5289
  const providerLabel = options.dryRun ? `${resolvedTargetSelection.resolvedTarget.kind} (dry-run)` : resolvedTargetSelection.resolvedTarget.kind;
@@ -5191,6 +5360,7 @@ async function runSingleEvalFile(params) {
5191
5360
  model: options.model,
5192
5361
  threshold: options.threshold,
5193
5362
  targetHooks: resolvedTargetSelection.targetHooks,
5363
+ replayRecording,
5194
5364
  providerFactory,
5195
5365
  streamCallbacks: streamingObserver?.getStreamCallbacks(),
5196
5366
  onResult: async (result) => {
@@ -5381,7 +5551,7 @@ async function runEvalCommand(input) {
5381
5551
  const useFileExport = !!options.otelFile;
5382
5552
  if (options.exportOtel || useFileExport) {
5383
5553
  try {
5384
- const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-M4B77IW4.js");
5554
+ const { OtelTraceExporter, OTEL_BACKEND_PRESETS } = await import("./dist-BSFUYS54.js");
5385
5555
  let endpoint = process.env.OTEL_EXPORTER_OTLP_ENDPOINT;
5386
5556
  let headers = {};
5387
5557
  if (options.otelBackend) {
@@ -5480,6 +5650,9 @@ async function runEvalCommand(input) {
5480
5650
  if (cache) {
5481
5651
  console.log(`Response cache: enabled (${cache.cachePath})`);
5482
5652
  }
5653
+ if (options.recordReplay) {
5654
+ console.log(`Replay recording: ${path12.resolve(options.recordReplay)}`);
5655
+ }
5483
5656
  const yamlThreshold = firstMeta?.threshold;
5484
5657
  const resolvedThreshold = options.threshold ?? yamlThreshold;
5485
5658
  if (resolvedThreshold !== void 0 && (resolvedThreshold < 0 || resolvedThreshold > 1)) {
@@ -5574,7 +5747,7 @@ async function runEvalCommand(input) {
5574
5747
  const activeTestFiles = resolvedTestFiles.filter((f) => fileMetadata.has(f));
5575
5748
  let transcriptProviderFactory;
5576
5749
  if (options.transcript) {
5577
- const { TranscriptProvider } = await import("./dist-M4B77IW4.js");
5750
+ const { TranscriptProvider } = await import("./dist-BSFUYS54.js");
5578
5751
  const transcriptProvider = await TranscriptProvider.fromFile(options.transcript);
5579
5752
  const totalTests = [...fileMetadata.values()].reduce(
5580
5753
  (sum, meta) => sum + meta.testCases.length,
@@ -5598,6 +5771,20 @@ async function runEvalCommand(input) {
5598
5771
  experiment: normalizeExperimentName(options.experiment)
5599
5772
  });
5600
5773
  }
5774
+ let wipLoop;
5775
+ let wipCleanedUp = false;
5776
+ let finalExportStatus = "disabled";
5777
+ {
5778
+ const wipConfig = await loadNormalizedResultsConfig(cwd).catch(() => void 0);
5779
+ if (wipConfig?.auto_push) {
5780
+ wipLoop = new WipCheckpointLoop({
5781
+ config: wipConfig,
5782
+ runDir,
5783
+ destinationPath: getRelativeRunPath(cwd, runDir)
5784
+ });
5785
+ await wipLoop.start();
5786
+ }
5787
+ }
5601
5788
  try {
5602
5789
  for (const testFilePath of activeTestFiles) {
5603
5790
  if (runBudgetTracker?.isExceeded()) {
@@ -5612,7 +5799,16 @@ async function runEvalCommand(input) {
5612
5799
  testId: testCase.id,
5613
5800
  score: 0,
5614
5801
  assertions: [],
5615
- output: [],
5802
+ output: budgetMsg,
5803
+ trace: buildTraceFromMessages({
5804
+ input: testCase.input,
5805
+ output: [{ role: "assistant", content: budgetMsg }],
5806
+ finalOutput: budgetMsg,
5807
+ target: selection.targetName,
5808
+ testId: testCase.id,
5809
+ conversationId: testCase.conversation_id,
5810
+ error: budgetMsg
5811
+ }),
5616
5812
  error: budgetMsg,
5617
5813
  budgetExceeded: true,
5618
5814
  executionStatus: "execution_error",
@@ -5702,7 +5898,16 @@ async function runEvalCommand(input) {
5702
5898
  testId: testCase.id,
5703
5899
  score: 0,
5704
5900
  assertions: [],
5705
- output: [],
5901
+ output: message,
5902
+ trace: buildTraceFromMessages({
5903
+ input: testCase.input,
5904
+ output: [{ role: "assistant", content: message }],
5905
+ finalOutput: message,
5906
+ target: selection.targetName,
5907
+ testId: testCase.id,
5908
+ conversationId: testCase.conversation_id,
5909
+ error: message
5910
+ }),
5706
5911
  scores: [],
5707
5912
  error: message,
5708
5913
  executionStatus: "execution_error",
@@ -5758,7 +5963,7 @@ async function runEvalCommand(input) {
5758
5963
  );
5759
5964
  const taskBundleTargets = buildTaskBundleTargetSelections(activeTestFiles, fileMetadata);
5760
5965
  if (isResumeAppend) {
5761
- const { writePerTestArtifacts } = await import("./artifact-writer-G57MG52C.js");
5966
+ const { writePerTestArtifacts } = await import("./artifact-writer-3YRN6YTA.js");
5762
5967
  await writePerTestArtifacts(allResults, runDir, {
5763
5968
  experiment: normalizeExperimentName(options.experiment),
5764
5969
  cwd,
@@ -5815,7 +6020,7 @@ async function runEvalCommand(input) {
5815
6020
  console.log(`
5816
6021
  Results written to: ${outputPath}`);
5817
6022
  await saveRunCache(cwd, outputPath).catch(() => void 0);
5818
- await maybeAutoExportRunArtifacts({
6023
+ finalExportStatus = await maybeAutoExportRunArtifacts({
5819
6024
  cwd,
5820
6025
  run_dir: runDir,
5821
6026
  test_files: activeTestFiles,
@@ -5851,6 +6056,10 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
5851
6056
  \u26A0 Run budget exceeded: $${runBudgetTracker?.currentCostUsd.toFixed(4)} spent of $${runBudgetTracker?.budgetCapUsd.toFixed(4)} cap`
5852
6057
  );
5853
6058
  }
6059
+ if (wipLoop && (finalExportStatus === "published" || finalExportStatus === "already_published")) {
6060
+ wipCleanedUp = true;
6061
+ await wipLoop.stopAndDeleteWipBranch();
6062
+ }
5854
6063
  return {
5855
6064
  executionErrorCount: summary.executionErrorCount,
5856
6065
  outputPath,
@@ -5861,6 +6070,9 @@ Tip: ${summary.executionErrorCount} execution error(s) detected. Re-run failed t
5861
6070
  budgetExceeded: runBudgetExceeded || void 0
5862
6071
  };
5863
6072
  } finally {
6073
+ if (wipLoop && !wipCleanedUp) {
6074
+ await wipLoop.stop().catch(() => void 0);
6075
+ }
5864
6076
  unsubscribeCodexLogs();
5865
6077
  unsubscribePiLogs();
5866
6078
  unsubscribeCopilotSdkLogs();
@@ -5986,4 +6198,4 @@ export {
5986
6198
  getCategories,
5987
6199
  filterByCategory
5988
6200
  };
5989
- //# sourceMappingURL=chunk-INOKS5LF.js.map
6201
+ //# sourceMappingURL=chunk-7KZ2AF26.js.map