@workbench-ai/workbench 0.0.71 → 0.0.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AA2DA,MAAM,WAAW,KAAK;IACpB,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;IAC9B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;CAC/B;AAkUD,wBAAsB,MAAM,CAAC,IAAI,EAAE,SAAS,MAAM,EAAE,EAAE,EAAE,GAAE,KAGzD,GAAG,OAAO,CAAC,MAAM,CAAC,CAwMlB"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AA2DA,MAAM,WAAW,KAAK;IACpB,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;IAC9B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;CAC/B;AAsUD,wBAAsB,MAAM,CAAC,IAAI,EAAE,SAAS,MAAM,EAAE,EAAE,EAAE,GAAE,KAGzD,GAAG,OAAO,CAAC,MAAM,CAAC,CAiMlB"}
package/dist/index.js CHANGED
@@ -44,7 +44,7 @@ const HELP_ALL = [
44
44
  " workbench show REF[:PATH] [--json]",
45
45
  " workbench diff [A..B] [--json]",
46
46
  " workbench switch VERSION [--json]",
47
- " workbench open [--host HOST] [--port PORT] [--no-open] [--json]",
47
+ " workbench open [--host HOST] [--port PORT] [--no-open]",
48
48
  "",
49
49
  "Configure:",
50
50
  " workbench case add RUN_ID [--json]",
@@ -162,9 +162,9 @@ const COMMAND_HELP = {
162
162
  ].join("\n"),
163
163
  open: [
164
164
  "Usage:",
165
- " workbench open [--host HOST] [--port PORT] [--no-open] [--json]",
165
+ " workbench open [--host HOST] [--port PORT] [--no-open]",
166
166
  "",
167
- "Serves or emits the read-only Workbench inspection snapshot.",
167
+ "Serves the read-only Workbench inspection UI.",
168
168
  "",
169
169
  "Example:",
170
170
  " workbench open --no-open",
@@ -221,9 +221,12 @@ const COMMAND_HELP = {
221
221
  const COMMON_FLAGS = {
222
222
  json: "boolean",
223
223
  };
224
+ const DIR_FLAG = {
225
+ dir: "string",
226
+ };
224
227
  const PROJECT_FLAGS = {
225
228
  ...COMMON_FLAGS,
226
- dir: "string",
229
+ ...DIR_FLAG,
227
230
  };
228
231
  const HELP_FLAG = {
229
232
  help: "boolean",
@@ -270,7 +273,7 @@ const COMMAND_FLAGS = {
270
273
  },
271
274
  logout: { ...COMMON_FLAGS, ...HELP_FLAG },
272
275
  new: { ...PROJECT_FLAGS, ...HELP_FLAG },
273
- open: { ...PROJECT_FLAGS, ...HELP_FLAG, host: "string", "no-open": "boolean", port: "positive-integer" },
276
+ open: { ...DIR_FLAG, ...HELP_FLAG, host: "string", "no-open": "boolean", port: "port" },
274
277
  publish: {
275
278
  ...PROJECT_FLAGS,
276
279
  ...HELP_FLAG,
@@ -359,13 +362,16 @@ export async function runCli(argv, io = {
359
362
  return emitEvalFailure(runs, failedRuns, artifactIds, parsed, io);
360
363
  }
361
364
  const deltas = await evalDeltas(core, runs);
365
+ const coverage = await evalCoverageSummaries(core, runs);
362
366
  const next = await evalSuccessNextCommand(core, runs);
363
367
  return emitResult("workbench.cli.eval.v1", {
364
368
  result: runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
369
+ coverage: coverage,
365
370
  deltas: deltas,
366
371
  next: next,
367
372
  }, parsed, io, () => [
368
373
  runs.map(formatRun).join("\n"),
374
+ ...coverage.map(formatEvalCoverage),
369
375
  ...deltas.map(formatEvalDelta),
370
376
  ...(next ? [`next: ${next}`] : []),
371
377
  ].filter(Boolean).join("\n"));
@@ -449,10 +455,7 @@ export async function runCli(argv, io = {
449
455
  pinnedInstallUrl: preview.pinnedInstallUrl,
450
456
  dryRun: true,
451
457
  }, parsed, io, () => [
452
- `Would publish ${displayRef(preview.version.id)} to remote ${preview.remote.name}.`,
453
- `Visibility: ${preview.visibility}`,
454
- `Install: ${preview.installUrl}`,
455
- `Pinned: ${preview.pinnedInstallUrl}`,
458
+ `Would publish ${displayRef(preview.version.id)} as ${preview.installHandle} (${preview.visibility}).`,
456
459
  `next: workbench install ${preview.installHandle}`,
457
460
  ].join("\n"));
458
461
  }
@@ -473,25 +476,18 @@ export async function runCli(argv, io = {
473
476
  pinnedInstallUrl: result.pinnedInstallUrl,
474
477
  ...(result.dryRun ? { dryRun: true } : {}),
475
478
  }, parsed, io, () => [
476
- `${result.dryRun ? "Would publish" : "Published"} ${displayRef(result.version.id)} to remote ${result.remote.name}.`,
477
- `Visibility: ${result.visibility}`,
478
- `Install: ${result.installUrl}`,
479
- `Pinned: ${result.pinnedInstallUrl}`,
479
+ `${result.dryRun ? "Would publish" : "Published"} ${displayRef(result.version.id)} as ${result.installHandle} (${result.visibility}).`,
480
480
  `next: workbench install ${result.installHandle}`,
481
481
  ].join("\n"));
482
482
  }
483
483
  if (command === "open") {
484
- if (parsed.flags.json === true) {
485
- const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
486
- return output(manifestOnly(snapshot), parsed, io, () => "Read-only Workbench inspection data is available with --json.");
487
- }
488
484
  // The browser server serves committed object state through a read-only
489
485
  // snapshot path, so long-running commands do not block page loads.
490
486
  const server = await startWorkbenchOpenServer({
491
487
  dir: dirFlag(parsed),
492
488
  authToken: core.authToken,
493
489
  host: stringFlag(parsed, "host"),
494
- port: intFlag(parsed, "port"),
490
+ port: portFlag(parsed, "port"),
495
491
  });
496
492
  io.stdout.write(`Workbench: ${server.url}\n`);
497
493
  if (parsed.flags["no-open"] !== true) {
@@ -506,9 +502,10 @@ export async function runCli(argv, io = {
506
502
  }
507
503
  }
508
504
  async function handleStatus(parsed, io) {
509
- const status = await workbenchStatusSnapshot(await coreOptions(parsed));
505
+ const core = await coreOptions(parsed);
506
+ const status = await workbenchStatusSnapshot(core);
510
507
  const auth = await workbenchCliAuthStatus();
511
- const cliStatus = statusWithCausalNext(status, auth);
508
+ const cliStatus = await statusWithCausalNext(status, auth, core);
512
509
  return emitResult("workbench.status.v1", {
513
510
  project: cliStatus.project,
514
511
  worktree: cliStatus.worktree,
@@ -557,7 +554,7 @@ async function handleLog(parsed, io) {
557
554
  versionId: run.versionId,
558
555
  skillName: run.skillName,
559
556
  agentName: run.agentName,
560
- ...(run.score !== undefined ? { score: run.score } : {}),
557
+ ...(scoredRunValue(run) !== undefined ? { score: scoredRunValue(run) } : {}),
561
558
  })) : []),
562
559
  ].sort((left, right) => right.createdAt.localeCompare(left.createdAt));
563
560
  return emitResult("workbench.cli.log.v1", {
@@ -597,7 +594,7 @@ async function handleShow(parsed, io) {
597
594
  const evidenceFiles = evidenceFilesForRunOrJob(snapshot, objectRef);
598
595
  if (details.length > 0 || evidenceFiles.length > 0) {
599
596
  return output({
600
- details: details,
597
+ details: details.map(evidenceDetailSummary),
601
598
  files: evidenceFiles.map(fileSummary),
602
599
  }, parsed, io, () => formatRunOrJobEvidence(details, evidenceFiles));
603
600
  }
@@ -733,11 +730,14 @@ function validateFlagValue(name, value, kind) {
733
730
  if (typeof value !== "string" || !value.trim()) {
734
731
  throw new WorkbenchUserError(`--${name} requires a value.`);
735
732
  }
736
- if (kind === "positive-integer") {
733
+ if (kind === "positive-integer" || kind === "port") {
737
734
  const parsedValue = Number(value);
738
- if (!Number.isInteger(parsedValue) || parsedValue <= 0) {
735
+ if (kind === "positive-integer" && (!Number.isInteger(parsedValue) || parsedValue <= 0)) {
739
736
  throw new WorkbenchUserError(`--${name} must be a positive integer.`);
740
737
  }
738
+ if (kind === "port" && (!Number.isInteger(parsedValue) || parsedValue < 0 || parsedValue > 65535)) {
739
+ throw new WorkbenchUserError(`--${name} must be an integer between 0 and 65535.`);
740
+ }
741
741
  }
742
742
  }
743
743
  const CONFIG_SCHEMA = "workbench.cli.config.v1";
@@ -926,8 +926,22 @@ async function handleInstall(parsed, io) {
926
926
  ].join("\n"));
927
927
  }
928
928
  async function handleCloudEval(parsed, io) {
929
- const started = await startCloudExecution("eval", parsed);
929
+ const started = await startCloudExecution("eval", parsed, io);
930
930
  const artifactIds = await artifactIdsByRunId(started.core, started.runs);
931
+ if (started.detached) {
932
+ const next = cloudDetachedNextCommand(started.runs);
933
+ emitResult("workbench.cli.eval.v1", {
934
+ result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
935
+ detached: true,
936
+ next: next,
937
+ cloud: cloudExecutionSummary(started),
938
+ }, parsed, io, () => [
939
+ `Detached from hosted eval on ${started.remote.url}.`,
940
+ started.runs.map(formatRun).join("\n"),
941
+ ...(next ? [`next: ${next}`] : []),
942
+ ].filter(Boolean).join("\n"));
943
+ return 130;
944
+ }
931
945
  const failedRuns = started.runs.filter((run) => run.status === "failed" || run.status === "canceled");
932
946
  if (failedRuns.length > 0) {
933
947
  return emitEvalFailure(started.runs, failedRuns, artifactIds, parsed, io);
@@ -947,8 +961,22 @@ async function handleCloudEval(parsed, io) {
947
961
  ].filter(Boolean).join("\n"));
948
962
  }
949
963
  async function handleCloudImprove(parsed, io) {
950
- const started = await startCloudExecution("improve", parsed);
964
+ const started = await startCloudExecution("improve", parsed, io);
951
965
  const artifactIds = await artifactIdsByRunId(started.core, started.runs);
966
+ if (started.detached) {
967
+ const next = cloudDetachedNextCommand(started.runs);
968
+ emitResult("workbench.cli.improve.v1", {
969
+ result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
970
+ detached: true,
971
+ next: next,
972
+ cloud: cloudExecutionSummary(started),
973
+ }, parsed, io, () => [
974
+ `Detached from hosted improve on ${started.remote.url}.`,
975
+ started.runs.map(formatRun).join("\n"),
976
+ ...(next ? [`next: ${next}`] : []),
977
+ ].filter(Boolean).join("\n"));
978
+ return 130;
979
+ }
952
980
  const failedRuns = started.runs.filter((run) => run.status === "failed" || run.status === "canceled");
953
981
  if (failedRuns.length > 0) {
954
982
  const first = failedRuns[0];
@@ -1014,7 +1042,7 @@ async function pathExists(filePath) {
1014
1042
  return false;
1015
1043
  }
1016
1044
  }
1017
- async function startCloudExecution(command, parsed) {
1045
+ async function startCloudExecution(command, parsed, io) {
1018
1046
  const root = dirFlag(parsed) ?? process.cwd();
1019
1047
  const remote = await ensureCloudRemoteForExecution(root, parsed);
1020
1048
  const source = parseWorkbenchInstallSource(remote.url);
@@ -1033,7 +1061,9 @@ async function startCloudExecution(command, parsed) {
1033
1061
  });
1034
1062
  }
1035
1063
  const core = { dir: root, authToken: token };
1064
+ writeCloudProgress(io, `workbench cloud: syncing ${remote.name} before hosted ${command}.`);
1036
1065
  const syncBefore = await syncWorkbenchRemote({ ...core, remote: remote.name });
1066
+ writeCloudProgress(io, `workbench cloud: synced ${remote.name} before hosted ${command} (pushed=${syncBefore.pushed}, pulled=${syncBefore.pulled}, up-to-date=${syncBefore.upToDate}).`);
1037
1067
  const startSnapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
1038
1068
  const skillId = await resolveCloudSkillId(source);
1039
1069
  const response = await apiRequest(`/api/workbench/skills/${encodeURIComponent(skillId)}${command === "improve" ? "/improve" : "/runs"}`, { method: "POST", body: cloudExecutionRequestBody(command, parsed) }, source.baseUrl);
@@ -1046,9 +1076,15 @@ async function startCloudExecution(command, parsed) {
1046
1076
  exitCode: 1,
1047
1077
  });
1048
1078
  }
1079
+ const initialRunIds = runs.map((run) => run.id);
1080
+ writeCloudProgress(io, `workbench cloud: scheduled hosted ${command} on ${remote.url} (${formatCloudRunStatuses(runs)}).`);
1049
1081
  const initialSyncAfter = await syncWorkbenchRemote({ ...core, remote: remote.name });
1082
+ writeCloudProgress(io, `workbench cloud: synced after scheduling hosted ${command} (pushed=${initialSyncAfter.pushed}, pulled=${initialSyncAfter.pulled}, up-to-date=${initialSyncAfter.upToDate}).`);
1083
+ writeCloudProgress(io, `workbench cloud: waiting for terminal status; press Ctrl-C to detach and resume with workbench status or workbench show ${displayRef(initialRunIds[0] ?? "run")}.`);
1050
1084
  const completed = await waitForCloudRuns({
1085
+ command,
1051
1086
  core,
1087
+ io,
1052
1088
  remote,
1053
1089
  runs,
1054
1090
  initialSync: initialSyncAfter,
@@ -1057,7 +1093,9 @@ async function startCloudExecution(command, parsed) {
1057
1093
  core,
1058
1094
  remote,
1059
1095
  skillId,
1096
+ initialRunIds,
1060
1097
  runs: completed.runs,
1098
+ ...(completed.detached ? { detached: true } : {}),
1061
1099
  startVersionId: startSnapshot.status.currentVersionId ?? startSnapshot.refs.current,
1062
1100
  source,
1063
1101
  sync: {
@@ -1081,27 +1119,59 @@ async function waitForCloudRuns(input) {
1081
1119
  const timeoutMs = positiveIntEnv("WORKBENCH_CLOUD_RUN_TIMEOUT_MS") ?? CLOUD_RUN_TIMEOUT_MS;
1082
1120
  const pollIntervalMs = positiveIntEnv("WORKBENCH_CLOUD_RUN_POLL_INTERVAL_MS") ?? CLOUD_RUN_POLL_INTERVAL_MS;
1083
1121
  const deadline = Date.now() + timeoutMs;
1084
- while (true) {
1085
- const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(input.core);
1086
- const runs = runIds
1087
- .map((id) => snapshot.runs.find((entry) => entry.id === id))
1088
- .filter((run) => Boolean(run));
1089
- if (runs.length === runIds.length && runs.every(isTerminalRun)) {
1090
- return { runs, sync };
1091
- }
1092
- if (Date.now() >= deadline) {
1093
- throw new WorkbenchCodedError("cloud_run_pending", "Hosted Workbench run is still running.", {
1094
- retryable: true,
1095
- remediation: runIds[0] ? `Run workbench show ${runIds[0]}.` : "Run workbench log --runs.",
1096
- subject: {
1097
- runIds,
1098
- statuses: Object.fromEntries(runs.map((run) => [run.id, run.status])),
1099
- },
1100
- exitCode: 1,
1101
- });
1122
+ let runs = [...input.runs];
1123
+ let interrupted = false;
1124
+ const onSigint = () => {
1125
+ interrupted = true;
1126
+ writeCloudProgress(input.io, `workbench cloud: detaching from hosted ${input.command} (${runIds.map(displayRef).join(", ")}).`);
1127
+ };
1128
+ process.once("SIGINT", onSigint);
1129
+ const seenStatuses = new Map();
1130
+ try {
1131
+ while (true) {
1132
+ const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(input.core);
1133
+ const snapshotRuns = runIds
1134
+ .map((id) => snapshot.runs.find((entry) => entry.id === id))
1135
+ .filter((run) => Boolean(run));
1136
+ if (snapshotRuns.length > 0) {
1137
+ runs = runIds.map((id) => snapshotRuns.find((entry) => entry.id === id) ?? runs.find((entry) => entry.id === id))
1138
+ .filter((run) => Boolean(run));
1139
+ }
1140
+ for (const run of runs) {
1141
+ const previous = seenStatuses.get(run.id);
1142
+ if (previous !== run.status) {
1143
+ seenStatuses.set(run.id, run.status);
1144
+ writeCloudProgress(input.io, `workbench cloud: ${displayRef(run.id)} is ${run.status}.`);
1145
+ }
1146
+ }
1147
+ if (runs.length === runIds.length && runs.every(isTerminalRun)) {
1148
+ writeCloudProgress(input.io, `workbench cloud: hosted ${input.command} finished (${formatCloudRunStatuses(runs)}).`);
1149
+ return { runs, sync };
1150
+ }
1151
+ if (interrupted) {
1152
+ return { runs, sync, detached: true };
1153
+ }
1154
+ if (Date.now() >= deadline) {
1155
+ throw new WorkbenchCodedError("cloud_run_pending", "Hosted Workbench run is still running.", {
1156
+ retryable: true,
1157
+ remediation: runIds[0] ? `Run workbench show ${runIds[0]}.` : "Run workbench log --runs.",
1158
+ subject: {
1159
+ runIds,
1160
+ statuses: Object.fromEntries(runs.map((run) => [run.id, run.status])),
1161
+ },
1162
+ exitCode: 1,
1163
+ });
1164
+ }
1165
+ await sleep(pollIntervalMs);
1166
+ if (interrupted) {
1167
+ return { runs, sync, detached: true };
1168
+ }
1169
+ sync = await syncWorkbenchRemote({ ...input.core, remote: input.remote.name });
1170
+ writeCloudProgress(input.io, `workbench cloud: synced ${input.remote.name} while waiting (${formatCloudRunStatuses(runs)}).`);
1102
1171
  }
1103
- await sleep(pollIntervalMs);
1104
- sync = await syncWorkbenchRemote({ ...input.core, remote: input.remote.name });
1172
+ }
1173
+ finally {
1174
+ process.off("SIGINT", onSigint);
1105
1175
  }
1106
1176
  }
1107
1177
  function isTerminalRun(run) {
@@ -1228,6 +1298,10 @@ function cloudExecutionRequestBody(command, parsed) {
1228
1298
  function cloudImproveNextCommand(runs) {
1229
1299
  return cloudExecutionNextCommand(runs, "workbench eval");
1230
1300
  }
1301
+ function cloudDetachedNextCommand(runs) {
1302
+ const first = runs[0];
1303
+ return first?.id ? `workbench show ${displayRef(first.id)}` : "workbench status";
1304
+ }
1231
1305
  function cloudExecutionNextCommand(runs, successCommand) {
1232
1306
  const first = runs[0];
1233
1307
  if (!first) {
@@ -1243,9 +1317,19 @@ function cloudExecutionSummary(started) {
1243
1317
  remote: started.remote.name,
1244
1318
  url: started.remote.url,
1245
1319
  skillId: started.skillId,
1320
+ initialRunIds: started.initialRunIds,
1321
+ ...(started.detached ? { detached: true } : {}),
1246
1322
  sync: started.sync,
1247
1323
  };
1248
1324
  }
1325
+ function writeCloudProgress(io, message) {
1326
+ io.stderr.write(`${message}\n`);
1327
+ }
1328
+ function formatCloudRunStatuses(runs) {
1329
+ return runs.length > 0
1330
+ ? runs.map((run) => `${displayRef(run.id)}:${run.status}`).join(", ")
1331
+ : "no runs";
1332
+ }
1249
1333
  function workbenchInstallSourceSummary(source, snapshot) {
1250
1334
  const installUrl = `${source.baseUrl}/skills/${encodeURIComponent(source.owner)}/${encodeURIComponent(source.skill)}`;
1251
1335
  return {
@@ -2204,6 +2288,17 @@ function intFlag(parsed, name) {
2204
2288
  }
2205
2289
  return parsedValue;
2206
2290
  }
2291
+ function portFlag(parsed, name) {
2292
+ const value = stringFlag(parsed, name);
2293
+ if (!value) {
2294
+ return undefined;
2295
+ }
2296
+ const parsedValue = Number(value);
2297
+ if (!Number.isInteger(parsedValue) || parsedValue < 0 || parsedValue > 65535) {
2298
+ throw new WorkbenchUserError(`--${name} must be an integer between 0 and 65535.`);
2299
+ }
2300
+ return parsedValue;
2301
+ }
2207
2302
  function optionalPositional(parsed, index) {
2208
2303
  return parsed.positionals[index];
2209
2304
  }
@@ -2452,7 +2547,7 @@ function runSummary(run, artifactIds) {
2452
2547
  versionId: run.versionId,
2453
2548
  skillName: run.skillName,
2454
2549
  agentName: run.agentName,
2455
- ...(run.score !== undefined ? { score: run.score } : {}),
2550
+ ...(scoredRunValue(run) !== undefined ? { score: scoredRunValue(run) } : {}),
2456
2551
  ...(run.latencyMs !== undefined ? { latencyMs: run.latencyMs } : {}),
2457
2552
  ...(run.error ? { error: run.error } : {}),
2458
2553
  ...(run.jobIds ? { jobIds: run.jobIds } : {}),
@@ -2467,7 +2562,7 @@ function runFailureSummary(run, artifactIds) {
2467
2562
  skill: run.skillName,
2468
2563
  status: run.status,
2469
2564
  versionId: run.versionId,
2470
- ...(run.score !== undefined ? { score: run.score } : {}),
2565
+ ...(scoredRunValue(run) !== undefined ? { score: scoredRunValue(run) } : {}),
2471
2566
  ...(run.error ? { error: run.error } : {}),
2472
2567
  traceIds: run.traceIds,
2473
2568
  artifactIds: [...artifactIds],
@@ -2511,16 +2606,72 @@ async function workbenchCliAuthStatus() {
2511
2606
  })),
2512
2607
  };
2513
2608
  }
2514
- function statusWithCausalNext(status, auth) {
2609
+ function scoredRunValue(run) {
2610
+ return run.status === "succeeded" && typeof run.score === "number" ? run.score : undefined;
2611
+ }
2612
+ function scoredJobValue(job) {
2613
+ return job.status === "succeeded" && typeof job.score === "number" ? job.score : undefined;
2614
+ }
2615
+ function snapshotHasWorkflowCase(snapshot) {
2616
+ const currentVersion = snapshotVersionByRef(snapshot, snapshot.status.currentVersionId ?? snapshot.refs.current ?? "");
2617
+ const caseFiles = currentVersion?.files.filter((file) => file.kind === "text" &&
2618
+ /^\.workbench\/cases\/[^/]+\/case\.ya?ml$/u.test(file.path)) ?? [];
2619
+ return caseFiles.some((file) => file.kind === "text" && !/\n\s*smoke:\s*true(?:\s|$)/u.test(`\n${file.content}`));
2620
+ }
2621
+ function installHandleFromStatusRemote(remote) {
2622
+ const publicationUrl = remote.publication.status === "published" ? remote.publication.installUrl : undefined;
2623
+ const source = parseWorkbenchInstallSource(publicationUrl ?? remote.url);
2624
+ return source ? `${source.owner}/${source.skill}` : publicationUrl ?? remote.url;
2625
+ }
2626
+ async function statusWithCausalNext(status, auth, core) {
2627
+ if (!status.project.initialized) {
2628
+ return status;
2629
+ }
2630
+ const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core).catch(() => null);
2631
+ const lastRun = snapshot?.runs
2632
+ .slice()
2633
+ .sort((left, right) => right.createdAt.localeCompare(left.createdAt))[0];
2634
+ if ((lastRun?.status === "running" || lastRun?.status === "failed" || lastRun?.status === "canceled") && lastRun.id) {
2635
+ return { ...status, next: `workbench show ${displayRef(lastRun.id)}` };
2636
+ }
2637
+ const failedRemote = status.remotes.find((remote) => remote.sync.status === "error");
2638
+ const hasWorkflowCase = snapshot ? snapshotHasWorkflowCase(snapshot) : false;
2639
+ const hasScoredRun = snapshot?.runs.some((run) => scoredRunValue(run) !== undefined) ?? false;
2640
+ const canPublish = hasWorkflowCase && hasScoredRun;
2515
2641
  const cloudAuthMissing = auth.workbenchCloud.status !== "authenticated";
2516
- const needsCloudAuth = cloudAuthMissing && status.remotes.some((remote) => remote.kind === "workbench-cloud" &&
2642
+ const cloudRemoteNeedsAuth = status.remotes.some((remote) => remote.kind === "workbench-cloud" &&
2517
2643
  (remote.sync.status !== "up_to_date" || remote.publication.status === "unpublished"));
2518
- if (!needsCloudAuth) {
2519
- return status;
2644
+ if (cloudAuthMissing && (canPublish || cloudRemoteNeedsAuth)) {
2645
+ return { ...status, next: "workbench login" };
2646
+ }
2647
+ if (failedRemote) {
2648
+ return { ...status, next: `workbench sync ${failedRemote.name}` };
2649
+ }
2650
+ if ((snapshot?.runs.length ?? status.runs.total) === 0) {
2651
+ return { ...status, next: "workbench eval" };
2652
+ }
2653
+ if (!hasWorkflowCase) {
2654
+ return { ...status, next: "edit .workbench/cases, then run workbench eval" };
2655
+ }
2656
+ const cloudRemote = status.remotes.find((remote) => remote.kind === "workbench-cloud");
2657
+ if (canPublish && !cloudRemote) {
2658
+ return { ...status, next: "workbench publish" };
2659
+ }
2660
+ const unpublishedCloudRemote = status.remotes.find((remote) => remote.kind === "workbench-cloud" &&
2661
+ remote.publication.status === "unpublished" &&
2662
+ remote.sync.status === "up_to_date");
2663
+ if (unpublishedCloudRemote) {
2664
+ return { ...status, next: "workbench publish" };
2665
+ }
2666
+ const publishedCloudRemote = status.remotes.find((remote) => remote.kind === "workbench-cloud" &&
2667
+ remote.publication.status === "published" &&
2668
+ Boolean(remote.publication.installUrl));
2669
+ if (publishedCloudRemote) {
2670
+ return { ...status, next: `workbench install ${installHandleFromStatusRemote(publishedCloudRemote)}` };
2520
2671
  }
2521
2672
  return {
2522
2673
  ...status,
2523
- next: "workbench login",
2674
+ next: null,
2524
2675
  };
2525
2676
  }
2526
2677
  function displayRef(id) {
@@ -2612,16 +2763,16 @@ function evidenceFilesForRunOrJob(snapshot, ref) {
2612
2763
  const traceById = new Map(snapshot.traces.map((trace) => [trace.id, trace]));
2613
2764
  const artifactById = new Map(snapshot.artifacts.map((artifact) => [artifact.id, artifact]));
2614
2765
  const files = selection.jobs.flatMap((job) => [
2615
- ...job.traceIds.flatMap((traceId) => {
2616
- const trace = traceById.get(traceId);
2617
- return trace
2618
- ? trace.files.map((file) => evidenceFileWithPath(file, `cases/${evidencePathSegment(job.caseId)}/jobs/${evidencePathSegment(job.id)}/traces/${evidencePathSegment(trace.id)}/${file.path}`))
2619
- : [];
2620
- }),
2621
2766
  ...job.artifactIds.flatMap((artifactId) => {
2622
2767
  const artifact = artifactById.get(artifactId);
2623
2768
  return artifact
2624
- ? artifact.files.map((file) => evidenceFileWithPath(file, `cases/${evidencePathSegment(job.caseId)}/jobs/${evidencePathSegment(job.id)}/artifacts/${evidencePathSegment(artifact.id)}/${file.path}`))
2769
+ ? artifact.files.filter(isUserFacingEvidenceFile).map((file) => evidenceFileWithPath(file, `cases/${evidencePathSegment(job.caseId)}/jobs/${evidencePathSegment(job.id)}/${file.path}`))
2770
+ : [];
2771
+ }),
2772
+ ...job.traceIds.flatMap((traceId) => {
2773
+ const trace = traceById.get(traceId);
2774
+ return trace
2775
+ ? trace.files.filter(isUserFacingEvidenceFile).map((file) => evidenceFileWithPath(file, `cases/${evidencePathSegment(job.caseId)}/jobs/${evidencePathSegment(job.id)}/traces/${evidencePathSegment(trace.id)}/${file.path}`))
2625
2776
  : [];
2626
2777
  }),
2627
2778
  ]);
@@ -2640,6 +2791,10 @@ function evidenceFileWithPath(file, filePath) {
2640
2791
  path: filePath.replace(/\\/gu, "/").replace(/^\/+/u, ""),
2641
2792
  };
2642
2793
  }
2794
+ function isUserFacingEvidenceFile(file) {
2795
+ const normalized = file.path.replace(/\\/gu, "/").replace(/^\/+/u, "");
2796
+ return normalized.split("/").every((segment) => segment !== ".workbench");
2797
+ }
2643
2798
  function evidencePathSegment(value) {
2644
2799
  return value.replace(/[^A-Za-z0-9._-]+/gu, "-") || "_";
2645
2800
  }
@@ -2648,6 +2803,24 @@ function formatRunOrJobEvidence(details, files) {
2648
2803
  const fileLines = files.length > 0 ? ["Files:", ...files.map((file) => file.path)] : [];
2649
2804
  return [...detailLines, ...fileLines].join("\n") || "No evidence.";
2650
2805
  }
2806
+ function evidenceDetailSummary(detail) {
2807
+ return {
2808
+ runId: detail.runId,
2809
+ executions: detail.executions.map((execution) => ({
2810
+ id: execution.id,
2811
+ status: execution.status,
2812
+ jobIds: execution.jobIds,
2813
+ sessions: execution.sessions.map((session) => ({
2814
+ label: session.label,
2815
+ })),
2816
+ trace: {
2817
+ events: execution.trace.events.length,
2818
+ spans: execution.trace.spans.length,
2819
+ summaries: execution.trace.summaries.length,
2820
+ },
2821
+ })),
2822
+ };
2823
+ }
2651
2824
  function manifestOnly(value) {
2652
2825
  if (value === null || typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
2653
2826
  return value;
@@ -2770,6 +2943,17 @@ function findShowFile(files, requestedPath, objectRef) {
2770
2943
  const candidates = normalized === "stderr.log"
2771
2944
  ? suffixCandidates.filter((file) => file.content.length > 0)
2772
2945
  : suffixCandidates;
2946
+ const canonicalCandidates = candidates.filter(isCanonicalEvidenceFileCandidate);
2947
+ if (canonicalCandidates.length === 1) {
2948
+ return canonicalCandidates[0];
2949
+ }
2950
+ const equivalentCanonicalCandidate = singleEquivalentShowFile(canonicalCandidates);
2951
+ if (equivalentCanonicalCandidate) {
2952
+ return equivalentCanonicalCandidate;
2953
+ }
2954
+ if (canonicalCandidates.length > 1) {
2955
+ throw ambiguousShowPath(objectRef, requestedPath, canonicalCandidates);
2956
+ }
2773
2957
  if (candidates.length === 1) {
2774
2958
  return candidates[0];
2775
2959
  }
@@ -2786,6 +2970,9 @@ function findShowFile(files, requestedPath, objectRef) {
2786
2970
  }
2787
2971
  throw ambiguousShowPath(objectRef, requestedPath, candidates.length > 0 ? candidates : suffixCandidates);
2788
2972
  }
2973
+ function isCanonicalEvidenceFileCandidate(file) {
2974
+ return !file.path.includes("/traces/") && !file.path.includes("/artifacts/");
2975
+ }
2789
2976
  function singleEquivalentShowFile(files) {
2790
2977
  if (files.length <= 1) {
2791
2978
  return null;
@@ -2831,24 +3018,56 @@ async function traceIdForCaseSource(core, ref) {
2831
3018
  exitCode: 1,
2832
3019
  });
2833
3020
  }
3021
+ async function evalCoverageSummaries(core, runs) {
3022
+ const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
3023
+ const jobsByRun = new Map();
3024
+ for (const job of snapshot.jobs) {
3025
+ const existing = jobsByRun.get(job.runId) ?? [];
3026
+ existing.push(job);
3027
+ jobsByRun.set(job.runId, existing);
3028
+ }
3029
+ return runs.map((run) => {
3030
+ const jobs = jobsByRun.get(run.id) ?? [];
3031
+ const cases = new Set(jobs.map((job) => job.caseId));
3032
+ const samples = new Set(jobs.map((job) => `${job.caseId}\0${job.sample}`));
3033
+ return {
3034
+ runId: run.id,
3035
+ cases: cases.size,
3036
+ samples: samples.size,
3037
+ jobs: jobs.length,
3038
+ succeeded: jobs.filter((job) => job.status === "succeeded").length,
3039
+ failed: jobs.filter((job) => job.status === "failed" || job.status === "canceled").length,
3040
+ };
3041
+ });
3042
+ }
3043
+ function formatEvalCoverage(coverage) {
3044
+ return [
3045
+ `coverage cases=${coverage.cases}`,
3046
+ `samples=${coverage.samples}`,
3047
+ `jobs=${coverage.jobs}`,
3048
+ coverage.failed > 0 ? `failed=${coverage.failed}` : undefined,
3049
+ ].filter(Boolean).join(" ");
3050
+ }
2834
3051
  async function evalDeltas(core, runs) {
2835
3052
  const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
2836
3053
  return runs.map((run) => {
3054
+ const score = scoredRunValue(run);
2837
3055
  const previous = snapshot.runs
2838
3056
  .filter((candidate) => candidate.id !== run.id &&
2839
3057
  candidate.skillName === run.skillName &&
2840
3058
  candidate.agentName === run.agentName &&
2841
- typeof candidate.score === "number" &&
3059
+ scoredRunValue(candidate) !== undefined &&
2842
3060
  candidate.createdAt < run.createdAt)
2843
3061
  .sort((left, right) => right.createdAt.localeCompare(left.createdAt))[0];
3062
+ const previousScore = previous ? scoredRunValue(previous) : undefined;
2844
3063
  return {
2845
3064
  runId: run.id,
2846
3065
  versionId: run.versionId,
2847
3066
  skillName: run.skillName,
2848
3067
  agentName: run.agentName,
2849
- ...(run.score !== undefined ? { score: run.score } : {}),
2850
- ...(previous?.score !== undefined ? { previousScore: previous.score } : {}),
2851
- ...(run.score !== undefined && previous?.score !== undefined ? { delta: run.score - previous.score } : {}),
3068
+ ...(score !== undefined ? { score } : {}),
3069
+ ...(previousScore !== undefined ? { previousScore } : {}),
3070
+ ...(score !== undefined && previousScore !== undefined ? { delta: score - previousScore } : {}),
2852
3071
  };
2853
3072
  });
2854
3073
  }
@@ -2867,15 +3086,11 @@ async function evalSuccessNextCommand(core, runs) {
2867
3086
  if (runs.length === 0) {
2868
3087
  return "workbench eval";
2869
3088
  }
2870
- if (!runs.some((run) => typeof run.score === "number")) {
3089
+ if (!runs.some((run) => scoredRunValue(run) !== undefined)) {
2871
3090
  return "edit .workbench/cases, then run workbench eval";
2872
3091
  }
2873
3092
  const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
2874
- const currentVersion = snapshotVersionByRef(snapshot, snapshot.status.currentVersionId ?? snapshot.refs.current ?? "");
2875
- const caseFiles = currentVersion?.files.filter((file) => file.kind === "text" &&
2876
- /^\.workbench\/cases\/[^/]+\/case\.ya?ml$/u.test(file.path)) ?? [];
2877
- const hasWorkflowCase = caseFiles.some((file) => file.kind === "text" && !/\n\s*smoke:\s*true(?:\s|$)/u.test(`\n${file.content}`));
2878
- return hasWorkflowCase ? "workbench publish" : "edit .workbench/cases, then run workbench eval";
3093
+ return snapshotHasWorkflowCase(snapshot) ? "workbench publish" : "edit .workbench/cases, then run workbench eval";
2879
3094
  }
2880
3095
  function formatStatusSnapshot(status) {
2881
3096
  const lines = [
@@ -2927,7 +3142,8 @@ function formatAgent(agent) {
2927
3142
  return `${agent.name}\t${agent.adapter}${agent.model ? `\t${agent.model}` : ""}`;
2928
3143
  }
2929
3144
  function formatRun(run) {
2930
- const score = run.score === undefined ? "n/a" : run.score.toFixed(3);
3145
+ const scoreValue = scoredRunValue(run);
3146
+ const score = scoreValue === undefined ? "n/a" : scoreValue.toFixed(3);
2931
3147
  const latency = run.latencyMs === undefined ? "n/a" : `${run.latencyMs}ms`;
2932
3148
  return `${displayRef(run.id)}\t${run.kind}\t${run.status}\tversion=${displayRef(run.versionId)}\tskill=${run.skillName}\tagent=${run.agentName}\tscore=${score}\tlatency=${latency}`;
2933
3149
  }
@@ -2940,7 +3156,8 @@ function formatImproveResult(result) {
2940
3156
  ].join("\n");
2941
3157
  }
2942
3158
  function formatJob(job) {
2943
- const score = job.score === undefined ? "n/a" : job.score.toFixed(3);
3159
+ const scoreValue = scoredJobValue(job);
3160
+ const score = scoreValue === undefined ? "n/a" : scoreValue.toFixed(3);
2944
3161
  const duration = job.durationMs === undefined ? "n/a" : `${job.durationMs}ms`;
2945
3162
  return `${displayRef(job.id)}\trun=${displayRef(job.runId)}\tcase=${job.caseId}\tsample=${job.sample}\t${job.status}\tscore=${score}\tduration=${duration}`;
2946
3163
  }
@@ -2966,7 +3183,7 @@ function shortObjectId(id) {
2966
3183
  function formatTrace(trace) {
2967
3184
  const result = asRecord(trace.result);
2968
3185
  const status = typeof result?.status === "string" ? result.status : undefined;
2969
- const score = typeof result?.score === "number" ? result.score.toFixed(3) : undefined;
3186
+ const score = status === "succeeded" && typeof result?.score === "number" ? result.score.toFixed(3) : undefined;
2970
3187
  const error = typeof result?.error === "string" ? result.error.split(/\r?\n/u)[0] : undefined;
2971
3188
  const files = trace.files.slice(0, 5).map((file) => file.path).join(",");
2972
3189
  return [
@@ -2979,6 +3196,7 @@ function formatTrace(trace) {
2979
3196
  }
2980
3197
  function traceSummary(trace) {
2981
3198
  const result = asRecord(trace.result);
3199
+ const status = typeof result?.status === "string" ? result.status : undefined;
2982
3200
  return {
2983
3201
  id: trace.id,
2984
3202
  runId: trace.runId,
@@ -2987,8 +3205,8 @@ function traceSummary(trace) {
2987
3205
  skillName: trace.skillName,
2988
3206
  agentName: trace.agentName,
2989
3207
  createdAt: trace.createdAt,
2990
- ...(typeof result?.status === "string" ? { status: result.status } : {}),
2991
- ...(typeof result?.score === "number" ? { score: result.score } : {}),
3208
+ ...(status ? { status } : {}),
3209
+ ...(status === "succeeded" && typeof result?.score === "number" ? { score: result.score } : {}),
2992
3210
  ...(typeof result?.error === "string" ? { error: singleLine(result.error) } : {}),
2993
3211
  fileCount: trace.files.length,
2994
3212
  files: trace.files.map(fileSummary),
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@workbench-ai/workbench",
3
- "version": "0.0.71",
3
+ "version": "0.0.72",
4
4
  "repository": {
5
5
  "type": "git",
6
6
  "url": "git+https://github.com/workbench-ai/workbench.git",
@@ -21,10 +21,10 @@
21
21
  ],
22
22
  "dependencies": {
23
23
  "yaml": "^2.8.2",
24
- "@workbench-ai/workbench-built-in-adapters": "0.0.71",
25
- "@workbench-ai/workbench-core": "0.0.71",
26
- "@workbench-ai/workbench-contract": "0.0.71",
27
- "@workbench-ai/workbench-protocol": "0.0.71"
24
+ "@workbench-ai/workbench-built-in-adapters": "0.0.72",
25
+ "@workbench-ai/workbench-contract": "0.0.72",
26
+ "@workbench-ai/workbench-core": "0.0.72",
27
+ "@workbench-ai/workbench-protocol": "0.0.72"
28
28
  },
29
29
  "devDependencies": {
30
30
  "@tailwindcss/postcss": "^4.2.2",
@@ -35,7 +35,7 @@
35
35
  "react-dom": "^19.2.0",
36
36
  "typescript": "^5.9.2",
37
37
  "vitest": "^3.2.4",
38
- "@workbench-ai/workbench-ui": "0.0.71"
38
+ "@workbench-ai/workbench-ui": "0.0.72"
39
39
  },
40
40
  "scripts": {
41
41
  "build": "rm -rf dist && tsc -p tsconfig.json && chmod 755 dist/workbench.js && node ./scripts/build-dev-open-assets.mjs",