@workbench-ai/workbench 0.0.70 → 0.0.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -4,10 +4,10 @@ import { createRequire } from "node:module";
4
4
  import os from "node:os";
5
5
  import path from "node:path";
6
6
  import { gzipSync } from "node:zlib";
7
- import { addWorkbenchCase, addWorkbenchRemote, addWorkbenchAgent, compareWorkbench, createWorkbenchAdapterAuthBundle, createWorkbenchReadOnlyInspectionSnapshot, diffWorkbenchVersions, evalWorkbenchSkill, improveWorkbenchSkill, initWorkbenchSkill, listWorkbenchCases, listWorkbenchAgents, listWorkbenchVersions, localWorkbenchAdapterAuthStore, parseWorkbenchAdapterAuthTarget, publishWorkbenchVersion, removeWorkbenchCase, removeWorkbenchAgent, showWorkbenchRef, switchWorkbenchVersion, syncWorkbenchRemote, workbenchJobEvidenceForSnapshot, workbenchStatusSnapshot, WorkbenchCodedError, WorkbenchUserError, } from "@workbench-ai/workbench-core";
7
+ import { addWorkbenchCase, addWorkbenchRemote, addWorkbenchAgent, compareWorkbench, createWorkbenchInspectionSnapshot, createWorkbenchAdapterAuthBundle, createWorkbenchReadOnlyInspectionSnapshot, diffWorkbenchVersions, evalWorkbenchSkill, improveWorkbenchSkill, initWorkbenchSkill, listWorkbenchAgents, listWorkbenchVersions, localWorkbenchAdapterAuthStore, parseWorkbenchAdapterAuthTarget, publishWorkbenchVersion, removeWorkbenchAgent, showWorkbenchRef, switchWorkbenchVersion, syncWorkbenchRemote, workbenchJobEvidenceForSnapshot, workbenchSkillImproveCanUseQueuedAdapter, workbenchStatusSnapshot, WorkbenchCodedError, WorkbenchUserError, } from "@workbench-ai/workbench-core";
8
8
  import { normalizeWorkbenchSkillName } from "@workbench-ai/workbench-contract";
9
9
  import { emitError, emitResult } from "./output.js";
10
- import { installSnapshotToTargets, installTargetsToJson, normalizeInstallSnapshotPath, resolveInstallTargets, supportedInstallTargets, } from "./install-targets.js";
10
+ import { installSnapshotToTargets, normalizeInstallSnapshotPath, resolveInstallTargets, supportedInstallTargets, } from "./install-targets.js";
11
11
  import { startWorkbenchOpenServer } from "./open-server.js";
12
12
  const require = createRequire(import.meta.url);
13
13
  const HELP = [
@@ -23,7 +23,7 @@ const HELP = [
23
23
  " workbench improve [VERSION] [--skills LIST] [--agents LIST] [--budget N] [-n N|--samples N] [--cloud] [--json]",
24
24
  " workbench compare [--skills all|LIST] [--agents all|LIST] [--versions all|A..B|LIST] [--json]",
25
25
  " workbench publish [VERSION] [--as OWNER/SKILL] [--private|--team|--public] [--dry-run] [--json]",
26
- " workbench install HANDLE_OR_URL [--to codex|claude|local]... [--yes] [--list] [--dry-run] [--json]",
26
+ " workbench install HANDLE_OR_URL [--to codex|claude|local]... [--yes] [--dry-run] [--json]",
27
27
  "",
28
28
  "More:",
29
29
  " workbench help --all",
@@ -36,7 +36,7 @@ const HELP_ALL = [
36
36
  " workbench compare [--skills all|LIST] [--agents all|LIST] [--versions all|A..B|LIST] [--json]",
37
37
  " workbench improve [VERSION] [--skills LIST] [--agents LIST] [--budget N] [-n N|--samples N] [--cloud] [--json]",
38
38
  " workbench publish [VERSION] [--as OWNER/SKILL] [--private|--team|--public] [--dry-run] [--json]",
39
- " workbench install HANDLE_OR_URL [--to codex|claude|local]... [--yes] [--list] [--dry-run] [--json]",
39
+ " workbench install HANDLE_OR_URL [--to codex|claude|local]... [--yes] [--dry-run] [--json]",
40
40
  "",
41
41
  "Inspect:",
42
42
  " workbench status [--dir DIR] [--json]",
@@ -44,10 +44,10 @@ const HELP_ALL = [
44
44
  " workbench show REF[:PATH] [--json]",
45
45
  " workbench diff [A..B] [--json]",
46
46
  " workbench switch VERSION [--json]",
47
- " workbench open [--host HOST] [--port PORT] [--no-open] [--json]",
47
+ " workbench open [--host HOST] [--port PORT] [--no-open]",
48
48
  "",
49
49
  "Configure:",
50
- " workbench case add [RUN_ID] | list | rm ID [--json]",
50
+ " workbench case add RUN_ID [--json]",
51
51
  " workbench agent add NAME --adapter X [--model M] [--with k=v]... | list | rm NAME [--json]",
52
52
  "",
53
53
  "Share and auth:",
@@ -65,28 +65,40 @@ const COMMAND_HELP = {
65
65
  " workbench new [DIR] [--json]",
66
66
  "",
67
67
  "Creates a Workbench skill project.",
68
+ "",
69
+ "Example:",
70
+ " workbench new earnings-prep",
68
71
  ].join("\n"),
69
72
  eval: [
70
73
  "Usage:",
71
74
  " workbench eval [VERSION] [--skills all|LIST] [--agents all|LIST] [-n N|--samples N] [--rerun] [--cloud] [--json]",
72
75
  "",
73
76
  "Runs eval jobs for the selected version, measured skills, and agents. Omitted selectors use manifest defaults.",
77
+ "",
78
+ "Example:",
79
+ " workbench eval -n 5",
74
80
  ].join("\n"),
75
81
  improve: [
76
82
  "Usage:",
77
83
  " workbench improve [VERSION] [--skills LIST] [--agents LIST] [--budget N] [-n N|--samples N] [--cloud] [--json]",
78
84
  "",
79
85
  "Creates one improved child version from evidence. The selected skills and agents must resolve to exactly one entry each.",
86
+ "",
87
+ "Example:",
88
+ " workbench improve --budget 1 -n 1",
80
89
  ].join("\n"),
81
90
  compare: [
82
91
  "Usage:",
83
92
  " workbench compare [--skills all|LIST] [--agents all|LIST] [--versions all|A..B|LIST] [--json]",
84
93
  "",
85
94
  "Compares recorded eval evidence across selected skills, agents, and versions.",
95
+ "",
96
+ "Example:",
97
+ " workbench compare --agents all",
86
98
  ].join("\n"),
87
99
  install: [
88
100
  "Usage:",
89
- " workbench install HANDLE_OR_URL [--to codex|claude|local]... [--yes] [--list] [--dry-run] [--json]",
101
+ " workbench install HANDLE_OR_URL [--to codex|claude|local]... [--yes] [--dry-run] [--json]",
90
102
  "",
91
103
  "Installs published Workbench Cloud source into local agent targets.",
92
104
  "",
@@ -98,12 +110,18 @@ const COMMAND_HELP = {
98
110
  " workbench status [--dir DIR] [--json]",
99
111
  "",
100
112
  "Reports project, worktree, run, per-remote sync/publication, and auth state. --json emits the workbench.status.v1 dashboard.",
113
+ "",
114
+ "Example:",
115
+ " workbench status --json",
101
116
  ].join("\n"),
102
117
  logout: [
103
118
  "Usage:",
104
119
  " workbench logout [PROVIDER] [--json]",
105
120
  "",
106
121
  "With no provider, logs out of Workbench Cloud. With a provider such as codex or claude, removes local adapter auth.",
122
+ "",
123
+ "Example:",
124
+ " workbench logout claude",
107
125
  ].join("\n"),
108
126
  show: [
109
127
  "Usage:",
@@ -111,38 +129,54 @@ const COMMAND_HELP = {
111
129
  " workbench show REF:PATH [--json]",
112
130
  "",
113
131
  "Shows a Workbench object, lists files for file-backed objects, or prints one file.",
132
+ "",
133
+ "Example:",
134
+ " workbench show run_abc12345:result.json",
114
135
  ].join("\n"),
115
136
  log: [
116
137
  "Usage:",
117
138
  " workbench log [--runs|--versions] [--json]",
118
139
  "",
119
140
  "Shows one reverse-chronological timeline of versions and runs.",
141
+ "",
142
+ "Example:",
143
+ " workbench log --runs",
120
144
  ].join("\n"),
121
145
  diff: [
122
146
  "Usage:",
123
147
  " workbench diff [A..B] [--json]",
124
148
  "",
125
149
  "Shows changed files between two Workbench source versions.",
150
+ "",
151
+ "Example:",
152
+ " workbench diff 26059f9a..eac5699c",
126
153
  ].join("\n"),
127
154
  switch: [
128
155
  "Usage:",
129
156
  " workbench switch VERSION [--json]",
130
157
  "",
131
158
  "Switches the working skill source to a recorded Workbench version.",
159
+ "",
160
+ "Example:",
161
+ " workbench switch 26059f9a",
132
162
  ].join("\n"),
133
163
  open: [
134
164
  "Usage:",
135
- " workbench open [--host HOST] [--port PORT] [--no-open] [--json]",
165
+ " workbench open [--host HOST] [--port PORT] [--no-open]",
136
166
  "",
137
- "Serves or emits the read-only Workbench inspection snapshot.",
167
+ "Serves the read-only Workbench inspection UI.",
168
+ "",
169
+ "Example:",
170
+ " workbench open --no-open",
138
171
  ].join("\n"),
139
172
  case: [
140
173
  "Usage:",
141
- " workbench case list [--json]",
142
- " workbench case add [RUN_ID] [--json]",
143
- " workbench case rm ID [--json]",
174
+ " workbench case add RUN_ID [--json]",
175
+ "",
176
+ "Captures a regression case from a recorded run.",
144
177
  "",
145
- "Lists cases, creates a draft case, or removes a case.",
178
+ "Example:",
179
+ " workbench case add run_abc12345",
146
180
  ].join("\n"),
147
181
  agent: [
148
182
  "Usage:",
@@ -151,18 +185,27 @@ const COMMAND_HELP = {
151
185
  " workbench agent rm NAME [--json]",
152
186
  "",
153
187
  "Lists, adds, or removes eval agent configurations.",
188
+ "",
189
+ "Example:",
190
+ " workbench agent add claude --adapter claude --model sonnet",
154
191
  ].join("\n"),
155
192
  sync: [
156
193
  "Usage:",
157
194
  " workbench sync [REMOTE] [--dry-run] [--dir DIR] [--json]",
158
195
  "",
159
196
  "Plumbing command: synchronizes local evidence and version objects with a Workbench remote.",
197
+ "",
198
+ "Example:",
199
+ " workbench sync cloud --dry-run",
160
200
  ].join("\n"),
161
201
  publish: [
162
202
  "Usage:",
163
203
  " workbench publish [VERSION] [--as OWNER/SKILL] [--private|--team|--public] [--dry-run] [--dir DIR] [--json]",
164
204
  "",
165
205
  "Publishes installable skill source to Workbench Cloud. --as sets the linked OWNER/SKILL handle.",
206
+ "",
207
+ "Example:",
208
+ " workbench publish --as acme/earnings-prep --dry-run",
166
209
  ].join("\n"),
167
210
  login: [
168
211
  "Usage:",
@@ -170,14 +213,20 @@ const COMMAND_HELP = {
170
213
  " workbench logout [PROVIDER] [--json]",
171
214
  "",
172
215
  "Connects the CLI to Workbench Cloud or captures local adapter auth for a provider.",
216
+ "",
217
+ "Example:",
218
+ " workbench login --start-only --no-open",
173
219
  ].join("\n"),
174
220
  };
175
221
  const COMMON_FLAGS = {
176
222
  json: "boolean",
177
223
  };
224
+ const DIR_FLAG = {
225
+ dir: "string",
226
+ };
178
227
  const PROJECT_FLAGS = {
179
228
  ...COMMON_FLAGS,
180
- dir: "string",
229
+ ...DIR_FLAG,
181
230
  };
182
231
  const HELP_FLAG = {
183
232
  help: "boolean",
@@ -207,7 +256,7 @@ const COMMAND_FLAGS = {
207
256
  samples: "positive-integer",
208
257
  skills: "string",
209
258
  },
210
- install: { ...COMMON_FLAGS, ...HELP_FLAG, "dry-run": "boolean", list: "boolean", to: "repeat-string", yes: "boolean" },
259
+ install: { ...COMMON_FLAGS, ...HELP_FLAG, "dry-run": "boolean", to: "repeat-string", yes: "boolean" },
211
260
  log: { ...PROJECT_FLAGS, ...HELP_FLAG, runs: "boolean", versions: "boolean" },
212
261
  login: {
213
262
  ...COMMON_FLAGS,
@@ -224,7 +273,7 @@ const COMMAND_FLAGS = {
224
273
  },
225
274
  logout: { ...COMMON_FLAGS, ...HELP_FLAG },
226
275
  new: { ...PROJECT_FLAGS, ...HELP_FLAG },
227
- open: { ...PROJECT_FLAGS, ...HELP_FLAG, host: "string", "no-open": "boolean", port: "positive-integer" },
276
+ open: { ...DIR_FLAG, ...HELP_FLAG, host: "string", "no-open": "boolean", port: "port" },
228
277
  publish: {
229
278
  ...PROJECT_FLAGS,
230
279
  ...HELP_FLAG,
@@ -243,9 +292,7 @@ const COMMAND_FLAGS = {
243
292
  const SUBCOMMAND_FLAGS = {
244
293
  case: {
245
294
  flags: {
246
- list: { ...PROJECT_FLAGS, ...HELP_FLAG },
247
295
  add: { ...PROJECT_FLAGS, ...HELP_FLAG },
248
- rm: { ...PROJECT_FLAGS, ...HELP_FLAG },
249
296
  },
250
297
  },
251
298
  agent: {
@@ -315,26 +362,31 @@ export async function runCli(argv, io = {
315
362
  return emitEvalFailure(runs, failedRuns, artifactIds, parsed, io);
316
363
  }
317
364
  const deltas = await evalDeltas(core, runs);
318
- const nextCommands = evalSuccessNextCommands(runs);
365
+ const coverage = await evalCoverageSummaries(core, runs);
366
+ const next = await evalSuccessNextCommand(core, runs);
319
367
  return emitResult("workbench.cli.eval.v1", {
320
368
  result: runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
369
+ coverage: coverage,
321
370
  deltas: deltas,
322
- nextCommands: nextCommands,
371
+ next: next,
323
372
  }, parsed, io, () => [
324
373
  runs.map(formatRun).join("\n"),
374
+ ...coverage.map(formatEvalCoverage),
325
375
  ...deltas.map(formatEvalDelta),
326
- ...(nextCommands[0] ? [`next: ${nextCommands[0]}`] : []),
376
+ ...(next ? [`next: ${next}`] : []),
327
377
  ].filter(Boolean).join("\n"));
328
378
  }
329
379
  if (command === "improve") {
330
380
  if (parsed.flags.cloud === true) {
331
381
  return await handleCloudImprove(parsed, io);
332
382
  }
383
+ const improverAgent = await resolveLocalImproverAgent(parsed, core);
333
384
  const result = await improveWorkbenchSkill({
334
385
  ...core,
335
386
  version: optionalPositional(parsed, 1),
336
387
  skill: stringFlag(parsed, "skills"),
337
388
  agent: stringFlag(parsed, "agents"),
389
+ ...(improverAgent ? { improverAgent } : {}),
338
390
  budget: intFlag(parsed, "budget"),
339
391
  samples: intFlag(parsed, "samples"),
340
392
  });
@@ -350,12 +402,12 @@ export async function runCli(argv, io = {
350
402
  skills: stringFlag(parsed, "skills"),
351
403
  agents: stringFlag(parsed, "agents"),
352
404
  });
353
- return output(comparison, parsed, io, () => formatComparison(comparison));
405
+ return output(manifestOnly(comparison), parsed, io, () => formatComparison(comparison));
354
406
  }
355
407
  if (command === "switch") {
356
408
  const versionRef = requiredPositional(parsed, 1, "workbench switch requires VERSION.");
357
409
  const version = await switchWorkbenchVersion(versionRef, core);
358
- return output(versionSummary(version), parsed, io, () => `Switched to ${version.id}.`);
410
+ return output(versionSummary(version), parsed, io, () => `Switched to ${displayRef(version.id)}.`);
359
411
  }
360
412
  if (command === "diff") {
361
413
  const range = optionalPositional(parsed, 1) ?? await defaultDiffRange(core);
@@ -390,7 +442,7 @@ export async function runCli(argv, io = {
390
442
  }, parsed, io, () => `${result.dryRun ? "Would sync" : "Synced"} ${result.remote.name}: pushed ${result.pushed}, pulled ${result.pulled}${result.upToDate ? " (up to date)" : ""}.`);
391
443
  }
392
444
  if (command === "publish") {
393
- const preview = parsed.flags["dry-run"] === true && !stringFlag(parsed, "as")
445
+ const preview = parsed.flags["dry-run"] === true
394
446
  ? await previewPublishWithDerivedRemote(parsed)
395
447
  : undefined;
396
448
  if (preview) {
@@ -403,10 +455,7 @@ export async function runCli(argv, io = {
403
455
  pinnedInstallUrl: preview.pinnedInstallUrl,
404
456
  dryRun: true,
405
457
  }, parsed, io, () => [
406
- `Would publish ${preview.version.id} to remote ${preview.remote.name}.`,
407
- `Visibility: ${preview.visibility}`,
408
- `Install: ${preview.installUrl}`,
409
- `Pinned: ${preview.pinnedInstallUrl}`,
458
+ `Would publish ${displayRef(preview.version.id)} as ${preview.installHandle} (${preview.visibility}).`,
410
459
  `next: workbench install ${preview.installHandle}`,
411
460
  ].join("\n"));
412
461
  }
@@ -427,25 +476,18 @@ export async function runCli(argv, io = {
427
476
  pinnedInstallUrl: result.pinnedInstallUrl,
428
477
  ...(result.dryRun ? { dryRun: true } : {}),
429
478
  }, parsed, io, () => [
430
- `${result.dryRun ? "Would publish" : "Published"} ${result.version.id} to remote ${result.remote.name}.`,
431
- `Visibility: ${result.visibility}`,
432
- `Install: ${result.installUrl}`,
433
- `Pinned: ${result.pinnedInstallUrl}`,
479
+ `${result.dryRun ? "Would publish" : "Published"} ${displayRef(result.version.id)} as ${result.installHandle} (${result.visibility}).`,
434
480
  `next: workbench install ${result.installHandle}`,
435
481
  ].join("\n"));
436
482
  }
437
483
  if (command === "open") {
438
- if (parsed.flags.json === true) {
439
- const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
440
- return output(snapshot, parsed, io, () => "Read-only Workbench inspection data is available with --json.");
441
- }
442
484
  // The browser server serves committed object state through a read-only
443
485
  // snapshot path, so long-running commands do not block page loads.
444
486
  const server = await startWorkbenchOpenServer({
445
487
  dir: dirFlag(parsed),
446
488
  authToken: core.authToken,
447
489
  host: stringFlag(parsed, "host"),
448
- port: intFlag(parsed, "port"),
490
+ port: portFlag(parsed, "port"),
449
491
  });
450
492
  io.stdout.write(`Workbench: ${server.url}\n`);
451
493
  if (parsed.flags["no-open"] !== true) {
@@ -460,16 +502,18 @@ export async function runCli(argv, io = {
460
502
  }
461
503
  }
462
504
  async function handleStatus(parsed, io) {
463
- const status = await workbenchStatusSnapshot(await coreOptions(parsed));
505
+ const core = await coreOptions(parsed);
506
+ const status = await workbenchStatusSnapshot(core);
464
507
  const auth = await workbenchCliAuthStatus();
508
+ const cliStatus = await statusWithCausalNext(status, auth, core);
465
509
  return emitResult("workbench.status.v1", {
466
- project: status.project,
467
- worktree: status.worktree,
468
- runs: status.runs,
469
- remotes: status.remotes,
510
+ project: cliStatus.project,
511
+ worktree: cliStatus.worktree,
512
+ runs: cliStatus.runs,
513
+ remotes: cliStatus.remotes,
470
514
  auth: auth,
471
- next: status.next,
472
- }, parsed, io, () => formatStatusSnapshot({ ...status, auth }));
515
+ next: cliStatus.next,
516
+ }, parsed, io, () => formatStatusSnapshot({ ...cliStatus, auth }));
473
517
  }
474
518
  async function handleLog(parsed, io) {
475
519
  if (parsed.flags.runs === true && parsed.flags.versions === true) {
@@ -491,7 +535,7 @@ async function handleLog(parsed, io) {
491
535
  remediation: "Run workbench log, workbench log --runs, or workbench log --versions.",
492
536
  });
493
537
  }
494
- const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(await coreOptions(parsed));
538
+ const snapshot = await createWorkbenchInspectionSnapshot(await coreOptions(parsed));
495
539
  const includeRuns = parsed.flags.versions !== true;
496
540
  const includeVersions = parsed.flags.runs !== true;
497
541
  const entries = [
@@ -510,7 +554,7 @@ async function handleLog(parsed, io) {
510
554
  versionId: run.versionId,
511
555
  skillName: run.skillName,
512
556
  agentName: run.agentName,
513
- ...(run.score !== undefined ? { score: run.score } : {}),
557
+ ...(scoredRunValue(run) !== undefined ? { score: scoredRunValue(run) } : {}),
514
558
  })) : []),
515
559
  ].sort((left, right) => right.createdAt.localeCompare(left.createdAt));
516
560
  return emitResult("workbench.cli.log.v1", {
@@ -534,21 +578,25 @@ async function handleShow(parsed, io) {
534
578
  return output(value, parsed, io, () => formatShow(value));
535
579
  }
536
580
  const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
537
- const version = snapshot.versions.find((entry) => entry.id === objectRef);
581
+ const version = snapshotVersionByRef(snapshot, objectRef);
538
582
  if (version) {
539
583
  return output(fileListing("version", version.id, version.files), parsed, io, () => formatFileListing("version", version.id, version.files));
540
584
  }
541
- const trace = snapshot.traces.find((entry) => entry.id === objectRef);
585
+ const trace = snapshotObjectByRef(snapshot.traces, objectRef, "trace");
542
586
  if (trace) {
543
587
  return output(fileListing("trace", trace.id, trace.files), parsed, io, () => formatFileListing("trace", trace.id, trace.files));
544
588
  }
545
- const artifact = snapshot.artifacts.find((entry) => entry.id === objectRef);
589
+ const artifact = snapshotObjectByRef(snapshot.artifacts, objectRef, "artifact");
546
590
  if (artifact) {
547
591
  return output(fileListing("artifact", artifact.id, artifact.files), parsed, io, () => formatFileListing("artifact", artifact.id, artifact.files));
548
592
  }
549
593
  const details = evidenceDetailsForRunOrJob(snapshot, objectRef);
550
- if (details.length > 0) {
551
- return output(details, parsed, io, () => details.map(formatTraceDetail).join("\n"));
594
+ const evidenceFiles = evidenceFilesForRunOrJob(snapshot, objectRef);
595
+ if (details.length > 0 || evidenceFiles.length > 0) {
596
+ return output({
597
+ details: details.map(evidenceDetailSummary),
598
+ files: evidenceFiles.map(fileSummary),
599
+ }, parsed, io, () => formatRunOrJobEvidence(details, evidenceFiles));
552
600
  }
553
601
  const value = await showWorkbenchRef(ref, core);
554
602
  return output(value, parsed, io, () => formatShow(value));
@@ -581,22 +629,19 @@ async function handleAgent(parsed, io) {
581
629
  throw new WorkbenchUserError(`Unsupported agent command: ${subcommand}`);
582
630
  }
583
631
  async function handleCase(parsed, io) {
584
- const subcommand = requiredPositional(parsed, 1, "workbench case requires list|add|rm.");
585
- if (subcommand === "list") {
586
- const cases = await listWorkbenchCases(await coreOptions(parsed));
587
- return output(cases, parsed, io, () => cases.map((entry) => `${entry.id}\t${entry.path}`).join("\n") || "No cases.");
588
- }
632
+ const subcommand = requiredPositional(parsed, 1, "workbench case requires add.");
589
633
  if (subcommand === "add") {
590
634
  const core = await coreOptions(parsed);
591
- const sourceRef = optionalPositional(parsed, 2);
592
- const record = await addWorkbenchCase({ ...core, fromTraceId: sourceRef ? await traceIdForCaseSource(core, sourceRef) : undefined });
593
- return output(record, parsed, io, () => `Added case ${record.id}.`);
594
- }
595
- if (subcommand === "rm") {
596
- const result = await removeWorkbenchCase(requiredPositional(parsed, 2, "workbench case rm requires CASE_ID."), await coreOptions(parsed));
597
- return output(result, parsed, io, () => `Removed case ${result.removed}.`);
635
+ const sourceRef = requiredPositional(parsed, 2, "workbench case add requires RUN_ID.");
636
+ rejectExtraInput(parsed, {
637
+ maxPositionals: 3,
638
+ message: "workbench case add accepts one RUN_ID argument.",
639
+ remediation: "Run workbench case add RUN_ID.",
640
+ });
641
+ const record = await addWorkbenchCase({ ...core, fromTraceId: await traceIdForCaseSource(core, sourceRef) });
642
+ return output(record, parsed, io, () => `Added draft case ${record.id}. Edit .workbench/cases/${record.path}/case.yaml before using it as score evidence.`);
598
643
  }
599
- throw new WorkbenchUserError(`Unsupported case command: ${subcommand}`);
644
+ throw new WorkbenchUserError(`Unknown command: workbench case ${subcommand}`);
600
645
  }
601
646
  async function handleAdapterLogin(provider, parsed, io) {
602
647
  const target = parseAuthTarget(provider, authProfileFlag(parsed));
@@ -685,11 +730,14 @@ function validateFlagValue(name, value, kind) {
685
730
  if (typeof value !== "string" || !value.trim()) {
686
731
  throw new WorkbenchUserError(`--${name} requires a value.`);
687
732
  }
688
- if (kind === "positive-integer") {
733
+ if (kind === "positive-integer" || kind === "port") {
689
734
  const parsedValue = Number(value);
690
- if (!Number.isInteger(parsedValue) || parsedValue <= 0) {
735
+ if (kind === "positive-integer" && (!Number.isInteger(parsedValue) || parsedValue <= 0)) {
691
736
  throw new WorkbenchUserError(`--${name} must be a positive integer.`);
692
737
  }
738
+ if (kind === "port" && (!Number.isInteger(parsedValue) || parsedValue < 0 || parsedValue > 65535)) {
739
+ throw new WorkbenchUserError(`--${name} must be an integer between 0 and 65535.`);
740
+ }
693
741
  }
694
742
  }
695
743
  const CONFIG_SCHEMA = "workbench.cli.config.v1";
@@ -714,7 +762,7 @@ async function handleLogin(parsed, io) {
714
762
  }
715
763
  if (parsed.flags["start-only"] === true && parsed.flags.wait === true) {
716
764
  throw new WorkbenchCodedError("usage", "workbench login accepts only one of --start-only or --wait.", {
717
- remediation: "Run workbench login --start-only or workbench login --wait --timeout 120.",
765
+ remediation: "Run workbench login --start-only or workbench login --wait.",
718
766
  exitCode: 2,
719
767
  });
720
768
  }
@@ -723,22 +771,17 @@ async function handleLogin(parsed, io) {
723
771
  const timeoutSeconds = intFlag(parsed, "timeout");
724
772
  if (startOnly && timeoutSeconds !== undefined) {
725
773
  throw new WorkbenchCodedError("usage", "workbench login --timeout only applies with --wait.", {
726
- remediation: "Run workbench login --start-only, then workbench login --wait --timeout 120.",
727
- exitCode: 2,
728
- });
729
- }
730
- if (waitOnly && timeoutSeconds === undefined) {
731
- throw new WorkbenchCodedError("usage", "workbench login --wait requires --timeout N.", {
732
- remediation: "Run workbench login --wait --timeout 120.",
774
+ remediation: "Run workbench login --start-only, then workbench login --wait.",
733
775
  exitCode: 2,
734
776
  });
735
777
  }
736
778
  const config = await loadConfig();
737
- const baseUrl = selectWorkbenchBaseUrl({
738
- explicitBaseUrl: stringFlag(parsed, "base-url"),
779
+ const explicitBaseUrl = stringFlag(parsed, "base-url");
780
+ const pending = waitOnly ? await readPendingDeviceAuthorization(explicitBaseUrl) : null;
781
+ const baseUrl = pending?.baseUrl ?? selectWorkbenchBaseUrl({
782
+ explicitBaseUrl,
739
783
  configBaseUrl: config.baseUrl,
740
784
  });
741
- const pending = waitOnly ? await readPendingDeviceAuthorization(baseUrl) : null;
742
785
  const record = pending ?? await startDeviceAuthorization(baseUrl);
743
786
  const freshAuthorization = pending === null;
744
787
  if (startOnly) {
@@ -753,8 +796,8 @@ async function handleLogin(parsed, io) {
753
796
  verificationUriComplete: record.verification_uri_complete,
754
797
  userCode: record.user_code,
755
798
  expiresAt: record.expiresAt,
756
- resume: "workbench login --wait --timeout 120",
757
- }, parsed, io, () => `Open ${record.verification_uri_complete}\nCode: ${record.user_code}\nResume: workbench login --wait --timeout 120`);
799
+ resume: "workbench login --wait",
800
+ }, parsed, io, () => `Open ${record.verification_uri_complete}\nCode: ${record.user_code}\nResume: workbench login --wait`);
758
801
  }
759
802
  await writePendingDeviceAuthorization(record);
760
803
  if (freshAuthorization && !parsed.flags.json) {
@@ -801,9 +844,6 @@ async function handleLogout(parsed, io) {
801
844
  const config = await loadConfig();
802
845
  const baseUrl = optionalWorkbenchBaseUrl({ configBaseUrl: config.baseUrl });
803
846
  const tokenPresent = Boolean(config.accessToken);
804
- if (tokenPresent && !baseUrl) {
805
- throw new WorkbenchUserError("Missing Workbench API URL. Set WORKBENCH_API_URL or run `workbench login --base-url URL`.");
806
- }
807
847
  let revoke = "skipped";
808
848
  if (config.accessToken && baseUrl) {
809
849
  try {
@@ -856,18 +896,6 @@ async function handleInstall(parsed, io) {
856
896
  const snapshot = await fetchWorkbenchInstallSourceSnapshot(workbenchSource, source);
857
897
  const sourceSummary = workbenchInstallSourceSummary(workbenchSource, snapshot);
858
898
  const config = await loadConfig();
859
- if (parsed.flags.list === true) {
860
- return emitResult("workbench.cli.install.v1", {
861
- source: sourceSummary,
862
- skills: [snapshot.name],
863
- fileCount: snapshot.files.length,
864
- targets: installTargetsToJson(supportedInstallTargets()),
865
- }, parsed, io, () => [
866
- `${snapshot.name}\t${snapshot.versionId}\tfiles=${snapshot.files.length}`,
867
- "Targets:",
868
- ...supportedInstallTargets().map((target) => ` ${target.agent}\t${target.destination}`),
869
- ].join("\n"));
870
- }
871
899
  const toTargets = stringsFlag(parsed, "to");
872
900
  const selectedTargets = toTargets.length > 0 ? normalizeInstallTargetNames(toTargets) : await defaultInstallTargetNames(config);
873
901
  const targets = resolveInstallTargets({
@@ -898,29 +926,57 @@ async function handleInstall(parsed, io) {
898
926
  ].join("\n"));
899
927
  }
900
928
  async function handleCloudEval(parsed, io) {
901
- const started = await startCloudExecution("eval", parsed);
929
+ const started = await startCloudExecution("eval", parsed, io);
902
930
  const artifactIds = await artifactIdsByRunId(started.core, started.runs);
931
+ if (started.detached) {
932
+ const next = cloudDetachedNextCommand(started.runs);
933
+ emitResult("workbench.cli.eval.v1", {
934
+ result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
935
+ detached: true,
936
+ next: next,
937
+ cloud: cloudExecutionSummary(started),
938
+ }, parsed, io, () => [
939
+ `Detached from hosted eval on ${started.remote.url}.`,
940
+ started.runs.map(formatRun).join("\n"),
941
+ ...(next ? [`next: ${next}`] : []),
942
+ ].filter(Boolean).join("\n"));
943
+ return 130;
944
+ }
903
945
  const failedRuns = started.runs.filter((run) => run.status === "failed" || run.status === "canceled");
904
946
  if (failedRuns.length > 0) {
905
947
  return emitEvalFailure(started.runs, failedRuns, artifactIds, parsed, io);
906
948
  }
907
949
  const deltas = await evalDeltas(started.core, started.runs);
908
- const nextCommands = cloudEvalNextCommands(started.runs);
950
+ const next = await evalSuccessNextCommand(started.core, started.runs);
909
951
  return emitResult("workbench.cli.eval.v1", {
910
952
  result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
911
953
  deltas: deltas,
912
- nextCommands: nextCommands,
954
+ next: next,
913
955
  cloud: cloudExecutionSummary(started),
914
956
  }, parsed, io, () => [
915
957
  `Completed hosted eval on ${started.remote.url}.`,
916
958
  started.runs.map(formatRun).join("\n"),
917
959
  ...deltas.map(formatEvalDelta),
918
- ...(nextCommands[0] ? [`next: ${nextCommands[0]}`] : []),
960
+ ...(next ? [`next: ${next}`] : []),
919
961
  ].filter(Boolean).join("\n"));
920
962
  }
921
963
  async function handleCloudImprove(parsed, io) {
922
- const started = await startCloudExecution("improve", parsed);
964
+ const started = await startCloudExecution("improve", parsed, io);
923
965
  const artifactIds = await artifactIdsByRunId(started.core, started.runs);
966
+ if (started.detached) {
967
+ const next = cloudDetachedNextCommand(started.runs);
968
+ emitResult("workbench.cli.improve.v1", {
969
+ result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
970
+ detached: true,
971
+ next: next,
972
+ cloud: cloudExecutionSummary(started),
973
+ }, parsed, io, () => [
974
+ `Detached from hosted improve on ${started.remote.url}.`,
975
+ started.runs.map(formatRun).join("\n"),
976
+ ...(next ? [`next: ${next}`] : []),
977
+ ].filter(Boolean).join("\n"));
978
+ return 130;
979
+ }
924
980
  const failedRuns = started.runs.filter((run) => run.status === "failed" || run.status === "canceled");
925
981
  if (failedRuns.length > 0) {
926
982
  const first = failedRuns[0];
@@ -934,17 +990,17 @@ async function handleCloudImprove(parsed, io) {
934
990
  });
935
991
  }
936
992
  const switchedVersionId = await switchHostedImproveVersionIfPromoted(started);
937
- const nextCommands = cloudImproveNextCommands(started.runs);
993
+ const next = cloudImproveNextCommand(started.runs);
938
994
  return emitResult("workbench.cli.improve.v1", {
939
995
  result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
940
- nextCommands: nextCommands,
996
+ next: next,
941
997
  cloud: cloudExecutionSummary(started),
942
998
  ...(switchedVersionId ? { switchedVersionId } : {}),
943
999
  }, parsed, io, () => [
944
1000
  `Completed hosted improve on ${started.remote.url}.`,
945
1001
  started.runs.map(formatRun).join("\n"),
946
- ...(switchedVersionId ? [`Switched local source to ${switchedVersionId}.`] : []),
947
- ...(nextCommands[0] ? [`next: ${nextCommands[0]}`] : []),
1002
+ ...(switchedVersionId ? [`Switched local source to ${displayRef(switchedVersionId)}.`] : []),
1003
+ ...(next ? [`next: ${next}`] : []),
948
1004
  ].filter(Boolean).join("\n"));
949
1005
  }
950
1006
  async function defaultInstallTargetNames(config) {
@@ -986,7 +1042,7 @@ async function pathExists(filePath) {
986
1042
  return false;
987
1043
  }
988
1044
  }
989
- async function startCloudExecution(command, parsed) {
1045
+ async function startCloudExecution(command, parsed, io) {
990
1046
  const root = dirFlag(parsed) ?? process.cwd();
991
1047
  const remote = await ensureCloudRemoteForExecution(root, parsed);
992
1048
  const source = parseWorkbenchInstallSource(remote.url);
@@ -1005,7 +1061,9 @@ async function startCloudExecution(command, parsed) {
1005
1061
  });
1006
1062
  }
1007
1063
  const core = { dir: root, authToken: token };
1064
+ writeCloudProgress(io, `workbench cloud: syncing ${remote.name} before hosted ${command}.`);
1008
1065
  const syncBefore = await syncWorkbenchRemote({ ...core, remote: remote.name });
1066
+ writeCloudProgress(io, `workbench cloud: synced ${remote.name} before hosted ${command} (pushed=${syncBefore.pushed}, pulled=${syncBefore.pulled}, up-to-date=${syncBefore.upToDate}).`);
1009
1067
  const startSnapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
1010
1068
  const skillId = await resolveCloudSkillId(source);
1011
1069
  const response = await apiRequest(`/api/workbench/skills/${encodeURIComponent(skillId)}${command === "improve" ? "/improve" : "/runs"}`, { method: "POST", body: cloudExecutionRequestBody(command, parsed) }, source.baseUrl);
@@ -1018,9 +1076,15 @@ async function startCloudExecution(command, parsed) {
1018
1076
  exitCode: 1,
1019
1077
  });
1020
1078
  }
1079
+ const initialRunIds = runs.map((run) => run.id);
1080
+ writeCloudProgress(io, `workbench cloud: scheduled hosted ${command} on ${remote.url} (${formatCloudRunStatuses(runs)}).`);
1021
1081
  const initialSyncAfter = await syncWorkbenchRemote({ ...core, remote: remote.name });
1082
+ writeCloudProgress(io, `workbench cloud: synced after scheduling hosted ${command} (pushed=${initialSyncAfter.pushed}, pulled=${initialSyncAfter.pulled}, up-to-date=${initialSyncAfter.upToDate}).`);
1083
+ writeCloudProgress(io, `workbench cloud: waiting for terminal status; press Ctrl-C to detach and resume with workbench status or workbench show ${displayRef(initialRunIds[0] ?? "run")}.`);
1022
1084
  const completed = await waitForCloudRuns({
1085
+ command,
1023
1086
  core,
1087
+ io,
1024
1088
  remote,
1025
1089
  runs,
1026
1090
  initialSync: initialSyncAfter,
@@ -1029,7 +1093,9 @@ async function startCloudExecution(command, parsed) {
1029
1093
  core,
1030
1094
  remote,
1031
1095
  skillId,
1096
+ initialRunIds,
1032
1097
  runs: completed.runs,
1098
+ ...(completed.detached ? { detached: true } : {}),
1033
1099
  startVersionId: startSnapshot.status.currentVersionId ?? startSnapshot.refs.current,
1034
1100
  source,
1035
1101
  sync: {
@@ -1053,27 +1119,59 @@ async function waitForCloudRuns(input) {
1053
1119
  const timeoutMs = positiveIntEnv("WORKBENCH_CLOUD_RUN_TIMEOUT_MS") ?? CLOUD_RUN_TIMEOUT_MS;
1054
1120
  const pollIntervalMs = positiveIntEnv("WORKBENCH_CLOUD_RUN_POLL_INTERVAL_MS") ?? CLOUD_RUN_POLL_INTERVAL_MS;
1055
1121
  const deadline = Date.now() + timeoutMs;
1056
- while (true) {
1057
- const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(input.core);
1058
- const runs = runIds
1059
- .map((id) => snapshot.runs.find((entry) => entry.id === id))
1060
- .filter((run) => Boolean(run));
1061
- if (runs.length === runIds.length && runs.every(isTerminalRun)) {
1062
- return { runs, sync };
1063
- }
1064
- if (Date.now() >= deadline) {
1065
- throw new WorkbenchCodedError("cloud_run_pending", "Hosted Workbench run is still running.", {
1066
- retryable: true,
1067
- remediation: runIds[0] ? `Run workbench show ${runIds[0]}.` : "Run workbench log --runs.",
1068
- subject: {
1069
- runIds,
1070
- statuses: Object.fromEntries(runs.map((run) => [run.id, run.status])),
1071
- },
1072
- exitCode: 1,
1073
- });
1122
+ let runs = [...input.runs];
1123
+ let interrupted = false;
1124
+ const onSigint = () => {
1125
+ interrupted = true;
1126
+ writeCloudProgress(input.io, `workbench cloud: detaching from hosted ${input.command} (${runIds.map(displayRef).join(", ")}).`);
1127
+ };
1128
+ process.once("SIGINT", onSigint);
1129
+ const seenStatuses = new Map();
1130
+ try {
1131
+ while (true) {
1132
+ const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(input.core);
1133
+ const snapshotRuns = runIds
1134
+ .map((id) => snapshot.runs.find((entry) => entry.id === id))
1135
+ .filter((run) => Boolean(run));
1136
+ if (snapshotRuns.length > 0) {
1137
+ runs = runIds.map((id) => snapshotRuns.find((entry) => entry.id === id) ?? runs.find((entry) => entry.id === id))
1138
+ .filter((run) => Boolean(run));
1139
+ }
1140
+ for (const run of runs) {
1141
+ const previous = seenStatuses.get(run.id);
1142
+ if (previous !== run.status) {
1143
+ seenStatuses.set(run.id, run.status);
1144
+ writeCloudProgress(input.io, `workbench cloud: ${displayRef(run.id)} is ${run.status}.`);
1145
+ }
1146
+ }
1147
+ if (runs.length === runIds.length && runs.every(isTerminalRun)) {
1148
+ writeCloudProgress(input.io, `workbench cloud: hosted ${input.command} finished (${formatCloudRunStatuses(runs)}).`);
1149
+ return { runs, sync };
1150
+ }
1151
+ if (interrupted) {
1152
+ return { runs, sync, detached: true };
1153
+ }
1154
+ if (Date.now() >= deadline) {
1155
+ throw new WorkbenchCodedError("cloud_run_pending", "Hosted Workbench run is still running.", {
1156
+ retryable: true,
1157
+ remediation: runIds[0] ? `Run workbench show ${runIds[0]}.` : "Run workbench log --runs.",
1158
+ subject: {
1159
+ runIds,
1160
+ statuses: Object.fromEntries(runs.map((run) => [run.id, run.status])),
1161
+ },
1162
+ exitCode: 1,
1163
+ });
1164
+ }
1165
+ await sleep(pollIntervalMs);
1166
+ if (interrupted) {
1167
+ return { runs, sync, detached: true };
1168
+ }
1169
+ sync = await syncWorkbenchRemote({ ...input.core, remote: input.remote.name });
1170
+ writeCloudProgress(input.io, `workbench cloud: synced ${input.remote.name} while waiting (${formatCloudRunStatuses(runs)}).`);
1074
1171
  }
1075
- await sleep(pollIntervalMs);
1076
- sync = await syncWorkbenchRemote({ ...input.core, remote: input.remote.name });
1172
+ }
1173
+ finally {
1174
+ process.off("SIGINT", onSigint);
1077
1175
  }
1078
1176
  }
1079
1177
  function isTerminalRun(run) {
@@ -1197,30 +1295,41 @@ function cloudExecutionRequestBody(command, parsed) {
1197
1295
  ...(command === "improve" ? { budget: intFlag(parsed, "budget") } : {}),
1198
1296
  };
1199
1297
  }
1200
- function cloudEvalNextCommands(runs) {
1201
- return cloudExecutionNextCommands(runs, "workbench publish");
1298
+ function cloudImproveNextCommand(runs) {
1299
+ return cloudExecutionNextCommand(runs, "workbench eval");
1202
1300
  }
1203
- function cloudImproveNextCommands(runs) {
1204
- return cloudExecutionNextCommands(runs, "workbench eval");
1301
+ function cloudDetachedNextCommand(runs) {
1302
+ const first = runs[0];
1303
+ return first?.id ? `workbench show ${displayRef(first.id)}` : "workbench status";
1205
1304
  }
1206
- function cloudExecutionNextCommands(runs, successCommand) {
1305
+ function cloudExecutionNextCommand(runs, successCommand) {
1207
1306
  const first = runs[0];
1208
1307
  if (!first) {
1209
- return ["workbench log --runs"];
1308
+ return "workbench log --runs";
1210
1309
  }
1211
1310
  if (first.status === "running" || first.status === "failed" || first.status === "canceled") {
1212
- return [`workbench show ${first.id}`];
1311
+ return `workbench show ${displayRef(first.id)}`;
1213
1312
  }
1214
- return [successCommand];
1313
+ return successCommand;
1215
1314
  }
1216
1315
  function cloudExecutionSummary(started) {
1217
1316
  return {
1218
1317
  remote: started.remote.name,
1219
1318
  url: started.remote.url,
1220
1319
  skillId: started.skillId,
1320
+ initialRunIds: started.initialRunIds,
1321
+ ...(started.detached ? { detached: true } : {}),
1221
1322
  sync: started.sync,
1222
1323
  };
1223
1324
  }
1325
+ function writeCloudProgress(io, message) {
1326
+ io.stderr.write(`${message}\n`);
1327
+ }
1328
+ function formatCloudRunStatuses(runs) {
1329
+ return runs.length > 0
1330
+ ? runs.map((run) => `${displayRef(run.id)}:${run.status}`).join(", ")
1331
+ : "no runs";
1332
+ }
1224
1333
  function workbenchInstallSourceSummary(source, snapshot) {
1225
1334
  const installUrl = `${source.baseUrl}/skills/${encodeURIComponent(source.owner)}/${encodeURIComponent(source.skill)}`;
1226
1335
  return {
@@ -1294,12 +1403,13 @@ async function fetchWorkbenchInstallSourceSnapshot(source, displaySource) {
1294
1403
  throw new WorkbenchCodedError("auth_required", token
1295
1404
  ? `Workbench Cloud rejected the provided token while installing ${displaySource}.`
1296
1405
  : `Authentication is required to install ${displaySource}.`, {
1297
- remediation: `Run workbench login --base-url ${source.baseUrl}.`,
1406
+ remediation: "Run workbench login.",
1298
1407
  exitCode: 1,
1299
1408
  });
1300
1409
  }
1301
1410
  if (!response.ok) {
1302
- throw new WorkbenchCodedError("install_failed", `Unable to download Workbench source ${displaySource}: ${response.status} ${readResponseError(text) ?? response.statusText}`, {
1411
+ const excerpt = readResponseError(text);
1412
+ throw new WorkbenchCodedError("install_failed", `Unable to download Workbench source ${displaySource}: ${response.status}${excerpt ? ` ${excerpt}` : response.statusText ? ` ${response.statusText}` : ""}`, {
1303
1413
  subject: { source: displaySource, status: response.status },
1304
1414
  exitCode: 1,
1305
1415
  });
@@ -1420,18 +1530,15 @@ function deviceAuthPath() {
1420
1530
  return process.env.WORKBENCH_DEVICE_AUTH?.trim() || path.join(path.dirname(configPath()), "device-auth.json");
1421
1531
  }
1422
1532
  function selectWorkbenchBaseUrl(input = {}) {
1423
- const baseUrl = optionalWorkbenchBaseUrl(input);
1424
- if (!baseUrl) {
1425
- throw new WorkbenchUserError("Missing Workbench API URL. Pass --base-url URL, set WORKBENCH_API_URL, or run `workbench login --base-url URL`.");
1426
- }
1427
- return baseUrl;
1533
+ return optionalWorkbenchBaseUrl(input);
1428
1534
  }
1429
1535
  function optionalWorkbenchBaseUrl(input = {}) {
1430
1536
  const value = input.explicitBaseUrl ??
1431
1537
  input.originBaseUrl ??
1432
1538
  process.env.WORKBENCH_API_URL ??
1433
- input.configBaseUrl;
1434
- return value ? normalizeBaseUrl(value) : undefined;
1539
+ input.configBaseUrl ??
1540
+ DEFAULT_WORKBENCH_CLOUD_BASE_URL;
1541
+ return normalizeBaseUrl(value);
1435
1542
  }
1436
1543
  function normalizeBaseUrl(value) {
1437
1544
  return value.trim().replace(/\/+$/u, "");
@@ -1449,7 +1556,8 @@ async function requestDeviceAuthorization(baseUrl) {
1449
1556
  });
1450
1557
  }
1451
1558
  if (!response.ok) {
1452
- throw new WorkbenchCodedError("login_denied", `Device login failed: ${readResponseError(text) ?? response.statusText}`, {
1559
+ const excerpt = readResponseError(text);
1560
+ throw new WorkbenchCodedError("login_denied", `Device login failed: ${response.status}${excerpt ? ` ${excerpt}` : response.statusText ? ` ${response.statusText}` : ""}`, {
1453
1561
  exitCode: 1,
1454
1562
  });
1455
1563
  }
@@ -1500,7 +1608,7 @@ async function pollDeviceToken(baseUrl, authorization, timeoutSeconds) {
1500
1608
  }
1501
1609
  throw new WorkbenchCodedError("login_pending", "Device login is still waiting for browser authorization.", {
1502
1610
  retryable: true,
1503
- remediation: "Authorize the device in the browser, then run workbench login --wait --timeout 120.",
1611
+ remediation: "Authorize the device in the browser, then run workbench login --wait.",
1504
1612
  subject: {
1505
1613
  retryAfterSeconds: Math.max(1, Math.ceil(intervalMs / 1000)),
1506
1614
  verificationUri: authorization.verification_uri,
@@ -1524,7 +1632,8 @@ async function fetchWorkbenchUsername(baseUrl, accessToken) {
1524
1632
  }
1525
1633
  async function readPendingDeviceAuthorization(baseUrl) {
1526
1634
  const record = await readDeviceAuthorizationJson(deviceAuthPath());
1527
- if (!record || record.baseUrl !== baseUrl || Date.parse(record.expiresAt) <= Date.now()) {
1635
+ const expectedBaseUrl = baseUrl ? normalizeBaseUrl(baseUrl) : undefined;
1636
+ if (!record || (expectedBaseUrl && record.baseUrl !== expectedBaseUrl) || Date.parse(record.expiresAt) <= Date.now()) {
1528
1637
  return null;
1529
1638
  }
1530
1639
  return record;
@@ -1614,7 +1723,8 @@ async function apiRequest(apiPath, options = {}, baseUrlOverride) {
1614
1723
  }
1615
1724
  throw requestError;
1616
1725
  }
1617
- const requestError = new WorkbenchApiRequestError(response.status, readResponseError(text) ?? `Request failed with status ${response.status}${response.statusText ? ` ${response.statusText}` : ""}.`, text);
1726
+ const excerpt = readResponseError(text);
1727
+ const requestError = new WorkbenchApiRequestError(response.status, `Request failed with status ${response.status}${response.statusText ? ` ${response.statusText}` : ""}${excerpt ? `: ${excerpt}` : ""}.`, text);
1618
1728
  lastError = requestError;
1619
1729
  if (canRetry && attempt < API_REQUEST_MAX_ATTEMPTS && isTransientApiRequestError(requestError)) {
1620
1730
  await sleep(250 * attempt);
@@ -1707,12 +1817,22 @@ function readResponseError(text) {
1707
1817
  const parsed = JSON.parse(text);
1708
1818
  const record = asRecord(parsed);
1709
1819
  const error = record?.error ?? record?.message;
1710
- return typeof error === "string" && error.trim() ? error : null;
1820
+ return typeof error === "string" && error.trim() ? oneLineExcerpt(error) : null;
1711
1821
  }
1712
1822
  catch {
1713
- return text.trim() || null;
1823
+ if (/<(?:!doctype|html|head|body)\b/iu.test(text)) {
1824
+ return null;
1825
+ }
1826
+ return oneLineExcerpt(text);
1714
1827
  }
1715
1828
  }
1829
+ function oneLineExcerpt(text) {
1830
+ const line = text.replace(/\s+/gu, " ").trim();
1831
+ if (!line) {
1832
+ return null;
1833
+ }
1834
+ return line.length > 180 ? `${line.slice(0, 177)}...` : line;
1835
+ }
1716
1836
  function parseWorkbenchCloudErrorBody(text) {
1717
1837
  try {
1718
1838
  const record = asRecord(JSON.parse(text));
@@ -2168,6 +2288,17 @@ function intFlag(parsed, name) {
2168
2288
  }
2169
2289
  return parsedValue;
2170
2290
  }
2291
+ function portFlag(parsed, name) {
2292
+ const value = stringFlag(parsed, name);
2293
+ if (!value) {
2294
+ return undefined;
2295
+ }
2296
+ const parsedValue = Number(value);
2297
+ if (!Number.isInteger(parsedValue) || parsedValue < 0 || parsedValue > 65535) {
2298
+ throw new WorkbenchUserError(`--${name} must be an integer between 0 and 65535.`);
2299
+ }
2300
+ return parsedValue;
2301
+ }
2171
2302
  function optionalPositional(parsed, index) {
2172
2303
  return parsed.positionals[index];
2173
2304
  }
@@ -2227,19 +2358,15 @@ function parsePublishVisibilityFlags(parsed) {
2227
2358
  }
2228
2359
  async function previewPublishWithDerivedRemote(parsed) {
2229
2360
  const root = path.resolve(dirFlag(parsed) ?? process.cwd());
2230
- const core = await coreOptions(parsed);
2231
- await listWorkbenchVersions(core);
2232
2361
  const reconciledSnapshot = await createWorkbenchReadOnlyInspectionSnapshot({ dir: root });
2233
2362
  const link = cloudRemoteLinkTargetFromRemotes(reconciledSnapshot.remotes);
2234
- if (link.existing) {
2235
- return undefined;
2236
- }
2237
- const remote = await derivePublishCloudRemote(parsed, "workbench publish", link.name);
2363
+ const remote = stringFlag(parsed, "as") || !link.existing
2364
+ ? await derivePublishCloudRemote(parsed, "workbench publish", link.name)
2365
+ : link.existing;
2238
2366
  const requestedVersion = optionalPositional(parsed, 1);
2239
- const versionId = requestedVersion && requestedVersion !== "current"
2240
- ? requestedVersion
2241
- : reconciledSnapshot.status.currentVersionId ?? reconciledSnapshot.refs.current;
2242
- const version = reconciledSnapshot.versions.find((entry) => entry.id === versionId);
2367
+ const version = requestedVersion && requestedVersion !== "current"
2368
+ ? snapshotVersionByRef(reconciledSnapshot, requestedVersion)
2369
+ : snapshotVersionByRef(reconciledSnapshot, reconciledSnapshot.status.currentVersionId ?? reconciledSnapshot.refs.current ?? "");
2243
2370
  if (!version) {
2244
2371
  throw new WorkbenchCodedError("version_not_found", `Version not found: ${requestedVersion ?? "current"}`, {
2245
2372
  remediation: "Run workbench log --versions.",
@@ -2390,7 +2517,7 @@ async function artifactIdsByRunId(core, runs) {
2390
2517
  return byRun;
2391
2518
  }
2392
2519
  function emitEvalFailure(runs, failedRuns, artifactIds, parsed, io) {
2393
- const nextCommands = evalFailureNextCommands(failedRuns);
2520
+ const next = evalFailureNextCommand(failedRuns);
2394
2521
  if (parsed.flags.json === true) {
2395
2522
  io.stdout.write(`${JSON.stringify({
2396
2523
  schema: "workbench.cli.eval.v1",
@@ -2401,14 +2528,14 @@ function emitEvalFailure(runs, failedRuns, artifactIds, parsed, io) {
2401
2528
  evidenceSaved: true,
2402
2529
  runs: runs.map((run) => runFailureSummary(run, artifactIds.get(run.id) ?? [])),
2403
2530
  failedRuns: failedRuns.map((run) => runFailureSummary(run, artifactIds.get(run.id) ?? [])),
2404
- nextCommands,
2531
+ next,
2405
2532
  }, null, 2)}\n`);
2406
2533
  return 1;
2407
2534
  }
2408
2535
  io.stdout.write([
2409
2536
  "Eval failed; evidence was saved.",
2410
2537
  ...failedRuns.map(formatRun),
2411
- ...(nextCommands[0] ? [`next: ${nextCommands[0]}`] : []),
2538
+ ...(next ? [`next: ${next}`] : []),
2412
2539
  ].join("\n") + "\n");
2413
2540
  return 1;
2414
2541
  }
@@ -2420,7 +2547,7 @@ function runSummary(run, artifactIds) {
2420
2547
  versionId: run.versionId,
2421
2548
  skillName: run.skillName,
2422
2549
  agentName: run.agentName,
2423
- ...(run.score !== undefined ? { score: run.score } : {}),
2550
+ ...(scoredRunValue(run) !== undefined ? { score: scoredRunValue(run) } : {}),
2424
2551
  ...(run.latencyMs !== undefined ? { latencyMs: run.latencyMs } : {}),
2425
2552
  ...(run.error ? { error: run.error } : {}),
2426
2553
  ...(run.jobIds ? { jobIds: run.jobIds } : {}),
@@ -2435,23 +2562,18 @@ function runFailureSummary(run, artifactIds) {
2435
2562
  skill: run.skillName,
2436
2563
  status: run.status,
2437
2564
  versionId: run.versionId,
2438
- ...(run.score !== undefined ? { score: run.score } : {}),
2565
+ ...(scoredRunValue(run) !== undefined ? { score: scoredRunValue(run) } : {}),
2439
2566
  ...(run.error ? { error: run.error } : {}),
2440
2567
  traceIds: run.traceIds,
2441
2568
  artifactIds: [...artifactIds],
2442
2569
  };
2443
2570
  }
2444
- function evalFailureNextCommands(failedRuns) {
2571
+ function evalFailureNextCommand(failedRuns) {
2445
2572
  const first = failedRuns[0];
2446
2573
  if (!first) {
2447
- return ["workbench log --runs"];
2574
+ return "workbench log --runs";
2448
2575
  }
2449
- return [
2450
- `workbench show ${first.id}`,
2451
- `workbench show ${first.id}:stderr.log`,
2452
- `workbench case add ${first.id}`,
2453
- `workbench improve --agents ${first.agentName} --budget 1 -n 1`,
2454
- ];
2576
+ return `workbench show ${displayRef(first.id)}`;
2455
2577
  }
2456
2578
  function output(value, parsed, io, text) {
2457
2579
  return emitResult(commandSchema(parsed), { result: value }, parsed, io, text);
@@ -2484,12 +2606,285 @@ async function workbenchCliAuthStatus() {
2484
2606
  })),
2485
2607
  };
2486
2608
  }
2609
+ function scoredRunValue(run) {
2610
+ return run.status === "succeeded" && typeof run.score === "number" ? run.score : undefined;
2611
+ }
2612
+ function scoredJobValue(job) {
2613
+ return job.status === "succeeded" && typeof job.score === "number" ? job.score : undefined;
2614
+ }
2615
+ function snapshotHasWorkflowCase(snapshot) {
2616
+ const currentVersion = snapshotVersionByRef(snapshot, snapshot.status.currentVersionId ?? snapshot.refs.current ?? "");
2617
+ const caseFiles = currentVersion?.files.filter((file) => file.kind === "text" &&
2618
+ /^\.workbench\/cases\/[^/]+\/case\.ya?ml$/u.test(file.path)) ?? [];
2619
+ return caseFiles.some((file) => file.kind === "text" && !/\n\s*smoke:\s*true(?:\s|$)/u.test(`\n${file.content}`));
2620
+ }
2621
+ function installHandleFromStatusRemote(remote) {
2622
+ const publicationUrl = remote.publication.status === "published" ? remote.publication.installUrl : undefined;
2623
+ const source = parseWorkbenchInstallSource(publicationUrl ?? remote.url);
2624
+ return source ? `${source.owner}/${source.skill}` : publicationUrl ?? remote.url;
2625
+ }
2626
+ async function statusWithCausalNext(status, auth, core) {
2627
+ if (!status.project.initialized) {
2628
+ return status;
2629
+ }
2630
+ const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core).catch(() => null);
2631
+ const lastRun = snapshot?.runs
2632
+ .slice()
2633
+ .sort((left, right) => right.createdAt.localeCompare(left.createdAt))[0];
2634
+ if ((lastRun?.status === "running" || lastRun?.status === "failed" || lastRun?.status === "canceled") && lastRun.id) {
2635
+ return { ...status, next: `workbench show ${displayRef(lastRun.id)}` };
2636
+ }
2637
+ const failedRemote = status.remotes.find((remote) => remote.sync.status === "error");
2638
+ const hasWorkflowCase = snapshot ? snapshotHasWorkflowCase(snapshot) : false;
2639
+ const hasScoredRun = snapshot?.runs.some((run) => scoredRunValue(run) !== undefined) ?? false;
2640
+ const canPublish = hasWorkflowCase && hasScoredRun;
2641
+ const cloudAuthMissing = auth.workbenchCloud.status !== "authenticated";
2642
+ const cloudRemoteNeedsAuth = status.remotes.some((remote) => remote.kind === "workbench-cloud" &&
2643
+ (remote.sync.status !== "up_to_date" || remote.publication.status === "unpublished"));
2644
+ if (cloudAuthMissing && (canPublish || cloudRemoteNeedsAuth)) {
2645
+ return { ...status, next: "workbench login" };
2646
+ }
2647
+ if (failedRemote) {
2648
+ return { ...status, next: `workbench sync ${failedRemote.name}` };
2649
+ }
2650
+ if ((snapshot?.runs.length ?? status.runs.total) === 0) {
2651
+ return { ...status, next: "workbench eval" };
2652
+ }
2653
+ if (!hasWorkflowCase) {
2654
+ return { ...status, next: "edit .workbench/cases, then run workbench eval" };
2655
+ }
2656
+ const cloudRemote = status.remotes.find((remote) => remote.kind === "workbench-cloud");
2657
+ if (canPublish && !cloudRemote) {
2658
+ return { ...status, next: "workbench publish" };
2659
+ }
2660
+ const unpublishedCloudRemote = status.remotes.find((remote) => remote.kind === "workbench-cloud" &&
2661
+ remote.publication.status === "unpublished" &&
2662
+ remote.sync.status === "up_to_date");
2663
+ if (unpublishedCloudRemote) {
2664
+ return { ...status, next: "workbench publish" };
2665
+ }
2666
+ const publishedCloudRemote = status.remotes.find((remote) => remote.kind === "workbench-cloud" &&
2667
+ remote.publication.status === "published" &&
2668
+ Boolean(remote.publication.installUrl));
2669
+ if (publishedCloudRemote) {
2670
+ return { ...status, next: `workbench install ${installHandleFromStatusRemote(publishedCloudRemote)}` };
2671
+ }
2672
+ return {
2673
+ ...status,
2674
+ next: null,
2675
+ };
2676
+ }
2677
+ function displayRef(id) {
2678
+ const version = /^v_([0-9a-f]{8,})$/iu.exec(id);
2679
+ if (version?.[1]) {
2680
+ return version[1].slice(0, 8);
2681
+ }
2682
+ const separator = id.indexOf("_");
2683
+ if (separator > 0 && separator < id.length - 1) {
2684
+ const prefix = id.slice(0, separator);
2685
+ const suffix = id.slice(separator + 1);
2686
+ return `${prefix}_${suffix.slice(0, 8)}`;
2687
+ }
2688
+ return id.length > 8 ? id.slice(0, 8) : id;
2689
+ }
2690
+ function shortenCommandRefs(command) {
2691
+ return command.replace(/\b(?:v_[0-9a-f]{8,}|(?:run|job|trace|artifact)_[a-z0-9_-]+)/giu, (match) => displayRef(match));
2692
+ }
2693
+ function snapshotVersionByRef(snapshot, ref) {
2694
+ const requested = ref.trim();
2695
+ const normalized = requested === "current" ? snapshot.refs.current ?? "" : requested;
2696
+ if (!normalized) {
2697
+ return undefined;
2698
+ }
2699
+ const candidates = snapshot.versions.filter((version) => snapshotVersionRefMatches(version, normalized));
2700
+ if (candidates.length > 1) {
2701
+ throw new WorkbenchCodedError("ref_ambiguous", `Version ref is ambiguous: ${ref}. Candidates: ${candidates.map((version) => displayRef(version.id)).join(", ")}.`, {
2702
+ subject: { ref, candidates: candidates.map((version) => version.id) },
2703
+ exitCode: 2,
2704
+ });
2705
+ }
2706
+ return candidates[0];
2707
+ }
2708
+ function snapshotVersionRefMatches(version, ref) {
2709
+ const withoutVersionPrefix = ref.startsWith("v_") ? ref.slice(2) : ref;
2710
+ return version.id === ref ||
2711
+ version.hash === ref ||
2712
+ version.id.startsWith(ref) ||
2713
+ version.hash.startsWith(ref) ||
2714
+ version.hash.startsWith(withoutVersionPrefix) ||
2715
+ version.id.startsWith(`v_${withoutVersionPrefix}`);
2716
+ }
2717
+ function snapshotObjectByRef(entries, ref, kind) {
2718
+ const normalized = ref.trim();
2719
+ if (!normalized) {
2720
+ return undefined;
2721
+ }
2722
+ const candidates = entries.filter((entry) => objectRefMatches(entry.id, normalized));
2723
+ if (candidates.length > 1) {
2724
+ throw new WorkbenchCodedError("ref_ambiguous", `${capitalize(kind)} ref is ambiguous: ${ref}. Candidates: ${candidates.map((entry) => displayRef(entry.id)).slice(0, 8).join(", ")}.`, {
2725
+ subject: { ref, candidates: candidates.map((entry) => entry.id).slice(0, 20) },
2726
+ exitCode: 2,
2727
+ });
2728
+ }
2729
+ return candidates[0];
2730
+ }
2731
+ function objectRefMatches(id, ref) {
2732
+ if (id === ref || id.startsWith(ref)) {
2733
+ return true;
2734
+ }
2735
+ const separator = id.indexOf("_");
2736
+ return separator > 0 && id.slice(separator + 1).startsWith(ref);
2737
+ }
2738
+ function capitalize(value) {
2739
+ return value.length > 0 ? `${value[0].toUpperCase()}${value.slice(1)}` : value;
2740
+ }
2741
+ function runOrJobEvidenceSelection(snapshot, ref) {
2742
+ const run = snapshotObjectByRef(snapshot.runs, ref, "run");
2743
+ const job = snapshotObjectByRef(snapshot.jobs, ref, "job");
2744
+ if (run && job) {
2745
+ throw new WorkbenchCodedError("ref_ambiguous", `Run/job ref is ambiguous: ${ref}. Candidates: ${displayRef(run.id)}, ${displayRef(job.id)}.`, {
2746
+ subject: { ref, candidates: [run.id, job.id] },
2747
+ exitCode: 2,
2748
+ });
2749
+ }
2750
+ if (run) {
2751
+ return {
2752
+ run,
2753
+ jobs: snapshot.jobs.filter((entry) => entry.runId === run.id),
2754
+ };
2755
+ }
2756
+ return job ? { jobs: [job] } : { jobs: [] };
2757
+ }
2758
+ function evidenceFilesForRunOrJob(snapshot, ref) {
2759
+ const selection = runOrJobEvidenceSelection(snapshot, ref);
2760
+ if (!selection.run && selection.jobs.length === 0) {
2761
+ return [];
2762
+ }
2763
+ const traceById = new Map(snapshot.traces.map((trace) => [trace.id, trace]));
2764
+ const artifactById = new Map(snapshot.artifacts.map((artifact) => [artifact.id, artifact]));
2765
+ const files = selection.jobs.flatMap((job) => [
2766
+ ...job.artifactIds.flatMap((artifactId) => {
2767
+ const artifact = artifactById.get(artifactId);
2768
+ return artifact
2769
+ ? artifact.files.filter(isUserFacingEvidenceFile).map((file) => evidenceFileWithPath(file, `cases/${evidencePathSegment(job.caseId)}/jobs/${evidencePathSegment(job.id)}/${file.path}`))
2770
+ : [];
2771
+ }),
2772
+ ...job.traceIds.flatMap((traceId) => {
2773
+ const trace = traceById.get(traceId);
2774
+ return trace
2775
+ ? trace.files.filter(isUserFacingEvidenceFile).map((file) => evidenceFileWithPath(file, `cases/${evidencePathSegment(job.caseId)}/jobs/${evidencePathSegment(job.id)}/traces/${evidencePathSegment(trace.id)}/${file.path}`))
2776
+ : [];
2777
+ }),
2778
+ ]);
2779
+ const seen = new Set();
2780
+ return files.filter((file) => {
2781
+ if (seen.has(file.path)) {
2782
+ return false;
2783
+ }
2784
+ seen.add(file.path);
2785
+ return true;
2786
+ });
2787
+ }
2788
+ function evidenceFileWithPath(file, filePath) {
2789
+ return {
2790
+ ...file,
2791
+ path: filePath.replace(/\\/gu, "/").replace(/^\/+/u, ""),
2792
+ };
2793
+ }
2794
+ function isUserFacingEvidenceFile(file) {
2795
+ const normalized = file.path.replace(/\\/gu, "/").replace(/^\/+/u, "");
2796
+ return normalized.split("/").every((segment) => segment !== ".workbench");
2797
+ }
2798
+ function evidencePathSegment(value) {
2799
+ return value.replace(/[^A-Za-z0-9._-]+/gu, "-") || "_";
2800
+ }
2801
+ function formatRunOrJobEvidence(details, files) {
2802
+ const detailLines = details.map(formatTraceDetail).filter(Boolean);
2803
+ const fileLines = files.length > 0 ? ["Files:", ...files.map((file) => file.path)] : [];
2804
+ return [...detailLines, ...fileLines].join("\n") || "No evidence.";
2805
+ }
2806
+ function evidenceDetailSummary(detail) {
2807
+ return {
2808
+ runId: detail.runId,
2809
+ executions: detail.executions.map((execution) => ({
2810
+ id: execution.id,
2811
+ status: execution.status,
2812
+ jobIds: execution.jobIds,
2813
+ sessions: execution.sessions.map((session) => ({
2814
+ label: session.label,
2815
+ })),
2816
+ trace: {
2817
+ events: execution.trace.events.length,
2818
+ spans: execution.trace.spans.length,
2819
+ summaries: execution.trace.summaries.length,
2820
+ },
2821
+ })),
2822
+ };
2823
+ }
2824
+ function manifestOnly(value) {
2825
+ if (value === null || typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
2826
+ return value;
2827
+ }
2828
+ if (Array.isArray(value)) {
2829
+ return value.map(manifestOnly);
2830
+ }
2831
+ if (!value || typeof value !== "object") {
2832
+ return null;
2833
+ }
2834
+ const record = value;
2835
+ if (typeof record.path === "string" && typeof record.content === "string") {
2836
+ return fileSummary(record);
2837
+ }
2838
+ const out = {};
2839
+ for (const [key, child] of Object.entries(record)) {
2840
+ if (child === undefined) {
2841
+ continue;
2842
+ }
2843
+ out[key] = manifestOnly(child);
2844
+ }
2845
+ return out;
2846
+ }
2847
+ async function resolveLocalImproverAgent(parsed, core) {
2848
+ if (stringFlag(parsed, "agents")) {
2849
+ return undefined;
2850
+ }
2851
+ const agents = await listWorkbenchAgents(core).catch(() => []);
2852
+ const status = await workbenchStatusSnapshot(core).catch(() => undefined);
2853
+ const defaultAgentName = status?.project.defaultAgent ?? agents[0]?.name;
2854
+ const defaultAgent = agents.find((agent) => agent.name === defaultAgentName);
2855
+ if (defaultAgent && workbenchSkillImproveCanUseQueuedAdapter(defaultAgent)) {
2856
+ return undefined;
2857
+ }
2858
+ const connected = await localWorkbenchAdapterAuthStore(adapterAuthStoreRoot()).listStatus().catch(() => []);
2859
+ const candidates = connected
2860
+ .filter((entry) => entry.status === "connected" &&
2861
+ (entry.adapterId === "claude" || entry.adapterId === "codex"))
2862
+ .sort((left, right) => {
2863
+ const adapterRank = (adapter) => adapter === "claude" ? 0 : adapter === "codex" ? 1 : 2;
2864
+ return adapterRank(left.adapterId) - adapterRank(right.adapterId) ||
2865
+ (Date.parse(right.updatedAt ?? "") || 0) - (Date.parse(left.updatedAt ?? "") || 0);
2866
+ });
2867
+ const selected = candidates[0];
2868
+ if (!selected) {
2869
+ throw new WorkbenchCodedError("auth_required", "workbench improve needs a connected improver.", {
2870
+ remediation: "Run workbench login claude (or codex) to connect an improver.",
2871
+ exitCode: 1,
2872
+ });
2873
+ }
2874
+ return {
2875
+ name: selected.adapterId,
2876
+ adapter: selected.adapterId,
2877
+ config: {
2878
+ auth: selected.slot ? { [selected.slot]: selected.profile } : selected.profile,
2879
+ },
2880
+ };
2881
+ }
2487
2882
  function formatLogEntry(entry) {
2488
2883
  if (entry.kind === "version") {
2489
- return `${entry.createdAt}\tversion\t${entry.id}\tfiles=${entry.fileCount}\t${entry.message}`;
2884
+ return `${entry.createdAt}\tversion\t${displayRef(entry.id)}\tfiles=${entry.fileCount}\t${entry.message}`;
2490
2885
  }
2491
2886
  const score = entry.score === undefined ? "n/a" : entry.score.toFixed(3);
2492
- return `${entry.createdAt}\trun\t${entry.id}\t${entry.status}\tversion=${entry.versionId}\tskill=${entry.skillName}\tagent=${entry.agentName}\tscore=${score}`;
2887
+ return `${entry.createdAt}\trun\t${displayRef(entry.id)}\t${entry.status}\tversion=${displayRef(entry.versionId)}\tskill=${entry.skillName}\tagent=${entry.agentName}\tscore=${score}`;
2493
2888
  }
2494
2889
  function splitShowRef(ref) {
2495
2890
  const index = ref.indexOf(":");
@@ -2500,18 +2895,14 @@ function splitShowRef(ref) {
2500
2895
  }
2501
2896
  async function fileForRunOrJobRef(core, objectRef, requestedPath) {
2502
2897
  const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
2503
- const run = snapshot.runs.find((entry) => entry.id === objectRef);
2504
- const job = snapshot.jobs.find((entry) => entry.id === objectRef);
2505
- if (!run && !job) {
2898
+ const selection = runOrJobEvidenceSelection(snapshot, objectRef);
2899
+ if (!selection.run && selection.jobs.length === 0) {
2506
2900
  return null;
2507
2901
  }
2508
- const traceIds = run?.traceIds ?? job?.traceIds ?? [];
2509
- const traces = snapshot.traces.filter((trace) => traceIds.includes(trace.id));
2510
- for (const trace of traces) {
2511
- const file = findShowFile(trace.files, requestedPath);
2512
- if (file) {
2513
- return file;
2514
- }
2902
+ const files = evidenceFilesForRunOrJob(snapshot, objectRef);
2903
+ const file = findShowFile(files, requestedPath, objectRef);
2904
+ if (file) {
2905
+ return file;
2515
2906
  }
2516
2907
  throw new WorkbenchCodedError("ref_not_found", `File not found in ${objectRef}: ${requestedPath}`, {
2517
2908
  remediation: `Run workbench show ${objectRef}.`,
@@ -2520,12 +2911,8 @@ async function fileForRunOrJobRef(core, objectRef, requestedPath) {
2520
2911
  });
2521
2912
  }
2522
2913
  function evidenceDetailsForRunOrJob(snapshot, ref) {
2523
- const run = snapshot.runs.find((entry) => entry.id === ref);
2524
- const job = snapshot.jobs.find((entry) => entry.id === ref);
2525
- const jobs = run
2526
- ? snapshot.jobs.filter((entry) => entry.runId === run.id)
2527
- : job ? [job] : [];
2528
- return jobs.flatMap((entry) => {
2914
+ const selection = runOrJobEvidenceSelection(snapshot, ref);
2915
+ return selection.jobs.flatMap((entry) => {
2529
2916
  const detail = workbenchJobEvidenceForSnapshot(snapshot, {
2530
2917
  runId: entry.runId,
2531
2918
  jobId: entry.id,
@@ -2536,12 +2923,72 @@ function evidenceDetailsForRunOrJob(snapshot, ref) {
2536
2923
  execution.trace.events.length > 0 ||
2537
2924
  execution.trace.summaries.length > 0));
2538
2925
  }
2539
- function findShowFile(files, requestedPath) {
2926
+ function findShowFile(files, requestedPath, objectRef) {
2540
2927
  const normalized = requestedPath.replace(/\\/gu, "/");
2541
- return files.find((file) => file.path === normalized) ??
2542
- files.find((file) => file.path.endsWith(`/${normalized}`)) ??
2543
- files.find((file) => path.basename(file.path) === normalized) ??
2544
- null;
2928
+ const exact = files.filter((file) => file.path === normalized);
2929
+ if (exact.length === 1) {
2930
+ return exact[0];
2931
+ }
2932
+ const exactEquivalent = singleEquivalentShowFile(exact);
2933
+ if (exactEquivalent) {
2934
+ return exactEquivalent;
2935
+ }
2936
+ if (exact.length > 1) {
2937
+ throw ambiguousShowPath(objectRef, requestedPath, exact);
2938
+ }
2939
+ const suffixCandidates = files.filter((file) => file.path.endsWith(`/${normalized}`) || path.basename(file.path) === normalized);
2940
+ if (suffixCandidates.length === 0) {
2941
+ return null;
2942
+ }
2943
+ const candidates = normalized === "stderr.log"
2944
+ ? suffixCandidates.filter((file) => file.content.length > 0)
2945
+ : suffixCandidates;
2946
+ const canonicalCandidates = candidates.filter(isCanonicalEvidenceFileCandidate);
2947
+ if (canonicalCandidates.length === 1) {
2948
+ return canonicalCandidates[0];
2949
+ }
2950
+ const equivalentCanonicalCandidate = singleEquivalentShowFile(canonicalCandidates);
2951
+ if (equivalentCanonicalCandidate) {
2952
+ return equivalentCanonicalCandidate;
2953
+ }
2954
+ if (canonicalCandidates.length > 1) {
2955
+ throw ambiguousShowPath(objectRef, requestedPath, canonicalCandidates);
2956
+ }
2957
+ if (candidates.length === 1) {
2958
+ return candidates[0];
2959
+ }
2960
+ const equivalentCandidate = singleEquivalentShowFile(candidates);
2961
+ if (equivalentCandidate) {
2962
+ return equivalentCandidate;
2963
+ }
2964
+ if (candidates.length === 0 && suffixCandidates.length === 1) {
2965
+ return suffixCandidates[0];
2966
+ }
2967
+ const equivalentSuffixCandidate = singleEquivalentShowFile(suffixCandidates);
2968
+ if (equivalentSuffixCandidate) {
2969
+ return equivalentSuffixCandidate;
2970
+ }
2971
+ throw ambiguousShowPath(objectRef, requestedPath, candidates.length > 0 ? candidates : suffixCandidates);
2972
+ }
2973
+ function isCanonicalEvidenceFileCandidate(file) {
2974
+ return !file.path.includes("/traces/") && !file.path.includes("/artifacts/");
2975
+ }
2976
+ function singleEquivalentShowFile(files) {
2977
+ if (files.length <= 1) {
2978
+ return null;
2979
+ }
2980
+ const first = files[0];
2981
+ return files.every((file) => file.kind === first.kind && file.encoding === first.encoding && file.content === first.content)
2982
+ ? first
2983
+ : null;
2984
+ }
2985
+ function ambiguousShowPath(objectRef, requestedPath, candidates) {
2986
+ const candidatePaths = candidates.map((file) => file.path);
2987
+ return new WorkbenchCodedError("ref_ambiguous", `File path is ambiguous in ${objectRef}: ${requestedPath}. Candidates: ${candidatePaths.join(", ")}.`, {
2988
+ remediation: `Run workbench show ${objectRef}.`,
2989
+ subject: { ref: objectRef, path: requestedPath, candidates: candidatePaths },
2990
+ exitCode: 2,
2991
+ });
2545
2992
  }
2546
2993
  function fileListing(kind, id, files) {
2547
2994
  return {
@@ -2552,17 +2999,16 @@ function fileListing(kind, id, files) {
2552
2999
  };
2553
3000
  }
2554
3001
  function formatFileListing(kind, id, files) {
2555
- return [`${kind}\t${id}\tfiles=${files.length}`, ...files.map((file) => file.path)].join("\n");
3002
+ return [`${kind}\t${displayRef(id)}\tfiles=${files.length}`, ...files.map((file) => file.path)].join("\n");
2556
3003
  }
2557
3004
  async function traceIdForCaseSource(core, ref) {
2558
3005
  const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
2559
- const trace = snapshot.traces.find((entry) => entry.id === ref);
3006
+ const trace = snapshotObjectByRef(snapshot.traces, ref, "trace");
2560
3007
  if (trace) {
2561
3008
  return trace.id;
2562
3009
  }
2563
- const run = snapshot.runs.find((entry) => entry.id === ref);
2564
- const job = snapshot.jobs.find((entry) => entry.id === ref);
2565
- const traceId = run?.traceIds[0] ?? job?.traceIds[0];
3010
+ const selection = runOrJobEvidenceSelection(snapshot, ref);
3011
+ const traceId = selection.run?.traceIds[0] ?? selection.jobs[0]?.traceIds[0];
2566
3012
  if (traceId) {
2567
3013
  return traceId;
2568
3014
  }
@@ -2572,43 +3018,85 @@ async function traceIdForCaseSource(core, ref) {
2572
3018
  exitCode: 1,
2573
3019
  });
2574
3020
  }
3021
+ async function evalCoverageSummaries(core, runs) {
3022
+ const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
3023
+ const jobsByRun = new Map();
3024
+ for (const job of snapshot.jobs) {
3025
+ const existing = jobsByRun.get(job.runId) ?? [];
3026
+ existing.push(job);
3027
+ jobsByRun.set(job.runId, existing);
3028
+ }
3029
+ return runs.map((run) => {
3030
+ const jobs = jobsByRun.get(run.id) ?? [];
3031
+ const cases = new Set(jobs.map((job) => job.caseId));
3032
+ const samples = new Set(jobs.map((job) => `${job.caseId}\0${job.sample}`));
3033
+ return {
3034
+ runId: run.id,
3035
+ cases: cases.size,
3036
+ samples: samples.size,
3037
+ jobs: jobs.length,
3038
+ succeeded: jobs.filter((job) => job.status === "succeeded").length,
3039
+ failed: jobs.filter((job) => job.status === "failed" || job.status === "canceled").length,
3040
+ };
3041
+ });
3042
+ }
3043
+ function formatEvalCoverage(coverage) {
3044
+ return [
3045
+ `coverage cases=${coverage.cases}`,
3046
+ `samples=${coverage.samples}`,
3047
+ `jobs=${coverage.jobs}`,
3048
+ coverage.failed > 0 ? `failed=${coverage.failed}` : undefined,
3049
+ ].filter(Boolean).join(" ");
3050
+ }
2575
3051
  async function evalDeltas(core, runs) {
2576
3052
  const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
2577
3053
  return runs.map((run) => {
3054
+ const score = scoredRunValue(run);
2578
3055
  const previous = snapshot.runs
2579
3056
  .filter((candidate) => candidate.id !== run.id &&
2580
3057
  candidate.skillName === run.skillName &&
2581
3058
  candidate.agentName === run.agentName &&
2582
- typeof candidate.score === "number" &&
3059
+ scoredRunValue(candidate) !== undefined &&
2583
3060
  candidate.createdAt < run.createdAt)
2584
3061
  .sort((left, right) => right.createdAt.localeCompare(left.createdAt))[0];
3062
+ const previousScore = previous ? scoredRunValue(previous) : undefined;
2585
3063
  return {
2586
3064
  runId: run.id,
2587
3065
  versionId: run.versionId,
2588
3066
  skillName: run.skillName,
2589
3067
  agentName: run.agentName,
2590
- ...(run.score !== undefined ? { score: run.score } : {}),
2591
- ...(previous?.score !== undefined ? { previousScore: previous.score } : {}),
2592
- ...(run.score !== undefined && previous?.score !== undefined ? { delta: run.score - previous.score } : {}),
3068
+ ...(score !== undefined ? { score } : {}),
3069
+ ...(previousScore !== undefined ? { previousScore } : {}),
3070
+ ...(score !== undefined && previousScore !== undefined ? { delta: score - previousScore } : {}),
2593
3071
  };
2594
3072
  });
2595
3073
  }
2596
3074
  function formatEvalDelta(delta) {
2597
- const score = delta.score === undefined ? "n/a" : delta.score.toFixed(3);
3075
+ if (delta.score === undefined) {
3076
+ return "";
3077
+ }
3078
+ const score = delta.score.toFixed(3);
2598
3079
  if (delta.previousScore === undefined || delta.delta === undefined) {
2599
- return `${delta.skillName} ${delta.versionId} ${score} (was n/a)`;
3080
+ return `${delta.skillName} ${displayRef(delta.versionId)} ${score}`;
2600
3081
  }
2601
3082
  const sign = delta.delta >= 0 ? "+" : "";
2602
- return `${delta.skillName} ${delta.versionId} ${score} (was ${delta.previousScore.toFixed(3)}, ${sign}${delta.delta.toFixed(3)})`;
3083
+ return `${delta.skillName} ${displayRef(delta.versionId)} ${score} (was ${delta.previousScore.toFixed(3)}, ${sign}${delta.delta.toFixed(3)})`;
2603
3084
  }
2604
- function evalSuccessNextCommands(runs) {
2605
- return runs.length > 0 ? ["workbench publish"] : ["workbench eval"];
3085
+ async function evalSuccessNextCommand(core, runs) {
3086
+ if (runs.length === 0) {
3087
+ return "workbench eval";
3088
+ }
3089
+ if (!runs.some((run) => scoredRunValue(run) !== undefined)) {
3090
+ return "edit .workbench/cases, then run workbench eval";
3091
+ }
3092
+ const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
3093
+ return snapshotHasWorkflowCase(snapshot) ? "workbench publish" : "edit .workbench/cases, then run workbench eval";
2606
3094
  }
2607
3095
  function formatStatusSnapshot(status) {
2608
3096
  const lines = [
2609
3097
  `Root: ${status.project.root}`,
2610
3098
  `Initialized: ${status.project.initialized ? "yes" : "no"}`,
2611
- ...(status.project.currentVersionId ? [`Current version: ${status.project.currentVersionId}`] : []),
3099
+ ...(status.project.currentVersionId ? [`Current version: ${displayRef(status.project.currentVersionId)}`] : []),
2612
3100
  ...(status.project.defaultSkill ? [`Default skill: ${status.project.defaultSkill}`] : []),
2613
3101
  ...(status.project.defaultAgent ? [`Default agent: ${status.project.defaultAgent}`] : []),
2614
3102
  `Runs: ${status.runs.total}${status.runs.lastStatus ? ` (last ${status.runs.lastStatus})` : ""}`,
@@ -2618,7 +3106,7 @@ function formatStatusSnapshot(status) {
2618
3106
  ? [
2619
3107
  "publication=published",
2620
3108
  remote.publication.visibility ? `visibility=${remote.publication.visibility}` : undefined,
2621
- remote.publication.versionId ? `version=${remote.publication.versionId}` : undefined,
3109
+ remote.publication.versionId ? `version=${displayRef(remote.publication.versionId)}` : undefined,
2622
3110
  remote.publication.installUrl ? `install=${remote.publication.installUrl}` : undefined,
2623
3111
  remote.publication.pinnedInstallUrl ? `pinned=${remote.publication.pinnedInstallUrl}` : undefined,
2624
3112
  ].filter(Boolean).join("\t")
@@ -2629,17 +3117,16 @@ function formatStatusSnapshot(status) {
2629
3117
  ? [
2630
3118
  ` error[${remote.sync.lastError.code}]: ${remote.sync.lastError.message}`,
2631
3119
  ...(remote.sync.lastAttemptAt ? [` last attempt: ${remote.sync.lastAttemptAt}`] : []),
2632
- ...(remote.sync.nextCommand ? [` next: ${remote.sync.nextCommand}`] : []),
2633
3120
  ]
2634
3121
  : []),
2635
3122
  ];
2636
3123
  })] : ["Remotes: none"]),
2637
- ...(status.next[0] ? [`next: ${status.next[0]}`] : []),
3124
+ ...(status.next ? [`next: ${shortenCommandRefs(status.next)}`] : []),
2638
3125
  ];
2639
3126
  return lines.join("\n");
2640
3127
  }
2641
3128
  function formatVersion(version) {
2642
- return `${version.id}\t${version.hash.slice(0, 12)}\t${version.message}`;
3129
+ return `${displayRef(version.id)}\t${version.hash.slice(0, 12)}\t${version.message}`;
2643
3130
  }
2644
3131
  function versionSummary(version) {
2645
3132
  return {
@@ -2655,50 +3142,52 @@ function formatAgent(agent) {
2655
3142
  return `${agent.name}\t${agent.adapter}${agent.model ? `\t${agent.model}` : ""}`;
2656
3143
  }
2657
3144
  function formatRun(run) {
2658
- const score = run.score === undefined ? "n/a" : run.score.toFixed(3);
3145
+ const scoreValue = scoredRunValue(run);
3146
+ const score = scoreValue === undefined ? "n/a" : scoreValue.toFixed(3);
2659
3147
  const latency = run.latencyMs === undefined ? "n/a" : `${run.latencyMs}ms`;
2660
- return `${run.id}\t${run.kind}\t${run.status}\tversion=${run.versionId}\tskill=${run.skillName}\tagent=${run.agentName}\tscore=${score}\tlatency=${latency}`;
3148
+ return `${displayRef(run.id)}\t${run.kind}\t${run.status}\tversion=${displayRef(run.versionId)}\tskill=${run.skillName}\tagent=${run.agentName}\tscore=${score}\tlatency=${latency}`;
2661
3149
  }
2662
3150
  function formatImproveResult(result) {
2663
3151
  return [
2664
- `Improved ${result.version.parentIds[0] ?? "current"} -> ${result.version.id}. ${formatRun(result.run)}`,
3152
+ `Improved ${result.version.parentIds[0] ? displayRef(result.version.parentIds[0]) : "current"} -> ${displayRef(result.version.id)}. ${formatRun(result.run)}`,
2665
3153
  result.switched
2666
3154
  ? "Switched to improved version."
2667
3155
  : `Did not switch: ${result.promotionReason}`,
2668
3156
  ].join("\n");
2669
3157
  }
2670
3158
  function formatJob(job) {
2671
- const score = job.score === undefined ? "n/a" : job.score.toFixed(3);
3159
+ const scoreValue = scoredJobValue(job);
3160
+ const score = scoreValue === undefined ? "n/a" : scoreValue.toFixed(3);
2672
3161
  const duration = job.durationMs === undefined ? "n/a" : `${job.durationMs}ms`;
2673
- return `${job.id}\trun=${job.runId}\tcase=${job.caseId}\tsample=${job.sample}\t${job.status}\tscore=${score}\tduration=${duration}`;
3162
+ return `${displayRef(job.id)}\trun=${displayRef(job.runId)}\tcase=${job.caseId}\tsample=${job.sample}\t${job.status}\tscore=${score}\tduration=${duration}`;
2674
3163
  }
2675
3164
  function formatComparison(comparison) {
2676
3165
  const lines = ["version\tskill\tagent\tstatus\tscore\tcost\tlatency\trun"];
2677
3166
  for (const cell of comparison.cells) {
2678
3167
  lines.push([
2679
- cell.versionId,
3168
+ displayRef(cell.versionId),
2680
3169
  cell.skillName,
2681
3170
  `${cell.agentName}@${shortObjectId(cell.agentHash)}`,
2682
3171
  cell.status ?? "not-run",
2683
3172
  cell.score === undefined ? "n/a" : cell.score.toFixed(3),
2684
3173
  cell.costUsd === undefined ? "n/a" : `$${cell.costUsd.toFixed(4)}`,
2685
3174
  cell.latencyMs === undefined ? "n/a" : `${cell.latencyMs}ms`,
2686
- cell.runId ?? "n/a",
3175
+ cell.runId ? displayRef(cell.runId) : "n/a",
2687
3176
  ].join("\t"));
2688
3177
  }
2689
3178
  return lines.join("\n");
2690
3179
  }
2691
3180
  function shortObjectId(id) {
2692
- return id.length > 12 ? id.slice(0, 12) : id;
3181
+ return id.length > 8 ? id.slice(0, 8) : id;
2693
3182
  }
2694
3183
  function formatTrace(trace) {
2695
3184
  const result = asRecord(trace.result);
2696
3185
  const status = typeof result?.status === "string" ? result.status : undefined;
2697
- const score = typeof result?.score === "number" ? result.score.toFixed(3) : undefined;
3186
+ const score = status === "succeeded" && typeof result?.score === "number" ? result.score.toFixed(3) : undefined;
2698
3187
  const error = typeof result?.error === "string" ? result.error.split(/\r?\n/u)[0] : undefined;
2699
3188
  const files = trace.files.slice(0, 5).map((file) => file.path).join(",");
2700
3189
  return [
2701
- `${trace.id}\trun=${trace.runId}\tjob=${trace.jobId ?? "n/a"}\tversion=${trace.versionId}\tskill=${trace.skillName}\tagent=${trace.agentName}`,
3190
+ `${displayRef(trace.id)}\trun=${displayRef(trace.runId)}\tjob=${trace.jobId ? displayRef(trace.jobId) : "n/a"}\tversion=${displayRef(trace.versionId)}\tskill=${trace.skillName}\tagent=${trace.agentName}`,
2702
3191
  status ? `status=${status}` : undefined,
2703
3192
  score ? `score=${score}` : undefined,
2704
3193
  error ? `error=${error}` : undefined,
@@ -2707,6 +3196,7 @@ function formatTrace(trace) {
2707
3196
  }
2708
3197
  function traceSummary(trace) {
2709
3198
  const result = asRecord(trace.result);
3199
+ const status = typeof result?.status === "string" ? result.status : undefined;
2710
3200
  return {
2711
3201
  id: trace.id,
2712
3202
  runId: trace.runId,
@@ -2715,8 +3205,8 @@ function traceSummary(trace) {
2715
3205
  skillName: trace.skillName,
2716
3206
  agentName: trace.agentName,
2717
3207
  createdAt: trace.createdAt,
2718
- ...(typeof result?.status === "string" ? { status: result.status } : {}),
2719
- ...(typeof result?.score === "number" ? { score: result.score } : {}),
3208
+ ...(status ? { status } : {}),
3209
+ ...(status === "succeeded" && typeof result?.score === "number" ? { score: result.score } : {}),
2720
3210
  ...(typeof result?.error === "string" ? { error: singleLine(result.error) } : {}),
2721
3211
  fileCount: trace.files.length,
2722
3212
  files: trace.files.map(fileSummary),
@@ -2726,7 +3216,7 @@ function formatTraceDetail(detail) {
2726
3216
  return detail.executions.map((execution) => {
2727
3217
  const sessionLabels = execution.sessions.map((session) => session.label).join(",");
2728
3218
  return [
2729
- `${execution.id}\trun=${detail.runId}\tjobs=${execution.jobIds.join(",")}\tstatus=${execution.status}`,
3219
+ `${execution.id}\trun=${displayRef(detail.runId)}\tjobs=${execution.jobIds.map(displayRef).join(",")}\tstatus=${execution.status}`,
2730
3220
  `events=${execution.trace.events.length}`,
2731
3221
  `spans=${execution.trace.spans.length}`,
2732
3222
  `summaries=${execution.trace.summaries.length}`,
@@ -2735,7 +3225,7 @@ function formatTraceDetail(detail) {
2735
3225
  }).join("\n");
2736
3226
  }
2737
3227
  function formatArtifact(artifact) {
2738
- return `${artifact.id}\trun=${artifact.runId}\tjob=${artifact.jobId}\t${artifact.kind}\tfiles=${artifact.files.length}`;
3228
+ return `${displayRef(artifact.id)}\trun=${displayRef(artifact.runId)}\tjob=${displayRef(artifact.jobId)}\t${artifact.kind}\tfiles=${artifact.files.length}`;
2739
3229
  }
2740
3230
  function artifactSummary(artifact) {
2741
3231
  return {