@workbench-ai/workbench 0.0.70 → 0.0.72
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +716 -226
- package/package.json +6 -6
package/dist/index.js
CHANGED
|
@@ -4,10 +4,10 @@ import { createRequire } from "node:module";
|
|
|
4
4
|
import os from "node:os";
|
|
5
5
|
import path from "node:path";
|
|
6
6
|
import { gzipSync } from "node:zlib";
|
|
7
|
-
import { addWorkbenchCase, addWorkbenchRemote, addWorkbenchAgent, compareWorkbench, createWorkbenchAdapterAuthBundle, createWorkbenchReadOnlyInspectionSnapshot, diffWorkbenchVersions, evalWorkbenchSkill, improveWorkbenchSkill, initWorkbenchSkill,
|
|
7
|
+
import { addWorkbenchCase, addWorkbenchRemote, addWorkbenchAgent, compareWorkbench, createWorkbenchInspectionSnapshot, createWorkbenchAdapterAuthBundle, createWorkbenchReadOnlyInspectionSnapshot, diffWorkbenchVersions, evalWorkbenchSkill, improveWorkbenchSkill, initWorkbenchSkill, listWorkbenchAgents, listWorkbenchVersions, localWorkbenchAdapterAuthStore, parseWorkbenchAdapterAuthTarget, publishWorkbenchVersion, removeWorkbenchAgent, showWorkbenchRef, switchWorkbenchVersion, syncWorkbenchRemote, workbenchJobEvidenceForSnapshot, workbenchSkillImproveCanUseQueuedAdapter, workbenchStatusSnapshot, WorkbenchCodedError, WorkbenchUserError, } from "@workbench-ai/workbench-core";
|
|
8
8
|
import { normalizeWorkbenchSkillName } from "@workbench-ai/workbench-contract";
|
|
9
9
|
import { emitError, emitResult } from "./output.js";
|
|
10
|
-
import { installSnapshotToTargets,
|
|
10
|
+
import { installSnapshotToTargets, normalizeInstallSnapshotPath, resolveInstallTargets, supportedInstallTargets, } from "./install-targets.js";
|
|
11
11
|
import { startWorkbenchOpenServer } from "./open-server.js";
|
|
12
12
|
const require = createRequire(import.meta.url);
|
|
13
13
|
const HELP = [
|
|
@@ -23,7 +23,7 @@ const HELP = [
|
|
|
23
23
|
" workbench improve [VERSION] [--skills LIST] [--agents LIST] [--budget N] [-n N|--samples N] [--cloud] [--json]",
|
|
24
24
|
" workbench compare [--skills all|LIST] [--agents all|LIST] [--versions all|A..B|LIST] [--json]",
|
|
25
25
|
" workbench publish [VERSION] [--as OWNER/SKILL] [--private|--team|--public] [--dry-run] [--json]",
|
|
26
|
-
" workbench install HANDLE_OR_URL [--to codex|claude|local]... [--yes] [--
|
|
26
|
+
" workbench install HANDLE_OR_URL [--to codex|claude|local]... [--yes] [--dry-run] [--json]",
|
|
27
27
|
"",
|
|
28
28
|
"More:",
|
|
29
29
|
" workbench help --all",
|
|
@@ -36,7 +36,7 @@ const HELP_ALL = [
|
|
|
36
36
|
" workbench compare [--skills all|LIST] [--agents all|LIST] [--versions all|A..B|LIST] [--json]",
|
|
37
37
|
" workbench improve [VERSION] [--skills LIST] [--agents LIST] [--budget N] [-n N|--samples N] [--cloud] [--json]",
|
|
38
38
|
" workbench publish [VERSION] [--as OWNER/SKILL] [--private|--team|--public] [--dry-run] [--json]",
|
|
39
|
-
" workbench install HANDLE_OR_URL [--to codex|claude|local]... [--yes] [--
|
|
39
|
+
" workbench install HANDLE_OR_URL [--to codex|claude|local]... [--yes] [--dry-run] [--json]",
|
|
40
40
|
"",
|
|
41
41
|
"Inspect:",
|
|
42
42
|
" workbench status [--dir DIR] [--json]",
|
|
@@ -44,10 +44,10 @@ const HELP_ALL = [
|
|
|
44
44
|
" workbench show REF[:PATH] [--json]",
|
|
45
45
|
" workbench diff [A..B] [--json]",
|
|
46
46
|
" workbench switch VERSION [--json]",
|
|
47
|
-
" workbench open [--host HOST] [--port PORT] [--no-open]
|
|
47
|
+
" workbench open [--host HOST] [--port PORT] [--no-open]",
|
|
48
48
|
"",
|
|
49
49
|
"Configure:",
|
|
50
|
-
" workbench case add
|
|
50
|
+
" workbench case add RUN_ID [--json]",
|
|
51
51
|
" workbench agent add NAME --adapter X [--model M] [--with k=v]... | list | rm NAME [--json]",
|
|
52
52
|
"",
|
|
53
53
|
"Share and auth:",
|
|
@@ -65,28 +65,40 @@ const COMMAND_HELP = {
|
|
|
65
65
|
" workbench new [DIR] [--json]",
|
|
66
66
|
"",
|
|
67
67
|
"Creates a Workbench skill project.",
|
|
68
|
+
"",
|
|
69
|
+
"Example:",
|
|
70
|
+
" workbench new earnings-prep",
|
|
68
71
|
].join("\n"),
|
|
69
72
|
eval: [
|
|
70
73
|
"Usage:",
|
|
71
74
|
" workbench eval [VERSION] [--skills all|LIST] [--agents all|LIST] [-n N|--samples N] [--rerun] [--cloud] [--json]",
|
|
72
75
|
"",
|
|
73
76
|
"Runs eval jobs for the selected version, measured skills, and agents. Omitted selectors use manifest defaults.",
|
|
77
|
+
"",
|
|
78
|
+
"Example:",
|
|
79
|
+
" workbench eval -n 5",
|
|
74
80
|
].join("\n"),
|
|
75
81
|
improve: [
|
|
76
82
|
"Usage:",
|
|
77
83
|
" workbench improve [VERSION] [--skills LIST] [--agents LIST] [--budget N] [-n N|--samples N] [--cloud] [--json]",
|
|
78
84
|
"",
|
|
79
85
|
"Creates one improved child version from evidence. The selected skills and agents must resolve to exactly one entry each.",
|
|
86
|
+
"",
|
|
87
|
+
"Example:",
|
|
88
|
+
" workbench improve --budget 1 -n 1",
|
|
80
89
|
].join("\n"),
|
|
81
90
|
compare: [
|
|
82
91
|
"Usage:",
|
|
83
92
|
" workbench compare [--skills all|LIST] [--agents all|LIST] [--versions all|A..B|LIST] [--json]",
|
|
84
93
|
"",
|
|
85
94
|
"Compares recorded eval evidence across selected skills, agents, and versions.",
|
|
95
|
+
"",
|
|
96
|
+
"Example:",
|
|
97
|
+
" workbench compare --agents all",
|
|
86
98
|
].join("\n"),
|
|
87
99
|
install: [
|
|
88
100
|
"Usage:",
|
|
89
|
-
" workbench install HANDLE_OR_URL [--to codex|claude|local]... [--yes] [--
|
|
101
|
+
" workbench install HANDLE_OR_URL [--to codex|claude|local]... [--yes] [--dry-run] [--json]",
|
|
90
102
|
"",
|
|
91
103
|
"Installs published Workbench Cloud source into local agent targets.",
|
|
92
104
|
"",
|
|
@@ -98,12 +110,18 @@ const COMMAND_HELP = {
|
|
|
98
110
|
" workbench status [--dir DIR] [--json]",
|
|
99
111
|
"",
|
|
100
112
|
"Reports project, worktree, run, per-remote sync/publication, and auth state. --json emits the workbench.status.v1 dashboard.",
|
|
113
|
+
"",
|
|
114
|
+
"Example:",
|
|
115
|
+
" workbench status --json",
|
|
101
116
|
].join("\n"),
|
|
102
117
|
logout: [
|
|
103
118
|
"Usage:",
|
|
104
119
|
" workbench logout [PROVIDER] [--json]",
|
|
105
120
|
"",
|
|
106
121
|
"With no provider, logs out of Workbench Cloud. With a provider such as codex or claude, removes local adapter auth.",
|
|
122
|
+
"",
|
|
123
|
+
"Example:",
|
|
124
|
+
" workbench logout claude",
|
|
107
125
|
].join("\n"),
|
|
108
126
|
show: [
|
|
109
127
|
"Usage:",
|
|
@@ -111,38 +129,54 @@ const COMMAND_HELP = {
|
|
|
111
129
|
" workbench show REF:PATH [--json]",
|
|
112
130
|
"",
|
|
113
131
|
"Shows a Workbench object, lists files for file-backed objects, or prints one file.",
|
|
132
|
+
"",
|
|
133
|
+
"Example:",
|
|
134
|
+
" workbench show run_abc12345:result.json",
|
|
114
135
|
].join("\n"),
|
|
115
136
|
log: [
|
|
116
137
|
"Usage:",
|
|
117
138
|
" workbench log [--runs|--versions] [--json]",
|
|
118
139
|
"",
|
|
119
140
|
"Shows one reverse-chronological timeline of versions and runs.",
|
|
141
|
+
"",
|
|
142
|
+
"Example:",
|
|
143
|
+
" workbench log --runs",
|
|
120
144
|
].join("\n"),
|
|
121
145
|
diff: [
|
|
122
146
|
"Usage:",
|
|
123
147
|
" workbench diff [A..B] [--json]",
|
|
124
148
|
"",
|
|
125
149
|
"Shows changed files between two Workbench source versions.",
|
|
150
|
+
"",
|
|
151
|
+
"Example:",
|
|
152
|
+
" workbench diff 26059f9a..eac5699c",
|
|
126
153
|
].join("\n"),
|
|
127
154
|
switch: [
|
|
128
155
|
"Usage:",
|
|
129
156
|
" workbench switch VERSION [--json]",
|
|
130
157
|
"",
|
|
131
158
|
"Switches the working skill source to a recorded Workbench version.",
|
|
159
|
+
"",
|
|
160
|
+
"Example:",
|
|
161
|
+
" workbench switch 26059f9a",
|
|
132
162
|
].join("\n"),
|
|
133
163
|
open: [
|
|
134
164
|
"Usage:",
|
|
135
|
-
" workbench open [--host HOST] [--port PORT] [--no-open]
|
|
165
|
+
" workbench open [--host HOST] [--port PORT] [--no-open]",
|
|
136
166
|
"",
|
|
137
|
-
"Serves
|
|
167
|
+
"Serves the read-only Workbench inspection UI.",
|
|
168
|
+
"",
|
|
169
|
+
"Example:",
|
|
170
|
+
" workbench open --no-open",
|
|
138
171
|
].join("\n"),
|
|
139
172
|
case: [
|
|
140
173
|
"Usage:",
|
|
141
|
-
" workbench case
|
|
142
|
-
"
|
|
143
|
-
"
|
|
174
|
+
" workbench case add RUN_ID [--json]",
|
|
175
|
+
"",
|
|
176
|
+
"Captures a regression case from a recorded run.",
|
|
144
177
|
"",
|
|
145
|
-
"
|
|
178
|
+
"Example:",
|
|
179
|
+
" workbench case add run_abc12345",
|
|
146
180
|
].join("\n"),
|
|
147
181
|
agent: [
|
|
148
182
|
"Usage:",
|
|
@@ -151,18 +185,27 @@ const COMMAND_HELP = {
|
|
|
151
185
|
" workbench agent rm NAME [--json]",
|
|
152
186
|
"",
|
|
153
187
|
"Lists, adds, or removes eval agent configurations.",
|
|
188
|
+
"",
|
|
189
|
+
"Example:",
|
|
190
|
+
" workbench agent add claude --adapter claude --model sonnet",
|
|
154
191
|
].join("\n"),
|
|
155
192
|
sync: [
|
|
156
193
|
"Usage:",
|
|
157
194
|
" workbench sync [REMOTE] [--dry-run] [--dir DIR] [--json]",
|
|
158
195
|
"",
|
|
159
196
|
"Plumbing command: synchronizes local evidence and version objects with a Workbench remote.",
|
|
197
|
+
"",
|
|
198
|
+
"Example:",
|
|
199
|
+
" workbench sync cloud --dry-run",
|
|
160
200
|
].join("\n"),
|
|
161
201
|
publish: [
|
|
162
202
|
"Usage:",
|
|
163
203
|
" workbench publish [VERSION] [--as OWNER/SKILL] [--private|--team|--public] [--dry-run] [--dir DIR] [--json]",
|
|
164
204
|
"",
|
|
165
205
|
"Publishes installable skill source to Workbench Cloud. --as sets the linked OWNER/SKILL handle.",
|
|
206
|
+
"",
|
|
207
|
+
"Example:",
|
|
208
|
+
" workbench publish --as acme/earnings-prep --dry-run",
|
|
166
209
|
].join("\n"),
|
|
167
210
|
login: [
|
|
168
211
|
"Usage:",
|
|
@@ -170,14 +213,20 @@ const COMMAND_HELP = {
|
|
|
170
213
|
" workbench logout [PROVIDER] [--json]",
|
|
171
214
|
"",
|
|
172
215
|
"Connects the CLI to Workbench Cloud or captures local adapter auth for a provider.",
|
|
216
|
+
"",
|
|
217
|
+
"Example:",
|
|
218
|
+
" workbench login --start-only --no-open",
|
|
173
219
|
].join("\n"),
|
|
174
220
|
};
|
|
175
221
|
const COMMON_FLAGS = {
|
|
176
222
|
json: "boolean",
|
|
177
223
|
};
|
|
224
|
+
const DIR_FLAG = {
|
|
225
|
+
dir: "string",
|
|
226
|
+
};
|
|
178
227
|
const PROJECT_FLAGS = {
|
|
179
228
|
...COMMON_FLAGS,
|
|
180
|
-
|
|
229
|
+
...DIR_FLAG,
|
|
181
230
|
};
|
|
182
231
|
const HELP_FLAG = {
|
|
183
232
|
help: "boolean",
|
|
@@ -207,7 +256,7 @@ const COMMAND_FLAGS = {
|
|
|
207
256
|
samples: "positive-integer",
|
|
208
257
|
skills: "string",
|
|
209
258
|
},
|
|
210
|
-
install: { ...COMMON_FLAGS, ...HELP_FLAG, "dry-run": "boolean",
|
|
259
|
+
install: { ...COMMON_FLAGS, ...HELP_FLAG, "dry-run": "boolean", to: "repeat-string", yes: "boolean" },
|
|
211
260
|
log: { ...PROJECT_FLAGS, ...HELP_FLAG, runs: "boolean", versions: "boolean" },
|
|
212
261
|
login: {
|
|
213
262
|
...COMMON_FLAGS,
|
|
@@ -224,7 +273,7 @@ const COMMAND_FLAGS = {
|
|
|
224
273
|
},
|
|
225
274
|
logout: { ...COMMON_FLAGS, ...HELP_FLAG },
|
|
226
275
|
new: { ...PROJECT_FLAGS, ...HELP_FLAG },
|
|
227
|
-
open: { ...
|
|
276
|
+
open: { ...DIR_FLAG, ...HELP_FLAG, host: "string", "no-open": "boolean", port: "port" },
|
|
228
277
|
publish: {
|
|
229
278
|
...PROJECT_FLAGS,
|
|
230
279
|
...HELP_FLAG,
|
|
@@ -243,9 +292,7 @@ const COMMAND_FLAGS = {
|
|
|
243
292
|
const SUBCOMMAND_FLAGS = {
|
|
244
293
|
case: {
|
|
245
294
|
flags: {
|
|
246
|
-
list: { ...PROJECT_FLAGS, ...HELP_FLAG },
|
|
247
295
|
add: { ...PROJECT_FLAGS, ...HELP_FLAG },
|
|
248
|
-
rm: { ...PROJECT_FLAGS, ...HELP_FLAG },
|
|
249
296
|
},
|
|
250
297
|
},
|
|
251
298
|
agent: {
|
|
@@ -315,26 +362,31 @@ export async function runCli(argv, io = {
|
|
|
315
362
|
return emitEvalFailure(runs, failedRuns, artifactIds, parsed, io);
|
|
316
363
|
}
|
|
317
364
|
const deltas = await evalDeltas(core, runs);
|
|
318
|
-
const
|
|
365
|
+
const coverage = await evalCoverageSummaries(core, runs);
|
|
366
|
+
const next = await evalSuccessNextCommand(core, runs);
|
|
319
367
|
return emitResult("workbench.cli.eval.v1", {
|
|
320
368
|
result: runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
|
|
369
|
+
coverage: coverage,
|
|
321
370
|
deltas: deltas,
|
|
322
|
-
|
|
371
|
+
next: next,
|
|
323
372
|
}, parsed, io, () => [
|
|
324
373
|
runs.map(formatRun).join("\n"),
|
|
374
|
+
...coverage.map(formatEvalCoverage),
|
|
325
375
|
...deltas.map(formatEvalDelta),
|
|
326
|
-
...(
|
|
376
|
+
...(next ? [`next: ${next}`] : []),
|
|
327
377
|
].filter(Boolean).join("\n"));
|
|
328
378
|
}
|
|
329
379
|
if (command === "improve") {
|
|
330
380
|
if (parsed.flags.cloud === true) {
|
|
331
381
|
return await handleCloudImprove(parsed, io);
|
|
332
382
|
}
|
|
383
|
+
const improverAgent = await resolveLocalImproverAgent(parsed, core);
|
|
333
384
|
const result = await improveWorkbenchSkill({
|
|
334
385
|
...core,
|
|
335
386
|
version: optionalPositional(parsed, 1),
|
|
336
387
|
skill: stringFlag(parsed, "skills"),
|
|
337
388
|
agent: stringFlag(parsed, "agents"),
|
|
389
|
+
...(improverAgent ? { improverAgent } : {}),
|
|
338
390
|
budget: intFlag(parsed, "budget"),
|
|
339
391
|
samples: intFlag(parsed, "samples"),
|
|
340
392
|
});
|
|
@@ -350,12 +402,12 @@ export async function runCli(argv, io = {
|
|
|
350
402
|
skills: stringFlag(parsed, "skills"),
|
|
351
403
|
agents: stringFlag(parsed, "agents"),
|
|
352
404
|
});
|
|
353
|
-
return output(comparison, parsed, io, () => formatComparison(comparison));
|
|
405
|
+
return output(manifestOnly(comparison), parsed, io, () => formatComparison(comparison));
|
|
354
406
|
}
|
|
355
407
|
if (command === "switch") {
|
|
356
408
|
const versionRef = requiredPositional(parsed, 1, "workbench switch requires VERSION.");
|
|
357
409
|
const version = await switchWorkbenchVersion(versionRef, core);
|
|
358
|
-
return output(versionSummary(version), parsed, io, () => `Switched to ${version.id}.`);
|
|
410
|
+
return output(versionSummary(version), parsed, io, () => `Switched to ${displayRef(version.id)}.`);
|
|
359
411
|
}
|
|
360
412
|
if (command === "diff") {
|
|
361
413
|
const range = optionalPositional(parsed, 1) ?? await defaultDiffRange(core);
|
|
@@ -390,7 +442,7 @@ export async function runCli(argv, io = {
|
|
|
390
442
|
}, parsed, io, () => `${result.dryRun ? "Would sync" : "Synced"} ${result.remote.name}: pushed ${result.pushed}, pulled ${result.pulled}${result.upToDate ? " (up to date)" : ""}.`);
|
|
391
443
|
}
|
|
392
444
|
if (command === "publish") {
|
|
393
|
-
const preview = parsed.flags["dry-run"] === true
|
|
445
|
+
const preview = parsed.flags["dry-run"] === true
|
|
394
446
|
? await previewPublishWithDerivedRemote(parsed)
|
|
395
447
|
: undefined;
|
|
396
448
|
if (preview) {
|
|
@@ -403,10 +455,7 @@ export async function runCli(argv, io = {
|
|
|
403
455
|
pinnedInstallUrl: preview.pinnedInstallUrl,
|
|
404
456
|
dryRun: true,
|
|
405
457
|
}, parsed, io, () => [
|
|
406
|
-
`Would publish ${preview.version.id}
|
|
407
|
-
`Visibility: ${preview.visibility}`,
|
|
408
|
-
`Install: ${preview.installUrl}`,
|
|
409
|
-
`Pinned: ${preview.pinnedInstallUrl}`,
|
|
458
|
+
`Would publish ${displayRef(preview.version.id)} as ${preview.installHandle} (${preview.visibility}).`,
|
|
410
459
|
`next: workbench install ${preview.installHandle}`,
|
|
411
460
|
].join("\n"));
|
|
412
461
|
}
|
|
@@ -427,25 +476,18 @@ export async function runCli(argv, io = {
|
|
|
427
476
|
pinnedInstallUrl: result.pinnedInstallUrl,
|
|
428
477
|
...(result.dryRun ? { dryRun: true } : {}),
|
|
429
478
|
}, parsed, io, () => [
|
|
430
|
-
`${result.dryRun ? "Would publish" : "Published"} ${result.version.id}
|
|
431
|
-
`Visibility: ${result.visibility}`,
|
|
432
|
-
`Install: ${result.installUrl}`,
|
|
433
|
-
`Pinned: ${result.pinnedInstallUrl}`,
|
|
479
|
+
`${result.dryRun ? "Would publish" : "Published"} ${displayRef(result.version.id)} as ${result.installHandle} (${result.visibility}).`,
|
|
434
480
|
`next: workbench install ${result.installHandle}`,
|
|
435
481
|
].join("\n"));
|
|
436
482
|
}
|
|
437
483
|
if (command === "open") {
|
|
438
|
-
if (parsed.flags.json === true) {
|
|
439
|
-
const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
|
|
440
|
-
return output(snapshot, parsed, io, () => "Read-only Workbench inspection data is available with --json.");
|
|
441
|
-
}
|
|
442
484
|
// The browser server serves committed object state through a read-only
|
|
443
485
|
// snapshot path, so long-running commands do not block page loads.
|
|
444
486
|
const server = await startWorkbenchOpenServer({
|
|
445
487
|
dir: dirFlag(parsed),
|
|
446
488
|
authToken: core.authToken,
|
|
447
489
|
host: stringFlag(parsed, "host"),
|
|
448
|
-
port:
|
|
490
|
+
port: portFlag(parsed, "port"),
|
|
449
491
|
});
|
|
450
492
|
io.stdout.write(`Workbench: ${server.url}\n`);
|
|
451
493
|
if (parsed.flags["no-open"] !== true) {
|
|
@@ -460,16 +502,18 @@ export async function runCli(argv, io = {
|
|
|
460
502
|
}
|
|
461
503
|
}
|
|
462
504
|
async function handleStatus(parsed, io) {
|
|
463
|
-
const
|
|
505
|
+
const core = await coreOptions(parsed);
|
|
506
|
+
const status = await workbenchStatusSnapshot(core);
|
|
464
507
|
const auth = await workbenchCliAuthStatus();
|
|
508
|
+
const cliStatus = await statusWithCausalNext(status, auth, core);
|
|
465
509
|
return emitResult("workbench.status.v1", {
|
|
466
|
-
project:
|
|
467
|
-
worktree:
|
|
468
|
-
runs:
|
|
469
|
-
remotes:
|
|
510
|
+
project: cliStatus.project,
|
|
511
|
+
worktree: cliStatus.worktree,
|
|
512
|
+
runs: cliStatus.runs,
|
|
513
|
+
remotes: cliStatus.remotes,
|
|
470
514
|
auth: auth,
|
|
471
|
-
next:
|
|
472
|
-
}, parsed, io, () => formatStatusSnapshot({ ...
|
|
515
|
+
next: cliStatus.next,
|
|
516
|
+
}, parsed, io, () => formatStatusSnapshot({ ...cliStatus, auth }));
|
|
473
517
|
}
|
|
474
518
|
async function handleLog(parsed, io) {
|
|
475
519
|
if (parsed.flags.runs === true && parsed.flags.versions === true) {
|
|
@@ -491,7 +535,7 @@ async function handleLog(parsed, io) {
|
|
|
491
535
|
remediation: "Run workbench log, workbench log --runs, or workbench log --versions.",
|
|
492
536
|
});
|
|
493
537
|
}
|
|
494
|
-
const snapshot = await
|
|
538
|
+
const snapshot = await createWorkbenchInspectionSnapshot(await coreOptions(parsed));
|
|
495
539
|
const includeRuns = parsed.flags.versions !== true;
|
|
496
540
|
const includeVersions = parsed.flags.runs !== true;
|
|
497
541
|
const entries = [
|
|
@@ -510,7 +554,7 @@ async function handleLog(parsed, io) {
|
|
|
510
554
|
versionId: run.versionId,
|
|
511
555
|
skillName: run.skillName,
|
|
512
556
|
agentName: run.agentName,
|
|
513
|
-
...(run
|
|
557
|
+
...(scoredRunValue(run) !== undefined ? { score: scoredRunValue(run) } : {}),
|
|
514
558
|
})) : []),
|
|
515
559
|
].sort((left, right) => right.createdAt.localeCompare(left.createdAt));
|
|
516
560
|
return emitResult("workbench.cli.log.v1", {
|
|
@@ -534,21 +578,25 @@ async function handleShow(parsed, io) {
|
|
|
534
578
|
return output(value, parsed, io, () => formatShow(value));
|
|
535
579
|
}
|
|
536
580
|
const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
|
|
537
|
-
const version = snapshot
|
|
581
|
+
const version = snapshotVersionByRef(snapshot, objectRef);
|
|
538
582
|
if (version) {
|
|
539
583
|
return output(fileListing("version", version.id, version.files), parsed, io, () => formatFileListing("version", version.id, version.files));
|
|
540
584
|
}
|
|
541
|
-
const trace = snapshot.traces
|
|
585
|
+
const trace = snapshotObjectByRef(snapshot.traces, objectRef, "trace");
|
|
542
586
|
if (trace) {
|
|
543
587
|
return output(fileListing("trace", trace.id, trace.files), parsed, io, () => formatFileListing("trace", trace.id, trace.files));
|
|
544
588
|
}
|
|
545
|
-
const artifact = snapshot.artifacts
|
|
589
|
+
const artifact = snapshotObjectByRef(snapshot.artifacts, objectRef, "artifact");
|
|
546
590
|
if (artifact) {
|
|
547
591
|
return output(fileListing("artifact", artifact.id, artifact.files), parsed, io, () => formatFileListing("artifact", artifact.id, artifact.files));
|
|
548
592
|
}
|
|
549
593
|
const details = evidenceDetailsForRunOrJob(snapshot, objectRef);
|
|
550
|
-
|
|
551
|
-
|
|
594
|
+
const evidenceFiles = evidenceFilesForRunOrJob(snapshot, objectRef);
|
|
595
|
+
if (details.length > 0 || evidenceFiles.length > 0) {
|
|
596
|
+
return output({
|
|
597
|
+
details: details.map(evidenceDetailSummary),
|
|
598
|
+
files: evidenceFiles.map(fileSummary),
|
|
599
|
+
}, parsed, io, () => formatRunOrJobEvidence(details, evidenceFiles));
|
|
552
600
|
}
|
|
553
601
|
const value = await showWorkbenchRef(ref, core);
|
|
554
602
|
return output(value, parsed, io, () => formatShow(value));
|
|
@@ -581,22 +629,19 @@ async function handleAgent(parsed, io) {
|
|
|
581
629
|
throw new WorkbenchUserError(`Unsupported agent command: ${subcommand}`);
|
|
582
630
|
}
|
|
583
631
|
async function handleCase(parsed, io) {
|
|
584
|
-
const subcommand = requiredPositional(parsed, 1, "workbench case requires
|
|
585
|
-
if (subcommand === "list") {
|
|
586
|
-
const cases = await listWorkbenchCases(await coreOptions(parsed));
|
|
587
|
-
return output(cases, parsed, io, () => cases.map((entry) => `${entry.id}\t${entry.path}`).join("\n") || "No cases.");
|
|
588
|
-
}
|
|
632
|
+
const subcommand = requiredPositional(parsed, 1, "workbench case requires add.");
|
|
589
633
|
if (subcommand === "add") {
|
|
590
634
|
const core = await coreOptions(parsed);
|
|
591
|
-
const sourceRef =
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
635
|
+
const sourceRef = requiredPositional(parsed, 2, "workbench case add requires RUN_ID.");
|
|
636
|
+
rejectExtraInput(parsed, {
|
|
637
|
+
maxPositionals: 3,
|
|
638
|
+
message: "workbench case add accepts one RUN_ID argument.",
|
|
639
|
+
remediation: "Run workbench case add RUN_ID.",
|
|
640
|
+
});
|
|
641
|
+
const record = await addWorkbenchCase({ ...core, fromTraceId: await traceIdForCaseSource(core, sourceRef) });
|
|
642
|
+
return output(record, parsed, io, () => `Added draft case ${record.id}. Edit .workbench/cases/${record.path}/case.yaml before using it as score evidence.`);
|
|
598
643
|
}
|
|
599
|
-
throw new WorkbenchUserError(`
|
|
644
|
+
throw new WorkbenchUserError(`Unknown command: workbench case ${subcommand}`);
|
|
600
645
|
}
|
|
601
646
|
async function handleAdapterLogin(provider, parsed, io) {
|
|
602
647
|
const target = parseAuthTarget(provider, authProfileFlag(parsed));
|
|
@@ -685,11 +730,14 @@ function validateFlagValue(name, value, kind) {
|
|
|
685
730
|
if (typeof value !== "string" || !value.trim()) {
|
|
686
731
|
throw new WorkbenchUserError(`--${name} requires a value.`);
|
|
687
732
|
}
|
|
688
|
-
if (kind === "positive-integer") {
|
|
733
|
+
if (kind === "positive-integer" || kind === "port") {
|
|
689
734
|
const parsedValue = Number(value);
|
|
690
|
-
if (!Number.isInteger(parsedValue) || parsedValue <= 0) {
|
|
735
|
+
if (kind === "positive-integer" && (!Number.isInteger(parsedValue) || parsedValue <= 0)) {
|
|
691
736
|
throw new WorkbenchUserError(`--${name} must be a positive integer.`);
|
|
692
737
|
}
|
|
738
|
+
if (kind === "port" && (!Number.isInteger(parsedValue) || parsedValue < 0 || parsedValue > 65535)) {
|
|
739
|
+
throw new WorkbenchUserError(`--${name} must be an integer between 0 and 65535.`);
|
|
740
|
+
}
|
|
693
741
|
}
|
|
694
742
|
}
|
|
695
743
|
const CONFIG_SCHEMA = "workbench.cli.config.v1";
|
|
@@ -714,7 +762,7 @@ async function handleLogin(parsed, io) {
|
|
|
714
762
|
}
|
|
715
763
|
if (parsed.flags["start-only"] === true && parsed.flags.wait === true) {
|
|
716
764
|
throw new WorkbenchCodedError("usage", "workbench login accepts only one of --start-only or --wait.", {
|
|
717
|
-
remediation: "Run workbench login --start-only or workbench login --wait
|
|
765
|
+
remediation: "Run workbench login --start-only or workbench login --wait.",
|
|
718
766
|
exitCode: 2,
|
|
719
767
|
});
|
|
720
768
|
}
|
|
@@ -723,22 +771,17 @@ async function handleLogin(parsed, io) {
|
|
|
723
771
|
const timeoutSeconds = intFlag(parsed, "timeout");
|
|
724
772
|
if (startOnly && timeoutSeconds !== undefined) {
|
|
725
773
|
throw new WorkbenchCodedError("usage", "workbench login --timeout only applies with --wait.", {
|
|
726
|
-
remediation: "Run workbench login --start-only, then workbench login --wait
|
|
727
|
-
exitCode: 2,
|
|
728
|
-
});
|
|
729
|
-
}
|
|
730
|
-
if (waitOnly && timeoutSeconds === undefined) {
|
|
731
|
-
throw new WorkbenchCodedError("usage", "workbench login --wait requires --timeout N.", {
|
|
732
|
-
remediation: "Run workbench login --wait --timeout 120.",
|
|
774
|
+
remediation: "Run workbench login --start-only, then workbench login --wait.",
|
|
733
775
|
exitCode: 2,
|
|
734
776
|
});
|
|
735
777
|
}
|
|
736
778
|
const config = await loadConfig();
|
|
737
|
-
const
|
|
738
|
-
|
|
779
|
+
const explicitBaseUrl = stringFlag(parsed, "base-url");
|
|
780
|
+
const pending = waitOnly ? await readPendingDeviceAuthorization(explicitBaseUrl) : null;
|
|
781
|
+
const baseUrl = pending?.baseUrl ?? selectWorkbenchBaseUrl({
|
|
782
|
+
explicitBaseUrl,
|
|
739
783
|
configBaseUrl: config.baseUrl,
|
|
740
784
|
});
|
|
741
|
-
const pending = waitOnly ? await readPendingDeviceAuthorization(baseUrl) : null;
|
|
742
785
|
const record = pending ?? await startDeviceAuthorization(baseUrl);
|
|
743
786
|
const freshAuthorization = pending === null;
|
|
744
787
|
if (startOnly) {
|
|
@@ -753,8 +796,8 @@ async function handleLogin(parsed, io) {
|
|
|
753
796
|
verificationUriComplete: record.verification_uri_complete,
|
|
754
797
|
userCode: record.user_code,
|
|
755
798
|
expiresAt: record.expiresAt,
|
|
756
|
-
resume: "workbench login --wait
|
|
757
|
-
}, parsed, io, () => `Open ${record.verification_uri_complete}\nCode: ${record.user_code}\nResume: workbench login --wait
|
|
799
|
+
resume: "workbench login --wait",
|
|
800
|
+
}, parsed, io, () => `Open ${record.verification_uri_complete}\nCode: ${record.user_code}\nResume: workbench login --wait`);
|
|
758
801
|
}
|
|
759
802
|
await writePendingDeviceAuthorization(record);
|
|
760
803
|
if (freshAuthorization && !parsed.flags.json) {
|
|
@@ -801,9 +844,6 @@ async function handleLogout(parsed, io) {
|
|
|
801
844
|
const config = await loadConfig();
|
|
802
845
|
const baseUrl = optionalWorkbenchBaseUrl({ configBaseUrl: config.baseUrl });
|
|
803
846
|
const tokenPresent = Boolean(config.accessToken);
|
|
804
|
-
if (tokenPresent && !baseUrl) {
|
|
805
|
-
throw new WorkbenchUserError("Missing Workbench API URL. Set WORKBENCH_API_URL or run `workbench login --base-url URL`.");
|
|
806
|
-
}
|
|
807
847
|
let revoke = "skipped";
|
|
808
848
|
if (config.accessToken && baseUrl) {
|
|
809
849
|
try {
|
|
@@ -856,18 +896,6 @@ async function handleInstall(parsed, io) {
|
|
|
856
896
|
const snapshot = await fetchWorkbenchInstallSourceSnapshot(workbenchSource, source);
|
|
857
897
|
const sourceSummary = workbenchInstallSourceSummary(workbenchSource, snapshot);
|
|
858
898
|
const config = await loadConfig();
|
|
859
|
-
if (parsed.flags.list === true) {
|
|
860
|
-
return emitResult("workbench.cli.install.v1", {
|
|
861
|
-
source: sourceSummary,
|
|
862
|
-
skills: [snapshot.name],
|
|
863
|
-
fileCount: snapshot.files.length,
|
|
864
|
-
targets: installTargetsToJson(supportedInstallTargets()),
|
|
865
|
-
}, parsed, io, () => [
|
|
866
|
-
`${snapshot.name}\t${snapshot.versionId}\tfiles=${snapshot.files.length}`,
|
|
867
|
-
"Targets:",
|
|
868
|
-
...supportedInstallTargets().map((target) => ` ${target.agent}\t${target.destination}`),
|
|
869
|
-
].join("\n"));
|
|
870
|
-
}
|
|
871
899
|
const toTargets = stringsFlag(parsed, "to");
|
|
872
900
|
const selectedTargets = toTargets.length > 0 ? normalizeInstallTargetNames(toTargets) : await defaultInstallTargetNames(config);
|
|
873
901
|
const targets = resolveInstallTargets({
|
|
@@ -898,29 +926,57 @@ async function handleInstall(parsed, io) {
|
|
|
898
926
|
].join("\n"));
|
|
899
927
|
}
|
|
900
928
|
async function handleCloudEval(parsed, io) {
|
|
901
|
-
const started = await startCloudExecution("eval", parsed);
|
|
929
|
+
const started = await startCloudExecution("eval", parsed, io);
|
|
902
930
|
const artifactIds = await artifactIdsByRunId(started.core, started.runs);
|
|
931
|
+
if (started.detached) {
|
|
932
|
+
const next = cloudDetachedNextCommand(started.runs);
|
|
933
|
+
emitResult("workbench.cli.eval.v1", {
|
|
934
|
+
result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
|
|
935
|
+
detached: true,
|
|
936
|
+
next: next,
|
|
937
|
+
cloud: cloudExecutionSummary(started),
|
|
938
|
+
}, parsed, io, () => [
|
|
939
|
+
`Detached from hosted eval on ${started.remote.url}.`,
|
|
940
|
+
started.runs.map(formatRun).join("\n"),
|
|
941
|
+
...(next ? [`next: ${next}`] : []),
|
|
942
|
+
].filter(Boolean).join("\n"));
|
|
943
|
+
return 130;
|
|
944
|
+
}
|
|
903
945
|
const failedRuns = started.runs.filter((run) => run.status === "failed" || run.status === "canceled");
|
|
904
946
|
if (failedRuns.length > 0) {
|
|
905
947
|
return emitEvalFailure(started.runs, failedRuns, artifactIds, parsed, io);
|
|
906
948
|
}
|
|
907
949
|
const deltas = await evalDeltas(started.core, started.runs);
|
|
908
|
-
const
|
|
950
|
+
const next = await evalSuccessNextCommand(started.core, started.runs);
|
|
909
951
|
return emitResult("workbench.cli.eval.v1", {
|
|
910
952
|
result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
|
|
911
953
|
deltas: deltas,
|
|
912
|
-
|
|
954
|
+
next: next,
|
|
913
955
|
cloud: cloudExecutionSummary(started),
|
|
914
956
|
}, parsed, io, () => [
|
|
915
957
|
`Completed hosted eval on ${started.remote.url}.`,
|
|
916
958
|
started.runs.map(formatRun).join("\n"),
|
|
917
959
|
...deltas.map(formatEvalDelta),
|
|
918
|
-
...(
|
|
960
|
+
...(next ? [`next: ${next}`] : []),
|
|
919
961
|
].filter(Boolean).join("\n"));
|
|
920
962
|
}
|
|
921
963
|
async function handleCloudImprove(parsed, io) {
|
|
922
|
-
const started = await startCloudExecution("improve", parsed);
|
|
964
|
+
const started = await startCloudExecution("improve", parsed, io);
|
|
923
965
|
const artifactIds = await artifactIdsByRunId(started.core, started.runs);
|
|
966
|
+
if (started.detached) {
|
|
967
|
+
const next = cloudDetachedNextCommand(started.runs);
|
|
968
|
+
emitResult("workbench.cli.improve.v1", {
|
|
969
|
+
result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
|
|
970
|
+
detached: true,
|
|
971
|
+
next: next,
|
|
972
|
+
cloud: cloudExecutionSummary(started),
|
|
973
|
+
}, parsed, io, () => [
|
|
974
|
+
`Detached from hosted improve on ${started.remote.url}.`,
|
|
975
|
+
started.runs.map(formatRun).join("\n"),
|
|
976
|
+
...(next ? [`next: ${next}`] : []),
|
|
977
|
+
].filter(Boolean).join("\n"));
|
|
978
|
+
return 130;
|
|
979
|
+
}
|
|
924
980
|
const failedRuns = started.runs.filter((run) => run.status === "failed" || run.status === "canceled");
|
|
925
981
|
if (failedRuns.length > 0) {
|
|
926
982
|
const first = failedRuns[0];
|
|
@@ -934,17 +990,17 @@ async function handleCloudImprove(parsed, io) {
|
|
|
934
990
|
});
|
|
935
991
|
}
|
|
936
992
|
const switchedVersionId = await switchHostedImproveVersionIfPromoted(started);
|
|
937
|
-
const
|
|
993
|
+
const next = cloudImproveNextCommand(started.runs);
|
|
938
994
|
return emitResult("workbench.cli.improve.v1", {
|
|
939
995
|
result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
|
|
940
|
-
|
|
996
|
+
next: next,
|
|
941
997
|
cloud: cloudExecutionSummary(started),
|
|
942
998
|
...(switchedVersionId ? { switchedVersionId } : {}),
|
|
943
999
|
}, parsed, io, () => [
|
|
944
1000
|
`Completed hosted improve on ${started.remote.url}.`,
|
|
945
1001
|
started.runs.map(formatRun).join("\n"),
|
|
946
|
-
...(switchedVersionId ? [`Switched local source to ${switchedVersionId}.`] : []),
|
|
947
|
-
...(
|
|
1002
|
+
...(switchedVersionId ? [`Switched local source to ${displayRef(switchedVersionId)}.`] : []),
|
|
1003
|
+
...(next ? [`next: ${next}`] : []),
|
|
948
1004
|
].filter(Boolean).join("\n"));
|
|
949
1005
|
}
|
|
950
1006
|
async function defaultInstallTargetNames(config) {
|
|
@@ -986,7 +1042,7 @@ async function pathExists(filePath) {
|
|
|
986
1042
|
return false;
|
|
987
1043
|
}
|
|
988
1044
|
}
|
|
989
|
-
async function startCloudExecution(command, parsed) {
|
|
1045
|
+
async function startCloudExecution(command, parsed, io) {
|
|
990
1046
|
const root = dirFlag(parsed) ?? process.cwd();
|
|
991
1047
|
const remote = await ensureCloudRemoteForExecution(root, parsed);
|
|
992
1048
|
const source = parseWorkbenchInstallSource(remote.url);
|
|
@@ -1005,7 +1061,9 @@ async function startCloudExecution(command, parsed) {
|
|
|
1005
1061
|
});
|
|
1006
1062
|
}
|
|
1007
1063
|
const core = { dir: root, authToken: token };
|
|
1064
|
+
writeCloudProgress(io, `workbench cloud: syncing ${remote.name} before hosted ${command}.`);
|
|
1008
1065
|
const syncBefore = await syncWorkbenchRemote({ ...core, remote: remote.name });
|
|
1066
|
+
writeCloudProgress(io, `workbench cloud: synced ${remote.name} before hosted ${command} (pushed=${syncBefore.pushed}, pulled=${syncBefore.pulled}, up-to-date=${syncBefore.upToDate}).`);
|
|
1009
1067
|
const startSnapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
|
|
1010
1068
|
const skillId = await resolveCloudSkillId(source);
|
|
1011
1069
|
const response = await apiRequest(`/api/workbench/skills/${encodeURIComponent(skillId)}${command === "improve" ? "/improve" : "/runs"}`, { method: "POST", body: cloudExecutionRequestBody(command, parsed) }, source.baseUrl);
|
|
@@ -1018,9 +1076,15 @@ async function startCloudExecution(command, parsed) {
|
|
|
1018
1076
|
exitCode: 1,
|
|
1019
1077
|
});
|
|
1020
1078
|
}
|
|
1079
|
+
const initialRunIds = runs.map((run) => run.id);
|
|
1080
|
+
writeCloudProgress(io, `workbench cloud: scheduled hosted ${command} on ${remote.url} (${formatCloudRunStatuses(runs)}).`);
|
|
1021
1081
|
const initialSyncAfter = await syncWorkbenchRemote({ ...core, remote: remote.name });
|
|
1082
|
+
writeCloudProgress(io, `workbench cloud: synced after scheduling hosted ${command} (pushed=${initialSyncAfter.pushed}, pulled=${initialSyncAfter.pulled}, up-to-date=${initialSyncAfter.upToDate}).`);
|
|
1083
|
+
writeCloudProgress(io, `workbench cloud: waiting for terminal status; press Ctrl-C to detach and resume with workbench status or workbench show ${displayRef(initialRunIds[0] ?? "run")}.`);
|
|
1022
1084
|
const completed = await waitForCloudRuns({
|
|
1085
|
+
command,
|
|
1023
1086
|
core,
|
|
1087
|
+
io,
|
|
1024
1088
|
remote,
|
|
1025
1089
|
runs,
|
|
1026
1090
|
initialSync: initialSyncAfter,
|
|
@@ -1029,7 +1093,9 @@ async function startCloudExecution(command, parsed) {
|
|
|
1029
1093
|
core,
|
|
1030
1094
|
remote,
|
|
1031
1095
|
skillId,
|
|
1096
|
+
initialRunIds,
|
|
1032
1097
|
runs: completed.runs,
|
|
1098
|
+
...(completed.detached ? { detached: true } : {}),
|
|
1033
1099
|
startVersionId: startSnapshot.status.currentVersionId ?? startSnapshot.refs.current,
|
|
1034
1100
|
source,
|
|
1035
1101
|
sync: {
|
|
@@ -1053,27 +1119,59 @@ async function waitForCloudRuns(input) {
|
|
|
1053
1119
|
const timeoutMs = positiveIntEnv("WORKBENCH_CLOUD_RUN_TIMEOUT_MS") ?? CLOUD_RUN_TIMEOUT_MS;
|
|
1054
1120
|
const pollIntervalMs = positiveIntEnv("WORKBENCH_CLOUD_RUN_POLL_INTERVAL_MS") ?? CLOUD_RUN_POLL_INTERVAL_MS;
|
|
1055
1121
|
const deadline = Date.now() + timeoutMs;
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
}
|
|
1122
|
+
let runs = [...input.runs];
|
|
1123
|
+
let interrupted = false;
|
|
1124
|
+
const onSigint = () => {
|
|
1125
|
+
interrupted = true;
|
|
1126
|
+
writeCloudProgress(input.io, `workbench cloud: detaching from hosted ${input.command} (${runIds.map(displayRef).join(", ")}).`);
|
|
1127
|
+
};
|
|
1128
|
+
process.once("SIGINT", onSigint);
|
|
1129
|
+
const seenStatuses = new Map();
|
|
1130
|
+
try {
|
|
1131
|
+
while (true) {
|
|
1132
|
+
const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(input.core);
|
|
1133
|
+
const snapshotRuns = runIds
|
|
1134
|
+
.map((id) => snapshot.runs.find((entry) => entry.id === id))
|
|
1135
|
+
.filter((run) => Boolean(run));
|
|
1136
|
+
if (snapshotRuns.length > 0) {
|
|
1137
|
+
runs = runIds.map((id) => snapshotRuns.find((entry) => entry.id === id) ?? runs.find((entry) => entry.id === id))
|
|
1138
|
+
.filter((run) => Boolean(run));
|
|
1139
|
+
}
|
|
1140
|
+
for (const run of runs) {
|
|
1141
|
+
const previous = seenStatuses.get(run.id);
|
|
1142
|
+
if (previous !== run.status) {
|
|
1143
|
+
seenStatuses.set(run.id, run.status);
|
|
1144
|
+
writeCloudProgress(input.io, `workbench cloud: ${displayRef(run.id)} is ${run.status}.`);
|
|
1145
|
+
}
|
|
1146
|
+
}
|
|
1147
|
+
if (runs.length === runIds.length && runs.every(isTerminalRun)) {
|
|
1148
|
+
writeCloudProgress(input.io, `workbench cloud: hosted ${input.command} finished (${formatCloudRunStatuses(runs)}).`);
|
|
1149
|
+
return { runs, sync };
|
|
1150
|
+
}
|
|
1151
|
+
if (interrupted) {
|
|
1152
|
+
return { runs, sync, detached: true };
|
|
1153
|
+
}
|
|
1154
|
+
if (Date.now() >= deadline) {
|
|
1155
|
+
throw new WorkbenchCodedError("cloud_run_pending", "Hosted Workbench run is still running.", {
|
|
1156
|
+
retryable: true,
|
|
1157
|
+
remediation: runIds[0] ? `Run workbench show ${runIds[0]}.` : "Run workbench log --runs.",
|
|
1158
|
+
subject: {
|
|
1159
|
+
runIds,
|
|
1160
|
+
statuses: Object.fromEntries(runs.map((run) => [run.id, run.status])),
|
|
1161
|
+
},
|
|
1162
|
+
exitCode: 1,
|
|
1163
|
+
});
|
|
1164
|
+
}
|
|
1165
|
+
await sleep(pollIntervalMs);
|
|
1166
|
+
if (interrupted) {
|
|
1167
|
+
return { runs, sync, detached: true };
|
|
1168
|
+
}
|
|
1169
|
+
sync = await syncWorkbenchRemote({ ...input.core, remote: input.remote.name });
|
|
1170
|
+
writeCloudProgress(input.io, `workbench cloud: synced ${input.remote.name} while waiting (${formatCloudRunStatuses(runs)}).`);
|
|
1074
1171
|
}
|
|
1075
|
-
|
|
1076
|
-
|
|
1172
|
+
}
|
|
1173
|
+
finally {
|
|
1174
|
+
process.off("SIGINT", onSigint);
|
|
1077
1175
|
}
|
|
1078
1176
|
}
|
|
1079
1177
|
function isTerminalRun(run) {
|
|
@@ -1197,30 +1295,41 @@ function cloudExecutionRequestBody(command, parsed) {
|
|
|
1197
1295
|
...(command === "improve" ? { budget: intFlag(parsed, "budget") } : {}),
|
|
1198
1296
|
};
|
|
1199
1297
|
}
|
|
1200
|
-
function
|
|
1201
|
-
return
|
|
1298
|
+
function cloudImproveNextCommand(runs) {
|
|
1299
|
+
return cloudExecutionNextCommand(runs, "workbench eval");
|
|
1202
1300
|
}
|
|
1203
|
-
function
|
|
1204
|
-
|
|
1301
|
+
function cloudDetachedNextCommand(runs) {
|
|
1302
|
+
const first = runs[0];
|
|
1303
|
+
return first?.id ? `workbench show ${displayRef(first.id)}` : "workbench status";
|
|
1205
1304
|
}
|
|
1206
|
-
function
|
|
1305
|
+
function cloudExecutionNextCommand(runs, successCommand) {
|
|
1207
1306
|
const first = runs[0];
|
|
1208
1307
|
if (!first) {
|
|
1209
|
-
return
|
|
1308
|
+
return "workbench log --runs";
|
|
1210
1309
|
}
|
|
1211
1310
|
if (first.status === "running" || first.status === "failed" || first.status === "canceled") {
|
|
1212
|
-
return
|
|
1311
|
+
return `workbench show ${displayRef(first.id)}`;
|
|
1213
1312
|
}
|
|
1214
|
-
return
|
|
1313
|
+
return successCommand;
|
|
1215
1314
|
}
|
|
1216
1315
|
function cloudExecutionSummary(started) {
|
|
1217
1316
|
return {
|
|
1218
1317
|
remote: started.remote.name,
|
|
1219
1318
|
url: started.remote.url,
|
|
1220
1319
|
skillId: started.skillId,
|
|
1320
|
+
initialRunIds: started.initialRunIds,
|
|
1321
|
+
...(started.detached ? { detached: true } : {}),
|
|
1221
1322
|
sync: started.sync,
|
|
1222
1323
|
};
|
|
1223
1324
|
}
|
|
1325
|
+
function writeCloudProgress(io, message) {
|
|
1326
|
+
io.stderr.write(`${message}\n`);
|
|
1327
|
+
}
|
|
1328
|
+
function formatCloudRunStatuses(runs) {
|
|
1329
|
+
return runs.length > 0
|
|
1330
|
+
? runs.map((run) => `${displayRef(run.id)}:${run.status}`).join(", ")
|
|
1331
|
+
: "no runs";
|
|
1332
|
+
}
|
|
1224
1333
|
function workbenchInstallSourceSummary(source, snapshot) {
|
|
1225
1334
|
const installUrl = `${source.baseUrl}/skills/${encodeURIComponent(source.owner)}/${encodeURIComponent(source.skill)}`;
|
|
1226
1335
|
return {
|
|
@@ -1294,12 +1403,13 @@ async function fetchWorkbenchInstallSourceSnapshot(source, displaySource) {
|
|
|
1294
1403
|
throw new WorkbenchCodedError("auth_required", token
|
|
1295
1404
|
? `Workbench Cloud rejected the provided token while installing ${displaySource}.`
|
|
1296
1405
|
: `Authentication is required to install ${displaySource}.`, {
|
|
1297
|
-
remediation:
|
|
1406
|
+
remediation: "Run workbench login.",
|
|
1298
1407
|
exitCode: 1,
|
|
1299
1408
|
});
|
|
1300
1409
|
}
|
|
1301
1410
|
if (!response.ok) {
|
|
1302
|
-
|
|
1411
|
+
const excerpt = readResponseError(text);
|
|
1412
|
+
throw new WorkbenchCodedError("install_failed", `Unable to download Workbench source ${displaySource}: ${response.status}${excerpt ? ` ${excerpt}` : response.statusText ? ` ${response.statusText}` : ""}`, {
|
|
1303
1413
|
subject: { source: displaySource, status: response.status },
|
|
1304
1414
|
exitCode: 1,
|
|
1305
1415
|
});
|
|
@@ -1420,18 +1530,15 @@ function deviceAuthPath() {
|
|
|
1420
1530
|
return process.env.WORKBENCH_DEVICE_AUTH?.trim() || path.join(path.dirname(configPath()), "device-auth.json");
|
|
1421
1531
|
}
|
|
1422
1532
|
function selectWorkbenchBaseUrl(input = {}) {
|
|
1423
|
-
|
|
1424
|
-
if (!baseUrl) {
|
|
1425
|
-
throw new WorkbenchUserError("Missing Workbench API URL. Pass --base-url URL, set WORKBENCH_API_URL, or run `workbench login --base-url URL`.");
|
|
1426
|
-
}
|
|
1427
|
-
return baseUrl;
|
|
1533
|
+
return optionalWorkbenchBaseUrl(input);
|
|
1428
1534
|
}
|
|
1429
1535
|
function optionalWorkbenchBaseUrl(input = {}) {
|
|
1430
1536
|
const value = input.explicitBaseUrl ??
|
|
1431
1537
|
input.originBaseUrl ??
|
|
1432
1538
|
process.env.WORKBENCH_API_URL ??
|
|
1433
|
-
input.configBaseUrl
|
|
1434
|
-
|
|
1539
|
+
input.configBaseUrl ??
|
|
1540
|
+
DEFAULT_WORKBENCH_CLOUD_BASE_URL;
|
|
1541
|
+
return normalizeBaseUrl(value);
|
|
1435
1542
|
}
|
|
1436
1543
|
function normalizeBaseUrl(value) {
|
|
1437
1544
|
return value.trim().replace(/\/+$/u, "");
|
|
@@ -1449,7 +1556,8 @@ async function requestDeviceAuthorization(baseUrl) {
|
|
|
1449
1556
|
});
|
|
1450
1557
|
}
|
|
1451
1558
|
if (!response.ok) {
|
|
1452
|
-
|
|
1559
|
+
const excerpt = readResponseError(text);
|
|
1560
|
+
throw new WorkbenchCodedError("login_denied", `Device login failed: ${response.status}${excerpt ? ` ${excerpt}` : response.statusText ? ` ${response.statusText}` : ""}`, {
|
|
1453
1561
|
exitCode: 1,
|
|
1454
1562
|
});
|
|
1455
1563
|
}
|
|
@@ -1500,7 +1608,7 @@ async function pollDeviceToken(baseUrl, authorization, timeoutSeconds) {
|
|
|
1500
1608
|
}
|
|
1501
1609
|
throw new WorkbenchCodedError("login_pending", "Device login is still waiting for browser authorization.", {
|
|
1502
1610
|
retryable: true,
|
|
1503
|
-
remediation: "Authorize the device in the browser, then run workbench login --wait
|
|
1611
|
+
remediation: "Authorize the device in the browser, then run workbench login --wait.",
|
|
1504
1612
|
subject: {
|
|
1505
1613
|
retryAfterSeconds: Math.max(1, Math.ceil(intervalMs / 1000)),
|
|
1506
1614
|
verificationUri: authorization.verification_uri,
|
|
@@ -1524,7 +1632,8 @@ async function fetchWorkbenchUsername(baseUrl, accessToken) {
|
|
|
1524
1632
|
}
|
|
1525
1633
|
async function readPendingDeviceAuthorization(baseUrl) {
|
|
1526
1634
|
const record = await readDeviceAuthorizationJson(deviceAuthPath());
|
|
1527
|
-
|
|
1635
|
+
const expectedBaseUrl = baseUrl ? normalizeBaseUrl(baseUrl) : undefined;
|
|
1636
|
+
if (!record || (expectedBaseUrl && record.baseUrl !== expectedBaseUrl) || Date.parse(record.expiresAt) <= Date.now()) {
|
|
1528
1637
|
return null;
|
|
1529
1638
|
}
|
|
1530
1639
|
return record;
|
|
@@ -1614,7 +1723,8 @@ async function apiRequest(apiPath, options = {}, baseUrlOverride) {
|
|
|
1614
1723
|
}
|
|
1615
1724
|
throw requestError;
|
|
1616
1725
|
}
|
|
1617
|
-
const
|
|
1726
|
+
const excerpt = readResponseError(text);
|
|
1727
|
+
const requestError = new WorkbenchApiRequestError(response.status, `Request failed with status ${response.status}${response.statusText ? ` ${response.statusText}` : ""}${excerpt ? `: ${excerpt}` : ""}.`, text);
|
|
1618
1728
|
lastError = requestError;
|
|
1619
1729
|
if (canRetry && attempt < API_REQUEST_MAX_ATTEMPTS && isTransientApiRequestError(requestError)) {
|
|
1620
1730
|
await sleep(250 * attempt);
|
|
@@ -1707,12 +1817,22 @@ function readResponseError(text) {
|
|
|
1707
1817
|
const parsed = JSON.parse(text);
|
|
1708
1818
|
const record = asRecord(parsed);
|
|
1709
1819
|
const error = record?.error ?? record?.message;
|
|
1710
|
-
return typeof error === "string" && error.trim() ? error : null;
|
|
1820
|
+
return typeof error === "string" && error.trim() ? oneLineExcerpt(error) : null;
|
|
1711
1821
|
}
|
|
1712
1822
|
catch {
|
|
1713
|
-
|
|
1823
|
+
if (/<(?:!doctype|html|head|body)\b/iu.test(text)) {
|
|
1824
|
+
return null;
|
|
1825
|
+
}
|
|
1826
|
+
return oneLineExcerpt(text);
|
|
1714
1827
|
}
|
|
1715
1828
|
}
|
|
1829
|
+
function oneLineExcerpt(text) {
|
|
1830
|
+
const line = text.replace(/\s+/gu, " ").trim();
|
|
1831
|
+
if (!line) {
|
|
1832
|
+
return null;
|
|
1833
|
+
}
|
|
1834
|
+
return line.length > 180 ? `${line.slice(0, 177)}...` : line;
|
|
1835
|
+
}
|
|
1716
1836
|
function parseWorkbenchCloudErrorBody(text) {
|
|
1717
1837
|
try {
|
|
1718
1838
|
const record = asRecord(JSON.parse(text));
|
|
@@ -2168,6 +2288,17 @@ function intFlag(parsed, name) {
|
|
|
2168
2288
|
}
|
|
2169
2289
|
return parsedValue;
|
|
2170
2290
|
}
|
|
2291
|
+
function portFlag(parsed, name) {
|
|
2292
|
+
const value = stringFlag(parsed, name);
|
|
2293
|
+
if (!value) {
|
|
2294
|
+
return undefined;
|
|
2295
|
+
}
|
|
2296
|
+
const parsedValue = Number(value);
|
|
2297
|
+
if (!Number.isInteger(parsedValue) || parsedValue < 0 || parsedValue > 65535) {
|
|
2298
|
+
throw new WorkbenchUserError(`--${name} must be an integer between 0 and 65535.`);
|
|
2299
|
+
}
|
|
2300
|
+
return parsedValue;
|
|
2301
|
+
}
|
|
2171
2302
|
function optionalPositional(parsed, index) {
|
|
2172
2303
|
return parsed.positionals[index];
|
|
2173
2304
|
}
|
|
@@ -2227,19 +2358,15 @@ function parsePublishVisibilityFlags(parsed) {
|
|
|
2227
2358
|
}
|
|
2228
2359
|
async function previewPublishWithDerivedRemote(parsed) {
|
|
2229
2360
|
const root = path.resolve(dirFlag(parsed) ?? process.cwd());
|
|
2230
|
-
const core = await coreOptions(parsed);
|
|
2231
|
-
await listWorkbenchVersions(core);
|
|
2232
2361
|
const reconciledSnapshot = await createWorkbenchReadOnlyInspectionSnapshot({ dir: root });
|
|
2233
2362
|
const link = cloudRemoteLinkTargetFromRemotes(reconciledSnapshot.remotes);
|
|
2234
|
-
|
|
2235
|
-
|
|
2236
|
-
|
|
2237
|
-
const remote = await derivePublishCloudRemote(parsed, "workbench publish", link.name);
|
|
2363
|
+
const remote = stringFlag(parsed, "as") || !link.existing
|
|
2364
|
+
? await derivePublishCloudRemote(parsed, "workbench publish", link.name)
|
|
2365
|
+
: link.existing;
|
|
2238
2366
|
const requestedVersion = optionalPositional(parsed, 1);
|
|
2239
|
-
const
|
|
2240
|
-
? requestedVersion
|
|
2241
|
-
: reconciledSnapshot.status.currentVersionId ?? reconciledSnapshot.refs.current;
|
|
2242
|
-
const version = reconciledSnapshot.versions.find((entry) => entry.id === versionId);
|
|
2367
|
+
const version = requestedVersion && requestedVersion !== "current"
|
|
2368
|
+
? snapshotVersionByRef(reconciledSnapshot, requestedVersion)
|
|
2369
|
+
: snapshotVersionByRef(reconciledSnapshot, reconciledSnapshot.status.currentVersionId ?? reconciledSnapshot.refs.current ?? "");
|
|
2243
2370
|
if (!version) {
|
|
2244
2371
|
throw new WorkbenchCodedError("version_not_found", `Version not found: ${requestedVersion ?? "current"}`, {
|
|
2245
2372
|
remediation: "Run workbench log --versions.",
|
|
@@ -2390,7 +2517,7 @@ async function artifactIdsByRunId(core, runs) {
|
|
|
2390
2517
|
return byRun;
|
|
2391
2518
|
}
|
|
2392
2519
|
function emitEvalFailure(runs, failedRuns, artifactIds, parsed, io) {
|
|
2393
|
-
const
|
|
2520
|
+
const next = evalFailureNextCommand(failedRuns);
|
|
2394
2521
|
if (parsed.flags.json === true) {
|
|
2395
2522
|
io.stdout.write(`${JSON.stringify({
|
|
2396
2523
|
schema: "workbench.cli.eval.v1",
|
|
@@ -2401,14 +2528,14 @@ function emitEvalFailure(runs, failedRuns, artifactIds, parsed, io) {
|
|
|
2401
2528
|
evidenceSaved: true,
|
|
2402
2529
|
runs: runs.map((run) => runFailureSummary(run, artifactIds.get(run.id) ?? [])),
|
|
2403
2530
|
failedRuns: failedRuns.map((run) => runFailureSummary(run, artifactIds.get(run.id) ?? [])),
|
|
2404
|
-
|
|
2531
|
+
next,
|
|
2405
2532
|
}, null, 2)}\n`);
|
|
2406
2533
|
return 1;
|
|
2407
2534
|
}
|
|
2408
2535
|
io.stdout.write([
|
|
2409
2536
|
"Eval failed; evidence was saved.",
|
|
2410
2537
|
...failedRuns.map(formatRun),
|
|
2411
|
-
...(
|
|
2538
|
+
...(next ? [`next: ${next}`] : []),
|
|
2412
2539
|
].join("\n") + "\n");
|
|
2413
2540
|
return 1;
|
|
2414
2541
|
}
|
|
@@ -2420,7 +2547,7 @@ function runSummary(run, artifactIds) {
|
|
|
2420
2547
|
versionId: run.versionId,
|
|
2421
2548
|
skillName: run.skillName,
|
|
2422
2549
|
agentName: run.agentName,
|
|
2423
|
-
...(run
|
|
2550
|
+
...(scoredRunValue(run) !== undefined ? { score: scoredRunValue(run) } : {}),
|
|
2424
2551
|
...(run.latencyMs !== undefined ? { latencyMs: run.latencyMs } : {}),
|
|
2425
2552
|
...(run.error ? { error: run.error } : {}),
|
|
2426
2553
|
...(run.jobIds ? { jobIds: run.jobIds } : {}),
|
|
@@ -2435,23 +2562,18 @@ function runFailureSummary(run, artifactIds) {
|
|
|
2435
2562
|
skill: run.skillName,
|
|
2436
2563
|
status: run.status,
|
|
2437
2564
|
versionId: run.versionId,
|
|
2438
|
-
...(run
|
|
2565
|
+
...(scoredRunValue(run) !== undefined ? { score: scoredRunValue(run) } : {}),
|
|
2439
2566
|
...(run.error ? { error: run.error } : {}),
|
|
2440
2567
|
traceIds: run.traceIds,
|
|
2441
2568
|
artifactIds: [...artifactIds],
|
|
2442
2569
|
};
|
|
2443
2570
|
}
|
|
2444
|
-
function
|
|
2571
|
+
function evalFailureNextCommand(failedRuns) {
|
|
2445
2572
|
const first = failedRuns[0];
|
|
2446
2573
|
if (!first) {
|
|
2447
|
-
return
|
|
2574
|
+
return "workbench log --runs";
|
|
2448
2575
|
}
|
|
2449
|
-
return
|
|
2450
|
-
`workbench show ${first.id}`,
|
|
2451
|
-
`workbench show ${first.id}:stderr.log`,
|
|
2452
|
-
`workbench case add ${first.id}`,
|
|
2453
|
-
`workbench improve --agents ${first.agentName} --budget 1 -n 1`,
|
|
2454
|
-
];
|
|
2576
|
+
return `workbench show ${displayRef(first.id)}`;
|
|
2455
2577
|
}
|
|
2456
2578
|
function output(value, parsed, io, text) {
|
|
2457
2579
|
return emitResult(commandSchema(parsed), { result: value }, parsed, io, text);
|
|
@@ -2484,12 +2606,285 @@ async function workbenchCliAuthStatus() {
|
|
|
2484
2606
|
})),
|
|
2485
2607
|
};
|
|
2486
2608
|
}
|
|
2609
|
+
function scoredRunValue(run) {
|
|
2610
|
+
return run.status === "succeeded" && typeof run.score === "number" ? run.score : undefined;
|
|
2611
|
+
}
|
|
2612
|
+
function scoredJobValue(job) {
|
|
2613
|
+
return job.status === "succeeded" && typeof job.score === "number" ? job.score : undefined;
|
|
2614
|
+
}
|
|
2615
|
+
function snapshotHasWorkflowCase(snapshot) {
|
|
2616
|
+
const currentVersion = snapshotVersionByRef(snapshot, snapshot.status.currentVersionId ?? snapshot.refs.current ?? "");
|
|
2617
|
+
const caseFiles = currentVersion?.files.filter((file) => file.kind === "text" &&
|
|
2618
|
+
/^\.workbench\/cases\/[^/]+\/case\.ya?ml$/u.test(file.path)) ?? [];
|
|
2619
|
+
return caseFiles.some((file) => file.kind === "text" && !/\n\s*smoke:\s*true(?:\s|$)/u.test(`\n${file.content}`));
|
|
2620
|
+
}
|
|
2621
|
+
function installHandleFromStatusRemote(remote) {
|
|
2622
|
+
const publicationUrl = remote.publication.status === "published" ? remote.publication.installUrl : undefined;
|
|
2623
|
+
const source = parseWorkbenchInstallSource(publicationUrl ?? remote.url);
|
|
2624
|
+
return source ? `${source.owner}/${source.skill}` : publicationUrl ?? remote.url;
|
|
2625
|
+
}
|
|
2626
|
+
async function statusWithCausalNext(status, auth, core) {
|
|
2627
|
+
if (!status.project.initialized) {
|
|
2628
|
+
return status;
|
|
2629
|
+
}
|
|
2630
|
+
const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core).catch(() => null);
|
|
2631
|
+
const lastRun = snapshot?.runs
|
|
2632
|
+
.slice()
|
|
2633
|
+
.sort((left, right) => right.createdAt.localeCompare(left.createdAt))[0];
|
|
2634
|
+
if ((lastRun?.status === "running" || lastRun?.status === "failed" || lastRun?.status === "canceled") && lastRun.id) {
|
|
2635
|
+
return { ...status, next: `workbench show ${displayRef(lastRun.id)}` };
|
|
2636
|
+
}
|
|
2637
|
+
const failedRemote = status.remotes.find((remote) => remote.sync.status === "error");
|
|
2638
|
+
const hasWorkflowCase = snapshot ? snapshotHasWorkflowCase(snapshot) : false;
|
|
2639
|
+
const hasScoredRun = snapshot?.runs.some((run) => scoredRunValue(run) !== undefined) ?? false;
|
|
2640
|
+
const canPublish = hasWorkflowCase && hasScoredRun;
|
|
2641
|
+
const cloudAuthMissing = auth.workbenchCloud.status !== "authenticated";
|
|
2642
|
+
const cloudRemoteNeedsAuth = status.remotes.some((remote) => remote.kind === "workbench-cloud" &&
|
|
2643
|
+
(remote.sync.status !== "up_to_date" || remote.publication.status === "unpublished"));
|
|
2644
|
+
if (cloudAuthMissing && (canPublish || cloudRemoteNeedsAuth)) {
|
|
2645
|
+
return { ...status, next: "workbench login" };
|
|
2646
|
+
}
|
|
2647
|
+
if (failedRemote) {
|
|
2648
|
+
return { ...status, next: `workbench sync ${failedRemote.name}` };
|
|
2649
|
+
}
|
|
2650
|
+
if ((snapshot?.runs.length ?? status.runs.total) === 0) {
|
|
2651
|
+
return { ...status, next: "workbench eval" };
|
|
2652
|
+
}
|
|
2653
|
+
if (!hasWorkflowCase) {
|
|
2654
|
+
return { ...status, next: "edit .workbench/cases, then run workbench eval" };
|
|
2655
|
+
}
|
|
2656
|
+
const cloudRemote = status.remotes.find((remote) => remote.kind === "workbench-cloud");
|
|
2657
|
+
if (canPublish && !cloudRemote) {
|
|
2658
|
+
return { ...status, next: "workbench publish" };
|
|
2659
|
+
}
|
|
2660
|
+
const unpublishedCloudRemote = status.remotes.find((remote) => remote.kind === "workbench-cloud" &&
|
|
2661
|
+
remote.publication.status === "unpublished" &&
|
|
2662
|
+
remote.sync.status === "up_to_date");
|
|
2663
|
+
if (unpublishedCloudRemote) {
|
|
2664
|
+
return { ...status, next: "workbench publish" };
|
|
2665
|
+
}
|
|
2666
|
+
const publishedCloudRemote = status.remotes.find((remote) => remote.kind === "workbench-cloud" &&
|
|
2667
|
+
remote.publication.status === "published" &&
|
|
2668
|
+
Boolean(remote.publication.installUrl));
|
|
2669
|
+
if (publishedCloudRemote) {
|
|
2670
|
+
return { ...status, next: `workbench install ${installHandleFromStatusRemote(publishedCloudRemote)}` };
|
|
2671
|
+
}
|
|
2672
|
+
return {
|
|
2673
|
+
...status,
|
|
2674
|
+
next: null,
|
|
2675
|
+
};
|
|
2676
|
+
}
|
|
2677
|
+
function displayRef(id) {
|
|
2678
|
+
const version = /^v_([0-9a-f]{8,})$/iu.exec(id);
|
|
2679
|
+
if (version?.[1]) {
|
|
2680
|
+
return version[1].slice(0, 8);
|
|
2681
|
+
}
|
|
2682
|
+
const separator = id.indexOf("_");
|
|
2683
|
+
if (separator > 0 && separator < id.length - 1) {
|
|
2684
|
+
const prefix = id.slice(0, separator);
|
|
2685
|
+
const suffix = id.slice(separator + 1);
|
|
2686
|
+
return `${prefix}_${suffix.slice(0, 8)}`;
|
|
2687
|
+
}
|
|
2688
|
+
return id.length > 8 ? id.slice(0, 8) : id;
|
|
2689
|
+
}
|
|
2690
|
+
function shortenCommandRefs(command) {
|
|
2691
|
+
return command.replace(/\b(?:v_[0-9a-f]{8,}|(?:run|job|trace|artifact)_[a-z0-9_-]+)/giu, (match) => displayRef(match));
|
|
2692
|
+
}
|
|
2693
|
+
function snapshotVersionByRef(snapshot, ref) {
|
|
2694
|
+
const requested = ref.trim();
|
|
2695
|
+
const normalized = requested === "current" ? snapshot.refs.current ?? "" : requested;
|
|
2696
|
+
if (!normalized) {
|
|
2697
|
+
return undefined;
|
|
2698
|
+
}
|
|
2699
|
+
const candidates = snapshot.versions.filter((version) => snapshotVersionRefMatches(version, normalized));
|
|
2700
|
+
if (candidates.length > 1) {
|
|
2701
|
+
throw new WorkbenchCodedError("ref_ambiguous", `Version ref is ambiguous: ${ref}. Candidates: ${candidates.map((version) => displayRef(version.id)).join(", ")}.`, {
|
|
2702
|
+
subject: { ref, candidates: candidates.map((version) => version.id) },
|
|
2703
|
+
exitCode: 2,
|
|
2704
|
+
});
|
|
2705
|
+
}
|
|
2706
|
+
return candidates[0];
|
|
2707
|
+
}
|
|
2708
|
+
function snapshotVersionRefMatches(version, ref) {
|
|
2709
|
+
const withoutVersionPrefix = ref.startsWith("v_") ? ref.slice(2) : ref;
|
|
2710
|
+
return version.id === ref ||
|
|
2711
|
+
version.hash === ref ||
|
|
2712
|
+
version.id.startsWith(ref) ||
|
|
2713
|
+
version.hash.startsWith(ref) ||
|
|
2714
|
+
version.hash.startsWith(withoutVersionPrefix) ||
|
|
2715
|
+
version.id.startsWith(`v_${withoutVersionPrefix}`);
|
|
2716
|
+
}
|
|
2717
|
+
function snapshotObjectByRef(entries, ref, kind) {
|
|
2718
|
+
const normalized = ref.trim();
|
|
2719
|
+
if (!normalized) {
|
|
2720
|
+
return undefined;
|
|
2721
|
+
}
|
|
2722
|
+
const candidates = entries.filter((entry) => objectRefMatches(entry.id, normalized));
|
|
2723
|
+
if (candidates.length > 1) {
|
|
2724
|
+
throw new WorkbenchCodedError("ref_ambiguous", `${capitalize(kind)} ref is ambiguous: ${ref}. Candidates: ${candidates.map((entry) => displayRef(entry.id)).slice(0, 8).join(", ")}.`, {
|
|
2725
|
+
subject: { ref, candidates: candidates.map((entry) => entry.id).slice(0, 20) },
|
|
2726
|
+
exitCode: 2,
|
|
2727
|
+
});
|
|
2728
|
+
}
|
|
2729
|
+
return candidates[0];
|
|
2730
|
+
}
|
|
2731
|
+
function objectRefMatches(id, ref) {
|
|
2732
|
+
if (id === ref || id.startsWith(ref)) {
|
|
2733
|
+
return true;
|
|
2734
|
+
}
|
|
2735
|
+
const separator = id.indexOf("_");
|
|
2736
|
+
return separator > 0 && id.slice(separator + 1).startsWith(ref);
|
|
2737
|
+
}
|
|
2738
|
+
function capitalize(value) {
|
|
2739
|
+
return value.length > 0 ? `${value[0].toUpperCase()}${value.slice(1)}` : value;
|
|
2740
|
+
}
|
|
2741
|
+
function runOrJobEvidenceSelection(snapshot, ref) {
|
|
2742
|
+
const run = snapshotObjectByRef(snapshot.runs, ref, "run");
|
|
2743
|
+
const job = snapshotObjectByRef(snapshot.jobs, ref, "job");
|
|
2744
|
+
if (run && job) {
|
|
2745
|
+
throw new WorkbenchCodedError("ref_ambiguous", `Run/job ref is ambiguous: ${ref}. Candidates: ${displayRef(run.id)}, ${displayRef(job.id)}.`, {
|
|
2746
|
+
subject: { ref, candidates: [run.id, job.id] },
|
|
2747
|
+
exitCode: 2,
|
|
2748
|
+
});
|
|
2749
|
+
}
|
|
2750
|
+
if (run) {
|
|
2751
|
+
return {
|
|
2752
|
+
run,
|
|
2753
|
+
jobs: snapshot.jobs.filter((entry) => entry.runId === run.id),
|
|
2754
|
+
};
|
|
2755
|
+
}
|
|
2756
|
+
return job ? { jobs: [job] } : { jobs: [] };
|
|
2757
|
+
}
|
|
2758
|
+
function evidenceFilesForRunOrJob(snapshot, ref) {
|
|
2759
|
+
const selection = runOrJobEvidenceSelection(snapshot, ref);
|
|
2760
|
+
if (!selection.run && selection.jobs.length === 0) {
|
|
2761
|
+
return [];
|
|
2762
|
+
}
|
|
2763
|
+
const traceById = new Map(snapshot.traces.map((trace) => [trace.id, trace]));
|
|
2764
|
+
const artifactById = new Map(snapshot.artifacts.map((artifact) => [artifact.id, artifact]));
|
|
2765
|
+
const files = selection.jobs.flatMap((job) => [
|
|
2766
|
+
...job.artifactIds.flatMap((artifactId) => {
|
|
2767
|
+
const artifact = artifactById.get(artifactId);
|
|
2768
|
+
return artifact
|
|
2769
|
+
? artifact.files.filter(isUserFacingEvidenceFile).map((file) => evidenceFileWithPath(file, `cases/${evidencePathSegment(job.caseId)}/jobs/${evidencePathSegment(job.id)}/${file.path}`))
|
|
2770
|
+
: [];
|
|
2771
|
+
}),
|
|
2772
|
+
...job.traceIds.flatMap((traceId) => {
|
|
2773
|
+
const trace = traceById.get(traceId);
|
|
2774
|
+
return trace
|
|
2775
|
+
? trace.files.filter(isUserFacingEvidenceFile).map((file) => evidenceFileWithPath(file, `cases/${evidencePathSegment(job.caseId)}/jobs/${evidencePathSegment(job.id)}/traces/${evidencePathSegment(trace.id)}/${file.path}`))
|
|
2776
|
+
: [];
|
|
2777
|
+
}),
|
|
2778
|
+
]);
|
|
2779
|
+
const seen = new Set();
|
|
2780
|
+
return files.filter((file) => {
|
|
2781
|
+
if (seen.has(file.path)) {
|
|
2782
|
+
return false;
|
|
2783
|
+
}
|
|
2784
|
+
seen.add(file.path);
|
|
2785
|
+
return true;
|
|
2786
|
+
});
|
|
2787
|
+
}
|
|
2788
|
+
function evidenceFileWithPath(file, filePath) {
|
|
2789
|
+
return {
|
|
2790
|
+
...file,
|
|
2791
|
+
path: filePath.replace(/\\/gu, "/").replace(/^\/+/u, ""),
|
|
2792
|
+
};
|
|
2793
|
+
}
|
|
2794
|
+
function isUserFacingEvidenceFile(file) {
|
|
2795
|
+
const normalized = file.path.replace(/\\/gu, "/").replace(/^\/+/u, "");
|
|
2796
|
+
return normalized.split("/").every((segment) => segment !== ".workbench");
|
|
2797
|
+
}
|
|
2798
|
+
function evidencePathSegment(value) {
|
|
2799
|
+
return value.replace(/[^A-Za-z0-9._-]+/gu, "-") || "_";
|
|
2800
|
+
}
|
|
2801
|
+
function formatRunOrJobEvidence(details, files) {
|
|
2802
|
+
const detailLines = details.map(formatTraceDetail).filter(Boolean);
|
|
2803
|
+
const fileLines = files.length > 0 ? ["Files:", ...files.map((file) => file.path)] : [];
|
|
2804
|
+
return [...detailLines, ...fileLines].join("\n") || "No evidence.";
|
|
2805
|
+
}
|
|
2806
|
+
function evidenceDetailSummary(detail) {
|
|
2807
|
+
return {
|
|
2808
|
+
runId: detail.runId,
|
|
2809
|
+
executions: detail.executions.map((execution) => ({
|
|
2810
|
+
id: execution.id,
|
|
2811
|
+
status: execution.status,
|
|
2812
|
+
jobIds: execution.jobIds,
|
|
2813
|
+
sessions: execution.sessions.map((session) => ({
|
|
2814
|
+
label: session.label,
|
|
2815
|
+
})),
|
|
2816
|
+
trace: {
|
|
2817
|
+
events: execution.trace.events.length,
|
|
2818
|
+
spans: execution.trace.spans.length,
|
|
2819
|
+
summaries: execution.trace.summaries.length,
|
|
2820
|
+
},
|
|
2821
|
+
})),
|
|
2822
|
+
};
|
|
2823
|
+
}
|
|
2824
|
+
function manifestOnly(value) {
|
|
2825
|
+
if (value === null || typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
|
|
2826
|
+
return value;
|
|
2827
|
+
}
|
|
2828
|
+
if (Array.isArray(value)) {
|
|
2829
|
+
return value.map(manifestOnly);
|
|
2830
|
+
}
|
|
2831
|
+
if (!value || typeof value !== "object") {
|
|
2832
|
+
return null;
|
|
2833
|
+
}
|
|
2834
|
+
const record = value;
|
|
2835
|
+
if (typeof record.path === "string" && typeof record.content === "string") {
|
|
2836
|
+
return fileSummary(record);
|
|
2837
|
+
}
|
|
2838
|
+
const out = {};
|
|
2839
|
+
for (const [key, child] of Object.entries(record)) {
|
|
2840
|
+
if (child === undefined) {
|
|
2841
|
+
continue;
|
|
2842
|
+
}
|
|
2843
|
+
out[key] = manifestOnly(child);
|
|
2844
|
+
}
|
|
2845
|
+
return out;
|
|
2846
|
+
}
|
|
2847
|
+
async function resolveLocalImproverAgent(parsed, core) {
|
|
2848
|
+
if (stringFlag(parsed, "agents")) {
|
|
2849
|
+
return undefined;
|
|
2850
|
+
}
|
|
2851
|
+
const agents = await listWorkbenchAgents(core).catch(() => []);
|
|
2852
|
+
const status = await workbenchStatusSnapshot(core).catch(() => undefined);
|
|
2853
|
+
const defaultAgentName = status?.project.defaultAgent ?? agents[0]?.name;
|
|
2854
|
+
const defaultAgent = agents.find((agent) => agent.name === defaultAgentName);
|
|
2855
|
+
if (defaultAgent && workbenchSkillImproveCanUseQueuedAdapter(defaultAgent)) {
|
|
2856
|
+
return undefined;
|
|
2857
|
+
}
|
|
2858
|
+
const connected = await localWorkbenchAdapterAuthStore(adapterAuthStoreRoot()).listStatus().catch(() => []);
|
|
2859
|
+
const candidates = connected
|
|
2860
|
+
.filter((entry) => entry.status === "connected" &&
|
|
2861
|
+
(entry.adapterId === "claude" || entry.adapterId === "codex"))
|
|
2862
|
+
.sort((left, right) => {
|
|
2863
|
+
const adapterRank = (adapter) => adapter === "claude" ? 0 : adapter === "codex" ? 1 : 2;
|
|
2864
|
+
return adapterRank(left.adapterId) - adapterRank(right.adapterId) ||
|
|
2865
|
+
(Date.parse(right.updatedAt ?? "") || 0) - (Date.parse(left.updatedAt ?? "") || 0);
|
|
2866
|
+
});
|
|
2867
|
+
const selected = candidates[0];
|
|
2868
|
+
if (!selected) {
|
|
2869
|
+
throw new WorkbenchCodedError("auth_required", "workbench improve needs a connected improver.", {
|
|
2870
|
+
remediation: "Run workbench login claude (or codex) to connect an improver.",
|
|
2871
|
+
exitCode: 1,
|
|
2872
|
+
});
|
|
2873
|
+
}
|
|
2874
|
+
return {
|
|
2875
|
+
name: selected.adapterId,
|
|
2876
|
+
adapter: selected.adapterId,
|
|
2877
|
+
config: {
|
|
2878
|
+
auth: selected.slot ? { [selected.slot]: selected.profile } : selected.profile,
|
|
2879
|
+
},
|
|
2880
|
+
};
|
|
2881
|
+
}
|
|
2487
2882
|
function formatLogEntry(entry) {
|
|
2488
2883
|
if (entry.kind === "version") {
|
|
2489
|
-
return `${entry.createdAt}\tversion\t${entry.id}\tfiles=${entry.fileCount}\t${entry.message}`;
|
|
2884
|
+
return `${entry.createdAt}\tversion\t${displayRef(entry.id)}\tfiles=${entry.fileCount}\t${entry.message}`;
|
|
2490
2885
|
}
|
|
2491
2886
|
const score = entry.score === undefined ? "n/a" : entry.score.toFixed(3);
|
|
2492
|
-
return `${entry.createdAt}\trun\t${entry.id}\t${entry.status}\tversion=${entry.versionId}\tskill=${entry.skillName}\tagent=${entry.agentName}\tscore=${score}`;
|
|
2887
|
+
return `${entry.createdAt}\trun\t${displayRef(entry.id)}\t${entry.status}\tversion=${displayRef(entry.versionId)}\tskill=${entry.skillName}\tagent=${entry.agentName}\tscore=${score}`;
|
|
2493
2888
|
}
|
|
2494
2889
|
function splitShowRef(ref) {
|
|
2495
2890
|
const index = ref.indexOf(":");
|
|
@@ -2500,18 +2895,14 @@ function splitShowRef(ref) {
|
|
|
2500
2895
|
}
|
|
2501
2896
|
async function fileForRunOrJobRef(core, objectRef, requestedPath) {
|
|
2502
2897
|
const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
|
|
2503
|
-
const
|
|
2504
|
-
|
|
2505
|
-
if (!run && !job) {
|
|
2898
|
+
const selection = runOrJobEvidenceSelection(snapshot, objectRef);
|
|
2899
|
+
if (!selection.run && selection.jobs.length === 0) {
|
|
2506
2900
|
return null;
|
|
2507
2901
|
}
|
|
2508
|
-
const
|
|
2509
|
-
const
|
|
2510
|
-
|
|
2511
|
-
|
|
2512
|
-
if (file) {
|
|
2513
|
-
return file;
|
|
2514
|
-
}
|
|
2902
|
+
const files = evidenceFilesForRunOrJob(snapshot, objectRef);
|
|
2903
|
+
const file = findShowFile(files, requestedPath, objectRef);
|
|
2904
|
+
if (file) {
|
|
2905
|
+
return file;
|
|
2515
2906
|
}
|
|
2516
2907
|
throw new WorkbenchCodedError("ref_not_found", `File not found in ${objectRef}: ${requestedPath}`, {
|
|
2517
2908
|
remediation: `Run workbench show ${objectRef}.`,
|
|
@@ -2520,12 +2911,8 @@ async function fileForRunOrJobRef(core, objectRef, requestedPath) {
|
|
|
2520
2911
|
});
|
|
2521
2912
|
}
|
|
2522
2913
|
function evidenceDetailsForRunOrJob(snapshot, ref) {
|
|
2523
|
-
const
|
|
2524
|
-
|
|
2525
|
-
const jobs = run
|
|
2526
|
-
? snapshot.jobs.filter((entry) => entry.runId === run.id)
|
|
2527
|
-
: job ? [job] : [];
|
|
2528
|
-
return jobs.flatMap((entry) => {
|
|
2914
|
+
const selection = runOrJobEvidenceSelection(snapshot, ref);
|
|
2915
|
+
return selection.jobs.flatMap((entry) => {
|
|
2529
2916
|
const detail = workbenchJobEvidenceForSnapshot(snapshot, {
|
|
2530
2917
|
runId: entry.runId,
|
|
2531
2918
|
jobId: entry.id,
|
|
@@ -2536,12 +2923,72 @@ function evidenceDetailsForRunOrJob(snapshot, ref) {
|
|
|
2536
2923
|
execution.trace.events.length > 0 ||
|
|
2537
2924
|
execution.trace.summaries.length > 0));
|
|
2538
2925
|
}
|
|
2539
|
-
function findShowFile(files, requestedPath) {
|
|
2926
|
+
function findShowFile(files, requestedPath, objectRef) {
|
|
2540
2927
|
const normalized = requestedPath.replace(/\\/gu, "/");
|
|
2541
|
-
|
|
2542
|
-
|
|
2543
|
-
|
|
2544
|
-
|
|
2928
|
+
const exact = files.filter((file) => file.path === normalized);
|
|
2929
|
+
if (exact.length === 1) {
|
|
2930
|
+
return exact[0];
|
|
2931
|
+
}
|
|
2932
|
+
const exactEquivalent = singleEquivalentShowFile(exact);
|
|
2933
|
+
if (exactEquivalent) {
|
|
2934
|
+
return exactEquivalent;
|
|
2935
|
+
}
|
|
2936
|
+
if (exact.length > 1) {
|
|
2937
|
+
throw ambiguousShowPath(objectRef, requestedPath, exact);
|
|
2938
|
+
}
|
|
2939
|
+
const suffixCandidates = files.filter((file) => file.path.endsWith(`/${normalized}`) || path.basename(file.path) === normalized);
|
|
2940
|
+
if (suffixCandidates.length === 0) {
|
|
2941
|
+
return null;
|
|
2942
|
+
}
|
|
2943
|
+
const candidates = normalized === "stderr.log"
|
|
2944
|
+
? suffixCandidates.filter((file) => file.content.length > 0)
|
|
2945
|
+
: suffixCandidates;
|
|
2946
|
+
const canonicalCandidates = candidates.filter(isCanonicalEvidenceFileCandidate);
|
|
2947
|
+
if (canonicalCandidates.length === 1) {
|
|
2948
|
+
return canonicalCandidates[0];
|
|
2949
|
+
}
|
|
2950
|
+
const equivalentCanonicalCandidate = singleEquivalentShowFile(canonicalCandidates);
|
|
2951
|
+
if (equivalentCanonicalCandidate) {
|
|
2952
|
+
return equivalentCanonicalCandidate;
|
|
2953
|
+
}
|
|
2954
|
+
if (canonicalCandidates.length > 1) {
|
|
2955
|
+
throw ambiguousShowPath(objectRef, requestedPath, canonicalCandidates);
|
|
2956
|
+
}
|
|
2957
|
+
if (candidates.length === 1) {
|
|
2958
|
+
return candidates[0];
|
|
2959
|
+
}
|
|
2960
|
+
const equivalentCandidate = singleEquivalentShowFile(candidates);
|
|
2961
|
+
if (equivalentCandidate) {
|
|
2962
|
+
return equivalentCandidate;
|
|
2963
|
+
}
|
|
2964
|
+
if (candidates.length === 0 && suffixCandidates.length === 1) {
|
|
2965
|
+
return suffixCandidates[0];
|
|
2966
|
+
}
|
|
2967
|
+
const equivalentSuffixCandidate = singleEquivalentShowFile(suffixCandidates);
|
|
2968
|
+
if (equivalentSuffixCandidate) {
|
|
2969
|
+
return equivalentSuffixCandidate;
|
|
2970
|
+
}
|
|
2971
|
+
throw ambiguousShowPath(objectRef, requestedPath, candidates.length > 0 ? candidates : suffixCandidates);
|
|
2972
|
+
}
|
|
2973
|
+
function isCanonicalEvidenceFileCandidate(file) {
|
|
2974
|
+
return !file.path.includes("/traces/") && !file.path.includes("/artifacts/");
|
|
2975
|
+
}
|
|
2976
|
+
function singleEquivalentShowFile(files) {
|
|
2977
|
+
if (files.length <= 1) {
|
|
2978
|
+
return null;
|
|
2979
|
+
}
|
|
2980
|
+
const first = files[0];
|
|
2981
|
+
return files.every((file) => file.kind === first.kind && file.encoding === first.encoding && file.content === first.content)
|
|
2982
|
+
? first
|
|
2983
|
+
: null;
|
|
2984
|
+
}
|
|
2985
|
+
function ambiguousShowPath(objectRef, requestedPath, candidates) {
|
|
2986
|
+
const candidatePaths = candidates.map((file) => file.path);
|
|
2987
|
+
return new WorkbenchCodedError("ref_ambiguous", `File path is ambiguous in ${objectRef}: ${requestedPath}. Candidates: ${candidatePaths.join(", ")}.`, {
|
|
2988
|
+
remediation: `Run workbench show ${objectRef}.`,
|
|
2989
|
+
subject: { ref: objectRef, path: requestedPath, candidates: candidatePaths },
|
|
2990
|
+
exitCode: 2,
|
|
2991
|
+
});
|
|
2545
2992
|
}
|
|
2546
2993
|
function fileListing(kind, id, files) {
|
|
2547
2994
|
return {
|
|
@@ -2552,17 +2999,16 @@ function fileListing(kind, id, files) {
|
|
|
2552
2999
|
};
|
|
2553
3000
|
}
|
|
2554
3001
|
function formatFileListing(kind, id, files) {
|
|
2555
|
-
return [`${kind}\t${id}\tfiles=${files.length}`, ...files.map((file) => file.path)].join("\n");
|
|
3002
|
+
return [`${kind}\t${displayRef(id)}\tfiles=${files.length}`, ...files.map((file) => file.path)].join("\n");
|
|
2556
3003
|
}
|
|
2557
3004
|
async function traceIdForCaseSource(core, ref) {
|
|
2558
3005
|
const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
|
|
2559
|
-
const trace = snapshot.traces
|
|
3006
|
+
const trace = snapshotObjectByRef(snapshot.traces, ref, "trace");
|
|
2560
3007
|
if (trace) {
|
|
2561
3008
|
return trace.id;
|
|
2562
3009
|
}
|
|
2563
|
-
const
|
|
2564
|
-
const
|
|
2565
|
-
const traceId = run?.traceIds[0] ?? job?.traceIds[0];
|
|
3010
|
+
const selection = runOrJobEvidenceSelection(snapshot, ref);
|
|
3011
|
+
const traceId = selection.run?.traceIds[0] ?? selection.jobs[0]?.traceIds[0];
|
|
2566
3012
|
if (traceId) {
|
|
2567
3013
|
return traceId;
|
|
2568
3014
|
}
|
|
@@ -2572,43 +3018,85 @@ async function traceIdForCaseSource(core, ref) {
|
|
|
2572
3018
|
exitCode: 1,
|
|
2573
3019
|
});
|
|
2574
3020
|
}
|
|
3021
|
+
async function evalCoverageSummaries(core, runs) {
|
|
3022
|
+
const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
|
|
3023
|
+
const jobsByRun = new Map();
|
|
3024
|
+
for (const job of snapshot.jobs) {
|
|
3025
|
+
const existing = jobsByRun.get(job.runId) ?? [];
|
|
3026
|
+
existing.push(job);
|
|
3027
|
+
jobsByRun.set(job.runId, existing);
|
|
3028
|
+
}
|
|
3029
|
+
return runs.map((run) => {
|
|
3030
|
+
const jobs = jobsByRun.get(run.id) ?? [];
|
|
3031
|
+
const cases = new Set(jobs.map((job) => job.caseId));
|
|
3032
|
+
const samples = new Set(jobs.map((job) => `${job.caseId}\0${job.sample}`));
|
|
3033
|
+
return {
|
|
3034
|
+
runId: run.id,
|
|
3035
|
+
cases: cases.size,
|
|
3036
|
+
samples: samples.size,
|
|
3037
|
+
jobs: jobs.length,
|
|
3038
|
+
succeeded: jobs.filter((job) => job.status === "succeeded").length,
|
|
3039
|
+
failed: jobs.filter((job) => job.status === "failed" || job.status === "canceled").length,
|
|
3040
|
+
};
|
|
3041
|
+
});
|
|
3042
|
+
}
|
|
3043
|
+
function formatEvalCoverage(coverage) {
|
|
3044
|
+
return [
|
|
3045
|
+
`coverage cases=${coverage.cases}`,
|
|
3046
|
+
`samples=${coverage.samples}`,
|
|
3047
|
+
`jobs=${coverage.jobs}`,
|
|
3048
|
+
coverage.failed > 0 ? `failed=${coverage.failed}` : undefined,
|
|
3049
|
+
].filter(Boolean).join(" ");
|
|
3050
|
+
}
|
|
2575
3051
|
async function evalDeltas(core, runs) {
|
|
2576
3052
|
const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
|
|
2577
3053
|
return runs.map((run) => {
|
|
3054
|
+
const score = scoredRunValue(run);
|
|
2578
3055
|
const previous = snapshot.runs
|
|
2579
3056
|
.filter((candidate) => candidate.id !== run.id &&
|
|
2580
3057
|
candidate.skillName === run.skillName &&
|
|
2581
3058
|
candidate.agentName === run.agentName &&
|
|
2582
|
-
|
|
3059
|
+
scoredRunValue(candidate) !== undefined &&
|
|
2583
3060
|
candidate.createdAt < run.createdAt)
|
|
2584
3061
|
.sort((left, right) => right.createdAt.localeCompare(left.createdAt))[0];
|
|
3062
|
+
const previousScore = previous ? scoredRunValue(previous) : undefined;
|
|
2585
3063
|
return {
|
|
2586
3064
|
runId: run.id,
|
|
2587
3065
|
versionId: run.versionId,
|
|
2588
3066
|
skillName: run.skillName,
|
|
2589
3067
|
agentName: run.agentName,
|
|
2590
|
-
...(
|
|
2591
|
-
...(
|
|
2592
|
-
...(
|
|
3068
|
+
...(score !== undefined ? { score } : {}),
|
|
3069
|
+
...(previousScore !== undefined ? { previousScore } : {}),
|
|
3070
|
+
...(score !== undefined && previousScore !== undefined ? { delta: score - previousScore } : {}),
|
|
2593
3071
|
};
|
|
2594
3072
|
});
|
|
2595
3073
|
}
|
|
2596
3074
|
function formatEvalDelta(delta) {
|
|
2597
|
-
|
|
3075
|
+
if (delta.score === undefined) {
|
|
3076
|
+
return "";
|
|
3077
|
+
}
|
|
3078
|
+
const score = delta.score.toFixed(3);
|
|
2598
3079
|
if (delta.previousScore === undefined || delta.delta === undefined) {
|
|
2599
|
-
return `${delta.skillName} ${delta.versionId} ${score}
|
|
3080
|
+
return `${delta.skillName} ${displayRef(delta.versionId)} ${score}`;
|
|
2600
3081
|
}
|
|
2601
3082
|
const sign = delta.delta >= 0 ? "+" : "";
|
|
2602
|
-
return `${delta.skillName} ${delta.versionId} ${score} (was ${delta.previousScore.toFixed(3)}, ${sign}${delta.delta.toFixed(3)})`;
|
|
3083
|
+
return `${delta.skillName} ${displayRef(delta.versionId)} ${score} (was ${delta.previousScore.toFixed(3)}, ${sign}${delta.delta.toFixed(3)})`;
|
|
2603
3084
|
}
|
|
2604
|
-
function
|
|
2605
|
-
|
|
3085
|
+
async function evalSuccessNextCommand(core, runs) {
|
|
3086
|
+
if (runs.length === 0) {
|
|
3087
|
+
return "workbench eval";
|
|
3088
|
+
}
|
|
3089
|
+
if (!runs.some((run) => scoredRunValue(run) !== undefined)) {
|
|
3090
|
+
return "edit .workbench/cases, then run workbench eval";
|
|
3091
|
+
}
|
|
3092
|
+
const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
|
|
3093
|
+
return snapshotHasWorkflowCase(snapshot) ? "workbench publish" : "edit .workbench/cases, then run workbench eval";
|
|
2606
3094
|
}
|
|
2607
3095
|
function formatStatusSnapshot(status) {
|
|
2608
3096
|
const lines = [
|
|
2609
3097
|
`Root: ${status.project.root}`,
|
|
2610
3098
|
`Initialized: ${status.project.initialized ? "yes" : "no"}`,
|
|
2611
|
-
...(status.project.currentVersionId ? [`Current version: ${status.project.currentVersionId}`] : []),
|
|
3099
|
+
...(status.project.currentVersionId ? [`Current version: ${displayRef(status.project.currentVersionId)}`] : []),
|
|
2612
3100
|
...(status.project.defaultSkill ? [`Default skill: ${status.project.defaultSkill}`] : []),
|
|
2613
3101
|
...(status.project.defaultAgent ? [`Default agent: ${status.project.defaultAgent}`] : []),
|
|
2614
3102
|
`Runs: ${status.runs.total}${status.runs.lastStatus ? ` (last ${status.runs.lastStatus})` : ""}`,
|
|
@@ -2618,7 +3106,7 @@ function formatStatusSnapshot(status) {
|
|
|
2618
3106
|
? [
|
|
2619
3107
|
"publication=published",
|
|
2620
3108
|
remote.publication.visibility ? `visibility=${remote.publication.visibility}` : undefined,
|
|
2621
|
-
remote.publication.versionId ? `version=${remote.publication.versionId}` : undefined,
|
|
3109
|
+
remote.publication.versionId ? `version=${displayRef(remote.publication.versionId)}` : undefined,
|
|
2622
3110
|
remote.publication.installUrl ? `install=${remote.publication.installUrl}` : undefined,
|
|
2623
3111
|
remote.publication.pinnedInstallUrl ? `pinned=${remote.publication.pinnedInstallUrl}` : undefined,
|
|
2624
3112
|
].filter(Boolean).join("\t")
|
|
@@ -2629,17 +3117,16 @@ function formatStatusSnapshot(status) {
|
|
|
2629
3117
|
? [
|
|
2630
3118
|
` error[${remote.sync.lastError.code}]: ${remote.sync.lastError.message}`,
|
|
2631
3119
|
...(remote.sync.lastAttemptAt ? [` last attempt: ${remote.sync.lastAttemptAt}`] : []),
|
|
2632
|
-
...(remote.sync.nextCommand ? [` next: ${remote.sync.nextCommand}`] : []),
|
|
2633
3120
|
]
|
|
2634
3121
|
: []),
|
|
2635
3122
|
];
|
|
2636
3123
|
})] : ["Remotes: none"]),
|
|
2637
|
-
...(status.next
|
|
3124
|
+
...(status.next ? [`next: ${shortenCommandRefs(status.next)}`] : []),
|
|
2638
3125
|
];
|
|
2639
3126
|
return lines.join("\n");
|
|
2640
3127
|
}
|
|
2641
3128
|
function formatVersion(version) {
|
|
2642
|
-
return `${version.id}\t${version.hash.slice(0, 12)}\t${version.message}`;
|
|
3129
|
+
return `${displayRef(version.id)}\t${version.hash.slice(0, 12)}\t${version.message}`;
|
|
2643
3130
|
}
|
|
2644
3131
|
function versionSummary(version) {
|
|
2645
3132
|
return {
|
|
@@ -2655,50 +3142,52 @@ function formatAgent(agent) {
|
|
|
2655
3142
|
return `${agent.name}\t${agent.adapter}${agent.model ? `\t${agent.model}` : ""}`;
|
|
2656
3143
|
}
|
|
2657
3144
|
function formatRun(run) {
|
|
2658
|
-
const
|
|
3145
|
+
const scoreValue = scoredRunValue(run);
|
|
3146
|
+
const score = scoreValue === undefined ? "n/a" : scoreValue.toFixed(3);
|
|
2659
3147
|
const latency = run.latencyMs === undefined ? "n/a" : `${run.latencyMs}ms`;
|
|
2660
|
-
return `${run.id}\t${run.kind}\t${run.status}\tversion=${run.versionId}\tskill=${run.skillName}\tagent=${run.agentName}\tscore=${score}\tlatency=${latency}`;
|
|
3148
|
+
return `${displayRef(run.id)}\t${run.kind}\t${run.status}\tversion=${displayRef(run.versionId)}\tskill=${run.skillName}\tagent=${run.agentName}\tscore=${score}\tlatency=${latency}`;
|
|
2661
3149
|
}
|
|
2662
3150
|
function formatImproveResult(result) {
|
|
2663
3151
|
return [
|
|
2664
|
-
`Improved ${result.version.parentIds[0]
|
|
3152
|
+
`Improved ${result.version.parentIds[0] ? displayRef(result.version.parentIds[0]) : "current"} -> ${displayRef(result.version.id)}. ${formatRun(result.run)}`,
|
|
2665
3153
|
result.switched
|
|
2666
3154
|
? "Switched to improved version."
|
|
2667
3155
|
: `Did not switch: ${result.promotionReason}`,
|
|
2668
3156
|
].join("\n");
|
|
2669
3157
|
}
|
|
2670
3158
|
function formatJob(job) {
|
|
2671
|
-
const
|
|
3159
|
+
const scoreValue = scoredJobValue(job);
|
|
3160
|
+
const score = scoreValue === undefined ? "n/a" : scoreValue.toFixed(3);
|
|
2672
3161
|
const duration = job.durationMs === undefined ? "n/a" : `${job.durationMs}ms`;
|
|
2673
|
-
return `${job.id}\trun=${job.runId}\tcase=${job.caseId}\tsample=${job.sample}\t${job.status}\tscore=${score}\tduration=${duration}`;
|
|
3162
|
+
return `${displayRef(job.id)}\trun=${displayRef(job.runId)}\tcase=${job.caseId}\tsample=${job.sample}\t${job.status}\tscore=${score}\tduration=${duration}`;
|
|
2674
3163
|
}
|
|
2675
3164
|
function formatComparison(comparison) {
|
|
2676
3165
|
const lines = ["version\tskill\tagent\tstatus\tscore\tcost\tlatency\trun"];
|
|
2677
3166
|
for (const cell of comparison.cells) {
|
|
2678
3167
|
lines.push([
|
|
2679
|
-
cell.versionId,
|
|
3168
|
+
displayRef(cell.versionId),
|
|
2680
3169
|
cell.skillName,
|
|
2681
3170
|
`${cell.agentName}@${shortObjectId(cell.agentHash)}`,
|
|
2682
3171
|
cell.status ?? "not-run",
|
|
2683
3172
|
cell.score === undefined ? "n/a" : cell.score.toFixed(3),
|
|
2684
3173
|
cell.costUsd === undefined ? "n/a" : `$${cell.costUsd.toFixed(4)}`,
|
|
2685
3174
|
cell.latencyMs === undefined ? "n/a" : `${cell.latencyMs}ms`,
|
|
2686
|
-
cell.runId
|
|
3175
|
+
cell.runId ? displayRef(cell.runId) : "n/a",
|
|
2687
3176
|
].join("\t"));
|
|
2688
3177
|
}
|
|
2689
3178
|
return lines.join("\n");
|
|
2690
3179
|
}
|
|
2691
3180
|
function shortObjectId(id) {
|
|
2692
|
-
return id.length >
|
|
3181
|
+
return id.length > 8 ? id.slice(0, 8) : id;
|
|
2693
3182
|
}
|
|
2694
3183
|
function formatTrace(trace) {
|
|
2695
3184
|
const result = asRecord(trace.result);
|
|
2696
3185
|
const status = typeof result?.status === "string" ? result.status : undefined;
|
|
2697
|
-
const score = typeof result?.score === "number" ? result.score.toFixed(3) : undefined;
|
|
3186
|
+
const score = status === "succeeded" && typeof result?.score === "number" ? result.score.toFixed(3) : undefined;
|
|
2698
3187
|
const error = typeof result?.error === "string" ? result.error.split(/\r?\n/u)[0] : undefined;
|
|
2699
3188
|
const files = trace.files.slice(0, 5).map((file) => file.path).join(",");
|
|
2700
3189
|
return [
|
|
2701
|
-
`${trace.id}\trun=${trace.runId}\tjob=${trace.jobId
|
|
3190
|
+
`${displayRef(trace.id)}\trun=${displayRef(trace.runId)}\tjob=${trace.jobId ? displayRef(trace.jobId) : "n/a"}\tversion=${displayRef(trace.versionId)}\tskill=${trace.skillName}\tagent=${trace.agentName}`,
|
|
2702
3191
|
status ? `status=${status}` : undefined,
|
|
2703
3192
|
score ? `score=${score}` : undefined,
|
|
2704
3193
|
error ? `error=${error}` : undefined,
|
|
@@ -2707,6 +3196,7 @@ function formatTrace(trace) {
|
|
|
2707
3196
|
}
|
|
2708
3197
|
function traceSummary(trace) {
|
|
2709
3198
|
const result = asRecord(trace.result);
|
|
3199
|
+
const status = typeof result?.status === "string" ? result.status : undefined;
|
|
2710
3200
|
return {
|
|
2711
3201
|
id: trace.id,
|
|
2712
3202
|
runId: trace.runId,
|
|
@@ -2715,8 +3205,8 @@ function traceSummary(trace) {
|
|
|
2715
3205
|
skillName: trace.skillName,
|
|
2716
3206
|
agentName: trace.agentName,
|
|
2717
3207
|
createdAt: trace.createdAt,
|
|
2718
|
-
...(
|
|
2719
|
-
...(typeof result?.score === "number" ? { score: result.score } : {}),
|
|
3208
|
+
...(status ? { status } : {}),
|
|
3209
|
+
...(status === "succeeded" && typeof result?.score === "number" ? { score: result.score } : {}),
|
|
2720
3210
|
...(typeof result?.error === "string" ? { error: singleLine(result.error) } : {}),
|
|
2721
3211
|
fileCount: trace.files.length,
|
|
2722
3212
|
files: trace.files.map(fileSummary),
|
|
@@ -2726,7 +3216,7 @@ function formatTraceDetail(detail) {
|
|
|
2726
3216
|
return detail.executions.map((execution) => {
|
|
2727
3217
|
const sessionLabels = execution.sessions.map((session) => session.label).join(",");
|
|
2728
3218
|
return [
|
|
2729
|
-
`${execution.id}\trun=${detail.runId}\tjobs=${execution.jobIds.join(",")}\tstatus=${execution.status}`,
|
|
3219
|
+
`${execution.id}\trun=${displayRef(detail.runId)}\tjobs=${execution.jobIds.map(displayRef).join(",")}\tstatus=${execution.status}`,
|
|
2730
3220
|
`events=${execution.trace.events.length}`,
|
|
2731
3221
|
`spans=${execution.trace.spans.length}`,
|
|
2732
3222
|
`summaries=${execution.trace.summaries.length}`,
|
|
@@ -2735,7 +3225,7 @@ function formatTraceDetail(detail) {
|
|
|
2735
3225
|
}).join("\n");
|
|
2736
3226
|
}
|
|
2737
3227
|
function formatArtifact(artifact) {
|
|
2738
|
-
return `${artifact.id}\trun=${artifact.runId}\tjob=${artifact.jobId}\t${artifact.kind}\tfiles=${artifact.files.length}`;
|
|
3228
|
+
return `${displayRef(artifact.id)}\trun=${displayRef(artifact.runId)}\tjob=${displayRef(artifact.jobId)}\t${artifact.kind}\tfiles=${artifact.files.length}`;
|
|
2739
3229
|
}
|
|
2740
3230
|
function artifactSummary(artifact) {
|
|
2741
3231
|
return {
|