@workbench-ai/workbench 0.0.79 → 0.0.81

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAiEA,MAAM,WAAW,KAAK;IACpB,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;IAC9B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;CAC/B;AAuTD,wBAAsB,MAAM,CAAC,IAAI,EAAE,SAAS,MAAM,EAAE,EAAE,EAAE,GAAE,KAGzD,GAAG,OAAO,CAAC,MAAM,CAAC,CAoMlB"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAkEA,MAAM,WAAW,KAAK;IACpB,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;IAC9B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;CAC/B;AAuTD,wBAAsB,MAAM,CAAC,IAAI,EAAE,SAAS,MAAM,EAAE,EAAE,EAAE,GAAE,KAGzD,GAAG,OAAO,CAAC,MAAM,CAAC,CAoMlB"}
package/dist/index.js CHANGED
@@ -4,7 +4,7 @@ import { createRequire } from "node:module";
4
4
  import os from "node:os";
5
5
  import path from "node:path";
6
6
  import { gzipSync } from "node:zlib";
7
- import { addWorkbenchRemote, addWorkbenchAgent, compareWorkbench, createWorkbenchInspectionSnapshot, createWorkbenchAdapterAuthBundle, createWorkbenchReadOnlyInspectionSnapshot, diffWorkbenchVersions, evalWorkbenchSkill, improveWorkbenchSkill, initWorkbenchSkill, listWorkbenchAgents, listWorkbenchVersions, localWorkbenchAdapterAuthStore, parseWorkbenchAdapterAuthTarget, prepareWorkbenchCloudEvalRequest, prepareWorkbenchCloudImproveRequest, publishWorkbenchVersion, removeWorkbenchAgent, showWorkbenchRef, switchWorkbenchVersion, syncWorkbenchRemote, workbenchJobEvidenceForSnapshot, workbenchStatusSnapshot, WorkbenchCodedError, WorkbenchUserError, } from "@workbench-ai/workbench-core";
7
+ import { addWorkbenchRemote, addWorkbenchAgent, compareWorkbench, createWorkbenchVersionRuntimeSnapshot, createWorkbenchInspectionSnapshot, createWorkbenchAdapterAuthBundle, createWorkbenchReadOnlyInspectionSnapshot, diffWorkbenchVersions, evalWorkbenchSkill, improveWorkbenchSkill, initWorkbenchSkill, listWorkbenchAgents, listWorkbenchVersions, localWorkbenchAdapterAuthStore, parseWorkbenchAdapterAuthTarget, prepareWorkbenchCloudEvalRequest, prepareWorkbenchCloudImproveRequest, publishWorkbenchVersion, removeWorkbenchAgent, showWorkbenchRef, switchWorkbenchVersion, syncWorkbenchRemote, workbenchJobEvidenceForSnapshot, workbenchStatusSnapshot, WorkbenchCodedError, WorkbenchUserError, } from "@workbench-ai/workbench-core";
8
8
  import { normalizeWorkbenchSkillName } from "@workbench-ai/workbench-contract";
9
9
  import { emitError, emitResult } from "./output.js";
10
10
  import { fanOutSkill, manualFanOutCommand } from "./fanout.js";
@@ -348,11 +348,11 @@ export async function runCli(argv, io = {
348
348
  });
349
349
  const artifactIds = await artifactIdsByRunId(core, runs);
350
350
  const failedRuns = runs.filter((run) => run.status === "failed" || run.status === "canceled");
351
+ const coverage = await evalCoverageSummaries(core, runs);
352
+ const deltas = await evalDeltas(core, runs);
351
353
  if (failedRuns.length > 0) {
352
- return emitEvalFailure(runs, failedRuns, artifactIds, parsed, io);
354
+ return emitEvalFailure(runs, failedRuns, artifactIds, coverage, deltas, parsed, io);
353
355
  }
354
- const deltas = await evalDeltas(core, runs);
355
- const coverage = await evalCoverageSummaries(core, runs);
356
356
  const next = await evalSuccessNextCommand(core, runs);
357
357
  return emitResult("workbench.cli.eval.v1", {
358
358
  result: runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
@@ -361,8 +361,8 @@ export async function runCli(argv, io = {
361
361
  next: next,
362
362
  }, parsed, io, () => [
363
363
  runs.map(formatRun).join("\n"),
364
- ...coverage.map(formatEvalCoverage),
365
- ...deltas.map(formatEvalDelta),
364
+ ...formatEvalCoverageLines(coverage),
365
+ ...formatEvalDeltaLines(deltas),
366
366
  ...(next ? [`next: ${next}`] : []),
367
367
  ].filter(Boolean).join("\n"));
368
368
  }
@@ -582,13 +582,15 @@ async function handleShow(parsed, io) {
582
582
  if (artifact) {
583
583
  return output(fileListing("artifact", artifact.id, artifact.files), parsed, io, () => formatFileListing("artifact", artifact.id, artifact.files));
584
584
  }
585
- const details = evidenceDetailsForRunOrJob(snapshot, objectRef);
586
- const evidenceFiles = evidenceFilesForRunOrJob(snapshot, objectRef);
587
- if (details.length > 0 || evidenceFiles.length > 0) {
585
+ const selection = runOrJobEvidenceSelection(snapshot, objectRef);
586
+ const details = evidenceDetailsForSelection(snapshot, selection);
587
+ const evidenceFiles = evidenceFilesForSelection(snapshot, selection);
588
+ if (selection.run || selection.jobs.length > 0 || details.length > 0 || evidenceFiles.length > 0) {
588
589
  return output({
590
+ jobs: selection.jobs.map(jobEvidenceSummary),
589
591
  details: details.map(evidenceDetailSummary),
590
592
  files: evidenceFiles.map(fileSummary),
591
- }, parsed, io, () => formatRunOrJobEvidence(details, evidenceFiles));
593
+ }, parsed, io, () => formatRunOrJobEvidence(selection.jobs, details, evidenceFiles));
592
594
  }
593
595
  const value = await showWorkbenchRef(ref, core);
594
596
  return output(value, parsed, io, () => formatShow(value));
@@ -744,7 +746,8 @@ async function handleLogin(parsed, io) {
744
746
  exitCode: 2,
745
747
  });
746
748
  }
747
- const startOnly = parsed.flags["start-only"] === true;
749
+ const startOnly = parsed.flags["start-only"] === true ||
750
+ (parsed.flags["no-open"] === true && parsed.flags.wait !== true && parsed.flags.timeout === undefined);
748
751
  const waitOnly = parsed.flags.wait === true;
749
752
  const timeoutSeconds = intFlag(parsed, "timeout");
750
753
  if (startOnly && timeoutSeconds !== undefined) {
@@ -944,20 +947,23 @@ async function handleCloudEval(parsed, io) {
944
947
  return 130;
945
948
  }
946
949
  const failedRuns = started.runs.filter((run) => run.status === "failed" || run.status === "canceled");
950
+ const coverage = await evalCoverageSummaries(started.core, started.runs);
951
+ const deltas = await evalDeltas(started.core, started.runs);
947
952
  if (failedRuns.length > 0) {
948
- return emitEvalFailure(started.runs, failedRuns, artifactIds, parsed, io);
953
+ return emitEvalFailure(started.runs, failedRuns, artifactIds, coverage, deltas, parsed, io);
949
954
  }
950
- const deltas = await evalDeltas(started.core, started.runs);
951
955
  const next = await evalSuccessNextCommand(started.core, started.runs);
952
956
  return emitResult("workbench.cli.eval.v1", {
953
957
  result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
958
+ coverage: coverage,
954
959
  deltas: deltas,
955
960
  next: next,
956
961
  cloud: cloudExecutionSummary(started),
957
962
  }, parsed, io, () => [
958
963
  `Completed hosted eval on ${started.remote.url}.`,
959
964
  started.runs.map(formatRun).join("\n"),
960
- ...deltas.map(formatEvalDelta),
965
+ ...formatEvalCoverageLines(coverage),
966
+ ...formatEvalDeltaLines(deltas),
961
967
  ...(next ? [`next: ${next}`] : []),
962
968
  ].filter(Boolean).join("\n"));
963
969
  }
@@ -1019,6 +1025,9 @@ function installNextCommand(fanout) {
1019
1025
  }
1020
1026
  function formatInstallOutcome(result, dryRun) {
1021
1027
  if (dryRun) {
1028
+ if (result.result === "unchanged") {
1029
+ return `Already installed ${result.directoryName} at ${result.destination} (unchanged; dry run made no changes).`;
1030
+ }
1022
1031
  return `Would install ${result.directoryName} to ${result.destination} (${formatFileCount(result.filesCopied)}).`;
1023
1032
  }
1024
1033
  if (result.result === "unchanged") {
@@ -1082,7 +1091,7 @@ function withTimeout(promise, timeoutMs) {
1082
1091
  }
1083
1092
  async function startCloudExecution(command, parsed, io) {
1084
1093
  const root = dirFlag(parsed) ?? process.cwd();
1085
- const showProgress = parsed.flags.json !== true;
1094
+ const showProgress = true;
1086
1095
  const interrupt = createCloudInterruptController(command, io, showProgress);
1087
1096
  try {
1088
1097
  writeCloudProgress(io, `workbench cloud: preparing hosted ${command}.`, showProgress);
@@ -1118,6 +1127,19 @@ async function startCloudExecution(command, parsed, io) {
1118
1127
  samples: intFlag(parsed, "samples"),
1119
1128
  budget: intFlag(parsed, "budget"),
1120
1129
  }));
1130
+ const adapterAuthTargets = await cloudPreScheduleStep(command, interrupt, resolveCloudAdapterAuthTargets({
1131
+ root,
1132
+ versionId: request.versionId,
1133
+ parsed,
1134
+ authToken: token,
1135
+ }));
1136
+ if (adapterAuthTargets.length > 0) {
1137
+ writeCloudProgress(io, "workbench cloud: checking provider auth.", showProgress);
1138
+ await cloudPreScheduleStep(command, interrupt, assertCloudAdapterAuthConnected({
1139
+ baseUrl: source.baseUrl,
1140
+ targets: adapterAuthTargets,
1141
+ }));
1142
+ }
1121
1143
  writeCloudProgress(io, "workbench cloud: syncing source to cloud.", showProgress);
1122
1144
  const syncBefore = await cloudPreScheduleStep(command, interrupt, syncWorkbenchRemote({ ...core, remote: remote.name }));
1123
1145
  writeCloudProgress(io, `workbench cloud: scheduling hosted ${command}.`, showProgress);
@@ -1216,6 +1238,82 @@ function cloudCanceledBeforeRunIdError(command) {
1216
1238
  exitCode: 130,
1217
1239
  });
1218
1240
  }
1241
+ async function resolveCloudAdapterAuthTargets(input) {
1242
+ const snapshot = await createWorkbenchReadOnlyInspectionSnapshot({ dir: input.root, authToken: input.authToken });
1243
+ const version = snapshotVersionByRef(snapshot, input.versionId);
1244
+ if (!version) {
1245
+ throw new WorkbenchCodedError("version_not_found", `Version not found: ${input.versionId}`, {
1246
+ remediation: "Run workbench status.",
1247
+ subject: { versionId: input.versionId },
1248
+ exitCode: 1,
1249
+ });
1250
+ }
1251
+ const runtime = await createWorkbenchVersionRuntimeSnapshot(version, {
1252
+ skill: stringFlag(input.parsed, "skills"),
1253
+ agent: stringFlag(input.parsed, "agents"),
1254
+ authToken: input.authToken,
1255
+ });
1256
+ return uniqueAdapterAuthTargets(runtime.selectedAgents.flatMap(cloudAdapterAuthTargetsForAgent));
1257
+ }
1258
+ async function assertCloudAdapterAuthConnected(input) {
1259
+ const targets = uniqueAdapterAuthTargets(input.targets);
1260
+ if (targets.length === 0) {
1261
+ return;
1262
+ }
1263
+ const statuses = await fetchCloudAdapterAuthStatuses(input.baseUrl);
1264
+ const missing = targets.find((target) => !statuses.some((status) => adapterAuthStatusMatchesTarget(status, target)));
1265
+ if (!missing) {
1266
+ return;
1267
+ }
1268
+ throw new WorkbenchCodedError("adapter_auth_required", `${formatCloudAdapterAuthTarget(missing)} disconnected. Run workbench login ${missing.adapterId}.`, {
1269
+ remediation: `Run workbench login ${missing.adapterId}.`,
1270
+ subject: {
1271
+ adapterId: missing.adapterId,
1272
+ profile: missing.profile,
1273
+ ...(missing.slot ? { slot: missing.slot } : {}),
1274
+ },
1275
+ exitCode: 1,
1276
+ });
1277
+ }
1278
+ function cloudAdapterAuthTargetsForAgent(agent) {
1279
+ const adapterId = agent.adapter.trim().toLowerCase();
1280
+ if (adapterId !== "codex" && adapterId !== "claude") {
1281
+ return [];
1282
+ }
1283
+ const auth = agent.config.auth;
1284
+ if (typeof auth === "string" && auth.trim()) {
1285
+ return [{ adapterId, profile: auth.trim() }];
1286
+ }
1287
+ if (auth && typeof auth === "object" && !Array.isArray(auth)) {
1288
+ return Object.entries(auth)
1289
+ .filter((entry) => typeof entry[1] === "string" && entry[1].trim().length > 0)
1290
+ .map(([slot, profile]) => ({ adapterId, slot, profile: profile.trim() }));
1291
+ }
1292
+ return [{ adapterId, profile: "default" }];
1293
+ }
1294
+ function uniqueAdapterAuthTargets(targets) {
1295
+ const byKey = new Map();
1296
+ for (const target of targets) {
1297
+ byKey.set(adapterAuthTargetKey(target), target);
1298
+ }
1299
+ return [...byKey.values()].sort((left, right) => adapterAuthTargetKey(left).localeCompare(adapterAuthTargetKey(right)));
1300
+ }
1301
+ async function fetchCloudAdapterAuthStatuses(baseUrl) {
1302
+ const response = await apiRequest("/api/workbench/auth/adapters", {}, baseUrl);
1303
+ return response.adapters ?? [];
1304
+ }
1305
+ function adapterAuthStatusMatchesTarget(status, target) {
1306
+ return status.status === "connected" &&
1307
+ status.adapterId === target.adapterId &&
1308
+ status.profile === target.profile &&
1309
+ (status.slot ?? undefined) === (target.slot ?? undefined);
1310
+ }
1311
+ function adapterAuthTargetKey(target) {
1312
+ return `${target.adapterId}/${target.slot ?? "_"}/${target.profile}`;
1313
+ }
1314
+ function formatCloudAdapterAuthTarget(target) {
1315
+ return `${target.adapterId}${target.slot ? `/${target.slot}` : ""}`;
1316
+ }
1219
1317
  async function waitForCloudRuns(input) {
1220
1318
  const runIds = input.runs
1221
1319
  .map((run) => run.id)
@@ -2641,7 +2739,7 @@ async function artifactIdsByRunId(core, runs) {
2641
2739
  }
2642
2740
  return byRun;
2643
2741
  }
2644
- function emitEvalFailure(runs, failedRuns, artifactIds, parsed, io) {
2742
+ function emitEvalFailure(runs, failedRuns, artifactIds, coverage, deltas, parsed, io) {
2645
2743
  const next = evalFailureNextCommand(failedRuns);
2646
2744
  if (parsed.flags.json === true) {
2647
2745
  io.stdout.write(`${JSON.stringify({
@@ -2653,13 +2751,17 @@ function emitEvalFailure(runs, failedRuns, artifactIds, parsed, io) {
2653
2751
  evidenceSaved: true,
2654
2752
  runs: runs.map((run) => runFailureSummary(run, artifactIds.get(run.id) ?? [])),
2655
2753
  failedRuns: failedRuns.map((run) => runFailureSummary(run, artifactIds.get(run.id) ?? [])),
2754
+ coverage: coverage,
2755
+ deltas: deltas,
2656
2756
  next,
2657
2757
  }, null, 2)}\n`);
2658
2758
  return 1;
2659
2759
  }
2660
2760
  io.stdout.write([
2661
2761
  "Eval failed; evidence was saved.",
2662
- ...failedRuns.map(formatRun),
2762
+ runs.map(formatRun).join("\n"),
2763
+ ...formatEvalCoverageLines(coverage),
2764
+ ...formatEvalDeltaLines(deltas),
2663
2765
  ...(next ? [`next: ${next}`] : []),
2664
2766
  ].join("\n") + "\n");
2665
2767
  return 1;
@@ -2876,6 +2978,16 @@ function displayRef(id) {
2876
2978
  function shortenCommandRefs(command) {
2877
2979
  return command.replace(/\b(?:v_[0-9a-f]{8,}|(?:run|job|trace|artifact)_[a-z0-9_-]+)/giu, (match) => displayRef(match));
2878
2980
  }
2981
+ function displayCandidateRefs(ids) {
2982
+ const uniqueIds = [...ids];
2983
+ for (let length = 8; length <= 32; length += 1) {
2984
+ const refs = uniqueIds.map((id) => id.length > length ? id.slice(0, length) : id);
2985
+ if (new Set(refs).size === refs.length) {
2986
+ return refs;
2987
+ }
2988
+ }
2989
+ return uniqueIds;
2990
+ }
2879
2991
  function snapshotVersionByRef(snapshot, ref) {
2880
2992
  const requested = ref.trim();
2881
2993
  const normalized = requested === "current" ? snapshot.refs.current ?? "" : requested;
@@ -2884,7 +2996,7 @@ function snapshotVersionByRef(snapshot, ref) {
2884
2996
  }
2885
2997
  const candidates = snapshot.versions.filter((version) => snapshotVersionRefMatches(version, normalized));
2886
2998
  if (candidates.length > 1) {
2887
- throw new WorkbenchCodedError("ref_ambiguous", `Version ref is ambiguous: ${ref}. Candidates: ${candidates.map((version) => displayRef(version.id)).join(", ")}.`, {
2999
+ throw new WorkbenchCodedError("ref_ambiguous", `Version ref is ambiguous: ${ref}. Candidates: ${displayCandidateRefs(candidates.map((version) => version.id)).join(", ")}.`, {
2888
3000
  subject: { ref, candidates: candidates.map((version) => version.id) },
2889
3001
  exitCode: 2,
2890
3002
  });
@@ -2907,7 +3019,7 @@ function snapshotObjectByRef(entries, ref, kind) {
2907
3019
  }
2908
3020
  const candidates = entries.filter((entry) => objectRefMatches(entry.id, normalized));
2909
3021
  if (candidates.length > 1) {
2910
- throw new WorkbenchCodedError("ref_ambiguous", `${capitalize(kind)} ref is ambiguous: ${ref}. Candidates: ${candidates.map((entry) => displayRef(entry.id)).slice(0, 8).join(", ")}.`, {
3022
+ throw new WorkbenchCodedError("ref_ambiguous", `${capitalize(kind)} ref is ambiguous: ${ref}. Candidates: ${displayCandidateRefs(candidates.map((entry) => entry.id)).slice(0, 8).join(", ")}.`, {
2911
3023
  subject: { ref, candidates: candidates.map((entry) => entry.id).slice(0, 20) },
2912
3024
  exitCode: 2,
2913
3025
  });
@@ -2928,7 +3040,7 @@ function runOrJobEvidenceSelection(snapshot, ref) {
2928
3040
  const run = snapshotObjectByRef(snapshot.runs, ref, "run");
2929
3041
  const job = snapshotObjectByRef(snapshot.jobs, ref, "job");
2930
3042
  if (run && job) {
2931
- throw new WorkbenchCodedError("ref_ambiguous", `Run/job ref is ambiguous: ${ref}. Candidates: ${displayRef(run.id)}, ${displayRef(job.id)}.`, {
3043
+ throw new WorkbenchCodedError("ref_ambiguous", `Run/job ref is ambiguous: ${ref}. Candidates: ${displayCandidateRefs([run.id, job.id]).join(", ")}.`, {
2932
3044
  subject: { ref, candidates: [run.id, job.id] },
2933
3045
  exitCode: 2,
2934
3046
  });
@@ -2943,6 +3055,9 @@ function runOrJobEvidenceSelection(snapshot, ref) {
2943
3055
  }
2944
3056
  function evidenceFilesForRunOrJob(snapshot, ref) {
2945
3057
  const selection = runOrJobEvidenceSelection(snapshot, ref);
3058
+ return evidenceFilesForSelection(snapshot, selection);
3059
+ }
3060
+ function evidenceFilesForSelection(snapshot, selection) {
2946
3061
  if (!selection.run && selection.jobs.length === 0) {
2947
3062
  return [];
2948
3063
  }
@@ -3024,10 +3139,32 @@ function isUserFacingTraceEvidenceFile(file) {
3024
3139
  function evidencePathSegment(value) {
3025
3140
  return value.replace(/[^A-Za-z0-9._-]+/gu, "-") || "_";
3026
3141
  }
3027
- function formatRunOrJobEvidence(details, files) {
3142
+ function formatRunOrJobEvidence(jobs, details, files) {
3143
+ const jobLines = jobs.length > 0 ? ["Jobs:", ...jobs.map(formatJobEvidenceSummary)] : [];
3028
3144
  const detailLines = details.map(formatTraceDetail).filter(Boolean);
3029
3145
  const fileLines = files.length > 0 ? ["Files:", ...files.map((file) => file.path)] : [];
3030
- return [...detailLines, ...fileLines].join("\n") || "No evidence.";
3146
+ return [...jobLines, ...detailLines, ...fileLines].join("\n") || "No evidence.";
3147
+ }
3148
+ function jobEvidenceSummary(job) {
3149
+ return {
3150
+ id: job.id,
3151
+ runId: job.runId,
3152
+ caseId: job.caseId,
3153
+ sample: job.sample,
3154
+ status: job.status,
3155
+ ...(job.score !== undefined ? { score: job.score } : {}),
3156
+ ...(job.error ? { error: job.error } : {}),
3157
+ };
3158
+ }
3159
+ function formatJobEvidenceSummary(job) {
3160
+ return [
3161
+ displayRef(job.id),
3162
+ `case=${job.caseId}`,
3163
+ `sample=${job.sample}`,
3164
+ job.status,
3165
+ job.score !== undefined ? `score=${job.score.toFixed(3)}` : undefined,
3166
+ job.error ? `error=${singleLine(job.error)}` : undefined,
3167
+ ].filter(Boolean).join("\t");
3031
3168
  }
3032
3169
  function evidenceDetailSummary(detail) {
3033
3170
  return {
@@ -3103,6 +3240,9 @@ async function fileForRunOrJobRef(core, objectRef, requestedPath) {
3103
3240
  }
3104
3241
  function evidenceDetailsForRunOrJob(snapshot, ref) {
3105
3242
  const selection = runOrJobEvidenceSelection(snapshot, ref);
3243
+ return evidenceDetailsForSelection(snapshot, selection);
3244
+ }
3245
+ function evidenceDetailsForSelection(snapshot, selection) {
3106
3246
  return selection.jobs.flatMap((entry) => {
3107
3247
  const detail = workbenchJobEvidenceForSnapshot(snapshot, {
3108
3248
  runId: entry.runId,
@@ -3192,6 +3332,8 @@ async function evalCoverageSummaries(core, runs) {
3192
3332
  const samples = new Set(jobs.map((job) => `${job.caseId}\0${job.sample}`));
3193
3333
  return {
3194
3334
  runId: run.id,
3335
+ skillName: run.skillName,
3336
+ agentName: run.agentName,
3195
3337
  cases: cases.size,
3196
3338
  samples: samples.size,
3197
3339
  jobs: jobs.length,
@@ -3200,12 +3342,19 @@ async function evalCoverageSummaries(core, runs) {
3200
3342
  };
3201
3343
  });
3202
3344
  }
3203
- function formatEvalCoverage(coverage) {
3345
+ function formatEvalCoverageLines(coverage) {
3346
+ const includeRunLabels = coverage.length > 1;
3347
+ return coverage.map((entry) => formatEvalCoverage(entry, includeRunLabels));
3348
+ }
3349
+ function formatEvalCoverage(coverage, includeRunLabels = false) {
3204
3350
  return [
3205
3351
  `coverage cases=${coverage.cases}`,
3206
3352
  `samples=${coverage.samples}`,
3207
3353
  `jobs=${coverage.jobs}`,
3208
3354
  coverage.failed > 0 ? `failed=${coverage.failed}` : undefined,
3355
+ includeRunLabels ? `run=${displayRef(coverage.runId)}` : undefined,
3356
+ includeRunLabels ? `skill=${coverage.skillName}` : undefined,
3357
+ includeRunLabels ? `agent=${coverage.agentName}` : undefined,
3209
3358
  ].filter(Boolean).join(" ");
3210
3359
  }
3211
3360
  async function evalDeltas(core, runs) {
@@ -3231,16 +3380,23 @@ async function evalDeltas(core, runs) {
3231
3380
  };
3232
3381
  });
3233
3382
  }
3234
- function formatEvalDelta(delta) {
3383
+ function formatEvalDeltaLines(deltas) {
3384
+ const includeRunLabels = deltas.length > 1;
3385
+ return deltas
3386
+ .map((delta) => formatEvalDelta(delta, includeRunLabels))
3387
+ .filter((line) => line.length > 0);
3388
+ }
3389
+ function formatEvalDelta(delta, includeRunLabels = false) {
3235
3390
  if (delta.score === undefined) {
3236
3391
  return "";
3237
3392
  }
3393
+ const label = includeRunLabels ? `${delta.skillName}/${delta.agentName}` : delta.skillName;
3238
3394
  const score = delta.score.toFixed(3);
3239
3395
  if (delta.previousScore === undefined || delta.delta === undefined) {
3240
- return `${delta.skillName} ${displayRef(delta.versionId)} ${score}`;
3396
+ return `${label} ${displayRef(delta.versionId)} ${score}`;
3241
3397
  }
3242
3398
  const sign = delta.delta >= 0 ? "+" : "";
3243
- return `${delta.skillName} ${displayRef(delta.versionId)} ${score} (was ${delta.previousScore.toFixed(3)}, ${sign}${delta.delta.toFixed(3)})`;
3399
+ return `${label} ${displayRef(delta.versionId)} ${score} (was ${delta.previousScore.toFixed(3)}, ${sign}${delta.delta.toFixed(3)})`;
3244
3400
  }
3245
3401
  async function evalSuccessNextCommand(core, runs) {
3246
3402
  if (runs.length === 0) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@workbench-ai/workbench",
3
- "version": "0.0.79",
3
+ "version": "0.0.81",
4
4
  "repository": {
5
5
  "type": "git",
6
6
  "url": "git+https://github.com/workbench-ai/workbench.git",
@@ -22,10 +22,10 @@
22
22
  "dependencies": {
23
23
  "skills": "1.5.11",
24
24
  "yaml": "^2.8.2",
25
- "@workbench-ai/workbench-built-in-adapters": "0.0.79",
26
- "@workbench-ai/workbench-core": "0.0.79",
27
- "@workbench-ai/workbench-contract": "0.0.79",
28
- "@workbench-ai/workbench-protocol": "0.0.79"
25
+ "@workbench-ai/workbench-contract": "0.0.81",
26
+ "@workbench-ai/workbench-built-in-adapters": "0.0.81",
27
+ "@workbench-ai/workbench-core": "0.0.81",
28
+ "@workbench-ai/workbench-protocol": "0.0.81"
29
29
  },
30
30
  "devDependencies": {
31
31
  "@tailwindcss/postcss": "^4.2.2",
@@ -36,7 +36,7 @@
36
36
  "react-dom": "^19.2.0",
37
37
  "typescript": "^5.9.2",
38
38
  "vitest": "^3.2.4",
39
- "@workbench-ai/workbench-ui": "0.0.79"
39
+ "@workbench-ai/workbench-ui": "0.0.81"
40
40
  },
41
41
  "scripts": {
42
42
  "build": "rm -rf dist && tsc -p tsconfig.json && chmod 755 dist/workbench.js && node ./scripts/build-dev-open-assets.mjs",