@workbench-ai/workbench 0.0.80 → 0.0.82

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/index.js +50 -19
  2. package/package.json +6 -6
package/dist/index.js CHANGED
@@ -348,11 +348,11 @@ export async function runCli(argv, io = {
348
348
  });
349
349
  const artifactIds = await artifactIdsByRunId(core, runs);
350
350
  const failedRuns = runs.filter((run) => run.status === "failed" || run.status === "canceled");
351
+ const coverage = await evalCoverageSummaries(core, runs);
352
+ const deltas = await evalDeltas(core, runs);
351
353
  if (failedRuns.length > 0) {
352
- return emitEvalFailure(runs, failedRuns, artifactIds, parsed, io);
354
+ return emitEvalFailure(runs, failedRuns, artifactIds, coverage, deltas, parsed, io);
353
355
  }
354
- const deltas = await evalDeltas(core, runs);
355
- const coverage = await evalCoverageSummaries(core, runs);
356
356
  const next = await evalSuccessNextCommand(core, runs);
357
357
  return emitResult("workbench.cli.eval.v1", {
358
358
  result: runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
@@ -361,8 +361,8 @@ export async function runCli(argv, io = {
361
361
  next: next,
362
362
  }, parsed, io, () => [
363
363
  runs.map(formatRun).join("\n"),
364
- ...coverage.map(formatEvalCoverage),
365
- ...deltas.map(formatEvalDelta),
364
+ ...formatEvalCoverageLines(coverage),
365
+ ...formatEvalDeltaLines(deltas),
366
366
  ...(next ? [`next: ${next}`] : []),
367
367
  ].filter(Boolean).join("\n"));
368
368
  }
@@ -947,20 +947,23 @@ async function handleCloudEval(parsed, io) {
947
947
  return 130;
948
948
  }
949
949
  const failedRuns = started.runs.filter((run) => run.status === "failed" || run.status === "canceled");
950
+ const coverage = await evalCoverageSummaries(started.core, started.runs);
951
+ const deltas = await evalDeltas(started.core, started.runs);
950
952
  if (failedRuns.length > 0) {
951
- return emitEvalFailure(started.runs, failedRuns, artifactIds, parsed, io);
953
+ return emitEvalFailure(started.runs, failedRuns, artifactIds, coverage, deltas, parsed, io);
952
954
  }
953
- const deltas = await evalDeltas(started.core, started.runs);
954
955
  const next = await evalSuccessNextCommand(started.core, started.runs);
955
956
  return emitResult("workbench.cli.eval.v1", {
956
957
  result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
958
+ coverage: coverage,
957
959
  deltas: deltas,
958
960
  next: next,
959
961
  cloud: cloudExecutionSummary(started),
960
962
  }, parsed, io, () => [
961
963
  `Completed hosted eval on ${started.remote.url}.`,
962
964
  started.runs.map(formatRun).join("\n"),
963
- ...deltas.map(formatEvalDelta),
965
+ ...formatEvalCoverageLines(coverage),
966
+ ...formatEvalDeltaLines(deltas),
964
967
  ...(next ? [`next: ${next}`] : []),
965
968
  ].filter(Boolean).join("\n"));
966
969
  }
@@ -1124,14 +1127,19 @@ async function startCloudExecution(command, parsed, io) {
1124
1127
  samples: intFlag(parsed, "samples"),
1125
1128
  budget: intFlag(parsed, "budget"),
1126
1129
  }));
1127
- writeCloudProgress(io, "workbench cloud: checking provider auth.", showProgress);
1128
- await cloudPreScheduleStep(command, interrupt, preflightCloudAdapterAuth({
1130
+ const adapterAuthTargets = await cloudPreScheduleStep(command, interrupt, resolveCloudAdapterAuthTargets({
1129
1131
  root,
1130
1132
  versionId: request.versionId,
1131
1133
  parsed,
1132
- baseUrl: source.baseUrl,
1133
1134
  authToken: token,
1134
1135
  }));
1136
+ if (adapterAuthTargets.length > 0) {
1137
+ writeCloudProgress(io, "workbench cloud: checking provider auth.", showProgress);
1138
+ await cloudPreScheduleStep(command, interrupt, assertCloudAdapterAuthConnected({
1139
+ baseUrl: source.baseUrl,
1140
+ targets: adapterAuthTargets,
1141
+ }));
1142
+ }
1135
1143
  writeCloudProgress(io, "workbench cloud: syncing source to cloud.", showProgress);
1136
1144
  const syncBefore = await cloudPreScheduleStep(command, interrupt, syncWorkbenchRemote({ ...core, remote: remote.name }));
1137
1145
  writeCloudProgress(io, `workbench cloud: scheduling hosted ${command}.`, showProgress);
@@ -1230,7 +1238,7 @@ function cloudCanceledBeforeRunIdError(command) {
1230
1238
  exitCode: 130,
1231
1239
  });
1232
1240
  }
1233
- async function preflightCloudAdapterAuth(input) {
1241
+ async function resolveCloudAdapterAuthTargets(input) {
1234
1242
  const snapshot = await createWorkbenchReadOnlyInspectionSnapshot({ dir: input.root, authToken: input.authToken });
1235
1243
  const version = snapshotVersionByRef(snapshot, input.versionId);
1236
1244
  if (!version) {
@@ -1245,7 +1253,10 @@ async function preflightCloudAdapterAuth(input) {
1245
1253
  agent: stringFlag(input.parsed, "agents"),
1246
1254
  authToken: input.authToken,
1247
1255
  });
1248
- const targets = uniqueAdapterAuthTargets(runtime.selectedAgents.flatMap(cloudAdapterAuthTargetsForAgent));
1256
+ return uniqueAdapterAuthTargets(runtime.selectedAgents.flatMap(cloudAdapterAuthTargetsForAgent));
1257
+ }
1258
+ async function assertCloudAdapterAuthConnected(input) {
1259
+ const targets = uniqueAdapterAuthTargets(input.targets);
1249
1260
  if (targets.length === 0) {
1250
1261
  return;
1251
1262
  }
@@ -2728,7 +2739,7 @@ async function artifactIdsByRunId(core, runs) {
2728
2739
  }
2729
2740
  return byRun;
2730
2741
  }
2731
- function emitEvalFailure(runs, failedRuns, artifactIds, parsed, io) {
2742
+ function emitEvalFailure(runs, failedRuns, artifactIds, coverage, deltas, parsed, io) {
2732
2743
  const next = evalFailureNextCommand(failedRuns);
2733
2744
  if (parsed.flags.json === true) {
2734
2745
  io.stdout.write(`${JSON.stringify({
@@ -2740,13 +2751,17 @@ function emitEvalFailure(runs, failedRuns, artifactIds, parsed, io) {
2740
2751
  evidenceSaved: true,
2741
2752
  runs: runs.map((run) => runFailureSummary(run, artifactIds.get(run.id) ?? [])),
2742
2753
  failedRuns: failedRuns.map((run) => runFailureSummary(run, artifactIds.get(run.id) ?? [])),
2754
+ coverage: coverage,
2755
+ deltas: deltas,
2743
2756
  next,
2744
2757
  }, null, 2)}\n`);
2745
2758
  return 1;
2746
2759
  }
2747
2760
  io.stdout.write([
2748
2761
  "Eval failed; evidence was saved.",
2749
- ...failedRuns.map(formatRun),
2762
+ runs.map(formatRun).join("\n"),
2763
+ ...formatEvalCoverageLines(coverage),
2764
+ ...formatEvalDeltaLines(deltas),
2750
2765
  ...(next ? [`next: ${next}`] : []),
2751
2766
  ].join("\n") + "\n");
2752
2767
  return 1;
@@ -3317,6 +3332,8 @@ async function evalCoverageSummaries(core, runs) {
3317
3332
  const samples = new Set(jobs.map((job) => `${job.caseId}\0${job.sample}`));
3318
3333
  return {
3319
3334
  runId: run.id,
3335
+ skillName: run.skillName,
3336
+ agentName: run.agentName,
3320
3337
  cases: cases.size,
3321
3338
  samples: samples.size,
3322
3339
  jobs: jobs.length,
@@ -3325,12 +3342,19 @@ async function evalCoverageSummaries(core, runs) {
3325
3342
  };
3326
3343
  });
3327
3344
  }
3328
- function formatEvalCoverage(coverage) {
3345
+ function formatEvalCoverageLines(coverage) {
3346
+ const includeRunLabels = coverage.length > 1;
3347
+ return coverage.map((entry) => formatEvalCoverage(entry, includeRunLabels));
3348
+ }
3349
+ function formatEvalCoverage(coverage, includeRunLabels = false) {
3329
3350
  return [
3330
3351
  `coverage cases=${coverage.cases}`,
3331
3352
  `samples=${coverage.samples}`,
3332
3353
  `jobs=${coverage.jobs}`,
3333
3354
  coverage.failed > 0 ? `failed=${coverage.failed}` : undefined,
3355
+ includeRunLabels ? `run=${displayRef(coverage.runId)}` : undefined,
3356
+ includeRunLabels ? `skill=${coverage.skillName}` : undefined,
3357
+ includeRunLabels ? `agent=${coverage.agentName}` : undefined,
3334
3358
  ].filter(Boolean).join(" ");
3335
3359
  }
3336
3360
  async function evalDeltas(core, runs) {
@@ -3356,16 +3380,23 @@ async function evalDeltas(core, runs) {
3356
3380
  };
3357
3381
  });
3358
3382
  }
3359
- function formatEvalDelta(delta) {
3383
+ function formatEvalDeltaLines(deltas) {
3384
+ const includeRunLabels = deltas.length > 1;
3385
+ return deltas
3386
+ .map((delta) => formatEvalDelta(delta, includeRunLabels))
3387
+ .filter((line) => line.length > 0);
3388
+ }
3389
+ function formatEvalDelta(delta, includeRunLabels = false) {
3360
3390
  if (delta.score === undefined) {
3361
3391
  return "";
3362
3392
  }
3393
+ const label = includeRunLabels ? `${delta.skillName}/${delta.agentName}` : delta.skillName;
3363
3394
  const score = delta.score.toFixed(3);
3364
3395
  if (delta.previousScore === undefined || delta.delta === undefined) {
3365
- return `${delta.skillName} ${displayRef(delta.versionId)} ${score}`;
3396
+ return `${label} ${displayRef(delta.versionId)} ${score}`;
3366
3397
  }
3367
3398
  const sign = delta.delta >= 0 ? "+" : "";
3368
- return `${delta.skillName} ${displayRef(delta.versionId)} ${score} (was ${delta.previousScore.toFixed(3)}, ${sign}${delta.delta.toFixed(3)})`;
3399
+ return `${label} ${displayRef(delta.versionId)} ${score} (was ${delta.previousScore.toFixed(3)}, ${sign}${delta.delta.toFixed(3)})`;
3369
3400
  }
3370
3401
  async function evalSuccessNextCommand(core, runs) {
3371
3402
  if (runs.length === 0) {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@workbench-ai/workbench",
3
- "version": "0.0.80",
3
+ "version": "0.0.82",
4
4
  "repository": {
5
5
  "type": "git",
6
6
  "url": "git+https://github.com/workbench-ai/workbench.git",
@@ -22,10 +22,10 @@
22
22
  "dependencies": {
23
23
  "skills": "1.5.11",
24
24
  "yaml": "^2.8.2",
25
- "@workbench-ai/workbench-built-in-adapters": "0.0.80",
26
- "@workbench-ai/workbench-protocol": "0.0.80",
27
- "@workbench-ai/workbench-contract": "0.0.80",
28
- "@workbench-ai/workbench-core": "0.0.80"
25
+ "@workbench-ai/workbench-built-in-adapters": "0.0.82",
26
+ "@workbench-ai/workbench-contract": "0.0.82",
27
+ "@workbench-ai/workbench-protocol": "0.0.82",
28
+ "@workbench-ai/workbench-core": "0.0.82"
29
29
  },
30
30
  "devDependencies": {
31
31
  "@tailwindcss/postcss": "^4.2.2",
@@ -36,7 +36,7 @@
36
36
  "react-dom": "^19.2.0",
37
37
  "typescript": "^5.9.2",
38
38
  "vitest": "^3.2.4",
39
- "@workbench-ai/workbench-ui": "0.0.80"
39
+ "@workbench-ai/workbench-ui": "0.0.82"
40
40
  },
41
41
  "scripts": {
42
42
  "build": "rm -rf dist && tsc -p tsconfig.json && chmod 755 dist/workbench.js && node ./scripts/build-dev-open-assets.mjs",