@workbench-ai/workbench 0.0.80 → 0.0.82
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.js +50 -19
- package/package.json +6 -6
package/dist/index.js
CHANGED
|
@@ -348,11 +348,11 @@ export async function runCli(argv, io = {
|
|
|
348
348
|
});
|
|
349
349
|
const artifactIds = await artifactIdsByRunId(core, runs);
|
|
350
350
|
const failedRuns = runs.filter((run) => run.status === "failed" || run.status === "canceled");
|
|
351
|
+
const coverage = await evalCoverageSummaries(core, runs);
|
|
352
|
+
const deltas = await evalDeltas(core, runs);
|
|
351
353
|
if (failedRuns.length > 0) {
|
|
352
|
-
return emitEvalFailure(runs, failedRuns, artifactIds, parsed, io);
|
|
354
|
+
return emitEvalFailure(runs, failedRuns, artifactIds, coverage, deltas, parsed, io);
|
|
353
355
|
}
|
|
354
|
-
const deltas = await evalDeltas(core, runs);
|
|
355
|
-
const coverage = await evalCoverageSummaries(core, runs);
|
|
356
356
|
const next = await evalSuccessNextCommand(core, runs);
|
|
357
357
|
return emitResult("workbench.cli.eval.v1", {
|
|
358
358
|
result: runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
|
|
@@ -361,8 +361,8 @@ export async function runCli(argv, io = {
|
|
|
361
361
|
next: next,
|
|
362
362
|
}, parsed, io, () => [
|
|
363
363
|
runs.map(formatRun).join("\n"),
|
|
364
|
-
...coverage
|
|
365
|
-
...deltas
|
|
364
|
+
...formatEvalCoverageLines(coverage),
|
|
365
|
+
...formatEvalDeltaLines(deltas),
|
|
366
366
|
...(next ? [`next: ${next}`] : []),
|
|
367
367
|
].filter(Boolean).join("\n"));
|
|
368
368
|
}
|
|
@@ -947,20 +947,23 @@ async function handleCloudEval(parsed, io) {
|
|
|
947
947
|
return 130;
|
|
948
948
|
}
|
|
949
949
|
const failedRuns = started.runs.filter((run) => run.status === "failed" || run.status === "canceled");
|
|
950
|
+
const coverage = await evalCoverageSummaries(started.core, started.runs);
|
|
951
|
+
const deltas = await evalDeltas(started.core, started.runs);
|
|
950
952
|
if (failedRuns.length > 0) {
|
|
951
|
-
return emitEvalFailure(started.runs, failedRuns, artifactIds, parsed, io);
|
|
953
|
+
return emitEvalFailure(started.runs, failedRuns, artifactIds, coverage, deltas, parsed, io);
|
|
952
954
|
}
|
|
953
|
-
const deltas = await evalDeltas(started.core, started.runs);
|
|
954
955
|
const next = await evalSuccessNextCommand(started.core, started.runs);
|
|
955
956
|
return emitResult("workbench.cli.eval.v1", {
|
|
956
957
|
result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
|
|
958
|
+
coverage: coverage,
|
|
957
959
|
deltas: deltas,
|
|
958
960
|
next: next,
|
|
959
961
|
cloud: cloudExecutionSummary(started),
|
|
960
962
|
}, parsed, io, () => [
|
|
961
963
|
`Completed hosted eval on ${started.remote.url}.`,
|
|
962
964
|
started.runs.map(formatRun).join("\n"),
|
|
963
|
-
...
|
|
965
|
+
...formatEvalCoverageLines(coverage),
|
|
966
|
+
...formatEvalDeltaLines(deltas),
|
|
964
967
|
...(next ? [`next: ${next}`] : []),
|
|
965
968
|
].filter(Boolean).join("\n"));
|
|
966
969
|
}
|
|
@@ -1124,14 +1127,19 @@ async function startCloudExecution(command, parsed, io) {
|
|
|
1124
1127
|
samples: intFlag(parsed, "samples"),
|
|
1125
1128
|
budget: intFlag(parsed, "budget"),
|
|
1126
1129
|
}));
|
|
1127
|
-
|
|
1128
|
-
await cloudPreScheduleStep(command, interrupt, preflightCloudAdapterAuth({
|
|
1130
|
+
const adapterAuthTargets = await cloudPreScheduleStep(command, interrupt, resolveCloudAdapterAuthTargets({
|
|
1129
1131
|
root,
|
|
1130
1132
|
versionId: request.versionId,
|
|
1131
1133
|
parsed,
|
|
1132
|
-
baseUrl: source.baseUrl,
|
|
1133
1134
|
authToken: token,
|
|
1134
1135
|
}));
|
|
1136
|
+
if (adapterAuthTargets.length > 0) {
|
|
1137
|
+
writeCloudProgress(io, "workbench cloud: checking provider auth.", showProgress);
|
|
1138
|
+
await cloudPreScheduleStep(command, interrupt, assertCloudAdapterAuthConnected({
|
|
1139
|
+
baseUrl: source.baseUrl,
|
|
1140
|
+
targets: adapterAuthTargets,
|
|
1141
|
+
}));
|
|
1142
|
+
}
|
|
1135
1143
|
writeCloudProgress(io, "workbench cloud: syncing source to cloud.", showProgress);
|
|
1136
1144
|
const syncBefore = await cloudPreScheduleStep(command, interrupt, syncWorkbenchRemote({ ...core, remote: remote.name }));
|
|
1137
1145
|
writeCloudProgress(io, `workbench cloud: scheduling hosted ${command}.`, showProgress);
|
|
@@ -1230,7 +1238,7 @@ function cloudCanceledBeforeRunIdError(command) {
|
|
|
1230
1238
|
exitCode: 130,
|
|
1231
1239
|
});
|
|
1232
1240
|
}
|
|
1233
|
-
async function
|
|
1241
|
+
async function resolveCloudAdapterAuthTargets(input) {
|
|
1234
1242
|
const snapshot = await createWorkbenchReadOnlyInspectionSnapshot({ dir: input.root, authToken: input.authToken });
|
|
1235
1243
|
const version = snapshotVersionByRef(snapshot, input.versionId);
|
|
1236
1244
|
if (!version) {
|
|
@@ -1245,7 +1253,10 @@ async function preflightCloudAdapterAuth(input) {
|
|
|
1245
1253
|
agent: stringFlag(input.parsed, "agents"),
|
|
1246
1254
|
authToken: input.authToken,
|
|
1247
1255
|
});
|
|
1248
|
-
|
|
1256
|
+
return uniqueAdapterAuthTargets(runtime.selectedAgents.flatMap(cloudAdapterAuthTargetsForAgent));
|
|
1257
|
+
}
|
|
1258
|
+
async function assertCloudAdapterAuthConnected(input) {
|
|
1259
|
+
const targets = uniqueAdapterAuthTargets(input.targets);
|
|
1249
1260
|
if (targets.length === 0) {
|
|
1250
1261
|
return;
|
|
1251
1262
|
}
|
|
@@ -2728,7 +2739,7 @@ async function artifactIdsByRunId(core, runs) {
|
|
|
2728
2739
|
}
|
|
2729
2740
|
return byRun;
|
|
2730
2741
|
}
|
|
2731
|
-
function emitEvalFailure(runs, failedRuns, artifactIds, parsed, io) {
|
|
2742
|
+
function emitEvalFailure(runs, failedRuns, artifactIds, coverage, deltas, parsed, io) {
|
|
2732
2743
|
const next = evalFailureNextCommand(failedRuns);
|
|
2733
2744
|
if (parsed.flags.json === true) {
|
|
2734
2745
|
io.stdout.write(`${JSON.stringify({
|
|
@@ -2740,13 +2751,17 @@ function emitEvalFailure(runs, failedRuns, artifactIds, parsed, io) {
|
|
|
2740
2751
|
evidenceSaved: true,
|
|
2741
2752
|
runs: runs.map((run) => runFailureSummary(run, artifactIds.get(run.id) ?? [])),
|
|
2742
2753
|
failedRuns: failedRuns.map((run) => runFailureSummary(run, artifactIds.get(run.id) ?? [])),
|
|
2754
|
+
coverage: coverage,
|
|
2755
|
+
deltas: deltas,
|
|
2743
2756
|
next,
|
|
2744
2757
|
}, null, 2)}\n`);
|
|
2745
2758
|
return 1;
|
|
2746
2759
|
}
|
|
2747
2760
|
io.stdout.write([
|
|
2748
2761
|
"Eval failed; evidence was saved.",
|
|
2749
|
-
|
|
2762
|
+
runs.map(formatRun).join("\n"),
|
|
2763
|
+
...formatEvalCoverageLines(coverage),
|
|
2764
|
+
...formatEvalDeltaLines(deltas),
|
|
2750
2765
|
...(next ? [`next: ${next}`] : []),
|
|
2751
2766
|
].join("\n") + "\n");
|
|
2752
2767
|
return 1;
|
|
@@ -3317,6 +3332,8 @@ async function evalCoverageSummaries(core, runs) {
|
|
|
3317
3332
|
const samples = new Set(jobs.map((job) => `${job.caseId}\0${job.sample}`));
|
|
3318
3333
|
return {
|
|
3319
3334
|
runId: run.id,
|
|
3335
|
+
skillName: run.skillName,
|
|
3336
|
+
agentName: run.agentName,
|
|
3320
3337
|
cases: cases.size,
|
|
3321
3338
|
samples: samples.size,
|
|
3322
3339
|
jobs: jobs.length,
|
|
@@ -3325,12 +3342,19 @@ async function evalCoverageSummaries(core, runs) {
|
|
|
3325
3342
|
};
|
|
3326
3343
|
});
|
|
3327
3344
|
}
|
|
3328
|
-
function
|
|
3345
|
+
function formatEvalCoverageLines(coverage) {
|
|
3346
|
+
const includeRunLabels = coverage.length > 1;
|
|
3347
|
+
return coverage.map((entry) => formatEvalCoverage(entry, includeRunLabels));
|
|
3348
|
+
}
|
|
3349
|
+
function formatEvalCoverage(coverage, includeRunLabels = false) {
|
|
3329
3350
|
return [
|
|
3330
3351
|
`coverage cases=${coverage.cases}`,
|
|
3331
3352
|
`samples=${coverage.samples}`,
|
|
3332
3353
|
`jobs=${coverage.jobs}`,
|
|
3333
3354
|
coverage.failed > 0 ? `failed=${coverage.failed}` : undefined,
|
|
3355
|
+
includeRunLabels ? `run=${displayRef(coverage.runId)}` : undefined,
|
|
3356
|
+
includeRunLabels ? `skill=${coverage.skillName}` : undefined,
|
|
3357
|
+
includeRunLabels ? `agent=${coverage.agentName}` : undefined,
|
|
3334
3358
|
].filter(Boolean).join(" ");
|
|
3335
3359
|
}
|
|
3336
3360
|
async function evalDeltas(core, runs) {
|
|
@@ -3356,16 +3380,23 @@ async function evalDeltas(core, runs) {
|
|
|
3356
3380
|
};
|
|
3357
3381
|
});
|
|
3358
3382
|
}
|
|
3359
|
-
function
|
|
3383
|
+
function formatEvalDeltaLines(deltas) {
|
|
3384
|
+
const includeRunLabels = deltas.length > 1;
|
|
3385
|
+
return deltas
|
|
3386
|
+
.map((delta) => formatEvalDelta(delta, includeRunLabels))
|
|
3387
|
+
.filter((line) => line.length > 0);
|
|
3388
|
+
}
|
|
3389
|
+
function formatEvalDelta(delta, includeRunLabels = false) {
|
|
3360
3390
|
if (delta.score === undefined) {
|
|
3361
3391
|
return "";
|
|
3362
3392
|
}
|
|
3393
|
+
const label = includeRunLabels ? `${delta.skillName}/${delta.agentName}` : delta.skillName;
|
|
3363
3394
|
const score = delta.score.toFixed(3);
|
|
3364
3395
|
if (delta.previousScore === undefined || delta.delta === undefined) {
|
|
3365
|
-
return `${
|
|
3396
|
+
return `${label} ${displayRef(delta.versionId)} ${score}`;
|
|
3366
3397
|
}
|
|
3367
3398
|
const sign = delta.delta >= 0 ? "+" : "";
|
|
3368
|
-
return `${
|
|
3399
|
+
return `${label} ${displayRef(delta.versionId)} ${score} (was ${delta.previousScore.toFixed(3)}, ${sign}${delta.delta.toFixed(3)})`;
|
|
3369
3400
|
}
|
|
3370
3401
|
async function evalSuccessNextCommand(core, runs) {
|
|
3371
3402
|
if (runs.length === 0) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@workbench-ai/workbench",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.82",
|
|
4
4
|
"repository": {
|
|
5
5
|
"type": "git",
|
|
6
6
|
"url": "git+https://github.com/workbench-ai/workbench.git",
|
|
@@ -22,10 +22,10 @@
|
|
|
22
22
|
"dependencies": {
|
|
23
23
|
"skills": "1.5.11",
|
|
24
24
|
"yaml": "^2.8.2",
|
|
25
|
-
"@workbench-ai/workbench-built-in-adapters": "0.0.
|
|
26
|
-
"@workbench-ai/workbench-
|
|
27
|
-
"@workbench-ai/workbench-
|
|
28
|
-
"@workbench-ai/workbench-core": "0.0.
|
|
25
|
+
"@workbench-ai/workbench-built-in-adapters": "0.0.82",
|
|
26
|
+
"@workbench-ai/workbench-contract": "0.0.82",
|
|
27
|
+
"@workbench-ai/workbench-protocol": "0.0.82",
|
|
28
|
+
"@workbench-ai/workbench-core": "0.0.82"
|
|
29
29
|
},
|
|
30
30
|
"devDependencies": {
|
|
31
31
|
"@tailwindcss/postcss": "^4.2.2",
|
|
@@ -36,7 +36,7 @@
|
|
|
36
36
|
"react-dom": "^19.2.0",
|
|
37
37
|
"typescript": "^5.9.2",
|
|
38
38
|
"vitest": "^3.2.4",
|
|
39
|
-
"@workbench-ai/workbench-ui": "0.0.
|
|
39
|
+
"@workbench-ai/workbench-ui": "0.0.82"
|
|
40
40
|
},
|
|
41
41
|
"scripts": {
|
|
42
42
|
"build": "rm -rf dist && tsc -p tsconfig.json && chmod 755 dist/workbench.js && node ./scripts/build-dev-open-assets.mjs",
|