@workbench-ai/workbench 0.0.79 → 0.0.81
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +182 -26
- package/package.json +6 -6
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAkEA,MAAM,WAAW,KAAK;IACpB,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;IAC9B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;CAC/B;AAuTD,wBAAsB,MAAM,CAAC,IAAI,EAAE,SAAS,MAAM,EAAE,EAAE,EAAE,GAAE,KAGzD,GAAG,OAAO,CAAC,MAAM,CAAC,CAoMlB"}
|
package/dist/index.js
CHANGED
|
@@ -4,7 +4,7 @@ import { createRequire } from "node:module";
|
|
|
4
4
|
import os from "node:os";
|
|
5
5
|
import path from "node:path";
|
|
6
6
|
import { gzipSync } from "node:zlib";
|
|
7
|
-
import { addWorkbenchRemote, addWorkbenchAgent, compareWorkbench, createWorkbenchInspectionSnapshot, createWorkbenchAdapterAuthBundle, createWorkbenchReadOnlyInspectionSnapshot, diffWorkbenchVersions, evalWorkbenchSkill, improveWorkbenchSkill, initWorkbenchSkill, listWorkbenchAgents, listWorkbenchVersions, localWorkbenchAdapterAuthStore, parseWorkbenchAdapterAuthTarget, prepareWorkbenchCloudEvalRequest, prepareWorkbenchCloudImproveRequest, publishWorkbenchVersion, removeWorkbenchAgent, showWorkbenchRef, switchWorkbenchVersion, syncWorkbenchRemote, workbenchJobEvidenceForSnapshot, workbenchStatusSnapshot, WorkbenchCodedError, WorkbenchUserError, } from "@workbench-ai/workbench-core";
|
|
7
|
+
import { addWorkbenchRemote, addWorkbenchAgent, compareWorkbench, createWorkbenchVersionRuntimeSnapshot, createWorkbenchInspectionSnapshot, createWorkbenchAdapterAuthBundle, createWorkbenchReadOnlyInspectionSnapshot, diffWorkbenchVersions, evalWorkbenchSkill, improveWorkbenchSkill, initWorkbenchSkill, listWorkbenchAgents, listWorkbenchVersions, localWorkbenchAdapterAuthStore, parseWorkbenchAdapterAuthTarget, prepareWorkbenchCloudEvalRequest, prepareWorkbenchCloudImproveRequest, publishWorkbenchVersion, removeWorkbenchAgent, showWorkbenchRef, switchWorkbenchVersion, syncWorkbenchRemote, workbenchJobEvidenceForSnapshot, workbenchStatusSnapshot, WorkbenchCodedError, WorkbenchUserError, } from "@workbench-ai/workbench-core";
|
|
8
8
|
import { normalizeWorkbenchSkillName } from "@workbench-ai/workbench-contract";
|
|
9
9
|
import { emitError, emitResult } from "./output.js";
|
|
10
10
|
import { fanOutSkill, manualFanOutCommand } from "./fanout.js";
|
|
@@ -348,11 +348,11 @@ export async function runCli(argv, io = {
|
|
|
348
348
|
});
|
|
349
349
|
const artifactIds = await artifactIdsByRunId(core, runs);
|
|
350
350
|
const failedRuns = runs.filter((run) => run.status === "failed" || run.status === "canceled");
|
|
351
|
+
const coverage = await evalCoverageSummaries(core, runs);
|
|
352
|
+
const deltas = await evalDeltas(core, runs);
|
|
351
353
|
if (failedRuns.length > 0) {
|
|
352
|
-
return emitEvalFailure(runs, failedRuns, artifactIds, parsed, io);
|
|
354
|
+
return emitEvalFailure(runs, failedRuns, artifactIds, coverage, deltas, parsed, io);
|
|
353
355
|
}
|
|
354
|
-
const deltas = await evalDeltas(core, runs);
|
|
355
|
-
const coverage = await evalCoverageSummaries(core, runs);
|
|
356
356
|
const next = await evalSuccessNextCommand(core, runs);
|
|
357
357
|
return emitResult("workbench.cli.eval.v1", {
|
|
358
358
|
result: runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
|
|
@@ -361,8 +361,8 @@ export async function runCli(argv, io = {
|
|
|
361
361
|
next: next,
|
|
362
362
|
}, parsed, io, () => [
|
|
363
363
|
runs.map(formatRun).join("\n"),
|
|
364
|
-
...coverage
|
|
365
|
-
...deltas
|
|
364
|
+
...formatEvalCoverageLines(coverage),
|
|
365
|
+
...formatEvalDeltaLines(deltas),
|
|
366
366
|
...(next ? [`next: ${next}`] : []),
|
|
367
367
|
].filter(Boolean).join("\n"));
|
|
368
368
|
}
|
|
@@ -582,13 +582,15 @@ async function handleShow(parsed, io) {
|
|
|
582
582
|
if (artifact) {
|
|
583
583
|
return output(fileListing("artifact", artifact.id, artifact.files), parsed, io, () => formatFileListing("artifact", artifact.id, artifact.files));
|
|
584
584
|
}
|
|
585
|
-
const
|
|
586
|
-
const
|
|
587
|
-
|
|
585
|
+
const selection = runOrJobEvidenceSelection(snapshot, objectRef);
|
|
586
|
+
const details = evidenceDetailsForSelection(snapshot, selection);
|
|
587
|
+
const evidenceFiles = evidenceFilesForSelection(snapshot, selection);
|
|
588
|
+
if (selection.run || selection.jobs.length > 0 || details.length > 0 || evidenceFiles.length > 0) {
|
|
588
589
|
return output({
|
|
590
|
+
jobs: selection.jobs.map(jobEvidenceSummary),
|
|
589
591
|
details: details.map(evidenceDetailSummary),
|
|
590
592
|
files: evidenceFiles.map(fileSummary),
|
|
591
|
-
}, parsed, io, () => formatRunOrJobEvidence(details, evidenceFiles));
|
|
593
|
+
}, parsed, io, () => formatRunOrJobEvidence(selection.jobs, details, evidenceFiles));
|
|
592
594
|
}
|
|
593
595
|
const value = await showWorkbenchRef(ref, core);
|
|
594
596
|
return output(value, parsed, io, () => formatShow(value));
|
|
@@ -744,7 +746,8 @@ async function handleLogin(parsed, io) {
|
|
|
744
746
|
exitCode: 2,
|
|
745
747
|
});
|
|
746
748
|
}
|
|
747
|
-
const startOnly = parsed.flags["start-only"] === true
|
|
749
|
+
const startOnly = parsed.flags["start-only"] === true ||
|
|
750
|
+
(parsed.flags["no-open"] === true && parsed.flags.wait !== true && parsed.flags.timeout === undefined);
|
|
748
751
|
const waitOnly = parsed.flags.wait === true;
|
|
749
752
|
const timeoutSeconds = intFlag(parsed, "timeout");
|
|
750
753
|
if (startOnly && timeoutSeconds !== undefined) {
|
|
@@ -944,20 +947,23 @@ async function handleCloudEval(parsed, io) {
|
|
|
944
947
|
return 130;
|
|
945
948
|
}
|
|
946
949
|
const failedRuns = started.runs.filter((run) => run.status === "failed" || run.status === "canceled");
|
|
950
|
+
const coverage = await evalCoverageSummaries(started.core, started.runs);
|
|
951
|
+
const deltas = await evalDeltas(started.core, started.runs);
|
|
947
952
|
if (failedRuns.length > 0) {
|
|
948
|
-
return emitEvalFailure(started.runs, failedRuns, artifactIds, parsed, io);
|
|
953
|
+
return emitEvalFailure(started.runs, failedRuns, artifactIds, coverage, deltas, parsed, io);
|
|
949
954
|
}
|
|
950
|
-
const deltas = await evalDeltas(started.core, started.runs);
|
|
951
955
|
const next = await evalSuccessNextCommand(started.core, started.runs);
|
|
952
956
|
return emitResult("workbench.cli.eval.v1", {
|
|
953
957
|
result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
|
|
958
|
+
coverage: coverage,
|
|
954
959
|
deltas: deltas,
|
|
955
960
|
next: next,
|
|
956
961
|
cloud: cloudExecutionSummary(started),
|
|
957
962
|
}, parsed, io, () => [
|
|
958
963
|
`Completed hosted eval on ${started.remote.url}.`,
|
|
959
964
|
started.runs.map(formatRun).join("\n"),
|
|
960
|
-
...
|
|
965
|
+
...formatEvalCoverageLines(coverage),
|
|
966
|
+
...formatEvalDeltaLines(deltas),
|
|
961
967
|
...(next ? [`next: ${next}`] : []),
|
|
962
968
|
].filter(Boolean).join("\n"));
|
|
963
969
|
}
|
|
@@ -1019,6 +1025,9 @@ function installNextCommand(fanout) {
|
|
|
1019
1025
|
}
|
|
1020
1026
|
function formatInstallOutcome(result, dryRun) {
|
|
1021
1027
|
if (dryRun) {
|
|
1028
|
+
if (result.result === "unchanged") {
|
|
1029
|
+
return `Already installed ${result.directoryName} at ${result.destination} (unchanged; dry run made no changes).`;
|
|
1030
|
+
}
|
|
1022
1031
|
return `Would install ${result.directoryName} to ${result.destination} (${formatFileCount(result.filesCopied)}).`;
|
|
1023
1032
|
}
|
|
1024
1033
|
if (result.result === "unchanged") {
|
|
@@ -1082,7 +1091,7 @@ function withTimeout(promise, timeoutMs) {
|
|
|
1082
1091
|
}
|
|
1083
1092
|
async function startCloudExecution(command, parsed, io) {
|
|
1084
1093
|
const root = dirFlag(parsed) ?? process.cwd();
|
|
1085
|
-
const showProgress =
|
|
1094
|
+
const showProgress = true;
|
|
1086
1095
|
const interrupt = createCloudInterruptController(command, io, showProgress);
|
|
1087
1096
|
try {
|
|
1088
1097
|
writeCloudProgress(io, `workbench cloud: preparing hosted ${command}.`, showProgress);
|
|
@@ -1118,6 +1127,19 @@ async function startCloudExecution(command, parsed, io) {
|
|
|
1118
1127
|
samples: intFlag(parsed, "samples"),
|
|
1119
1128
|
budget: intFlag(parsed, "budget"),
|
|
1120
1129
|
}));
|
|
1130
|
+
const adapterAuthTargets = await cloudPreScheduleStep(command, interrupt, resolveCloudAdapterAuthTargets({
|
|
1131
|
+
root,
|
|
1132
|
+
versionId: request.versionId,
|
|
1133
|
+
parsed,
|
|
1134
|
+
authToken: token,
|
|
1135
|
+
}));
|
|
1136
|
+
if (adapterAuthTargets.length > 0) {
|
|
1137
|
+
writeCloudProgress(io, "workbench cloud: checking provider auth.", showProgress);
|
|
1138
|
+
await cloudPreScheduleStep(command, interrupt, assertCloudAdapterAuthConnected({
|
|
1139
|
+
baseUrl: source.baseUrl,
|
|
1140
|
+
targets: adapterAuthTargets,
|
|
1141
|
+
}));
|
|
1142
|
+
}
|
|
1121
1143
|
writeCloudProgress(io, "workbench cloud: syncing source to cloud.", showProgress);
|
|
1122
1144
|
const syncBefore = await cloudPreScheduleStep(command, interrupt, syncWorkbenchRemote({ ...core, remote: remote.name }));
|
|
1123
1145
|
writeCloudProgress(io, `workbench cloud: scheduling hosted ${command}.`, showProgress);
|
|
@@ -1216,6 +1238,82 @@ function cloudCanceledBeforeRunIdError(command) {
|
|
|
1216
1238
|
exitCode: 130,
|
|
1217
1239
|
});
|
|
1218
1240
|
}
|
|
1241
|
+
async function resolveCloudAdapterAuthTargets(input) {
|
|
1242
|
+
const snapshot = await createWorkbenchReadOnlyInspectionSnapshot({ dir: input.root, authToken: input.authToken });
|
|
1243
|
+
const version = snapshotVersionByRef(snapshot, input.versionId);
|
|
1244
|
+
if (!version) {
|
|
1245
|
+
throw new WorkbenchCodedError("version_not_found", `Version not found: ${input.versionId}`, {
|
|
1246
|
+
remediation: "Run workbench status.",
|
|
1247
|
+
subject: { versionId: input.versionId },
|
|
1248
|
+
exitCode: 1,
|
|
1249
|
+
});
|
|
1250
|
+
}
|
|
1251
|
+
const runtime = await createWorkbenchVersionRuntimeSnapshot(version, {
|
|
1252
|
+
skill: stringFlag(input.parsed, "skills"),
|
|
1253
|
+
agent: stringFlag(input.parsed, "agents"),
|
|
1254
|
+
authToken: input.authToken,
|
|
1255
|
+
});
|
|
1256
|
+
return uniqueAdapterAuthTargets(runtime.selectedAgents.flatMap(cloudAdapterAuthTargetsForAgent));
|
|
1257
|
+
}
|
|
1258
|
+
async function assertCloudAdapterAuthConnected(input) {
|
|
1259
|
+
const targets = uniqueAdapterAuthTargets(input.targets);
|
|
1260
|
+
if (targets.length === 0) {
|
|
1261
|
+
return;
|
|
1262
|
+
}
|
|
1263
|
+
const statuses = await fetchCloudAdapterAuthStatuses(input.baseUrl);
|
|
1264
|
+
const missing = targets.find((target) => !statuses.some((status) => adapterAuthStatusMatchesTarget(status, target)));
|
|
1265
|
+
if (!missing) {
|
|
1266
|
+
return;
|
|
1267
|
+
}
|
|
1268
|
+
throw new WorkbenchCodedError("adapter_auth_required", `${formatCloudAdapterAuthTarget(missing)} disconnected. Run workbench login ${missing.adapterId}.`, {
|
|
1269
|
+
remediation: `Run workbench login ${missing.adapterId}.`,
|
|
1270
|
+
subject: {
|
|
1271
|
+
adapterId: missing.adapterId,
|
|
1272
|
+
profile: missing.profile,
|
|
1273
|
+
...(missing.slot ? { slot: missing.slot } : {}),
|
|
1274
|
+
},
|
|
1275
|
+
exitCode: 1,
|
|
1276
|
+
});
|
|
1277
|
+
}
|
|
1278
|
+
function cloudAdapterAuthTargetsForAgent(agent) {
|
|
1279
|
+
const adapterId = agent.adapter.trim().toLowerCase();
|
|
1280
|
+
if (adapterId !== "codex" && adapterId !== "claude") {
|
|
1281
|
+
return [];
|
|
1282
|
+
}
|
|
1283
|
+
const auth = agent.config.auth;
|
|
1284
|
+
if (typeof auth === "string" && auth.trim()) {
|
|
1285
|
+
return [{ adapterId, profile: auth.trim() }];
|
|
1286
|
+
}
|
|
1287
|
+
if (auth && typeof auth === "object" && !Array.isArray(auth)) {
|
|
1288
|
+
return Object.entries(auth)
|
|
1289
|
+
.filter((entry) => typeof entry[1] === "string" && entry[1].trim().length > 0)
|
|
1290
|
+
.map(([slot, profile]) => ({ adapterId, slot, profile: profile.trim() }));
|
|
1291
|
+
}
|
|
1292
|
+
return [{ adapterId, profile: "default" }];
|
|
1293
|
+
}
|
|
1294
|
+
function uniqueAdapterAuthTargets(targets) {
|
|
1295
|
+
const byKey = new Map();
|
|
1296
|
+
for (const target of targets) {
|
|
1297
|
+
byKey.set(adapterAuthTargetKey(target), target);
|
|
1298
|
+
}
|
|
1299
|
+
return [...byKey.values()].sort((left, right) => adapterAuthTargetKey(left).localeCompare(adapterAuthTargetKey(right)));
|
|
1300
|
+
}
|
|
1301
|
+
async function fetchCloudAdapterAuthStatuses(baseUrl) {
|
|
1302
|
+
const response = await apiRequest("/api/workbench/auth/adapters", {}, baseUrl);
|
|
1303
|
+
return response.adapters ?? [];
|
|
1304
|
+
}
|
|
1305
|
+
function adapterAuthStatusMatchesTarget(status, target) {
|
|
1306
|
+
return status.status === "connected" &&
|
|
1307
|
+
status.adapterId === target.adapterId &&
|
|
1308
|
+
status.profile === target.profile &&
|
|
1309
|
+
(status.slot ?? undefined) === (target.slot ?? undefined);
|
|
1310
|
+
}
|
|
1311
|
+
function adapterAuthTargetKey(target) {
|
|
1312
|
+
return `${target.adapterId}/${target.slot ?? "_"}/${target.profile}`;
|
|
1313
|
+
}
|
|
1314
|
+
function formatCloudAdapterAuthTarget(target) {
|
|
1315
|
+
return `${target.adapterId}${target.slot ? `/${target.slot}` : ""}`;
|
|
1316
|
+
}
|
|
1219
1317
|
async function waitForCloudRuns(input) {
|
|
1220
1318
|
const runIds = input.runs
|
|
1221
1319
|
.map((run) => run.id)
|
|
@@ -2641,7 +2739,7 @@ async function artifactIdsByRunId(core, runs) {
|
|
|
2641
2739
|
}
|
|
2642
2740
|
return byRun;
|
|
2643
2741
|
}
|
|
2644
|
-
function emitEvalFailure(runs, failedRuns, artifactIds, parsed, io) {
|
|
2742
|
+
function emitEvalFailure(runs, failedRuns, artifactIds, coverage, deltas, parsed, io) {
|
|
2645
2743
|
const next = evalFailureNextCommand(failedRuns);
|
|
2646
2744
|
if (parsed.flags.json === true) {
|
|
2647
2745
|
io.stdout.write(`${JSON.stringify({
|
|
@@ -2653,13 +2751,17 @@ function emitEvalFailure(runs, failedRuns, artifactIds, parsed, io) {
|
|
|
2653
2751
|
evidenceSaved: true,
|
|
2654
2752
|
runs: runs.map((run) => runFailureSummary(run, artifactIds.get(run.id) ?? [])),
|
|
2655
2753
|
failedRuns: failedRuns.map((run) => runFailureSummary(run, artifactIds.get(run.id) ?? [])),
|
|
2754
|
+
coverage: coverage,
|
|
2755
|
+
deltas: deltas,
|
|
2656
2756
|
next,
|
|
2657
2757
|
}, null, 2)}\n`);
|
|
2658
2758
|
return 1;
|
|
2659
2759
|
}
|
|
2660
2760
|
io.stdout.write([
|
|
2661
2761
|
"Eval failed; evidence was saved.",
|
|
2662
|
-
|
|
2762
|
+
runs.map(formatRun).join("\n"),
|
|
2763
|
+
...formatEvalCoverageLines(coverage),
|
|
2764
|
+
...formatEvalDeltaLines(deltas),
|
|
2663
2765
|
...(next ? [`next: ${next}`] : []),
|
|
2664
2766
|
].join("\n") + "\n");
|
|
2665
2767
|
return 1;
|
|
@@ -2876,6 +2978,16 @@ function displayRef(id) {
|
|
|
2876
2978
|
function shortenCommandRefs(command) {
|
|
2877
2979
|
return command.replace(/\b(?:v_[0-9a-f]{8,}|(?:run|job|trace|artifact)_[a-z0-9_-]+)/giu, (match) => displayRef(match));
|
|
2878
2980
|
}
|
|
2981
|
+
function displayCandidateRefs(ids) {
|
|
2982
|
+
const uniqueIds = [...ids];
|
|
2983
|
+
for (let length = 8; length <= 32; length += 1) {
|
|
2984
|
+
const refs = uniqueIds.map((id) => id.length > length ? id.slice(0, length) : id);
|
|
2985
|
+
if (new Set(refs).size === refs.length) {
|
|
2986
|
+
return refs;
|
|
2987
|
+
}
|
|
2988
|
+
}
|
|
2989
|
+
return uniqueIds;
|
|
2990
|
+
}
|
|
2879
2991
|
function snapshotVersionByRef(snapshot, ref) {
|
|
2880
2992
|
const requested = ref.trim();
|
|
2881
2993
|
const normalized = requested === "current" ? snapshot.refs.current ?? "" : requested;
|
|
@@ -2884,7 +2996,7 @@ function snapshotVersionByRef(snapshot, ref) {
|
|
|
2884
2996
|
}
|
|
2885
2997
|
const candidates = snapshot.versions.filter((version) => snapshotVersionRefMatches(version, normalized));
|
|
2886
2998
|
if (candidates.length > 1) {
|
|
2887
|
-
throw new WorkbenchCodedError("ref_ambiguous", `Version ref is ambiguous: ${ref}. Candidates: ${candidates.map((version) =>
|
|
2999
|
+
throw new WorkbenchCodedError("ref_ambiguous", `Version ref is ambiguous: ${ref}. Candidates: ${displayCandidateRefs(candidates.map((version) => version.id)).join(", ")}.`, {
|
|
2888
3000
|
subject: { ref, candidates: candidates.map((version) => version.id) },
|
|
2889
3001
|
exitCode: 2,
|
|
2890
3002
|
});
|
|
@@ -2907,7 +3019,7 @@ function snapshotObjectByRef(entries, ref, kind) {
|
|
|
2907
3019
|
}
|
|
2908
3020
|
const candidates = entries.filter((entry) => objectRefMatches(entry.id, normalized));
|
|
2909
3021
|
if (candidates.length > 1) {
|
|
2910
|
-
throw new WorkbenchCodedError("ref_ambiguous", `${capitalize(kind)} ref is ambiguous: ${ref}. Candidates: ${candidates.map((entry) =>
|
|
3022
|
+
throw new WorkbenchCodedError("ref_ambiguous", `${capitalize(kind)} ref is ambiguous: ${ref}. Candidates: ${displayCandidateRefs(candidates.map((entry) => entry.id)).slice(0, 8).join(", ")}.`, {
|
|
2911
3023
|
subject: { ref, candidates: candidates.map((entry) => entry.id).slice(0, 20) },
|
|
2912
3024
|
exitCode: 2,
|
|
2913
3025
|
});
|
|
@@ -2928,7 +3040,7 @@ function runOrJobEvidenceSelection(snapshot, ref) {
|
|
|
2928
3040
|
const run = snapshotObjectByRef(snapshot.runs, ref, "run");
|
|
2929
3041
|
const job = snapshotObjectByRef(snapshot.jobs, ref, "job");
|
|
2930
3042
|
if (run && job) {
|
|
2931
|
-
throw new WorkbenchCodedError("ref_ambiguous", `Run/job ref is ambiguous: ${ref}. Candidates: ${
|
|
3043
|
+
throw new WorkbenchCodedError("ref_ambiguous", `Run/job ref is ambiguous: ${ref}. Candidates: ${displayCandidateRefs([run.id, job.id]).join(", ")}.`, {
|
|
2932
3044
|
subject: { ref, candidates: [run.id, job.id] },
|
|
2933
3045
|
exitCode: 2,
|
|
2934
3046
|
});
|
|
@@ -2943,6 +3055,9 @@ function runOrJobEvidenceSelection(snapshot, ref) {
|
|
|
2943
3055
|
}
|
|
2944
3056
|
function evidenceFilesForRunOrJob(snapshot, ref) {
|
|
2945
3057
|
const selection = runOrJobEvidenceSelection(snapshot, ref);
|
|
3058
|
+
return evidenceFilesForSelection(snapshot, selection);
|
|
3059
|
+
}
|
|
3060
|
+
function evidenceFilesForSelection(snapshot, selection) {
|
|
2946
3061
|
if (!selection.run && selection.jobs.length === 0) {
|
|
2947
3062
|
return [];
|
|
2948
3063
|
}
|
|
@@ -3024,10 +3139,32 @@ function isUserFacingTraceEvidenceFile(file) {
|
|
|
3024
3139
|
function evidencePathSegment(value) {
|
|
3025
3140
|
return value.replace(/[^A-Za-z0-9._-]+/gu, "-") || "_";
|
|
3026
3141
|
}
|
|
3027
|
-
function formatRunOrJobEvidence(details, files) {
|
|
3142
|
+
function formatRunOrJobEvidence(jobs, details, files) {
|
|
3143
|
+
const jobLines = jobs.length > 0 ? ["Jobs:", ...jobs.map(formatJobEvidenceSummary)] : [];
|
|
3028
3144
|
const detailLines = details.map(formatTraceDetail).filter(Boolean);
|
|
3029
3145
|
const fileLines = files.length > 0 ? ["Files:", ...files.map((file) => file.path)] : [];
|
|
3030
|
-
return [...detailLines, ...fileLines].join("\n") || "No evidence.";
|
|
3146
|
+
return [...jobLines, ...detailLines, ...fileLines].join("\n") || "No evidence.";
|
|
3147
|
+
}
|
|
3148
|
+
function jobEvidenceSummary(job) {
|
|
3149
|
+
return {
|
|
3150
|
+
id: job.id,
|
|
3151
|
+
runId: job.runId,
|
|
3152
|
+
caseId: job.caseId,
|
|
3153
|
+
sample: job.sample,
|
|
3154
|
+
status: job.status,
|
|
3155
|
+
...(job.score !== undefined ? { score: job.score } : {}),
|
|
3156
|
+
...(job.error ? { error: job.error } : {}),
|
|
3157
|
+
};
|
|
3158
|
+
}
|
|
3159
|
+
function formatJobEvidenceSummary(job) {
|
|
3160
|
+
return [
|
|
3161
|
+
displayRef(job.id),
|
|
3162
|
+
`case=${job.caseId}`,
|
|
3163
|
+
`sample=${job.sample}`,
|
|
3164
|
+
job.status,
|
|
3165
|
+
job.score !== undefined ? `score=${job.score.toFixed(3)}` : undefined,
|
|
3166
|
+
job.error ? `error=${singleLine(job.error)}` : undefined,
|
|
3167
|
+
].filter(Boolean).join("\t");
|
|
3031
3168
|
}
|
|
3032
3169
|
function evidenceDetailSummary(detail) {
|
|
3033
3170
|
return {
|
|
@@ -3103,6 +3240,9 @@ async function fileForRunOrJobRef(core, objectRef, requestedPath) {
|
|
|
3103
3240
|
}
|
|
3104
3241
|
function evidenceDetailsForRunOrJob(snapshot, ref) {
|
|
3105
3242
|
const selection = runOrJobEvidenceSelection(snapshot, ref);
|
|
3243
|
+
return evidenceDetailsForSelection(snapshot, selection);
|
|
3244
|
+
}
|
|
3245
|
+
function evidenceDetailsForSelection(snapshot, selection) {
|
|
3106
3246
|
return selection.jobs.flatMap((entry) => {
|
|
3107
3247
|
const detail = workbenchJobEvidenceForSnapshot(snapshot, {
|
|
3108
3248
|
runId: entry.runId,
|
|
@@ -3192,6 +3332,8 @@ async function evalCoverageSummaries(core, runs) {
|
|
|
3192
3332
|
const samples = new Set(jobs.map((job) => `${job.caseId}\0${job.sample}`));
|
|
3193
3333
|
return {
|
|
3194
3334
|
runId: run.id,
|
|
3335
|
+
skillName: run.skillName,
|
|
3336
|
+
agentName: run.agentName,
|
|
3195
3337
|
cases: cases.size,
|
|
3196
3338
|
samples: samples.size,
|
|
3197
3339
|
jobs: jobs.length,
|
|
@@ -3200,12 +3342,19 @@ async function evalCoverageSummaries(core, runs) {
|
|
|
3200
3342
|
};
|
|
3201
3343
|
});
|
|
3202
3344
|
}
|
|
3203
|
-
function
|
|
3345
|
+
function formatEvalCoverageLines(coverage) {
|
|
3346
|
+
const includeRunLabels = coverage.length > 1;
|
|
3347
|
+
return coverage.map((entry) => formatEvalCoverage(entry, includeRunLabels));
|
|
3348
|
+
}
|
|
3349
|
+
function formatEvalCoverage(coverage, includeRunLabels = false) {
|
|
3204
3350
|
return [
|
|
3205
3351
|
`coverage cases=${coverage.cases}`,
|
|
3206
3352
|
`samples=${coverage.samples}`,
|
|
3207
3353
|
`jobs=${coverage.jobs}`,
|
|
3208
3354
|
coverage.failed > 0 ? `failed=${coverage.failed}` : undefined,
|
|
3355
|
+
includeRunLabels ? `run=${displayRef(coverage.runId)}` : undefined,
|
|
3356
|
+
includeRunLabels ? `skill=${coverage.skillName}` : undefined,
|
|
3357
|
+
includeRunLabels ? `agent=${coverage.agentName}` : undefined,
|
|
3209
3358
|
].filter(Boolean).join(" ");
|
|
3210
3359
|
}
|
|
3211
3360
|
async function evalDeltas(core, runs) {
|
|
@@ -3231,16 +3380,23 @@ async function evalDeltas(core, runs) {
|
|
|
3231
3380
|
};
|
|
3232
3381
|
});
|
|
3233
3382
|
}
|
|
3234
|
-
function
|
|
3383
|
+
function formatEvalDeltaLines(deltas) {
|
|
3384
|
+
const includeRunLabels = deltas.length > 1;
|
|
3385
|
+
return deltas
|
|
3386
|
+
.map((delta) => formatEvalDelta(delta, includeRunLabels))
|
|
3387
|
+
.filter((line) => line.length > 0);
|
|
3388
|
+
}
|
|
3389
|
+
function formatEvalDelta(delta, includeRunLabels = false) {
|
|
3235
3390
|
if (delta.score === undefined) {
|
|
3236
3391
|
return "";
|
|
3237
3392
|
}
|
|
3393
|
+
const label = includeRunLabels ? `${delta.skillName}/${delta.agentName}` : delta.skillName;
|
|
3238
3394
|
const score = delta.score.toFixed(3);
|
|
3239
3395
|
if (delta.previousScore === undefined || delta.delta === undefined) {
|
|
3240
|
-
return `${
|
|
3396
|
+
return `${label} ${displayRef(delta.versionId)} ${score}`;
|
|
3241
3397
|
}
|
|
3242
3398
|
const sign = delta.delta >= 0 ? "+" : "";
|
|
3243
|
-
return `${
|
|
3399
|
+
return `${label} ${displayRef(delta.versionId)} ${score} (was ${delta.previousScore.toFixed(3)}, ${sign}${delta.delta.toFixed(3)})`;
|
|
3244
3400
|
}
|
|
3245
3401
|
async function evalSuccessNextCommand(core, runs) {
|
|
3246
3402
|
if (runs.length === 0) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@workbench-ai/workbench",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.81",
|
|
4
4
|
"repository": {
|
|
5
5
|
"type": "git",
|
|
6
6
|
"url": "git+https://github.com/workbench-ai/workbench.git",
|
|
@@ -22,10 +22,10 @@
|
|
|
22
22
|
"dependencies": {
|
|
23
23
|
"skills": "1.5.11",
|
|
24
24
|
"yaml": "^2.8.2",
|
|
25
|
-
"@workbench-ai/workbench-
|
|
26
|
-
"@workbench-ai/workbench-
|
|
27
|
-
"@workbench-ai/workbench-
|
|
28
|
-
"@workbench-ai/workbench-protocol": "0.0.
|
|
25
|
+
"@workbench-ai/workbench-contract": "0.0.81",
|
|
26
|
+
"@workbench-ai/workbench-built-in-adapters": "0.0.81",
|
|
27
|
+
"@workbench-ai/workbench-core": "0.0.81",
|
|
28
|
+
"@workbench-ai/workbench-protocol": "0.0.81"
|
|
29
29
|
},
|
|
30
30
|
"devDependencies": {
|
|
31
31
|
"@tailwindcss/postcss": "^4.2.2",
|
|
@@ -36,7 +36,7 @@
|
|
|
36
36
|
"react-dom": "^19.2.0",
|
|
37
37
|
"typescript": "^5.9.2",
|
|
38
38
|
"vitest": "^3.2.4",
|
|
39
|
-
"@workbench-ai/workbench-ui": "0.0.
|
|
39
|
+
"@workbench-ai/workbench-ui": "0.0.81"
|
|
40
40
|
},
|
|
41
41
|
"scripts": {
|
|
42
42
|
"build": "rm -rf dist && tsc -p tsconfig.json && chmod 755 dist/workbench.js && node ./scripts/build-dev-open-assets.mjs",
|