npm - @workbench-ai/workbench - Versions diffs - 0.0.71 → 0.0.72 - Mend

@workbench-ai/workbench 0.0.71 → 0.0.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/index.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AA2DA,MAAM,WAAW,KAAK;IACpB,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;IAC9B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;CAC/B;~~AAkUD~~,wBAAsB,MAAM,CAAC,IAAI,EAAE,SAAS,MAAM,EAAE,EAAE,EAAE,GAAE,KAGzD,GAAG,OAAO,CAAC,MAAM,CAAC,~~CAwMlB~~"}
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AA2DA,MAAM,WAAW,KAAK;IACpB,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;IAC9B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;CAC/B;AAsUD,wBAAsB,MAAM,CAAC,IAAI,EAAE,SAAS,MAAM,EAAE,EAAE,EAAE,GAAE,KAGzD,GAAG,OAAO,CAAC,MAAM,CAAC,CAiMlB"}

package/dist/index.js CHANGED Viewed

@@ -44,7 +44,7 @@ const HELP_ALL = [
     "  workbench show REF[:PATH] [--json]",
     "  workbench diff [A..B] [--json]",
     "  workbench switch VERSION [--json]",
-    "  workbench open [--host HOST] [--port PORT] [--no-open] [--json]",
+    "  workbench open [--host HOST] [--port PORT] [--no-open]",
     "",
     "Configure:",
     "  workbench case add RUN_ID [--json]",
@@ -162,9 +162,9 @@ const COMMAND_HELP = {
     ].join("\n"),
     open: [
         "Usage:",
-        "  workbench open [--host HOST] [--port PORT] [--no-open] [--json]",
+        "  workbench open [--host HOST] [--port PORT] [--no-open]",
         "",
-        "Serves or emits the read-only Workbench inspection snapshot.",
+        "Serves the read-only Workbench inspection UI.",
         "",
         "Example:",
         "  workbench open --no-open",
@@ -221,9 +221,12 @@ const COMMAND_HELP = {
 const COMMON_FLAGS = {
     json: "boolean",
 };
+const DIR_FLAG = {
+    dir: "string",
+};
 const PROJECT_FLAGS = {
     ...COMMON_FLAGS,
-    dir: "string",
+    ...DIR_FLAG,
 };
 const HELP_FLAG = {
     help: "boolean",
@@ -270,7 +273,7 @@ const COMMAND_FLAGS = {
     },
     logout: { ...COMMON_FLAGS, ...HELP_FLAG },
     new: { ...PROJECT_FLAGS, ...HELP_FLAG },
-    open: { ...PROJECT_FLAGS, ...HELP_FLAG, host: "string", "no-open": "boolean", port: "positive-integer" },
+    open: { ...DIR_FLAG, ...HELP_FLAG, host: "string", "no-open": "boolean", port: "port" },
     publish: {
         ...PROJECT_FLAGS,
         ...HELP_FLAG,
@@ -359,13 +362,16 @@ export async function runCli(argv, io = {
                 return emitEvalFailure(runs, failedRuns, artifactIds, parsed, io);
             }
             const deltas = await evalDeltas(core, runs);
+            const coverage = await evalCoverageSummaries(core, runs);
             const next = await evalSuccessNextCommand(core, runs);
             return emitResult("workbench.cli.eval.v1", {
                 result: runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
+                coverage: coverage,
                 deltas: deltas,
                 next: next,
             }, parsed, io, () => [
                 runs.map(formatRun).join("\n"),
+                ...coverage.map(formatEvalCoverage),
                 ...deltas.map(formatEvalDelta),
                 ...(next ? [`next: ${next}`] : []),
             ].filter(Boolean).join("\n"));
@@ -449,10 +455,7 @@ export async function runCli(argv, io = {
                     pinnedInstallUrl: preview.pinnedInstallUrl,
                     dryRun: true,
                 }, parsed, io, () => [
-                    `Would publish ${displayRef(preview.version.id)} to remote ${preview.remote.name}.`,
-                    `Visibility: ${preview.visibility}`,
-                    `Install: ${preview.installUrl}`,
-                    `Pinned: ${preview.pinnedInstallUrl}`,
+                    `Would publish ${displayRef(preview.version.id)} as ${preview.installHandle} (${preview.visibility}).`,
                     `next: workbench install ${preview.installHandle}`,
                 ].join("\n"));
             }
@@ -473,25 +476,18 @@ export async function runCli(argv, io = {
                 pinnedInstallUrl: result.pinnedInstallUrl,
                 ...(result.dryRun ? { dryRun: true } : {}),
             }, parsed, io, () => [
-                `${result.dryRun ? "Would publish" : "Published"} ${displayRef(result.version.id)} to remote ${result.remote.name}.`,
-                `Visibility: ${result.visibility}`,
-                `Install: ${result.installUrl}`,
-                `Pinned: ${result.pinnedInstallUrl}`,
+                `${result.dryRun ? "Would publish" : "Published"} ${displayRef(result.version.id)} as ${result.installHandle} (${result.visibility}).`,
                 `next: workbench install ${result.installHandle}`,
             ].join("\n"));
         }
         if (command === "open") {
-            if (parsed.flags.json === true) {
-                const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
-                return output(manifestOnly(snapshot), parsed, io, () => "Read-only Workbench inspection data is available with --json.");
-            }
             // The browser server serves committed object state through a read-only
             // snapshot path, so long-running commands do not block page loads.
             const server = await startWorkbenchOpenServer({
                 dir: dirFlag(parsed),
                 authToken: core.authToken,
                 host: stringFlag(parsed, "host"),
-                port: intFlag(parsed, "port"),
+                port: portFlag(parsed, "port"),
             });
             io.stdout.write(`Workbench: ${server.url}\n`);
             if (parsed.flags["no-open"] !== true) {
@@ -506,9 +502,10 @@ export async function runCli(argv, io = {
     }
 }
 async function handleStatus(parsed, io) {
-    const status = await workbenchStatusSnapshot(await coreOptions(parsed));
+    const core = await coreOptions(parsed);
+    const status = await workbenchStatusSnapshot(core);
     const auth = await workbenchCliAuthStatus();
-    const cliStatus = statusWithCausalNext(status, auth);
+    const cliStatus = await statusWithCausalNext(status, auth, core);
     return emitResult("workbench.status.v1", {
         project: cliStatus.project,
         worktree: cliStatus.worktree,
@@ -557,7 +554,7 @@ async function handleLog(parsed, io) {
             versionId: run.versionId,
             skillName: run.skillName,
             agentName: run.agentName,
-            ...(run.score !== undefined ? { score: run.score } : {}),
+            ...(scoredRunValue(run) !== undefined ? { score: scoredRunValue(run) } : {}),
         })) : []),
     ].sort((left, right) => right.createdAt.localeCompare(left.createdAt));
     return emitResult("workbench.cli.log.v1", {
@@ -597,7 +594,7 @@ async function handleShow(parsed, io) {
     const evidenceFiles = evidenceFilesForRunOrJob(snapshot, objectRef);
     if (details.length > 0 || evidenceFiles.length > 0) {
         return output({
-            details: details,
+            details: details.map(evidenceDetailSummary),
             files: evidenceFiles.map(fileSummary),
         }, parsed, io, () => formatRunOrJobEvidence(details, evidenceFiles));
     }
@@ -733,11 +730,14 @@ function validateFlagValue(name, value, kind) {
     if (typeof value !== "string" || !value.trim()) {
         throw new WorkbenchUserError(`--${name} requires a value.`);
     }
-    if (kind === "positive-integer") {
+    if (kind === "positive-integer" || kind === "port") {
         const parsedValue = Number(value);
-        if (!Number.isInteger(parsedValue) || parsedValue <= 0) {
+        if (kind === "positive-integer" && (!Number.isInteger(parsedValue) || parsedValue <= 0)) {
             throw new WorkbenchUserError(`--${name} must be a positive integer.`);
         }
+        if (kind === "port" && (!Number.isInteger(parsedValue) || parsedValue < 0 || parsedValue > 65535)) {
+            throw new WorkbenchUserError(`--${name} must be an integer between 0 and 65535.`);
+        }
     }
 }
 const CONFIG_SCHEMA = "workbench.cli.config.v1";
@@ -926,8 +926,22 @@ async function handleInstall(parsed, io) {
     ].join("\n"));
 }
 async function handleCloudEval(parsed, io) {
-    const started = await startCloudExecution("eval", parsed);
+    const started = await startCloudExecution("eval", parsed, io);
     const artifactIds = await artifactIdsByRunId(started.core, started.runs);
+    if (started.detached) {
+        const next = cloudDetachedNextCommand(started.runs);
+        emitResult("workbench.cli.eval.v1", {
+            result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
+            detached: true,
+            next: next,
+            cloud: cloudExecutionSummary(started),
+        }, parsed, io, () => [
+            `Detached from hosted eval on ${started.remote.url}.`,
+            started.runs.map(formatRun).join("\n"),
+            ...(next ? [`next: ${next}`] : []),
+        ].filter(Boolean).join("\n"));
+        return 130;
+    }
     const failedRuns = started.runs.filter((run) => run.status === "failed" || run.status === "canceled");
     if (failedRuns.length > 0) {
         return emitEvalFailure(started.runs, failedRuns, artifactIds, parsed, io);
@@ -947,8 +961,22 @@ async function handleCloudEval(parsed, io) {
     ].filter(Boolean).join("\n"));
 }
 async function handleCloudImprove(parsed, io) {
-    const started = await startCloudExecution("improve", parsed);
+    const started = await startCloudExecution("improve", parsed, io);
     const artifactIds = await artifactIdsByRunId(started.core, started.runs);
+    if (started.detached) {
+        const next = cloudDetachedNextCommand(started.runs);
+        emitResult("workbench.cli.improve.v1", {
+            result: started.runs.map((run) => runSummary(run, artifactIds.get(run.id) ?? [])),
+            detached: true,
+            next: next,
+            cloud: cloudExecutionSummary(started),
+        }, parsed, io, () => [
+            `Detached from hosted improve on ${started.remote.url}.`,
+            started.runs.map(formatRun).join("\n"),
+            ...(next ? [`next: ${next}`] : []),
+        ].filter(Boolean).join("\n"));
+        return 130;
+    }
     const failedRuns = started.runs.filter((run) => run.status === "failed" || run.status === "canceled");
     if (failedRuns.length > 0) {
         const first = failedRuns[0];
@@ -1014,7 +1042,7 @@ async function pathExists(filePath) {
         return false;
     }
 }
-async function startCloudExecution(command, parsed) {
+async function startCloudExecution(command, parsed, io) {
     const root = dirFlag(parsed) ?? process.cwd();
     const remote = await ensureCloudRemoteForExecution(root, parsed);
     const source = parseWorkbenchInstallSource(remote.url);
@@ -1033,7 +1061,9 @@ async function startCloudExecution(command, parsed) {
         });
     }
     const core = { dir: root, authToken: token };
+    writeCloudProgress(io, `workbench cloud: syncing ${remote.name} before hosted ${command}.`);
     const syncBefore = await syncWorkbenchRemote({ ...core, remote: remote.name });
+    writeCloudProgress(io, `workbench cloud: synced ${remote.name} before hosted ${command} (pushed=${syncBefore.pushed}, pulled=${syncBefore.pulled}, up-to-date=${syncBefore.upToDate}).`);
     const startSnapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
     const skillId = await resolveCloudSkillId(source);
     const response = await apiRequest(`/api/workbench/skills/${encodeURIComponent(skillId)}${command === "improve" ? "/improve" : "/runs"}`, { method: "POST", body: cloudExecutionRequestBody(command, parsed) }, source.baseUrl);
@@ -1046,9 +1076,15 @@ async function startCloudExecution(command, parsed) {
             exitCode: 1,
         });
     }
+    const initialRunIds = runs.map((run) => run.id);
+    writeCloudProgress(io, `workbench cloud: scheduled hosted ${command} on ${remote.url} (${formatCloudRunStatuses(runs)}).`);
     const initialSyncAfter = await syncWorkbenchRemote({ ...core, remote: remote.name });
+    writeCloudProgress(io, `workbench cloud: synced after scheduling hosted ${command} (pushed=${initialSyncAfter.pushed}, pulled=${initialSyncAfter.pulled}, up-to-date=${initialSyncAfter.upToDate}).`);
+    writeCloudProgress(io, `workbench cloud: waiting for terminal status; press Ctrl-C to detach and resume with workbench status or workbench show ${displayRef(initialRunIds[0] ?? "run")}.`);
     const completed = await waitForCloudRuns({
+        command,
         core,
+        io,
         remote,
         runs,
         initialSync: initialSyncAfter,
@@ -1057,7 +1093,9 @@ async function startCloudExecution(command, parsed) {
         core,
         remote,
         skillId,
+        initialRunIds,
         runs: completed.runs,
+        ...(completed.detached ? { detached: true } : {}),
         startVersionId: startSnapshot.status.currentVersionId ?? startSnapshot.refs.current,
         source,
         sync: {
@@ -1081,27 +1119,59 @@ async function waitForCloudRuns(input) {
     const timeoutMs = positiveIntEnv("WORKBENCH_CLOUD_RUN_TIMEOUT_MS") ?? CLOUD_RUN_TIMEOUT_MS;
     const pollIntervalMs = positiveIntEnv("WORKBENCH_CLOUD_RUN_POLL_INTERVAL_MS") ?? CLOUD_RUN_POLL_INTERVAL_MS;
     const deadline = Date.now() + timeoutMs;
-    while (true) {
-        const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(input.core);
-        const runs = runIds
-            .map((id) => snapshot.runs.find((entry) => entry.id === id))
-            .filter((run) => Boolean(run));
-        if (runs.length === runIds.length && runs.every(isTerminalRun)) {
-            return { runs, sync };
-        }
-        if (Date.now() >= deadline) {
-            throw new WorkbenchCodedError("cloud_run_pending", "Hosted Workbench run is still running.", {
-                retryable: true,
-                remediation: runIds[0] ? `Run workbench show ${runIds[0]}.` : "Run workbench log --runs.",
-                subject: {
-                    runIds,
-                    statuses: Object.fromEntries(runs.map((run) => [run.id, run.status])),
-                },
-                exitCode: 1,
-            });
+    let runs = [...input.runs];
+    let interrupted = false;
+    const onSigint = () => {
+        interrupted = true;
+        writeCloudProgress(input.io, `workbench cloud: detaching from hosted ${input.command} (${runIds.map(displayRef).join(", ")}).`);
+    };
+    process.once("SIGINT", onSigint);
+    const seenStatuses = new Map();
+    try {
+        while (true) {
+            const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(input.core);
+            const snapshotRuns = runIds
+                .map((id) => snapshot.runs.find((entry) => entry.id === id))
+                .filter((run) => Boolean(run));
+            if (snapshotRuns.length > 0) {
+                runs = runIds.map((id) => snapshotRuns.find((entry) => entry.id === id) ?? runs.find((entry) => entry.id === id))
+                    .filter((run) => Boolean(run));
+            }
+            for (const run of runs) {
+                const previous = seenStatuses.get(run.id);
+                if (previous !== run.status) {
+                    seenStatuses.set(run.id, run.status);
+                    writeCloudProgress(input.io, `workbench cloud: ${displayRef(run.id)} is ${run.status}.`);
+                }
+            }
+            if (runs.length === runIds.length && runs.every(isTerminalRun)) {
+                writeCloudProgress(input.io, `workbench cloud: hosted ${input.command} finished (${formatCloudRunStatuses(runs)}).`);
+                return { runs, sync };
+            }
+            if (interrupted) {
+                return { runs, sync, detached: true };
+            }
+            if (Date.now() >= deadline) {
+                throw new WorkbenchCodedError("cloud_run_pending", "Hosted Workbench run is still running.", {
+                    retryable: true,
+                    remediation: runIds[0] ? `Run workbench show ${runIds[0]}.` : "Run workbench log --runs.",
+                    subject: {
+                        runIds,
+                        statuses: Object.fromEntries(runs.map((run) => [run.id, run.status])),
+                    },
+                    exitCode: 1,
+                });
+            }
+            await sleep(pollIntervalMs);
+            if (interrupted) {
+                return { runs, sync, detached: true };
+            }
+            sync = await syncWorkbenchRemote({ ...input.core, remote: input.remote.name });
+            writeCloudProgress(input.io, `workbench cloud: synced ${input.remote.name} while waiting (${formatCloudRunStatuses(runs)}).`);
         }
-        await sleep(pollIntervalMs);
-        sync = await syncWorkbenchRemote({ ...input.core, remote: input.remote.name });
+    }
+    finally {
+        process.off("SIGINT", onSigint);
     }
 }
 function isTerminalRun(run) {
@@ -1228,6 +1298,10 @@ function cloudExecutionRequestBody(command, parsed) {
 function cloudImproveNextCommand(runs) {
     return cloudExecutionNextCommand(runs, "workbench eval");
 }
+function cloudDetachedNextCommand(runs) {
+    const first = runs[0];
+    return first?.id ? `workbench show ${displayRef(first.id)}` : "workbench status";
+}
 function cloudExecutionNextCommand(runs, successCommand) {
     const first = runs[0];
     if (!first) {
@@ -1243,9 +1317,19 @@ function cloudExecutionSummary(started) {
         remote: started.remote.name,
         url: started.remote.url,
         skillId: started.skillId,
+        initialRunIds: started.initialRunIds,
+        ...(started.detached ? { detached: true } : {}),
         sync: started.sync,
     };
 }
+function writeCloudProgress(io, message) {
+    io.stderr.write(`${message}\n`);
+}
+function formatCloudRunStatuses(runs) {
+    return runs.length > 0
+        ? runs.map((run) => `${displayRef(run.id)}:${run.status}`).join(", ")
+        : "no runs";
+}
 function workbenchInstallSourceSummary(source, snapshot) {
     const installUrl = `${source.baseUrl}/skills/${encodeURIComponent(source.owner)}/${encodeURIComponent(source.skill)}`;
     return {
@@ -2204,6 +2288,17 @@ function intFlag(parsed, name) {
     }
     return parsedValue;
 }
+function portFlag(parsed, name) {
+    const value = stringFlag(parsed, name);
+    if (!value) {
+        return undefined;
+    }
+    const parsedValue = Number(value);
+    if (!Number.isInteger(parsedValue) || parsedValue < 0 || parsedValue > 65535) {
+        throw new WorkbenchUserError(`--${name} must be an integer between 0 and 65535.`);
+    }
+    return parsedValue;
+}
 function optionalPositional(parsed, index) {
     return parsed.positionals[index];
 }
@@ -2452,7 +2547,7 @@ function runSummary(run, artifactIds) {
         versionId: run.versionId,
         skillName: run.skillName,
         agentName: run.agentName,
-        ...(run.score !== undefined ? { score: run.score } : {}),
+        ...(scoredRunValue(run) !== undefined ? { score: scoredRunValue(run) } : {}),
         ...(run.latencyMs !== undefined ? { latencyMs: run.latencyMs } : {}),
         ...(run.error ? { error: run.error } : {}),
         ...(run.jobIds ? { jobIds: run.jobIds } : {}),
@@ -2467,7 +2562,7 @@ function runFailureSummary(run, artifactIds) {
         skill: run.skillName,
         status: run.status,
         versionId: run.versionId,
-        ...(run.score !== undefined ? { score: run.score } : {}),
+        ...(scoredRunValue(run) !== undefined ? { score: scoredRunValue(run) } : {}),
         ...(run.error ? { error: run.error } : {}),
         traceIds: run.traceIds,
         artifactIds: [...artifactIds],
@@ -2511,16 +2606,72 @@ async function workbenchCliAuthStatus() {
         })),
     };
 }
-function statusWithCausalNext(status, auth) {
+function scoredRunValue(run) {
+    return run.status === "succeeded" && typeof run.score === "number" ? run.score : undefined;
+}
+function scoredJobValue(job) {
+    return job.status === "succeeded" && typeof job.score === "number" ? job.score : undefined;
+}
+function snapshotHasWorkflowCase(snapshot) {
+    const currentVersion = snapshotVersionByRef(snapshot, snapshot.status.currentVersionId ?? snapshot.refs.current ?? "");
+    const caseFiles = currentVersion?.files.filter((file) => file.kind === "text" &&
+        /^\.workbench\/cases\/[^/]+\/case\.ya?ml$/u.test(file.path)) ?? [];
+    return caseFiles.some((file) => file.kind === "text" && !/\n\s*smoke:\s*true(?:\s|$)/u.test(`\n${file.content}`));
+}
+function installHandleFromStatusRemote(remote) {
+    const publicationUrl = remote.publication.status === "published" ? remote.publication.installUrl : undefined;
+    const source = parseWorkbenchInstallSource(publicationUrl ?? remote.url);
+    return source ? `${source.owner}/${source.skill}` : publicationUrl ?? remote.url;
+}
+async function statusWithCausalNext(status, auth, core) {
+    if (!status.project.initialized) {
+        return status;
+    }
+    const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core).catch(() => null);
+    const lastRun = snapshot?.runs
+        .slice()
+        .sort((left, right) => right.createdAt.localeCompare(left.createdAt))[0];
+    if ((lastRun?.status === "running" || lastRun?.status === "failed" || lastRun?.status === "canceled") && lastRun.id) {
+        return { ...status, next: `workbench show ${displayRef(lastRun.id)}` };
+    }
+    const failedRemote = status.remotes.find((remote) => remote.sync.status === "error");
+    const hasWorkflowCase = snapshot ? snapshotHasWorkflowCase(snapshot) : false;
+    const hasScoredRun = snapshot?.runs.some((run) => scoredRunValue(run) !== undefined) ?? false;
+    const canPublish = hasWorkflowCase && hasScoredRun;
     const cloudAuthMissing = auth.workbenchCloud.status !== "authenticated";
-    const needsCloudAuth = cloudAuthMissing && status.remotes.some((remote) => remote.kind === "workbench-cloud" &&
+    const cloudRemoteNeedsAuth = status.remotes.some((remote) => remote.kind === "workbench-cloud" &&
         (remote.sync.status !== "up_to_date" || remote.publication.status === "unpublished"));
-    if (!needsCloudAuth) {
-        return status;
+    if (cloudAuthMissing && (canPublish || cloudRemoteNeedsAuth)) {
+        return { ...status, next: "workbench login" };
+    }
+    if (failedRemote) {
+        return { ...status, next: `workbench sync ${failedRemote.name}` };
+    }
+    if ((snapshot?.runs.length ?? status.runs.total) === 0) {
+        return { ...status, next: "workbench eval" };
+    }
+    if (!hasWorkflowCase) {
+        return { ...status, next: "edit .workbench/cases, then run workbench eval" };
+    }
+    const cloudRemote = status.remotes.find((remote) => remote.kind === "workbench-cloud");
+    if (canPublish && !cloudRemote) {
+        return { ...status, next: "workbench publish" };
+    }
+    const unpublishedCloudRemote = status.remotes.find((remote) => remote.kind === "workbench-cloud" &&
+        remote.publication.status === "unpublished" &&
+        remote.sync.status === "up_to_date");
+    if (unpublishedCloudRemote) {
+        return { ...status, next: "workbench publish" };
+    }
+    const publishedCloudRemote = status.remotes.find((remote) => remote.kind === "workbench-cloud" &&
+        remote.publication.status === "published" &&
+        Boolean(remote.publication.installUrl));
+    if (publishedCloudRemote) {
+        return { ...status, next: `workbench install ${installHandleFromStatusRemote(publishedCloudRemote)}` };
     }
     return {
         ...status,
-        next: "workbench login",
+        next: null,
     };
 }
 function displayRef(id) {
@@ -2612,16 +2763,16 @@ function evidenceFilesForRunOrJob(snapshot, ref) {
     const traceById = new Map(snapshot.traces.map((trace) => [trace.id, trace]));
     const artifactById = new Map(snapshot.artifacts.map((artifact) => [artifact.id, artifact]));
     const files = selection.jobs.flatMap((job) => [
-        ...job.traceIds.flatMap((traceId) => {
-            const trace = traceById.get(traceId);
-            return trace
-                ? trace.files.map((file) => evidenceFileWithPath(file, `cases/${evidencePathSegment(job.caseId)}/jobs/${evidencePathSegment(job.id)}/traces/${evidencePathSegment(trace.id)}/${file.path}`))
-                : [];
-        }),
         ...job.artifactIds.flatMap((artifactId) => {
             const artifact = artifactById.get(artifactId);
             return artifact
-                ? artifact.files.map((file) => evidenceFileWithPath(file, `cases/${evidencePathSegment(job.caseId)}/jobs/${evidencePathSegment(job.id)}/artifacts/${evidencePathSegment(artifact.id)}/${file.path}`))
+                ? artifact.files.filter(isUserFacingEvidenceFile).map((file) => evidenceFileWithPath(file, `cases/${evidencePathSegment(job.caseId)}/jobs/${evidencePathSegment(job.id)}/${file.path}`))
+                : [];
+        }),
+        ...job.traceIds.flatMap((traceId) => {
+            const trace = traceById.get(traceId);
+            return trace
+                ? trace.files.filter(isUserFacingEvidenceFile).map((file) => evidenceFileWithPath(file, `cases/${evidencePathSegment(job.caseId)}/jobs/${evidencePathSegment(job.id)}/traces/${evidencePathSegment(trace.id)}/${file.path}`))
                 : [];
         }),
     ]);
@@ -2640,6 +2791,10 @@ function evidenceFileWithPath(file, filePath) {
         path: filePath.replace(/\\/gu, "/").replace(/^\/+/u, ""),
     };
 }
+function isUserFacingEvidenceFile(file) {
+    const normalized = file.path.replace(/\\/gu, "/").replace(/^\/+/u, "");
+    return normalized.split("/").every((segment) => segment !== ".workbench");
+}
 function evidencePathSegment(value) {
     return value.replace(/[^A-Za-z0-9._-]+/gu, "-") || "_";
 }
@@ -2648,6 +2803,24 @@ function formatRunOrJobEvidence(details, files) {
     const fileLines = files.length > 0 ? ["Files:", ...files.map((file) => file.path)] : [];
     return [...detailLines, ...fileLines].join("\n") || "No evidence.";
 }
+function evidenceDetailSummary(detail) {
+    return {
+        runId: detail.runId,
+        executions: detail.executions.map((execution) => ({
+            id: execution.id,
+            status: execution.status,
+            jobIds: execution.jobIds,
+            sessions: execution.sessions.map((session) => ({
+                label: session.label,
+            })),
+            trace: {
+                events: execution.trace.events.length,
+                spans: execution.trace.spans.length,
+                summaries: execution.trace.summaries.length,
+            },
+        })),
+    };
+}
 function manifestOnly(value) {
     if (value === null || typeof value === "string" || typeof value === "number" || typeof value === "boolean") {
         return value;
@@ -2770,6 +2943,17 @@ function findShowFile(files, requestedPath, objectRef) {
     const candidates = normalized === "stderr.log"
         ? suffixCandidates.filter((file) => file.content.length > 0)
         : suffixCandidates;
+    const canonicalCandidates = candidates.filter(isCanonicalEvidenceFileCandidate);
+    if (canonicalCandidates.length === 1) {
+        return canonicalCandidates[0];
+    }
+    const equivalentCanonicalCandidate = singleEquivalentShowFile(canonicalCandidates);
+    if (equivalentCanonicalCandidate) {
+        return equivalentCanonicalCandidate;
+    }
+    if (canonicalCandidates.length > 1) {
+        throw ambiguousShowPath(objectRef, requestedPath, canonicalCandidates);
+    }
     if (candidates.length === 1) {
         return candidates[0];
     }
@@ -2786,6 +2970,9 @@ function findShowFile(files, requestedPath, objectRef) {
     }
     throw ambiguousShowPath(objectRef, requestedPath, candidates.length > 0 ? candidates : suffixCandidates);
 }
+function isCanonicalEvidenceFileCandidate(file) {
+    return !file.path.includes("/traces/") && !file.path.includes("/artifacts/");
+}
 function singleEquivalentShowFile(files) {
     if (files.length <= 1) {
         return null;
@@ -2831,24 +3018,56 @@ async function traceIdForCaseSource(core, ref) {
         exitCode: 1,
     });
 }
+async function evalCoverageSummaries(core, runs) {
+    const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
+    const jobsByRun = new Map();
+    for (const job of snapshot.jobs) {
+        const existing = jobsByRun.get(job.runId) ?? [];
+        existing.push(job);
+        jobsByRun.set(job.runId, existing);
+    }
+    return runs.map((run) => {
+        const jobs = jobsByRun.get(run.id) ?? [];
+        const cases = new Set(jobs.map((job) => job.caseId));
+        const samples = new Set(jobs.map((job) => `${job.caseId}\0${job.sample}`));
+        return {
+            runId: run.id,
+            cases: cases.size,
+            samples: samples.size,
+            jobs: jobs.length,
+            succeeded: jobs.filter((job) => job.status === "succeeded").length,
+            failed: jobs.filter((job) => job.status === "failed" || job.status === "canceled").length,
+        };
+    });
+}
+function formatEvalCoverage(coverage) {
+    return [
+        `coverage cases=${coverage.cases}`,
+        `samples=${coverage.samples}`,
+        `jobs=${coverage.jobs}`,
+        coverage.failed > 0 ? `failed=${coverage.failed}` : undefined,
+    ].filter(Boolean).join(" ");
+}
 async function evalDeltas(core, runs) {
     const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
     return runs.map((run) => {
+        const score = scoredRunValue(run);
         const previous = snapshot.runs
             .filter((candidate) => candidate.id !== run.id &&
             candidate.skillName === run.skillName &&
             candidate.agentName === run.agentName &&
-            typeof candidate.score === "number" &&
+            scoredRunValue(candidate) !== undefined &&
             candidate.createdAt < run.createdAt)
             .sort((left, right) => right.createdAt.localeCompare(left.createdAt))[0];
+        const previousScore = previous ? scoredRunValue(previous) : undefined;
         return {
             runId: run.id,
             versionId: run.versionId,
             skillName: run.skillName,
             agentName: run.agentName,
-            ...(run.score !== undefined ? { score: run.score } : {}),
-            ...(previous?.score !== undefined ? { previousScore: previous.score } : {}),
-            ...(run.score !== undefined && previous?.score !== undefined ? { delta: run.score - previous.score } : {}),
+            ...(score !== undefined ? { score } : {}),
+            ...(previousScore !== undefined ? { previousScore } : {}),
+            ...(score !== undefined && previousScore !== undefined ? { delta: score - previousScore } : {}),
         };
     });
 }
@@ -2867,15 +3086,11 @@ async function evalSuccessNextCommand(core, runs) {
     if (runs.length === 0) {
         return "workbench eval";
     }
-    if (!runs.some((run) => typeof run.score === "number")) {
+    if (!runs.some((run) => scoredRunValue(run) !== undefined)) {
         return "edit .workbench/cases, then run workbench eval";
     }
     const snapshot = await createWorkbenchReadOnlyInspectionSnapshot(core);
-    const currentVersion = snapshotVersionByRef(snapshot, snapshot.status.currentVersionId ?? snapshot.refs.current ?? "");
-    const caseFiles = currentVersion?.files.filter((file) => file.kind === "text" &&
-        /^\.workbench\/cases\/[^/]+\/case\.ya?ml$/u.test(file.path)) ?? [];
-    const hasWorkflowCase = caseFiles.some((file) => file.kind === "text" && !/\n\s*smoke:\s*true(?:\s|$)/u.test(`\n${file.content}`));
-    return hasWorkflowCase ? "workbench publish" : "edit .workbench/cases, then run workbench eval";
+    return snapshotHasWorkflowCase(snapshot) ? "workbench publish" : "edit .workbench/cases, then run workbench eval";
 }
 function formatStatusSnapshot(status) {
     const lines = [
@@ -2927,7 +3142,8 @@ function formatAgent(agent) {
     return `${agent.name}\t${agent.adapter}${agent.model ? `\t${agent.model}` : ""}`;
 }
 function formatRun(run) {
-    const score = run.score === undefined ? "n/a" : run.score.toFixed(3);
+    const scoreValue = scoredRunValue(run);
+    const score = scoreValue === undefined ? "n/a" : scoreValue.toFixed(3);
     const latency = run.latencyMs === undefined ? "n/a" : `${run.latencyMs}ms`;
     return `${displayRef(run.id)}\t${run.kind}\t${run.status}\tversion=${displayRef(run.versionId)}\tskill=${run.skillName}\tagent=${run.agentName}\tscore=${score}\tlatency=${latency}`;
 }
@@ -2940,7 +3156,8 @@ function formatImproveResult(result) {
     ].join("\n");
 }
 function formatJob(job) {
-    const score = job.score === undefined ? "n/a" : job.score.toFixed(3);
+    const scoreValue = scoredJobValue(job);
+    const score = scoreValue === undefined ? "n/a" : scoreValue.toFixed(3);
     const duration = job.durationMs === undefined ? "n/a" : `${job.durationMs}ms`;
     return `${displayRef(job.id)}\trun=${displayRef(job.runId)}\tcase=${job.caseId}\tsample=${job.sample}\t${job.status}\tscore=${score}\tduration=${duration}`;
 }
@@ -2966,7 +3183,7 @@ function shortObjectId(id) {
 function formatTrace(trace) {
     const result = asRecord(trace.result);
     const status = typeof result?.status === "string" ? result.status : undefined;
-    const score = typeof result?.score === "number" ? result.score.toFixed(3) : undefined;
+    const score = status === "succeeded" && typeof result?.score === "number" ? result.score.toFixed(3) : undefined;
     const error = typeof result?.error === "string" ? result.error.split(/\r?\n/u)[0] : undefined;
     const files = trace.files.slice(0, 5).map((file) => file.path).join(",");
     return [
@@ -2979,6 +3196,7 @@ function formatTrace(trace) {
 }
 function traceSummary(trace) {
     const result = asRecord(trace.result);
+    const status = typeof result?.status === "string" ? result.status : undefined;
     return {
         id: trace.id,
         runId: trace.runId,
@@ -2987,8 +3205,8 @@ function traceSummary(trace) {
         skillName: trace.skillName,
         agentName: trace.agentName,
         createdAt: trace.createdAt,
-        ...(typeof result?.status === "string" ? { status: result.status } : {}),
-        ...(typeof result?.score === "number" ? { score: result.score } : {}),
+        ...(status ? { status } : {}),
+        ...(status === "succeeded" && typeof result?.score === "number" ? { score: result.score } : {}),
         ...(typeof result?.error === "string" ? { error: singleLine(result.error) } : {}),
         fileCount: trace.files.length,
         files: trace.files.map(fileSummary),

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@workbench-ai/workbench",
-  "version": "0.0.71",
+  "version": "0.0.72",
   "repository": {
     "type": "git",
     "url": "git+https://github.com/workbench-ai/workbench.git",
@@ -21,10 +21,10 @@
   ],
   "dependencies": {
     "yaml": "^2.8.2",
-    "@workbench-ai/workbench-built-in-adapters": "0.0.71",
-    "@workbench-ai/workbench-core": "0.0.71",
-    "@workbench-ai/workbench-contract": "0.0.71",
-    "@workbench-ai/workbench-protocol": "0.0.71"
+    "@workbench-ai/workbench-built-in-adapters": "0.0.72",
+    "@workbench-ai/workbench-contract": "0.0.72",
+    "@workbench-ai/workbench-core": "0.0.72",
+    "@workbench-ai/workbench-protocol": "0.0.72"
   },
   "devDependencies": {
     "@tailwindcss/postcss": "^4.2.2",
@@ -35,7 +35,7 @@
     "react-dom": "^19.2.0",
     "typescript": "^5.9.2",
     "vitest": "^3.2.4",
-    "@workbench-ai/workbench-ui": "0.0.71"
+    "@workbench-ai/workbench-ui": "0.0.72"
   },
   "scripts": {
     "build": "rm -rf dist && tsc -p tsconfig.json && chmod 755 dist/workbench.js && node ./scripts/build-dev-open-assets.mjs",