npm - @workbench-ai/workbench - Versions diffs - 0.0.64 → 0.0.66 - Mend

@workbench-ai/workbench 0.0.64 → 0.0.66

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (19) hide show

package/dist/benchmark-fingerprint.js +2 -2
package/dist/command-model.d.ts +1 -1
package/dist/command-model.d.ts.map +1 -1
package/dist/command-model.js +106 -35
package/dist/dev-open/client.js +109 -109
package/dist/dev-open-server.d.ts +2 -37
package/dist/dev-open-server.d.ts.map +1 -1
package/dist/dev-open-server.js +39 -322
package/dist/index.d.ts.map +1 -1
package/dist/index.js +364 -263
package/dist/local-archive.d.ts +4 -4
package/dist/local-archive.d.ts.map +1 -1
package/dist/local-archive.js +4 -90
package/dist/local-inspection.d.ts +9 -0
package/dist/local-inspection.d.ts.map +1 -0
package/dist/local-inspection.js +317 -0
package/dist/project-source.d.ts +6 -6
package/dist/project-source.js +6 -6
package/package.json +9 -4

package/dist/benchmark-fingerprint.js CHANGED Viewed

@@ -1,9 +1,9 @@
 import { engineResolveBindingForSpec, workbenchBenchmarkContentFingerprint, workbenchCandidateContentFingerprint, } from "@workbench-ai/workbench-core";
-import { hostedEngineResolveFiles, } from "./project-source.js";
+import { remoteEngineResolveFiles, } from "./project-source.js";
 export function localBenchmarkFingerprint(project) {
     return workbenchBenchmarkContentFingerprint({
         sourceYaml: project.specSource,
-        engineResolveFiles: hostedEngineResolveFiles(project).map(toSurfaceFile),
+        engineResolveFiles: remoteEngineResolveFiles(project).map(toSurfaceFile),
         engineResolveBinding: engineResolveBindingForSpec(project.spec),
         adapterFiles: project.adapterFiles.map(toSurfaceFile),
         adapterManifests: project.adapters.map((adapter) => adapter.manifest),

package/dist/command-model.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 export declare const LOCAL_DEV_OPEN_LIFECYCLE_NOTE = "Keep this command running while using the local web view; Ctrl-C stops the server and the page will stop working.";
-export declare const HOSTED_WATCH_LIFECYCLE_NOTE: string;
+export declare const REMOTE_WATCH_LIFECYCLE_NOTE: string;
 export declare const rootUsage: string;
 export declare function commandUsage(commandPath: string): string | null;
 //# sourceMappingURL=command-model.d.ts.map

package/dist/command-model.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"command-model.d.ts","sourceRoot":"","sources":["../src/command-model.ts"],"names":[],"mappings":"AAOA,eAAO,MAAM,6BAA6B,sHAC2E,CAAC;AAOtH,eAAO,MAAM,2BAA2B,QAA0C,CAAC;~~AA4EnF~~,eAAO,MAAM,SAAS,QAAuB,CAAC;~~AAoY9C~~,wBAAgB,YAAY,CAAC,WAAW,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAE/D"}
1	+ {"version":3,"file":"command-model.d.ts","sourceRoot":"","sources":["../src/command-model.ts"],"names":[],"mappings":"AAOA,eAAO,MAAM,6BAA6B,sHAC2E,CAAC;AAOtH,eAAO,MAAM,2BAA2B,QAA0C,CAAC;AAiFnF,eAAO,MAAM,SAAS,QAAuB,CAAC;AAsc9C,wBAAgB,YAAY,CAAC,WAAW,EAAE,MAAM,GAAG,MAAM,GAAG,IAAI,CAE/D"}

package/dist/command-model.js CHANGED Viewed

@@ -5,19 +5,19 @@ const sourceDirectoryHelp = [
     "  Pass --dir DIR or pass benchmark.yaml, candidates/<name>, or candidates/<name>/candidate.yaml as SOURCE.",
 ];
 export const LOCAL_DEV_OPEN_LIFECYCLE_NOTE = "Keep this command running while using the local web view; Ctrl-C stops the server and the page will stop working.";
-const hostedWatchLifecycleNoteLines = [
+const remoteWatchLifecycleNoteLines = [
     "Watching is client-side polling only.",
-    "Stopping this command does not cancel the hosted run.",
+    "Stopping this command does not cancel the remote run.",
 ];
-export const HOSTED_WATCH_LIFECYCLE_NOTE = hostedWatchLifecycleNoteLines.join(" ");
+export const REMOTE_WATCH_LIFECYCLE_NOTE = remoteWatchLifecycleNoteLines.join(" ");
 const localOpenLifecycleHelp = [
     "Lifecycle:",
     "  workbench open starts a long-running local web server.",
     `  ${LOCAL_DEV_OPEN_LIFECYCLE_NOTE}`,
 ];
-const hostedWatchLifecycleHelp = [
+const remoteWatchLifecycleHelp = [
     "Lifecycle:",
-    ...hostedWatchLifecycleNoteLines.map((line) => `  ${line}`),
+    ...remoteWatchLifecycleNoteLines.map((line) => `  ${line}`),
 ];
 const rootLines = [
     "Usage:",
@@ -36,20 +36,25 @@ const rootLines = [
     "",
     "Runs:",
     "  workbench eval [SOURCE] [--dir DIR] [--candidate CANDIDATE_ID] [--runs RUNS|all] [--samples N] [--rerun] [--json]",
-    "  workbench eval --hosted [SOURCE] [--dir DIR] [--benchmark OWNER/BENCHMARK] [--candidate CANDIDATE_ID] [--runs RUNS|all] [--samples N] [--rerun] [--watch] [--dry-run] [--json]",
+    "  workbench eval --remote [SOURCE] [--dir DIR] [--benchmark OWNER/BENCHMARK] [--candidate CANDIDATE_ID] [--runs RUNS|all] [--samples N] [--rerun] [--watch] [--dry-run] [--json]",
     "  workbench improve [SOURCE] [--dir DIR] [--from CANDIDATE_ID] [--runs RUN] [--budget N] [--samples N] [--rerun] [--json]",
-    "  workbench improve --hosted [SOURCE] [--dir DIR] [--benchmark OWNER/BENCHMARK] [--base CANDIDATE_ID] [--runs RUN] [--budget N] [--samples N] [--rerun] [--watch] [--dry-run] [--json]",
-    "  workbench retry TARGET_ID [--dir DIR] [--hosted] [--benchmark OWNER/BENCHMARK] [--watch] [--interval-ms N] [--timeout-ms N] [--json]",
-    "  workbench open [SOURCE|OWNER/BENCHMARK|RUN_ID|CANDIDATE_ID] [--dir DIR] [--hosted] [--benchmark OWNER/BENCHMARK] [--run RUN_ID] [--host HOST] [--port N] [--no-open] [--json]",
+    "  workbench improve --remote [SOURCE] [--dir DIR] [--benchmark OWNER/BENCHMARK] [--base CANDIDATE_ID] [--runs RUN] [--budget N] [--samples N] [--rerun] [--watch] [--dry-run] [--json]",
+    "  workbench retry TARGET_ID [--dir DIR] [--remote] [--benchmark OWNER/BENCHMARK] [--watch] [--interval-ms N] [--timeout-ms N] [--json]",
+    "  workbench open [SOURCE|OWNER/BENCHMARK|RUN_ID|CANDIDATE_ID] [--dir DIR] [--remote] [--benchmark OWNER/BENCHMARK] [--run RUN_ID] [--host HOST] [--port N] [--no-open] [--json]",
     "  workbench restore [--dir DIR] [--candidate CANDIDATE_ID] [--dry-run] [--yes] [--json]",
     "",
     "Local inspection:",
     "  workbench runs list [--dir DIR] [--json]",
-    "  workbench runs show RUN_ID [--dir DIR] [--json]",
+    "  workbench runs show RUN_ID [--dir DIR] [--jobs] [--failures] [--json]",
+    "  workbench evaluations list [--dir DIR] [--json]",
+    "  workbench evaluations show EVALUATION_ID [--dir DIR] [--json]",
+    "  workbench executions trace --run RUN_ID --job JOB_ID [--dir DIR] [--json]",
+    "  workbench diagnose [RUN_OR_EVALUATION_ID] [--dir DIR] [--json]",
     "  workbench candidates list [--dir DIR] [--json]",
     "  workbench candidates show CANDIDATE_ID [--dir DIR] [--json]",
     "  workbench candidates files [--dir DIR] [--candidate CANDIDATE_ID] [--json]",
     "  workbench candidates preview --path PATH [--dir DIR] [--candidate CANDIDATE_ID] [--output PATH|-] [--json]",
+    "  # Installed-agent traces, not Workbench run execution traces:",
     "  workbench traces collect [--providers codex,claude] [--since 30d] [--workspace DIR] [--limit N] [--json]",
     "  workbench traces list [--providers codex,claude] [--since 30d] [--workspace DIR] [--limit N] [--json]",
     "  workbench traces show TRACE_ID [--providers codex,claude] [--since 30d] [--workspace DIR] [--json]",
@@ -73,10 +78,10 @@ const rootLines = [
     "  workbench retry eval_local_123 --json",
     "  workbench open --no-open --json",
     "  workbench push",
-    "  workbench eval --hosted candidates/current --benchmark openbench/invoice-review --watch",
+    "  workbench eval --remote candidates/current --benchmark openbench/invoice-review --watch",
     "",
     "Environment:",
-    "  WORKBENCH_API_URL sets the hosted Workbench API base URL.",
+    "  WORKBENCH_API_URL sets the remote Workbench API base URL.",
     "",
     "Default API URL:",
     "  https://v2.workbench.ai",
@@ -126,49 +131,49 @@ const commandHelp = Object.fromEntries(Object.entries({
     eval: withSourceDirectoryHelp(withLifecycleHelp([
         "Usage:",
         "  workbench eval [SOURCE] [--dir DIR] [--candidate CANDIDATE_ID] [--runs RUNS|all] [--samples N] [--rerun] [--json]",
-        "  workbench eval --hosted [SOURCE] [--dir DIR] [--benchmark OWNER/BENCHMARK] [--candidate CANDIDATE_ID] [--runs RUNS|all] [--samples N] [--rerun] [--watch] [--dry-run] [--json]",
+        "  workbench eval --remote [SOURCE] [--dir DIR] [--benchmark OWNER/BENCHMARK] [--candidate CANDIDATE_ID] [--runs RUNS|all] [--samples N] [--rerun] [--watch] [--dry-run] [--json]",
         "",
-        "Ensure the selected candidate run has an evaluation for the current benchmark. Without --hosted, execution writes local records. With --hosted, Workbench starts or reuses a hosted run against the configured remote or --benchmark target. Completed work is reused only when candidate, run configuration, source, adapters, benchmark, and samples match; use --rerun to intentionally spend again.",
+        "Ensure the selected candidate run has an evaluation for the current benchmark. Without --remote, execution writes local records. With --remote, Workbench starts or reuses a remote run against the configured remote or --benchmark target. Completed work is reused only when candidate, run configuration, source, adapters, benchmark, and samples match; use --rerun to intentionally spend again.",
         "",
         "Examples:",
         "  workbench eval --samples 1",
-        "  workbench eval --hosted candidates/current --samples 1 --watch --json",
+        "  workbench eval --remote candidates/current --samples 1 --watch --json",
         "  workbench eval candidates/current --samples 2 --json",
-    ], hostedWatchLifecycleHelp)),
+    ], remoteWatchLifecycleHelp)),
     improve: withSourceDirectoryHelp(withLifecycleHelp([
         "Usage:",
         "  workbench improve [SOURCE] [--dir DIR] [--from CANDIDATE_ID] [--runs RUN] [--budget N] [--samples N] [--rerun] [--json]",
-        "  workbench improve --hosted [SOURCE] [--dir DIR] [--benchmark OWNER/BENCHMARK] [--base CANDIDATE_ID] [--runs RUN] [--budget N] [--samples N] [--rerun] [--watch] [--dry-run] [--json]",
+        "  workbench improve --remote [SOURCE] [--dir DIR] [--benchmark OWNER/BENCHMARK] [--base CANDIDATE_ID] [--runs RUN] [--budget N] [--samples N] [--rerun] [--watch] [--dry-run] [--json]",
         "",
-        "Ensure a candidate improvement exists for the selected base, run, budget, and samples. Improve defaults to the evaluated active candidate when it belongs to the current benchmark fingerprint; otherwise it evaluates and uses the authored current candidate. Without --hosted, execution writes local records. With --hosted, Workbench starts or reuses hosted work against the configured remote or --benchmark target. Completed work is reused only when base, run configuration, source, adapters, benchmark, budget, and samples match; use --rerun to intentionally spend again.",
+        "Ensure a candidate improvement exists for the selected base, run, budget, and samples. Improve defaults to the evaluated active candidate when it belongs to the current benchmark fingerprint; otherwise it evaluates and uses the authored current candidate. Without --remote, execution writes local records. With --remote, Workbench starts or reuses remote work against the configured remote or --benchmark target. Completed work is reused only when base, run configuration, source, adapters, benchmark, budget, and samples match; use --rerun to intentionally spend again.",
         "",
         "Examples:",
         "  workbench improve --budget 1 --samples 1",
-        "  workbench improve --hosted candidates/current --budget 1 --samples 1 --watch --json",
+        "  workbench improve --remote candidates/current --budget 1 --samples 1 --watch --json",
         "  workbench improve candidates/current --from candidate_123 --json",
-    ], hostedWatchLifecycleHelp)),
+    ], remoteWatchLifecycleHelp)),
     retry: withSourceDirectoryHelp(withLifecycleHelp([
         "Usage:",
         "  workbench retry TARGET_ID [--dir DIR] [--json]",
-        "  workbench retry --hosted TARGET_ID [--dir DIR] [--benchmark OWNER/BENCHMARK] [--watch] [--interval-ms N] [--timeout-ms N] [--json]",
+        "  workbench retry --remote TARGET_ID [--dir DIR] [--benchmark OWNER/BENCHMARK] [--watch] [--interval-ms N] [--timeout-ms N] [--json]",
         "",
-        "Retry a failed run or evaluation by replaying its recorded candidate, configuration, sample count, and improve budget. Use --hosted for hosted records.",
+        "Retry a failed run or evaluation by replaying its recorded candidate, configuration, sample count, and improve budget. Use --remote for remote records.",
         "",
         "Examples:",
         "  workbench retry eval_local_123 --json",
-        "  workbench retry --hosted run_123 --watch --json",
+        "  workbench retry --remote run_123 --watch --json",
         "  workbench retry run_local_123 --dir ./my-benchmark",
-    ], hostedWatchLifecycleHelp)),
+    ], remoteWatchLifecycleHelp)),
     open: withSourceDirectoryHelp(withLifecycleHelp([
         "Usage:",
         "  workbench open [SOURCE] [--dir DIR] [--run RUN_ID] [--host HOST] [--port N] [--no-open] [--json]",
-        "  workbench open --hosted [OWNER/BENCHMARK|RUN_ID|CANDIDATE_ID] [--dir DIR] [--benchmark OWNER/BENCHMARK] [--no-open] [--json]",
+        "  workbench open --remote [OWNER/BENCHMARK|RUN_ID|CANDIDATE_ID] [--dir DIR] [--benchmark OWNER/BENCHMARK] [--no-open] [--json]",
         "",
-        "Start the local Workbench web view for the project and keep serving it until stopped. With --hosted, print and optionally open the hosted project URL instead.",
+        "Start the local Workbench web view for the project and keep serving it until stopped. With --remote, print and optionally open the remote project URL instead.",
         "",
         "Examples:",
         "  workbench open",
-        "  workbench open --hosted --no-open --json",
+        "  workbench open --remote --no-open --json",
         "  workbench open --run eval_local_123 --port 4317 --no-open --json",
     ], localOpenLifecycleHelp)),
     restore: withSourceDirectoryHelp([
@@ -189,7 +194,7 @@ const commandHelp = Object.fromEntries(Object.entries({
         "",
         "Commands:",
         "  workbench runs list [--dir DIR] [--json]",
-        "  workbench runs show RUN_ID [--dir DIR] [--json]",
+        "  workbench runs show RUN_ID [--dir DIR] [--jobs] [--failures] [--json]",
         "",
         "Examples:",
         "  workbench runs list --json",
@@ -207,13 +212,79 @@ const commandHelp = Object.fromEntries(Object.entries({
     ]),
     "runs show": withSourceDirectoryHelp([
         "Usage:",
-        "  workbench runs show RUN_ID [--dir DIR] [--json]",
+        "  workbench runs show RUN_ID [--dir DIR] [--jobs] [--failures] [--json]",
         "",
-        "Show one local run record.",
+        "Show one local run record. Use --jobs for execution jobs and --failures for the generic failure diagnosis.",
         "",
         "Examples:",
         "  workbench runs show eval_local_123",
-        "  workbench runs show eval_local_123 --json",
+        "  workbench runs show eval_local_123 --failures --json",
+    ]),
+    evaluations: [
+        "Usage:",
+        "  workbench evaluations <command> [options]",
+        "",
+        "Inspect local evaluation comparisons through the same read model used by the browser.",
+        "",
+        "Commands:",
+        "  workbench evaluations list [--dir DIR] [--json]",
+        "  workbench evaluations show EVALUATION_ID [--dir DIR] [--json]",
+        "",
+        "Examples:",
+        "  workbench evaluations list",
+        "  workbench evaluations show eval_local_123 --json",
+    ],
+    "evaluations list": withSourceDirectoryHelp([
+        "Usage:",
+        "  workbench evaluations list [--dir DIR] [--json]",
+        "",
+        "List local evaluations with candidate, configuration, score, and run id.",
+        "",
+        "Examples:",
+        "  workbench evaluations list",
+        "  workbench evaluations list --json",
+    ]),
+    "evaluations show": withSourceDirectoryHelp([
+        "Usage:",
+        "  workbench evaluations show EVALUATION_ID [--dir DIR] [--json]",
+        "",
+        "Show one local evaluation scorecard and its case summaries.",
+        "",
+        "Examples:",
+        "  workbench evaluations show eval_local_123",
+        "  workbench evaluations show eval_local_123 --json",
+    ]),
+    executions: [
+        "Usage:",
+        "  workbench executions <command> [options]",
+        "",
+        "Inspect Workbench run execution artifacts. These are runtime traces tied to a Workbench run and job.",
+        "",
+        "Commands:",
+        "  workbench executions trace --run RUN_ID --job JOB_ID [--dir DIR] [--json]",
+        "",
+        "Examples:",
+        "  workbench executions trace --run run_local_123 --job job_123 --json",
+    ],
+    "executions trace": withSourceDirectoryHelp([
+        "Usage:",
+        "  workbench executions trace --run RUN_ID --job JOB_ID [--dir DIR] [--json]",
+        "",
+        "Show the Workbench execution trace for a run job.",
+        "",
+        "Examples:",
+        "  workbench executions trace --run run_local_123 --job job_123",
+        "  workbench executions trace --run run_local_123 --job job_123 --json",
+    ]),
+    diagnose: withSourceDirectoryHelp([
+        "Usage:",
+        "  workbench diagnose [RUN_OR_EVALUATION_ID] [--dir DIR] [--json]",
+        "",
+        "Summarize generic Workbench failure modes from runs, evaluations, samples, cases, and jobs.",
+        "",
+        "Examples:",
+        "  workbench diagnose",
+        "  workbench diagnose eval_local_123 --json",
     ]),
     candidates: [
         "Usage:",
@@ -275,7 +346,7 @@ const commandHelp = Object.fromEntries(Object.entries({
         "Usage:",
         "  workbench clone OWNER/BENCHMARK [DIR] [--dry-run] [--json]",
         "",
-        "Clone hosted source and runtime history into a local Workbench project and remember it as the remote.",
+        "Clone remote source and runtime history into a local Workbench project and remember it as the remote.",
         "",
         "Examples:",
         "  workbench clone openbench/invoice-review",
@@ -390,7 +461,7 @@ const commandHelp = Object.fromEntries(Object.entries({
         "  workbench traces list [--providers codex,claude] [--since 30d] [--workspace DIR] [--limit N] [--json]",
         "  workbench traces show TRACE_ID [--providers codex,claude] [--since 30d] [--workspace DIR] [--json]",
         "",
-        "Inspect local installed-agent traces without calling an LLM.",
+        "Inspect local installed-agent traces without calling an LLM. These are not Workbench run execution traces; use workbench executions trace for runtime traces.",
         "",
         "Examples:",
         "  workbench traces list --limit 10 --json",
@@ -430,7 +501,7 @@ const commandHelp = Object.fromEntries(Object.entries({
         "Usage:",
         "  workbench auth <command> [options]",
         "",
-        "Connect adapter auth for local and hosted runs.",
+        "Connect adapter auth for local and remote runs.",
         "",
         "Commands:",
         "  workbench auth connect ADAPTER[/SLOT] [--dir DIR] [--method METHOD] [--profile PROFILE] [--profile-root DIR] [--local-only] [--json]",
@@ -454,7 +525,7 @@ const commandHelp = Object.fromEntries(Object.entries({
         "Usage:",
         "  workbench auth disconnect ADAPTER[/SLOT] [--profile PROFILE] [--local-only] [--json]",
         "",
-        "Disconnect adapter auth locally and, when logged in, in hosted Workbench.",
+        "Disconnect adapter auth locally and, when logged in, in remote Workbench.",
         "",
         "Examples:",
         "  workbench auth disconnect codex --local-only",