@workbench-ai/workbench 0.0.64 → 0.0.66

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,17 +5,18 @@ import { createRequire } from "node:module";
5
5
  import os from "node:os";
6
6
  import path from "node:path";
7
7
  import { Writable } from "node:stream";
8
- import { createCandidateFilePreview, createBaselineCandidateJob as createRuntimeBaselineCandidateJob, evaluationScorecardId, evaluationMeanMetrics, executeWorkbenchExecutionJob, engineResolveBindingForSpec, filterOptimizerTraceJobsForCaseIds, filterCandidateSourceFiles, formatWorkbenchCaseSelector, formatWorkbenchSelectionPolicy, workbenchCaseSelectorUsesAllCases, workbenchExecutionPurpose, workbenchRunExecutionFingerprint, createWorkbenchAdapterAuthBundle, createOptimizerTraceInputFiles, DOCKER_SANDBOX_BACKEND, localWorkbenchAdapterAuthStore, materializeWorkbenchRunResult, normalizeSurfaceFiles, planWorkbenchExecutionJobsForPurpose, runWorkbenchExecutionDag, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, runtimeResources, summarizeCandidateFiles, validateWorkbenchRunEnvelope, parseWorkbenchAdapterAuthTarget, workbenchEngineCaseIdsForImproveEvaluation, workbenchEngineCaseIdsForSelector, workbenchImproveOptimizeSelector, workbenchImproveSelectionPolicy, workbenchProjectSourceFingerprint, workbenchRuntimeBundleFingerprint, workbenchRuntimeExplicitActiveId, } from "@workbench-ai/workbench-core";
8
+ import { createBaselineCandidateJob as createRuntimeBaselineCandidateJob, evaluationScorecardId, evaluationMeanMetrics, executeWorkbenchExecutionJob, engineResolveBindingForSpec, filterOptimizerTraceJobsForCaseIds, filterCandidateSourceFiles, formatWorkbenchCaseSelector, formatWorkbenchSelectionPolicy, workbenchCaseSelectorUsesAllCases, workbenchExecutionPurpose, workbenchRunExecutionFingerprint, createWorkbenchAdapterAuthBundle, createOptimizerTraceInputFiles, DOCKER_SANDBOX_BACKEND, localWorkbenchAdapterAuthStore, materializeWorkbenchRunResult, normalizeSurfaceFiles, isSurfaceSnapshotFile, jsonRecord, planWorkbenchExecutionJobsForPurpose, runWorkbenchExecutionDag, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, runtimeResources, validateWorkbenchRunEnvelope, parseWorkbenchAdapterAuthTarget, workbenchEngineCaseIdsForImproveEvaluation, workbenchEngineCaseIdsForSelector, workbenchImproveOptimizeSelector, workbenchImproveSelectionPolicy, workbenchProjectSourceFingerprint, workbenchRuntimeBundleFingerprint, workbenchRuntimeExplicitActiveId, } from "@workbench-ai/workbench-core";
9
9
  import { assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, normalizeWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationCommand, workbenchAdapterOperationResultPath, withDefaultWorkbenchAdapterAuthProfiles as applyDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
10
10
  import { builtinLocalTraceAdapter, builtinLocalTraceAdapters, sortLocalTraceRefs, } from "@workbench-ai/workbench-built-in-adapters/local-traces";
11
- import { commandUsage, HOSTED_WATCH_LIFECYCLE_NOTE, LOCAL_DEV_OPEN_LIFECYCLE_NOTE, rootUsage, } from "./command-model.js";
11
+ import { commandUsage, REMOTE_WATCH_LIFECYCLE_NOTE, LOCAL_DEV_OPEN_LIFECYCLE_NOTE, rootUsage, } from "./command-model.js";
12
12
  import { startLocalWorkbenchDevServer } from "./dev-open-server.js";
13
+ import { createLocalWorkbenchInspection } from "./local-inspection.js";
13
14
  import { createWorkbenchInitScaffold, } from "./init-scaffold.js";
14
15
  import { defaultAdapterManifests, composeRuntimeDockerfileWithAdapters, resolveDefaultWorkbenchAdapter, resolveProjectAdapterSource, resolveWorkbenchAdaptersForProject, WORKBENCH_ADAPTER_MANIFEST_FILE, } from "./adapter-project.js";
15
16
  import { createAdapterCommandEnv } from "./adapter-command-env.js";
16
17
  import { loadLocalArchive, loadLocalArchiveIndex, exportLocalRuntimeBundle, importLocalRuntimeBundle, runtimeBundleStats, materializeCandidateRoot, readLocalCandidate, readLocalCandidateFiles, readLocalJobs, saveLocalArchive, saveLocalJobs, setLocalActive, upsertLocalRun, upsertLocalCandidate, upsertLocalEvaluation, } from "./local-archive.js";
17
18
  import { WorkspaceSnapshotError, } from "./workspace-snapshot.js";
18
- import { hostedEngineResolveFiles, readLocalProjectSource, WORKBENCH_BENCHMARK_FILE, } from "./project-source.js";
19
+ import { remoteEngineResolveFiles, readLocalProjectSource, WORKBENCH_BENCHMARK_FILE, } from "./project-source.js";
19
20
  import { localBenchmarkFingerprint, localCandidateFingerprint, projectStateBenchmarkFingerprint, } from "./benchmark-fingerprint.js";
20
21
  const require = createRequire(import.meta.url);
21
22
  function getCliVersion() {
@@ -34,6 +35,30 @@ class WorkbenchApiRequestError extends Error {
34
35
  }
35
36
  const API_REQUEST_MAX_ATTEMPTS = 3;
36
37
  const DEFAULT_BASE_URL = "https://v2.workbench.ai";
38
+ const AUTH_COMMAND_HANDLERS = {
39
+ connect: authConnect,
40
+ disconnect: authDisconnect,
41
+ };
42
+ const ADAPTERS_COMMAND_HANDLERS = {
43
+ create: adaptersCreate,
44
+ inspect: adaptersInspect,
45
+ list: adaptersList,
46
+ test: adaptersTest,
47
+ };
48
+ const TRACES_COMMAND_HANDLERS = {
49
+ collect: localTraceCollect,
50
+ list: localTraceList,
51
+ show: localTraceShow,
52
+ };
53
+ const TWO_SEGMENT_HELP_COMMANDS = {
54
+ adapters: ["create", "list", "inspect", "test"],
55
+ auth: [],
56
+ candidates: ["list", "show", "files", "preview"],
57
+ evaluations: ["list", "show"],
58
+ executions: ["trace"],
59
+ runs: ["list", "show"],
60
+ traces: ["collect", "list", "show"],
61
+ };
37
62
  export async function runCli(argv, io = {
38
63
  stdin: process.stdin,
39
64
  stdout: process.stdout,
@@ -82,31 +107,31 @@ export async function runCli(argv, io = {
82
107
  return await pushBenchmark(argv.slice(1), io);
83
108
  }
84
109
  if (argv[0] === "eval") {
85
- const hosted = extractHostedFlag(argv.slice(1));
86
- return hosted.enabled
87
- ? await startHostedWorkflow("eval", hosted.argv, io)
88
- : await localEvaluateCandidate(hosted.argv, io, runtimeOptions);
110
+ const remote = extractRemoteFlag(argv.slice(1));
111
+ return remote.enabled
112
+ ? await startRemoteWorkflow("eval", remote.argv, io)
113
+ : await localEvaluateCandidate(remote.argv, io, runtimeOptions);
89
114
  }
90
115
  if (argv[0] === "retry") {
91
- const hosted = extractHostedFlag(argv.slice(1));
92
- return hosted.enabled
93
- ? await retryHostedWorkflow(hosted.argv, io)
94
- : await localRetry(hosted.argv, io, runtimeOptions);
116
+ const remote = extractRemoteFlag(argv.slice(1));
117
+ return remote.enabled
118
+ ? await retryRemoteWorkflow(remote.argv, io)
119
+ : await localRetry(remote.argv, io, runtimeOptions);
95
120
  }
96
121
  if (argv[0] === "improve") {
97
- const hosted = extractHostedFlag(argv.slice(1));
98
- return hosted.enabled
99
- ? await startHostedWorkflow("improve", hosted.argv, io)
100
- : await localRun(hosted.argv, io, runtimeOptions);
122
+ const remote = extractRemoteFlag(argv.slice(1));
123
+ return remote.enabled
124
+ ? await startRemoteWorkflow("improve", remote.argv, io)
125
+ : await localRun(remote.argv, io, runtimeOptions);
101
126
  }
102
127
  if (argv[0] === "restore") {
103
128
  return await localRestore(argv.slice(1), io);
104
129
  }
105
130
  if (argv[0] === "open") {
106
- const hosted = extractHostedFlag(argv.slice(1));
107
- return hosted.enabled
108
- ? await openWorkbench(hosted.argv, io)
109
- : await localDevOpen(hosted.argv, io);
131
+ const remote = extractRemoteFlag(argv.slice(1));
132
+ return remote.enabled
133
+ ? await openWorkbench(remote.argv, io)
134
+ : await localDevOpen(remote.argv, io);
110
135
  }
111
136
  if (argv[0] === "auth") {
112
137
  return await runAuthCommand(argv.slice(1), io);
@@ -117,6 +142,9 @@ export async function runCli(argv, io = {
117
142
  if (argv[0] === "traces") {
118
143
  return await runTracesCommand(argv.slice(1), io);
119
144
  }
145
+ if (argv[0] === "diagnose") {
146
+ return await localDiagnose(argv.slice(1), io);
147
+ }
120
148
  const commandPath = argv.slice(0, 2).join(" ");
121
149
  const rest = argv.slice(2);
122
150
  switch (commandPath) {
@@ -124,6 +152,12 @@ export async function runCli(argv, io = {
124
152
  return await localRunList(rest, io);
125
153
  case "runs show":
126
154
  return await localRunShow(rest, io);
155
+ case "evaluations list":
156
+ return await localEvaluationList(rest, io);
157
+ case "evaluations show":
158
+ return await localEvaluationShow(rest, io);
159
+ case "executions trace":
160
+ return await localExecutionTrace(rest, io);
127
161
  case "candidates list":
128
162
  return await localCandidateList(rest, io);
129
163
  case "candidates show":
@@ -152,32 +186,18 @@ export async function runCli(argv, io = {
152
186
  }
153
187
  function commandPathForHelp(argv) {
154
188
  const positionals = argv.filter((arg) => arg !== "--help" && arg !== "-h" && !arg.startsWith("--"));
155
- if (positionals[0] === "adapters" &&
156
- ["create", "list", "inspect", "test"].includes(positionals[1] ?? "")) {
157
- return positionals.slice(0, 2).join(" ");
158
- }
159
- if (positionals[0] === "traces" &&
160
- ["collect", "list", "show"].includes(positionals[1] ?? "")) {
161
- return positionals.slice(0, 2).join(" ");
162
- }
163
- if (positionals[0] === "auth") {
164
- return positionals.slice(0, 2).join(" ");
165
- }
166
- if (positionals[0] === "runs" &&
167
- ["list", "show"].includes(positionals[1] ?? "")) {
168
- return positionals.slice(0, 2).join(" ");
169
- }
170
- if (positionals[0] === "candidates" &&
171
- ["list", "show", "files", "preview"].includes(positionals[1] ?? "")) {
189
+ const command = positionals[0] ?? "";
190
+ const subcommands = TWO_SEGMENT_HELP_COMMANDS[command];
191
+ if (subcommands && (subcommands.length === 0 || subcommands.includes(positionals[1] ?? ""))) {
172
192
  return positionals.slice(0, 2).join(" ");
173
193
  }
174
- return positionals[0] ?? "";
194
+ return command;
175
195
  }
176
- function extractHostedFlag(argv) {
196
+ function extractRemoteFlag(argv) {
177
197
  let enabled = false;
178
198
  const next = [];
179
199
  for (const arg of argv) {
180
- if (arg === "--hosted") {
200
+ if (arg === "--remote") {
181
201
  enabled = true;
182
202
  }
183
203
  else {
@@ -1498,6 +1518,9 @@ function latestCompletedAttemptJobsByPair(jobs, desiredKeys) {
1498
1518
  return byPair;
1499
1519
  }
1500
1520
  function caseSamplePairFromJob(job) {
1521
+ if (job.caseId && Number.isSafeInteger(job.sampleIndex) && job.sampleIndex >= 0) {
1522
+ return { caseId: job.caseId, sampleIndex: job.sampleIndex };
1523
+ }
1501
1524
  const input = readRecord(job.input);
1502
1525
  const execution = readRecord(input?.execution);
1503
1526
  const metadata = readRecord(execution?.metadata);
@@ -1593,7 +1616,7 @@ function resolveProjectPath(root, filePath) {
1593
1616
  }
1594
1617
  async function executeLocalDevelopmentJob(args) {
1595
1618
  return await executeWorkbenchExecutionJob(args, {
1596
- sandboxProvider: DOCKER_SANDBOX_BACKEND,
1619
+ sandboxBackend: DOCKER_SANDBOX_BACKEND,
1597
1620
  loadLocalAdapterAuthProfiles: true,
1598
1621
  });
1599
1622
  }
@@ -1604,7 +1627,7 @@ async function executeLocalDevelopmentDag(args) {
1604
1627
  const result = await runWorkbenchExecutionDag({
1605
1628
  jobs: args.jobs,
1606
1629
  capacity: args.capacity,
1607
- sandboxProvider: DOCKER_SANDBOX_BACKEND,
1630
+ sandboxBackend: DOCKER_SANDBOX_BACKEND,
1608
1631
  executeJob: async (job) => {
1609
1632
  return await executeLocalDevelopmentJob({
1610
1633
  job,
@@ -1802,11 +1825,16 @@ async function localRestore(argv, io) {
1802
1825
  writeOutput({ ok: true, activeCandidateId: candidateId, changedPaths }, parsed, io, () => `Restored ${candidateId} to ${candidateRoot}.`);
1803
1826
  return 0;
1804
1827
  }
1828
+ function localInspectionFromParsed(parsed) {
1829
+ return createLocalWorkbenchInspection({ workspace: resolveDir(parsed) });
1830
+ }
1805
1831
  async function localCandidateList(argv, io) {
1806
1832
  const parsed = parseArgs(argv);
1807
1833
  rejectUnknownFlags(parsed, new Set(["dir", "json"]));
1808
- const snapshot = await loadLocalArchive(resolveDir(parsed));
1809
- writeOutput(snapshot.candidates, parsed, io, (candidates) => candidates
1834
+ const inspection = localInspectionFromParsed(parsed);
1835
+ const snapshot = await inspection.snapshot();
1836
+ const candidates = await Promise.all(snapshot.summaries.map((candidate) => inspection.candidate({ id: candidate.id })));
1837
+ writeOutput(candidates, parsed, io, (candidates) => candidates
1810
1838
  .map((candidate) => `${candidate.id}\t${candidate.status}\tevaluation ${formatCandidateEvaluationScore(candidate)}${snapshot.activeId === candidate.id ? "\tactive" : ""}`)
1811
1839
  .join("\n") || "No candidates.");
1812
1840
  return 0;
@@ -1814,13 +1842,14 @@ async function localCandidateList(argv, io) {
1814
1842
  async function localCandidateShow(argv, io) {
1815
1843
  const parsed = parseArgs(argv);
1816
1844
  rejectUnknownFlags(parsed, new Set(["dir", "candidate", "json"]));
1817
- const snapshot = await loadLocalArchive(resolveDir(parsed));
1845
+ const inspection = localInspectionFromParsed(parsed);
1846
+ const snapshot = await inspection.snapshot();
1818
1847
  const candidateId = readCandidateIdFlag(parsed, snapshot);
1819
- const candidate = readLocalCandidate(snapshot, candidateId);
1848
+ const candidate = await inspection.candidate({ id: candidateId });
1820
1849
  writeOutput(candidate, parsed, io, (record) => [
1821
1850
  `${record.id}\t${record.status}`,
1822
1851
  `benchmark\t${record.benchmarkFingerprint}`,
1823
- `candidate\t${record.candidateFingerprint ?? record.candidateFingerprint}`,
1852
+ `candidate\t${record.candidateFingerprint}`,
1824
1853
  `evaluation\t${formatCandidateEvaluationSummary(record)}`,
1825
1854
  ...(record.baseId ? [`base\t${record.baseId}`] : []),
1826
1855
  ].join("\n"));
@@ -1829,10 +1858,10 @@ async function localCandidateShow(argv, io) {
1829
1858
  async function localCandidateFiles(argv, io) {
1830
1859
  const parsed = parseArgs(argv);
1831
1860
  rejectUnknownFlags(parsed, new Set(["dir", "candidate", "json"]));
1832
- const snapshot = await loadLocalArchive(resolveDir(parsed));
1861
+ const inspection = localInspectionFromParsed(parsed);
1862
+ const snapshot = await inspection.snapshot();
1833
1863
  const candidateId = readCandidateIdFlag(parsed, snapshot);
1834
- const candidate = readLocalCandidate(snapshot, candidateId);
1835
- const files = summarizeCandidateFiles(readLocalCandidateFiles(snapshot, candidateId), candidate.fileChanges);
1864
+ const files = await inspection.candidateFiles({ id: candidateId });
1836
1865
  writeOutput(files, parsed, io, (records) => records
1837
1866
  .map((file) => `${file.path}\t${file.status}\t${file.preview_kind}`)
1838
1867
  .join("\n") || "No files.");
@@ -1841,10 +1870,11 @@ async function localCandidateFiles(argv, io) {
1841
1870
  async function localCandidatePreview(argv, io) {
1842
1871
  const parsed = parseArgs(argv);
1843
1872
  rejectUnknownFlags(parsed, new Set(["dir", "candidate", "path", "output", "view", "json"]));
1844
- const snapshot = await loadLocalArchive(resolveDir(parsed));
1873
+ const inspection = localInspectionFromParsed(parsed);
1874
+ const snapshot = await inspection.snapshot();
1845
1875
  const candidateId = readCandidateIdFlag(parsed, snapshot);
1846
- const preview = createCandidateFilePreview({
1847
- files: readLocalCandidateFiles(snapshot, candidateId),
1876
+ const preview = await inspection.candidatePreview({
1877
+ id: candidateId,
1848
1878
  path: requireFlag(parsed, "path"),
1849
1879
  view: readPreviewMode(parsed),
1850
1880
  });
@@ -1865,7 +1895,7 @@ async function localCandidatePreview(argv, io) {
1865
1895
  async function localRunList(argv, io) {
1866
1896
  const parsed = parseArgs(argv);
1867
1897
  rejectUnknownFlags(parsed, new Set(["dir", "json"]));
1868
- const snapshot = await loadLocalArchive(resolveDir(parsed));
1898
+ const snapshot = await localInspectionFromParsed(parsed).snapshot();
1869
1899
  writeOutput(snapshot.runs, parsed, io, (runs) => runs
1870
1900
  .map((run) => `${run.id}\t${run.workflow}\t${run.status}\t${run.outcome ?? "pending"}\t${run.attemptsExecuted ?? 0}/${run.attemptsRequested ?? 0}`)
1871
1901
  .join("\n") || "No runs.");
@@ -1873,67 +1903,130 @@ async function localRunList(argv, io) {
1873
1903
  }
1874
1904
  async function localRunShow(argv, io) {
1875
1905
  const parsed = parseArgs(argv);
1876
- rejectUnknownFlags(parsed, new Set(["dir", "json"]));
1906
+ rejectUnknownFlags(parsed, new Set(["dir", "jobs", "failures", "json"]));
1877
1907
  const runId = parsed.positionals[0];
1878
1908
  if (!runId) {
1879
1909
  throw new UsageError("workbench runs show requires RUN_ID.");
1880
1910
  }
1881
- const snapshot = await loadLocalArchive(resolveDir(parsed));
1882
- const run = snapshot.runs.find((entry) => entry.id === runId);
1883
- if (!run) {
1884
- throw new UsageError(`Run not found: ${runId}`);
1885
- }
1886
- writeOutput(run, parsed, io, (record) => [
1887
- `${record.id}\t${record.workflow}\t${record.status}`,
1888
- `outcome\t${record.outcome ?? "pending"}`,
1889
- `started\t${record.startedAt}`,
1890
- ...(record.finishedAt ? [`finished\t${record.finishedAt}`] : []),
1891
- `attempts\t${record.attemptsExecuted ?? 0}/${record.attemptsRequested ?? 0}`,
1892
- `samples\t${record.samples ?? 0}`,
1911
+ const inspection = localInspectionFromParsed(parsed);
1912
+ const detail = await inspection.run({
1913
+ id: runId,
1914
+ includeJobs: parsed.flags.jobs === true || parsed.flags.failures === true,
1915
+ });
1916
+ const diagnosis = parsed.flags.failures === true
1917
+ ? await inspection.diagnose({ targetId: runId })
1918
+ : null;
1919
+ writeOutput(parsed.flags.failures === true
1920
+ ? { ...detail, diagnosis }
1921
+ : detail, parsed, io, (record) => {
1922
+ const run = record.run;
1923
+ const jobs = "jobs" in record && Array.isArray(record.jobs)
1924
+ ? record.jobs
1925
+ : [];
1926
+ const failures = "diagnosis" in record && record.diagnosis
1927
+ ? record.diagnosis.failures
1928
+ : [];
1929
+ return [
1930
+ `${run.id}\t${run.workflow}\t${run.status}`,
1931
+ `outcome\t${run.outcome ?? "pending"}`,
1932
+ `started\t${run.startedAt}`,
1933
+ ...(run.finishedAt ? [`finished\t${run.finishedAt}`] : []),
1934
+ `attempts\t${run.attemptsExecuted ?? 0}/${run.attemptsRequested ?? 0}`,
1935
+ `samples\t${run.samples ?? 0}`,
1936
+ ...(jobs.length > 0
1937
+ ? [
1938
+ "jobs",
1939
+ ...jobs.map((job) => `${job.id}\t${job.kind}\t${job.status}${job.error ? `\t${job.error}` : ""}`),
1940
+ ]
1941
+ : []),
1942
+ ...(failures.length > 0
1943
+ ? [
1944
+ "failures",
1945
+ ...failures.map(formatFailureLine),
1946
+ ]
1947
+ : []),
1948
+ ].join("\n");
1949
+ });
1950
+ return 0;
1951
+ }
1952
+ async function localEvaluationList(argv, io) {
1953
+ const parsed = parseArgs(argv);
1954
+ rejectUnknownFlags(parsed, new Set(["dir", "json"]));
1955
+ const comparison = await localInspectionFromParsed(parsed).evaluations();
1956
+ writeOutput(comparison, parsed, io, (record) => record.rows
1957
+ .map((row) => `${row.evaluationId}\t${row.status}\t${formatNullableMetric(row.score)}\t${row.candidateLabel}\t${row.configurationLabel}\t${row.runId}`)
1958
+ .join("\n") || "No evaluations.");
1959
+ return 0;
1960
+ }
1961
+ async function localEvaluationShow(argv, io) {
1962
+ const parsed = parseArgs(argv);
1963
+ rejectUnknownFlags(parsed, new Set(["dir", "json"]));
1964
+ const evaluationId = parsed.positionals[0];
1965
+ if (!evaluationId) {
1966
+ throw new UsageError("workbench evaluations show requires EVALUATION_ID.");
1967
+ }
1968
+ const evaluation = await localInspectionFromParsed(parsed).evaluation({ id: evaluationId });
1969
+ writeOutput(evaluation, parsed, io, (record) => [
1970
+ `${record.id}\t${record.status}`,
1971
+ `candidate\t${record.candidateName ?? record.candidateId}`,
1972
+ `run\t${record.runId}`,
1973
+ `samples\t${record.completedSampleCount}/${record.sampleCount}`,
1974
+ `errors\t${record.errorSampleCount}`,
1975
+ `score\t${formatNullableMetric(record.metrics?.score?.mean ?? null)}`,
1976
+ ...(record.error ? [`error\t${record.error}`] : []),
1977
+ ...(record.evaluation.cases?.length
1978
+ ? [
1979
+ "cases",
1980
+ ...record.evaluation.cases.map((entry) => `${entry.id}\t${entry.status ?? "unknown"}\t${formatNullableMetric(entry.metrics?.score?.mean ?? null)}`),
1981
+ ]
1982
+ : []),
1893
1983
  ].join("\n"));
1894
1984
  return 0;
1895
1985
  }
1986
+ async function localExecutionTrace(argv, io) {
1987
+ const parsed = parseArgs(argv);
1988
+ rejectUnknownFlags(parsed, new Set(["dir", "run", "job", "json"]));
1989
+ const runId = requireFlag(parsed, "run");
1990
+ const jobId = requireFlag(parsed, "job");
1991
+ const detail = await localInspectionFromParsed(parsed).executionTrace({ runId, jobId });
1992
+ writeOutput(detail, parsed, io, (record) => record.executions
1993
+ .map((execution) => [
1994
+ `${execution.id}\t${execution.kind}\t${execution.status}`,
1995
+ `jobs\t${execution.jobIds.join(",")}`,
1996
+ `sessions\t${execution.sessions.length}`,
1997
+ `spans\t${execution.trace.spans.length}`,
1998
+ `events\t${execution.trace.events.length}`,
1999
+ `summaries\t${execution.trace.summaries.length}`,
2000
+ ].join("\n"))
2001
+ .join("\n\n") || "No execution trace.");
2002
+ return 0;
2003
+ }
2004
+ async function localDiagnose(argv, io) {
2005
+ const parsed = parseArgs(argv);
2006
+ rejectUnknownFlags(parsed, new Set(["dir", "json"]));
2007
+ rejectUnexpectedPositionals(parsed, "workbench diagnose", 1);
2008
+ const diagnosis = await localInspectionFromParsed(parsed).diagnose({ targetId: parsed.positionals[0] ?? null });
2009
+ writeOutput(diagnosis, parsed, io, (record) => record.failures.length > 0
2010
+ ? record.failures.map(formatFailureLine).join("\n")
2011
+ : "No failures.");
2012
+ return 0;
2013
+ }
1896
2014
  async function runAuthCommand(argv, io) {
1897
- const command = argv[0];
1898
- const rest = argv.slice(1);
1899
- switch (command) {
1900
- case "connect":
1901
- return await authConnect(rest, io);
1902
- case "disconnect":
1903
- return await authDisconnect(rest, io);
1904
- default:
1905
- throw new UsageError(`Unknown command: auth ${argv.join(" ")}`);
1906
- }
2015
+ return await runSubCommand("auth", AUTH_COMMAND_HANDLERS, argv, io);
1907
2016
  }
1908
2017
  async function runAdaptersCommand(argv, io) {
1909
- const command = argv[0];
1910
- const rest = argv.slice(1);
1911
- switch (command) {
1912
- case "create":
1913
- return await adaptersCreate(rest, io);
1914
- case "list":
1915
- return await adaptersList(rest, io);
1916
- case "inspect":
1917
- return await adaptersInspect(rest, io);
1918
- case "test":
1919
- return await adaptersTest(rest, io);
1920
- default:
1921
- throw new UsageError(`Unknown command: adapters ${argv.join(" ")}`);
1922
- }
2018
+ return await runSubCommand("adapters", ADAPTERS_COMMAND_HANDLERS, argv, io);
1923
2019
  }
1924
2020
  async function runTracesCommand(argv, io) {
1925
- const command = argv[0];
1926
- const rest = argv.slice(1);
1927
- switch (command) {
1928
- case "collect":
1929
- return await localTraceCollect(rest, io);
1930
- case "list":
1931
- return await localTraceList(rest, io);
1932
- case "show":
1933
- return await localTraceShow(rest, io);
1934
- default:
1935
- throw new UsageError(`Unknown command: traces ${argv.join(" ")}`);
2021
+ return await runSubCommand("traces", TRACES_COMMAND_HANDLERS, argv, io);
2022
+ }
2023
+ async function runSubCommand(group, handlers, argv, io) {
2024
+ const command = argv[0] ?? "";
2025
+ const handler = handlers[command];
2026
+ if (!handler) {
2027
+ throw new UsageError(`Unknown command: ${group} ${argv.join(" ")}`);
1936
2028
  }
2029
+ return await handler(argv.slice(1), io);
1937
2030
  }
1938
2031
  const DEFAULT_LOCAL_TRACE_LIMIT = 3;
1939
2032
  const LOCAL_TRACE_WINDOW_FLAGS = new Set(["providers", "since", "workspace", "limit", "json"]);
@@ -2446,7 +2539,11 @@ fs.writeFileSync(resultPath, JSON.stringify({
2446
2539
  async function login(argv, io) {
2447
2540
  const parsed = parseArgs(argv);
2448
2541
  rejectUnknownFlags(parsed, new Set(["base-url", "no-open", "json"]));
2449
- const baseUrl = asOptionalString(parsed.flags["base-url"]) ?? DEFAULT_BASE_URL;
2542
+ const config = await loadConfig();
2543
+ const baseUrl = selectWorkbenchBaseUrl({
2544
+ explicitBaseUrl: asOptionalString(parsed.flags["base-url"]),
2545
+ configBaseUrl: config.baseUrl,
2546
+ });
2450
2547
  const authorization = await requestDeviceAuthorization(baseUrl);
2451
2548
  if (parsed.flags.json === true) {
2452
2549
  writeJson({ ok: true, status: "authorization_pending", ...authorization }, io);
@@ -2472,7 +2569,7 @@ async function logout(argv, io) {
2472
2569
  const parsed = parseArgs(argv);
2473
2570
  rejectUnknownFlags(parsed, new Set(["json"]));
2474
2571
  const config = await loadConfig();
2475
- const baseUrl = normalizeBaseUrl(process.env.WORKBENCH_API_URL ?? config.baseUrl ?? DEFAULT_BASE_URL);
2572
+ const baseUrl = selectWorkbenchBaseUrl({ configBaseUrl: config.baseUrl });
2476
2573
  if (config.accessToken) {
2477
2574
  await fetch(`${baseUrl}/api/oauth/revoke`, {
2478
2575
  method: "POST",
@@ -2493,8 +2590,8 @@ async function authStatus(argv, io) {
2493
2590
  const baseUrl = await effectiveBaseUrl();
2494
2591
  const profileStatus = await readWorkbenchProfileStatus(config);
2495
2592
  const adapterStatuses = await localWorkbenchAdapterAuthStore().listStatus();
2496
- const hostedAuth = profileStatus.authenticated
2497
- ? await readHostedAdapterAuthStatuses().catch((error) => ({
2593
+ const remoteAuth = profileStatus.authenticated
2594
+ ? await readRemoteAdapterAuthStatuses().catch((error) => ({
2498
2595
  adapters: [],
2499
2596
  error: error instanceof Error ? error.message : String(error),
2500
2597
  }))
@@ -2503,7 +2600,7 @@ async function authStatus(argv, io) {
2503
2600
  error: "not_authenticated",
2504
2601
  };
2505
2602
  const dir = resolveDir(parsed);
2506
- const adapterAuth = await projectAdapterAuthStatus(dir, adapterStatuses, hostedAuth.adapters).catch(() => []);
2603
+ const adapterAuth = await projectAdapterAuthStatus(dir, adapterStatuses, remoteAuth.adapters).catch(() => []);
2507
2604
  const result = {
2508
2605
  ok: true,
2509
2606
  workbench: {
@@ -2512,7 +2609,7 @@ async function authStatus(argv, io) {
2512
2609
  username: profileStatus.profile?.username ?? null,
2513
2610
  },
2514
2611
  adapterStatuses,
2515
- hostedAuth,
2612
+ remoteAuth,
2516
2613
  adapterAuth,
2517
2614
  };
2518
2615
  writeOutput(result, parsed, io, (record) => {
@@ -2525,28 +2622,28 @@ async function authStatus(argv, io) {
2525
2622
  ? [
2526
2623
  "",
2527
2624
  "Required adapter auth:",
2528
- ...value.adapterAuth.map((adapter) => `${adapter.adapter}${adapter.profile !== "default" ? ` profile ${adapter.profile}` : ""}: local ${adapter.local.status}${adapter.local.method ? ` (${adapter.local.method})` : ""}${adapter.local.reason ? ` (${adapter.local.reason})` : ""}, hosted ${adapter.hosted.status}${adapter.hosted.method ? ` (${adapter.hosted.method})` : ""}${adapter.hosted.reason ? ` (${adapter.hosted.reason})` : ""}`),
2625
+ ...value.adapterAuth.map((adapter) => `${adapter.adapter}${adapter.profile !== "default" ? ` profile ${adapter.profile}` : ""}: local ${adapter.local.status}${adapter.local.method ? ` (${adapter.local.method})` : ""}${adapter.local.reason ? ` (${adapter.local.reason})` : ""}, remote ${adapter.remote.status}${adapter.remote.method ? ` (${adapter.remote.method})` : ""}${adapter.remote.reason ? ` (${adapter.remote.reason})` : ""}`),
2529
2626
  ]
2530
2627
  : []),
2531
2628
  ].join("\n");
2532
2629
  });
2533
2630
  return 0;
2534
2631
  }
2535
- async function projectAdapterAuthStatus(dir, adapterStatuses, hostedAdapters) {
2632
+ async function projectAdapterAuthStatus(dir, adapterStatuses, remoteAdapters) {
2536
2633
  const spec = (await readLocalProjectSource(dir)).spec;
2537
2634
  const adapters = await resolveWorkbenchAdaptersForProject(dir, spec);
2538
2635
  const adapterStatusMap = new Map(adapterStatuses.map((status) => [
2539
2636
  adapterAuthStatusKey(status.adapterId, status.slot, status.profile),
2540
2637
  status,
2541
2638
  ]));
2542
- const hostedAdapterStatusMap = new Map(hostedAdapters.map((status) => [
2639
+ const remoteAdapterStatusMap = new Map(remoteAdapters.map((status) => [
2543
2640
  adapterAuthStatusKey(status.adapterId, status.slot, status.profile),
2544
2641
  status,
2545
2642
  ]));
2546
2643
  const adapterById = new Map(adapters.map((adapter) => [adapter.manifest.id, adapter]));
2547
2644
  return requiredAuthTargetsForSpec(spec, adapterById).map((target) => {
2548
2645
  const adapterStatus = adapterStatusMap.get(adapterAuthStatusKey(target.adapter, target.slot, target.profile));
2549
- const hostedAdapterStatus = hostedAdapterStatusMap.get(adapterAuthStatusKey(target.adapter, target.slot, target.profile));
2646
+ const remoteAdapterStatus = remoteAdapterStatusMap.get(adapterAuthStatusKey(target.adapter, target.slot, target.profile));
2550
2647
  return {
2551
2648
  ...target,
2552
2649
  local: adapterStatus
@@ -2556,17 +2653,17 @@ async function projectAdapterAuthStatus(dir, adapterStatuses, hostedAdapters) {
2556
2653
  ...(adapterStatus.reason ? { reason: adapterStatus.reason } : {}),
2557
2654
  }
2558
2655
  : { status: "disconnected" },
2559
- hosted: hostedAdapterStatus
2656
+ remote: remoteAdapterStatus
2560
2657
  ? {
2561
- status: hostedAdapterStatus.status,
2562
- ...(hostedAdapterStatus.method ? { method: hostedAdapterStatus.method } : {}),
2563
- ...(hostedAdapterStatus.reason ? { reason: hostedAdapterStatus.reason } : {}),
2658
+ status: remoteAdapterStatus.status,
2659
+ ...(remoteAdapterStatus.method ? { method: remoteAdapterStatus.method } : {}),
2660
+ ...(remoteAdapterStatus.reason ? { reason: remoteAdapterStatus.reason } : {}),
2564
2661
  }
2565
2662
  : { status: "disconnected" },
2566
2663
  };
2567
2664
  });
2568
2665
  }
2569
- async function readHostedAdapterAuthStatuses() {
2666
+ async function readRemoteAdapterAuthStatuses() {
2570
2667
  const adapterResponse = await apiRequest("/api/workbench/auth/adapters");
2571
2668
  return {
2572
2669
  adapters: adapterResponse.adapters ?? [],
@@ -2941,7 +3038,7 @@ async function pushBenchmark(argv, io) {
2941
3038
  const dir = resolveSourceDir(parsed);
2942
3039
  const source = await readLocalProjectSource(dir);
2943
3040
  const origin = await readWorkbenchOrigin(dir);
2944
- const baseUrl = await effectiveBaseUrl(origin?.baseUrl);
3041
+ const baseUrl = await effectiveOriginBaseUrl(origin?.baseUrl);
2945
3042
  const visibility = readOptionalBenchmarkVisibility(parsed.flags.visibility);
2946
3043
  const createVisibility = visibility ?? "public";
2947
3044
  const dryRun = parsed.flags["dry-run"] === true;
@@ -2972,7 +3069,7 @@ async function pushBenchmark(argv, io) {
2972
3069
  }, parsed, io, () => `Would push benchmark ${source.spec.name}.`);
2973
3070
  return 0;
2974
3071
  }
2975
- const { project, origin: nextOrigin, result } = await createHostedBenchmarkFromState({
3072
+ const { project, origin: nextOrigin, result } = await createRemoteBenchmarkFromState({
2976
3073
  baseUrl,
2977
3074
  dir,
2978
3075
  state,
@@ -3001,7 +3098,7 @@ async function pushBenchmark(argv, io) {
3001
3098
  }
3002
3099
  const projectId = origin.projectId;
3003
3100
  if (!projectId) {
3004
- throw new UsageError("Missing hosted benchmark. Run workbench push from a source directory.");
3101
+ throw new UsageError("Missing remote benchmark. Run workbench push from a source directory.");
3005
3102
  }
3006
3103
  if (dryRun) {
3007
3104
  const remoteProject = await verifyLinkedPushDryRunTarget({
@@ -3017,7 +3114,7 @@ async function pushBenchmark(argv, io) {
3017
3114
  baseUrl,
3018
3115
  benchmarkId: projectId,
3019
3116
  remote: origin.remote,
3020
- benchmark: hostedProjectSummaryForOutput(remoteProject),
3117
+ benchmark: remoteProjectSummaryForOutput(remoteProject),
3021
3118
  benchmarkName: source.spec.name,
3022
3119
  visibility: visibility ?? "unchanged",
3023
3120
  sourceFileCount: sourceFileCount(source),
@@ -3031,7 +3128,7 @@ async function pushBenchmark(argv, io) {
3031
3128
  method: "PUT",
3032
3129
  body: state,
3033
3130
  }, baseUrl);
3034
- const responseProject = hostedProjectSummaryFromState(response.state);
3131
+ const responseProject = remoteProjectSummaryFromState(response.state);
3035
3132
  const publishedProject = await applyRequestedProjectVisibility({
3036
3133
  baseUrl,
3037
3134
  projectId: responseProject.id,
@@ -3079,7 +3176,7 @@ async function verifyLinkedPushDryRunTarget(args) {
3079
3176
  }
3080
3177
  return response.benchmark;
3081
3178
  }
3082
- function hostedProjectSummaryForOutput(project) {
3179
+ function remoteProjectSummaryForOutput(project) {
3083
3180
  return {
3084
3181
  ...(project.id ? { id: project.id } : {}),
3085
3182
  ...(project.ownerUsername ? { ownerUsername: project.ownerUsername } : {}),
@@ -3089,12 +3186,12 @@ function hostedProjectSummaryForOutput(project) {
3089
3186
  ...(typeof project.starCount === "number" ? { starCount: project.starCount } : {}),
3090
3187
  };
3091
3188
  }
3092
- async function createHostedBenchmarkFromState(args) {
3189
+ async function createRemoteBenchmarkFromState(args) {
3093
3190
  const result = await apiRequest("/api/workbench/benchmarks/state", {
3094
3191
  method: "POST",
3095
3192
  body: args.state,
3096
3193
  }, args.baseUrl);
3097
- const project = hostedProjectSummaryFromState(result.state);
3194
+ const project = remoteProjectSummaryFromState(result.state);
3098
3195
  const applied = await acceptPushedProjectStateToLocal({
3099
3196
  dir: args.dir,
3100
3197
  baseUrl: args.baseUrl,
@@ -3168,7 +3265,7 @@ async function pullProject(argv, io) {
3168
3265
  }
3169
3266
  const dir = resolveDir(parsed);
3170
3267
  const origin = await requireWorkbenchOrigin(dir);
3171
- const baseUrl = await effectiveBaseUrl(origin.baseUrl);
3268
+ const baseUrl = await effectiveOriginBaseUrl(origin.baseUrl);
3172
3269
  const remoteRef = parseOriginRemote(origin);
3173
3270
  const state = await apiRequest(publicProjectStateApiPath(remoteRef), {}, baseUrl);
3174
3271
  if (parsed.flags["dry-run"] === true) {
@@ -3228,7 +3325,7 @@ async function acceptPushedProjectStateToLocal(args) {
3228
3325
  });
3229
3326
  return { origin, runtime: runtime.stats };
3230
3327
  }
3231
- async function retryHostedWorkflow(argv, io) {
3328
+ async function retryRemoteWorkflow(argv, io) {
3232
3329
  const parsed = parseArgs(argv);
3233
3330
  rejectUnknownFlags(parsed, new Set([
3234
3331
  "dir",
@@ -3238,7 +3335,7 @@ async function retryHostedWorkflow(argv, io) {
3238
3335
  "timeout-ms",
3239
3336
  "json",
3240
3337
  ]));
3241
- rejectUnexpectedPositionals(parsed, "workbench retry --hosted", 1);
3338
+ rejectUnexpectedPositionals(parsed, "workbench retry --remote", 1);
3242
3339
  const targetId = parsed.positionals[0];
3243
3340
  if (!targetId) {
3244
3341
  throw new UsageError("Missing required TARGET_ID.");
@@ -3247,8 +3344,8 @@ async function retryHostedWorkflow(argv, io) {
3247
3344
  parsed.flags["timeout-ms"] !== undefined)) {
3248
3345
  throw new UsageError("--interval-ms and --timeout-ms require --watch.");
3249
3346
  }
3250
- const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
3251
- const retryTarget = await resolveHostedRetryTarget(target, targetId);
3347
+ const target = await resolveRemoteTarget(parsed, { requireProjectIdentity: true });
3348
+ const retryTarget = await resolveRemoteRetryTarget(target, targetId);
3252
3349
  const watchIntervalMs = parsed.flags.watch === true
3253
3350
  ? parsePositiveInt(parsed.flags["interval-ms"], 1000, "interval-ms")
3254
3351
  : undefined;
@@ -3259,23 +3356,23 @@ async function retryHostedWorkflow(argv, io) {
3259
3356
  method: "POST",
3260
3357
  body: retryTarget.request,
3261
3358
  }, target.baseUrl);
3262
- const runTarget = hostedTargetForRunStartResponse(target, response);
3359
+ const runTarget = remoteTargetForRunStartResponse(target, response);
3263
3360
  const startedRun = withRunUrls(runTarget, response.run);
3264
3361
  if (parsed.flags.watch === true) {
3265
3362
  if (parsed.flags.json !== true) {
3266
- io.stdout.write(`${formatHostedRunStarted(startedRun, retryTarget.workflow).trimEnd()}\n${HOSTED_WATCH_LIFECYCLE_NOTE}\n`);
3363
+ io.stdout.write(`${formatRemoteRunStarted(startedRun, retryTarget.workflow).trimEnd()}\n${REMOTE_WATCH_LIFECYCLE_NOTE}\n`);
3267
3364
  }
3268
- const watched = await watchHostedRun({
3365
+ const watched = await watchRemoteRun({
3269
3366
  parsed,
3270
3367
  target: runTarget,
3271
3368
  runId: response.run.id,
3272
3369
  intervalMs: watchIntervalMs ?? 1000,
3273
3370
  timeoutMs: watchTimeoutMs,
3274
3371
  });
3275
- const outputRun = withRunUrls(runTarget, await withHostedRunFailureSummary(runTarget, watched));
3276
- await tryImportTerminalHostedProjectState({ target: runTarget, io });
3372
+ const outputRun = withRunUrls(runTarget, await withRemoteRunFailureSummary(runTarget, watched));
3373
+ await tryImportTerminalRemoteProjectState({ target: runTarget, io });
3277
3374
  const result = {
3278
- ok: hostedRunSucceeded(watched),
3375
+ ok: remoteRunSucceeded(watched),
3279
3376
  retried: {
3280
3377
  id: retryTarget.sourceId,
3281
3378
  kind: retryTarget.sourceKind,
@@ -3290,7 +3387,7 @@ async function retryHostedWorkflow(argv, io) {
3290
3387
  ...(outputRun.error ? { error: outputRun.error } : {}),
3291
3388
  };
3292
3389
  writeOutput(result, parsed, io, formatRetryCommandResult);
3293
- return hostedRunSucceeded(watched) ? 0 : 1;
3390
+ return remoteRunSucceeded(watched) ? 0 : 1;
3294
3391
  }
3295
3392
  const result = {
3296
3393
  ok: true,
@@ -3308,20 +3405,20 @@ async function retryHostedWorkflow(argv, io) {
3308
3405
  writeOutput(result, parsed, io, formatRetryCommandResult);
3309
3406
  return 0;
3310
3407
  }
3311
- async function resolveHostedRetryTarget(target, targetId) {
3408
+ async function resolveRemoteRetryTarget(target, targetId) {
3312
3409
  if (targetId.startsWith("eval_")) {
3313
- return await resolveHostedEvaluationRetryTarget(target, targetId);
3410
+ return await resolveRemoteEvaluationRetryTarget(target, targetId);
3314
3411
  }
3315
- const detail = await readHostedRunDetail(target, targetId);
3412
+ const detail = await readRemoteRunDetail(target, targetId);
3316
3413
  const run = detail.run;
3317
3414
  if (run.status !== "finished") {
3318
3415
  throw new UsageError(`Run ${run.id} is ${run.status}; wait for it to finish before retrying.`);
3319
3416
  }
3320
- if (!hostedRunRecordFailed(run)) {
3321
- throw new UsageError(`Run ${run.id} did not fail; use workbench ${run.workflow ?? "eval"} --hosted to intentionally run it again.`);
3417
+ if (!remoteRunRecordFailed(run)) {
3418
+ throw new UsageError(`Run ${run.id} did not fail; use workbench ${run.workflow ?? "eval"} --remote to intentionally run it again.`);
3322
3419
  }
3323
3420
  if (run.workflow === "eval") {
3324
- const candidateId = hostedRunEvaluationCandidateId(run, detail.jobs);
3421
+ const candidateId = remoteRunEvaluationCandidateId(run, detail.jobs);
3325
3422
  if (!candidateId) {
3326
3423
  throw new UsageError(`Run ${run.id} has no candidate id to retry.`);
3327
3424
  }
@@ -3330,17 +3427,18 @@ async function resolveHostedRetryTarget(target, targetId) {
3330
3427
  sourceKind: "run",
3331
3428
  workflow: "eval",
3332
3429
  request: {
3430
+ schema: "workbench.remote.run.request.v1",
3333
3431
  workflow: "eval",
3334
3432
  samples: run.samples ?? 1,
3335
3433
  candidateId,
3336
- sourceYaml: hostedRetrySourceYaml(run, run.id),
3434
+ sourceYaml: remoteRetrySourceYaml(run, run.id),
3337
3435
  preserveActive: true,
3338
3436
  ...retrySampleSelectionFromJobs(detail.jobs),
3339
3437
  },
3340
3438
  };
3341
3439
  }
3342
3440
  if (run.workflow === "improve") {
3343
- const baseCandidateId = stringValue(readRecord(run.input)?.baseCandidateId);
3441
+ const baseCandidateId = stringValue(readRecord(run.retry)?.baseCandidateId);
3344
3442
  if (!baseCandidateId) {
3345
3443
  throw new UsageError(`Run ${run.id} is missing its base candidate id.`);
3346
3444
  }
@@ -3349,41 +3447,43 @@ async function resolveHostedRetryTarget(target, targetId) {
3349
3447
  sourceKind: "run",
3350
3448
  workflow: "improve",
3351
3449
  request: {
3450
+ schema: "workbench.remote.run.request.v1",
3352
3451
  workflow: "improve",
3353
3452
  samples: run.samples ?? 1,
3354
3453
  budget: run.budget ?? run.attemptsRequested ?? 1,
3355
3454
  candidateId: baseCandidateId,
3356
- sourceYaml: hostedRetrySourceYaml(run, run.id),
3455
+ sourceYaml: remoteRetrySourceYaml(run, run.id),
3357
3456
  preserveActive: true,
3358
3457
  },
3359
3458
  };
3360
3459
  }
3361
3460
  throw new UsageError(`Run ${run.id} has no retryable workflow.`);
3362
3461
  }
3363
- async function resolveHostedEvaluationRetryTarget(target, evaluationId) {
3462
+ async function resolveRemoteEvaluationRetryTarget(target, evaluationId) {
3364
3463
  const snapshot = await apiRequest(projectApiPath(target.projectId, "/workbench/snapshot"), {}, target.baseUrl);
3365
3464
  const evaluation = snapshot.evaluations.find((entry) => entry.id === evaluationId);
3366
3465
  if (!evaluation) {
3367
- throw new UsageError(`Hosted evaluation not found: ${evaluationId}`);
3466
+ throw new UsageError(`Remote evaluation not found: ${evaluationId}`);
3368
3467
  }
3369
3468
  const run = snapshot.runs.find((entry) => entry.id === evaluation.runId) ?? null;
3370
3469
  if (!evaluationScorecardFailed(evaluation, run)) {
3371
- throw new UsageError(`Evaluation ${evaluation.id} did not fail; use workbench eval --hosted to intentionally run it again.`);
3470
+ throw new UsageError(`Evaluation ${evaluation.id} did not fail; use workbench eval --remote to intentionally run it again.`);
3372
3471
  }
3373
3472
  if (!run) {
3374
3473
  throw new UsageError(`Evaluation ${evaluation.id} is missing its run record.`);
3375
3474
  }
3376
- const detail = await readHostedRunDetail(target, run.id);
3475
+ const detail = await readRemoteRunDetail(target, run.id);
3377
3476
  const detailedRun = detail.run;
3378
3477
  return {
3379
3478
  sourceId: evaluationId,
3380
3479
  sourceKind: "evaluation",
3381
3480
  workflow: "eval",
3382
3481
  request: {
3482
+ schema: "workbench.remote.run.request.v1",
3383
3483
  workflow: "eval",
3384
3484
  samples: evaluation.sampleCount || detailedRun.samples || 1,
3385
3485
  candidateId: evaluation.candidateId,
3386
- sourceYaml: hostedRetrySourceYaml(detailedRun, detailedRun.id),
3486
+ sourceYaml: remoteRetrySourceYaml(detailedRun, detailedRun.id),
3387
3487
  preserveActive: true,
3388
3488
  ...retrySampleSelectionFromJobs(detail.jobs),
3389
3489
  },
@@ -3392,7 +3492,7 @@ async function resolveHostedEvaluationRetryTarget(target, evaluationId) {
3392
3492
  function retrySampleSelectionFromJobs(jobs) {
3393
3493
  const selectedSamples = uniqueCaseSamplePairs(jobs
3394
3494
  .filter((job) => job.status !== "succeeded" &&
3395
- executionPurposeFromJobInput(job.input) === "attempt")
3495
+ readRunJobPurpose(job) === "attempt")
3396
3496
  .map(caseSamplePairFromJob)
3397
3497
  .filter((pair) => pair !== null));
3398
3498
  return selectedSamples.length > 0
@@ -3407,10 +3507,10 @@ function uniqueCaseSamplePairs(pairs) {
3407
3507
  return [...byKey.values()].sort((left, right) => left.caseId.localeCompare(right.caseId) ||
3408
3508
  left.sampleIndex - right.sampleIndex);
3409
3509
  }
3410
- async function readHostedRunDetail(target, runId) {
3510
+ async function readRemoteRunDetail(target, runId) {
3411
3511
  return await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(runId)}`), {}, target.baseUrl);
3412
3512
  }
3413
- async function tryImportTerminalHostedProjectState(args) {
3513
+ async function tryImportTerminalRemoteProjectState(args) {
3414
3514
  const origin = args.target.origin;
3415
3515
  if (!origin || origin.projectId !== args.target.projectId) {
3416
3516
  return;
@@ -3426,23 +3526,23 @@ async function tryImportTerminalHostedProjectState(args) {
3426
3526
  });
3427
3527
  }
3428
3528
  catch (error) {
3429
- args.io.stderr.write(`Hosted run finished, but local project state was not updated: ${errorMessage(error)}\n`);
3529
+ args.io.stderr.write(`Remote run finished, but local project state was not updated: ${errorMessage(error)}\n`);
3430
3530
  }
3431
3531
  }
3432
- function hostedRetrySourceYaml(run, runId) {
3433
- const sourceYaml = stringValue(readRecord(run.input)?.sourceYaml);
3532
+ function remoteRetrySourceYaml(run, runId) {
3533
+ const sourceYaml = stringValue(readRecord(run.retry)?.sourceYaml);
3434
3534
  if (!sourceYaml) {
3435
3535
  throw new UsageError(`Run ${runId} is missing its recorded source configuration.`);
3436
3536
  }
3437
3537
  return sourceYaml;
3438
3538
  }
3439
- function hostedRunRecordFailed(run) {
3539
+ function remoteRunRecordFailed(run) {
3440
3540
  return run.outcome === "error" ||
3441
3541
  run.outcome === "cancelled" ||
3442
3542
  (run.failedJobCount ?? 0) > 0 ||
3443
3543
  Boolean(run.error);
3444
3544
  }
3445
- async function startHostedWorkflow(workflow, argv, io) {
3545
+ async function startRemoteWorkflow(workflow, argv, io) {
3446
3546
  const parsed = parseArgs(argv);
3447
3547
  const allowedFlags = new Set([
3448
3548
  "dir",
@@ -3465,7 +3565,7 @@ async function startHostedWorkflow(workflow, argv, io) {
3465
3565
  }
3466
3566
  rejectUnknownFlags(parsed, allowedFlags);
3467
3567
  if (parsed.positionals.length > 1) {
3468
- throw new UsageError(`workbench ${workflow} --hosted accepts at most one source file or directory argument.`);
3568
+ throw new UsageError(`workbench ${workflow} --remote accepts at most one source file or directory argument.`);
3469
3569
  }
3470
3570
  const sourceArg = resolveSourceDir(parsed);
3471
3571
  const samples = parsePositiveInt(parsed.flags.samples, 1, "samples");
@@ -3480,13 +3580,13 @@ async function startHostedWorkflow(workflow, argv, io) {
3480
3580
  const defaultProjectSource = await readLocalProjectSource(path.resolve(sourceArg));
3481
3581
  const selectedRunIds = workflow === "eval"
3482
3582
  ? resolveCandidateRunSelection(defaultProjectSource, runsFlag)
3483
- : [singleRequestedRunId(runsFlag, `workbench ${workflow} --hosted`) ?? defaultProjectSource.candidateRunId];
3583
+ : [singleRequestedRunId(runsFlag, `workbench ${workflow} --remote`) ?? defaultProjectSource.candidateRunId];
3484
3584
  if (workflow === "eval" && selectedRunIds.length > 1) {
3485
3585
  let failed = 0;
3486
3586
  const results = [];
3487
3587
  for (const runId of selectedRunIds) {
3488
3588
  const captured = createCapturingIo(io);
3489
- const code = await startHostedWorkflow(workflow, hostedWorkflowArgsForRun({
3589
+ const code = await startRemoteWorkflow(workflow, remoteWorkflowArgsForRun({
3490
3590
  parsed,
3491
3591
  sourceDir: defaultProjectSource.dir,
3492
3592
  runId,
@@ -3501,7 +3601,7 @@ async function startHostedWorkflow(workflow, argv, io) {
3501
3601
  candidateRunIds: selectedRunIds,
3502
3602
  failedRunCount: failed,
3503
3603
  results,
3504
- }, parsed, io, () => `Processed ${selectedRunIds.length} hosted candidate run(s); ${failed} failed.`);
3604
+ }, parsed, io, () => `Processed ${selectedRunIds.length} remote candidate run(s); ${failed} failed.`);
3505
3605
  return failed === 0 ? 0 : 1;
3506
3606
  }
3507
3607
  const selectedCandidateId = workflow === "eval"
@@ -3509,12 +3609,14 @@ async function startHostedWorkflow(workflow, argv, io) {
3509
3609
  : asOptionalString(parsed.flags.base);
3510
3610
  const request = workflow === "improve"
3511
3611
  ? {
3612
+ schema: "workbench.remote.run.request.v1",
3512
3613
  workflow,
3513
3614
  budget,
3514
3615
  samples,
3515
3616
  ...(selectedCandidateId ? { candidateId: selectedCandidateId } : {}),
3516
3617
  }
3517
3618
  : {
3619
+ schema: "workbench.remote.run.request.v1",
3518
3620
  workflow,
3519
3621
  samples,
3520
3622
  ...(selectedCandidateId ? { candidateId: selectedCandidateId } : {}),
@@ -3538,7 +3640,7 @@ async function startHostedWorkflow(workflow, argv, io) {
3538
3640
  : undefined;
3539
3641
  const dryRun = parsed.flags["dry-run"] === true;
3540
3642
  if (dryRun) {
3541
- const target = await resolveHostedDryRunTarget(parsed, { sourceDir: projectSource.dir });
3643
+ const target = await resolveRemoteDryRunTarget(parsed, { sourceDir: projectSource.dir });
3542
3644
  writeOutput({
3543
3645
  ok: true,
3544
3646
  dryRun: true,
@@ -3547,20 +3649,21 @@ async function startHostedWorkflow(workflow, argv, io) {
3547
3649
  dir: target.dir,
3548
3650
  baseUrl: target.baseUrl,
3549
3651
  request,
3550
- }, parsed, io, () => `Would start hosted ${workflow} for ${target.projectRef}.`);
3652
+ }, parsed, io, () => `Would start remote ${workflow} for ${target.projectRef}.`);
3551
3653
  return 0;
3552
3654
  }
3553
- const target = await resolveHostedTarget(parsed, {
3655
+ const target = await resolveRemoteTarget(parsed, {
3554
3656
  requireProjectIdentity: true,
3555
3657
  sourceDir: projectSource.dir,
3556
3658
  });
3557
3659
  if (workflow === "improve") {
3558
- request.candidateId = await ensureHostedImproveBaseCandidate({
3660
+ request.candidateId = await ensureRemoteImproveBaseCandidate({
3559
3661
  parsed,
3560
3662
  target,
3561
3663
  samples: request.samples,
3562
3664
  candidateId: selectedCandidateId,
3563
3665
  sourceYaml: projectSource.specSource,
3666
+ candidateFiles: projectSource.candidateFiles,
3564
3667
  adapterFiles: projectSource.adapterFiles,
3565
3668
  intervalMs: watchIntervalMs ?? 1000,
3566
3669
  timeoutMs: watchTimeoutMs,
@@ -3571,53 +3674,53 @@ async function startHostedWorkflow(workflow, argv, io) {
3571
3674
  method: "POST",
3572
3675
  body: request,
3573
3676
  }, target.baseUrl);
3574
- const runTarget = hostedTargetForRunStartResponse(target, response);
3677
+ const runTarget = remoteTargetForRunStartResponse(target, response);
3575
3678
  const startedRun = withRunUrls(runTarget, response.run);
3576
3679
  const startedRunOutput = response.reused === true
3577
3680
  ? { ...startedRun, reused: true }
3578
3681
  : startedRun;
3579
3682
  if (response.reused === true && response.run.status === "finished") {
3580
- await tryImportTerminalHostedProjectState({ target: runTarget, io });
3683
+ await tryImportTerminalRemoteProjectState({ target: runTarget, io });
3581
3684
  writeOutput({
3582
- ok: hostedRunSucceeded(response.run),
3685
+ ok: remoteRunSucceeded(response.run),
3583
3686
  reused: true,
3584
3687
  workflow,
3585
3688
  runId: startedRun.id,
3586
3689
  ...startedRun,
3587
- }, parsed, io, () => `Reused hosted ${workflow} ${startedRun.id}. Use --rerun to intentionally run it again.`);
3588
- return hostedRunSucceeded(response.run) ? 0 : 1;
3690
+ }, parsed, io, () => `Reused remote ${workflow} ${startedRun.id}. Use --rerun to intentionally run it again.`);
3691
+ return remoteRunSucceeded(response.run) ? 0 : 1;
3589
3692
  }
3590
3693
  if (parsed.flags.watch === true) {
3591
3694
  if (parsed.flags.json !== true) {
3592
- io.stdout.write(`${formatHostedRunStarted(startedRun, workflow).trimEnd()}\n${HOSTED_WATCH_LIFECYCLE_NOTE}\n`);
3695
+ io.stdout.write(`${formatRemoteRunStarted(startedRun, workflow).trimEnd()}\n${REMOTE_WATCH_LIFECYCLE_NOTE}\n`);
3593
3696
  }
3594
- const watched = await watchHostedRun({
3697
+ const watched = await watchRemoteRun({
3595
3698
  parsed,
3596
3699
  target: runTarget,
3597
3700
  runId: response.run.id,
3598
3701
  intervalMs: watchIntervalMs ?? 1000,
3599
3702
  timeoutMs: watchTimeoutMs,
3600
3703
  });
3601
- const outputRun = await withHostedRunFailureSummary(runTarget, watched);
3602
- await tryImportTerminalHostedProjectState({ target: runTarget, io });
3603
- writeOutput(withRunUrls(runTarget, outputRun), parsed, io, formatHostedRunResult);
3604
- return hostedRunSucceeded(watched) ? 0 : 1;
3704
+ const outputRun = await withRemoteRunFailureSummary(runTarget, watched);
3705
+ await tryImportTerminalRemoteProjectState({ target: runTarget, io });
3706
+ writeOutput(withRunUrls(runTarget, outputRun), parsed, io, formatRemoteRunResult);
3707
+ return remoteRunSucceeded(watched) ? 0 : 1;
3605
3708
  }
3606
- writeOutput(startedRunOutput, parsed, io, (run) => formatHostedRunStarted(run, workflow).trimEnd());
3709
+ writeOutput(startedRunOutput, parsed, io, (run) => formatRemoteRunStarted(run, workflow).trimEnd());
3607
3710
  return 0;
3608
3711
  }
3609
- async function ensureHostedImproveBaseCandidate(args) {
3712
+ async function ensureRemoteImproveBaseCandidate(args) {
3610
3713
  if (args.candidateId) {
3611
- const candidate = await readHostedCandidateSummary(args.target, args.candidateId);
3714
+ const candidate = await readRemoteCandidateSummary(args.target, args.candidateId);
3612
3715
  if (!candidate) {
3613
3716
  throw new UsageError(`Base candidate ${args.candidateId} was not found for the current benchmark.`);
3614
3717
  }
3615
- if (hostedCandidateIsEvaluated(candidate)) {
3718
+ if (remoteCandidateIsEvaluated(candidate)) {
3616
3719
  return args.candidateId;
3617
3720
  }
3618
3721
  }
3619
3722
  else {
3620
- const activeCandidate = await readEvaluatedActiveHostedCandidate(args.target);
3723
+ const activeCandidate = await readEvaluatedActiveRemoteCandidate(args.target);
3621
3724
  if (activeCandidate) {
3622
3725
  return activeCandidate.id;
3623
3726
  }
@@ -3625,31 +3728,33 @@ async function ensureHostedImproveBaseCandidate(args) {
3625
3728
  const response = await apiRequest(projectApiPath(args.target.projectId, "/runs"), {
3626
3729
  method: "POST",
3627
3730
  body: {
3731
+ schema: "workbench.remote.run.request.v1",
3628
3732
  workflow: "eval",
3629
3733
  samples: args.samples,
3630
3734
  ...(args.candidateId ? { candidateId: args.candidateId } : {}),
3631
3735
  sourceYaml: args.sourceYaml,
3736
+ ...(args.candidateId ? {} : { candidateFiles: args.candidateFiles }),
3632
3737
  ...(args.adapterFiles.length > 0 ? { adapterFiles: args.adapterFiles } : {}),
3633
3738
  },
3634
3739
  }, args.target.baseUrl);
3635
- const runTarget = hostedTargetForRunStartResponse(args.target, response);
3636
- const watched = await watchHostedRun({
3740
+ const runTarget = remoteTargetForRunStartResponse(args.target, response);
3741
+ const watched = await watchRemoteRun({
3637
3742
  parsed: args.parsed,
3638
3743
  target: runTarget,
3639
3744
  runId: response.run.id,
3640
3745
  intervalMs: args.intervalMs,
3641
3746
  timeoutMs: args.timeoutMs,
3642
3747
  });
3643
- if (!hostedRunSucceeded(watched)) {
3748
+ if (!remoteRunSucceeded(watched)) {
3644
3749
  throw new UsageError(`Parent candidate eval ${watched.id} failed; improve was not started.`);
3645
3750
  }
3646
3751
  if (!watched.candidateId) {
3647
3752
  throw new UsageError(`Parent candidate eval ${watched.id} did not produce a candidate.`);
3648
3753
  }
3649
- await tryImportTerminalHostedProjectState({ target: runTarget, io: args.io });
3754
+ await tryImportTerminalRemoteProjectState({ target: runTarget, io: args.io });
3650
3755
  return watched.candidateId;
3651
3756
  }
3652
- function hostedWorkflowArgsForRun(args) {
3757
+ function remoteWorkflowArgsForRun(args) {
3653
3758
  const next = ["--dir", args.sourceDir, "--runs", args.runId, "--json"];
3654
3759
  appendStringFlag(next, "benchmark", asOptionalString(args.parsed.flags.benchmark));
3655
3760
  appendStringFlag(next, "candidate", asOptionalString(args.parsed.flags.candidate));
@@ -3672,27 +3777,27 @@ function appendStringFlag(args, name, value) {
3672
3777
  args.push(`--${name}`, value);
3673
3778
  }
3674
3779
  }
3675
- async function readHostedCandidateSummary(target, candidateId) {
3780
+ async function readRemoteCandidateSummary(target, candidateId) {
3676
3781
  const response = await apiRequest(projectApiPath(target.projectId, "/candidates"), {}, target.baseUrl);
3677
3782
  return response.candidates.find((entry) => entry.id === candidateId) ?? null;
3678
3783
  }
3679
- async function readEvaluatedActiveHostedCandidate(target) {
3784
+ async function readEvaluatedActiveRemoteCandidate(target) {
3680
3785
  const response = await apiRequest(projectApiPath(target.projectId), {}, target.baseUrl);
3681
3786
  const activeCandidateId = response.benchmark.activeCandidateId;
3682
3787
  if (!activeCandidateId) {
3683
3788
  return null;
3684
3789
  }
3685
- const candidate = await readHostedCandidateSummary(target, activeCandidateId);
3686
- return candidate && hostedCandidateIsEvaluated(candidate) ? candidate : null;
3790
+ const candidate = await readRemoteCandidateSummary(target, activeCandidateId);
3791
+ return candidate && remoteCandidateIsEvaluated(candidate) ? candidate : null;
3687
3792
  }
3688
- function hostedCandidateIsEvaluated(candidate) {
3793
+ function remoteCandidateIsEvaluated(candidate) {
3689
3794
  return candidate.status === "evaluated" || candidate.eval != null;
3690
3795
  }
3691
3796
  async function openWorkbench(argv, io) {
3692
3797
  const parsed = parseArgs(argv);
3693
3798
  rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "no-open", "json"]));
3694
3799
  if (parsed.positionals.length > 1) {
3695
- throw new UsageError(`Unexpected argument for workbench open --hosted: ${parsed.positionals.slice(1).join(" ")}`);
3800
+ throw new UsageError(`Unexpected argument for workbench open --remote: ${parsed.positionals.slice(1).join(" ")}`);
3696
3801
  }
3697
3802
  const target = await resolveOpenTarget(parsed);
3698
3803
  const ref = target.openRef;
@@ -3722,7 +3827,7 @@ function buildWorkbenchWebUrl(target, ref) {
3722
3827
  }
3723
3828
  return buildWorkbenchResourceUrls(target, { candidateId: ref }).candidateEvaluation;
3724
3829
  }
3725
- async function resolveHostedTarget(parsed, options = {}) {
3830
+ async function resolveRemoteTarget(parsed, options = {}) {
3726
3831
  if (options.sourceArg !== undefined && parsed.flags.dir !== undefined) {
3727
3832
  throw new UsageError("Use either --dir or SOURCE, not both.");
3728
3833
  }
@@ -3731,7 +3836,7 @@ async function resolveHostedTarget(parsed, options = {}) {
3731
3836
  : resolveDir(parsed, options.sourceArg);
3732
3837
  const origin = await readWorkbenchOrigin(dir);
3733
3838
  const explicitProject = asOptionalString(parsed.flags.benchmark);
3734
- const baseUrl = await effectiveBaseUrl(origin?.baseUrl);
3839
+ const baseUrl = await effectiveOriginBaseUrl(origin?.baseUrl);
3735
3840
  if (explicitProject && (!isRemoteProjectId(explicitProject) || options.requireProjectIdentity === true)) {
3736
3841
  const project = await resolveRemoteProject(explicitProject, baseUrl);
3737
3842
  return {
@@ -3745,7 +3850,7 @@ async function resolveHostedTarget(parsed, options = {}) {
3745
3850
  }
3746
3851
  const projectId = explicitProject ?? origin?.projectId;
3747
3852
  if (!projectId) {
3748
- throw new UsageError("Missing hosted benchmark. Run workbench push, workbench clone, or pass --benchmark OWNER/BENCHMARK.");
3853
+ throw new UsageError("Missing remote benchmark. Run workbench push, workbench clone, or pass --benchmark OWNER/BENCHMARK.");
3749
3854
  }
3750
3855
  const originRemote = origin ? parseOriginRemote(origin) : null;
3751
3856
  return {
@@ -3759,7 +3864,7 @@ async function resolveHostedTarget(parsed, options = {}) {
3759
3864
  origin,
3760
3865
  };
3761
3866
  }
3762
- async function resolveHostedDryRunTarget(parsed, options = {}) {
3867
+ async function resolveRemoteDryRunTarget(parsed, options = {}) {
3763
3868
  if (options.sourceArg !== undefined && parsed.flags.dir !== undefined) {
3764
3869
  throw new UsageError("Use either --dir or SOURCE, not both.");
3765
3870
  }
@@ -3768,7 +3873,7 @@ async function resolveHostedDryRunTarget(parsed, options = {}) {
3768
3873
  : resolveDir(parsed, options.sourceArg);
3769
3874
  const origin = await readWorkbenchOrigin(dir);
3770
3875
  const explicitProject = asOptionalString(parsed.flags.benchmark);
3771
- const baseUrl = await effectiveBaseUrl(origin?.baseUrl);
3876
+ const baseUrl = await effectiveOriginBaseUrl(origin?.baseUrl);
3772
3877
  if (explicitProject) {
3773
3878
  if (isRemoteProjectId(explicitProject)) {
3774
3879
  return {
@@ -3801,7 +3906,7 @@ async function resolveHostedDryRunTarget(parsed, options = {}) {
3801
3906
  origin,
3802
3907
  };
3803
3908
  }
3804
- throw new UsageError("Missing hosted benchmark. Run workbench push, workbench clone, or pass --benchmark OWNER/BENCHMARK.");
3909
+ throw new UsageError("Missing remote benchmark. Run workbench push, workbench clone, or pass --benchmark OWNER/BENCHMARK.");
3805
3910
  }
3806
3911
  async function resolveOpenTarget(parsed) {
3807
3912
  const ref = parsed.positionals[0];
@@ -3830,7 +3935,7 @@ async function resolveOpenTarget(parsed) {
3830
3935
  };
3831
3936
  }
3832
3937
  return {
3833
- ...(await resolveHostedTarget(parsed, { requireProjectIdentity: true })),
3938
+ ...(await resolveRemoteTarget(parsed, { requireProjectIdentity: true })),
3834
3939
  ...(ref ? { openRef: ref } : {}),
3835
3940
  };
3836
3941
  }
@@ -3901,7 +4006,7 @@ function withRunUrls(target, run) {
3901
4006
  }),
3902
4007
  };
3903
4008
  }
3904
- function hostedTargetForRunStartResponse(target, response) {
4009
+ function remoteTargetForRunStartResponse(target, response) {
3905
4010
  const projectId = response.benchmark?.id ?? response.run.projectId ?? target.projectId;
3906
4011
  if (projectId === target.projectId && !response.benchmark) {
3907
4012
  return target;
@@ -3926,7 +4031,7 @@ function hostedTargetForRunStartResponse(target, response) {
3926
4031
  }
3927
4032
  return next;
3928
4033
  }
3929
- function hostedRunEvaluationCandidateId(run, jobs = []) {
4034
+ function remoteRunEvaluationCandidateId(run, jobs = []) {
3930
4035
  if (run.outputCandidateId) {
3931
4036
  return run.outputCandidateId;
3932
4037
  }
@@ -3986,7 +4091,7 @@ function runtimeBundleForProjectVisibility(runtime, visibility) {
3986
4091
  };
3987
4092
  }
3988
4093
  function localProjectStateSource(source) {
3989
- const request = hostedProjectSourceRequest(source);
4094
+ const request = remoteProjectSourceRequest(source);
3990
4095
  const stateSource = {
3991
4096
  source: request.source,
3992
4097
  files: source.sourceFiles.map((file) => ({ ...file })),
@@ -4014,7 +4119,7 @@ function toSurfaceSnapshotFile(file) {
4014
4119
  executable: file.executable === true,
4015
4120
  };
4016
4121
  }
4017
- function hostedProjectSummaryFromState(state) {
4122
+ function remoteProjectSummaryFromState(state) {
4018
4123
  return {
4019
4124
  id: state.project.id,
4020
4125
  ownerUsername: state.project.ownerUsername,
@@ -4025,12 +4130,12 @@ function hostedProjectSummaryFromState(state) {
4025
4130
  function sourceFileCount(source) {
4026
4131
  return source.sourceFiles.length;
4027
4132
  }
4028
- function hostedProjectSourceRequest(source) {
4029
- const { network, resources } = hostedEnvironmentOptions(source);
4133
+ function remoteProjectSourceRequest(source) {
4134
+ const { network, resources } = remoteEnvironmentOptions(source);
4030
4135
  return {
4031
4136
  source: source.specSource,
4032
4137
  candidateFiles: source.candidateFiles,
4033
- engineResolveFiles: hostedEngineResolveFiles(source),
4138
+ engineResolveFiles: remoteEngineResolveFiles(source),
4034
4139
  engineResolveBinding: engineResolveBindingForSpec(source.spec),
4035
4140
  adapterFiles: source.adapterFiles,
4036
4141
  dockerfile: source.dockerfile,
@@ -4043,7 +4148,7 @@ function hostedProjectSourceRequest(source) {
4043
4148
  function isRemoteProjectId(value) {
4044
4149
  return /^wb_[a-f0-9]{12}$/u.test(value);
4045
4150
  }
4046
- function hostedEnvironmentOptions(source) {
4151
+ function remoteEnvironmentOptions(source) {
4047
4152
  return {
4048
4153
  network: source.spec.environment.network?.egress === "open"
4049
4154
  ? "on"
@@ -4051,7 +4156,7 @@ function hostedEnvironmentOptions(source) {
4051
4156
  resources: runtimeResources(source.spec.environment),
4052
4157
  };
4053
4158
  }
4054
- async function watchHostedRun(args) {
4159
+ async function watchRemoteRun(args) {
4055
4160
  const deadline = args.timeoutMs === undefined ? undefined : Date.now() + args.timeoutMs;
4056
4161
  let lastRun = null;
4057
4162
  while (true) {
@@ -4079,7 +4184,7 @@ async function watchHostedRun(args) {
4079
4184
  await sleep(args.intervalMs);
4080
4185
  }
4081
4186
  }
4082
- function formatHostedRunResult(run) {
4187
+ function formatRemoteRunResult(run) {
4083
4188
  const candidateId = run.outputCandidateId ?? run.candidateId;
4084
4189
  const activeDetail = run.activeCandidateId && candidateId && run.activeCandidateId !== candidateId
4085
4190
  ? `; active ${run.activeCandidateId}`
@@ -4097,7 +4202,7 @@ function formatRetryCommandResult(result) {
4097
4202
  const runId = run?.id ?? result.runId ?? "unknown";
4098
4203
  const scope = `${result.retried.kind} ${result.retried.id}`;
4099
4204
  const verb = run
4100
- ? run.status === "finished" ? "finished as hosted run" : "started as hosted run"
4205
+ ? run.status === "finished" ? "finished as remote run" : "started as remote run"
4101
4206
  : "finished as local run";
4102
4207
  return [
4103
4208
  `Retry of ${scope} ${verb} ${runId}.`,
@@ -4113,7 +4218,7 @@ function formatRetryCommandResult(result) {
4113
4218
  : result.urls?.benchmark ? [`Open benchmark: ${result.urls.benchmark}`] : []),
4114
4219
  ].join("\n");
4115
4220
  }
4116
- function formatHostedRunStarted(run, fallbackWorkflow) {
4221
+ function formatRemoteRunStarted(run, fallbackWorkflow) {
4117
4222
  const candidateId = run.outputCandidateId ?? run.candidateId;
4118
4223
  return [
4119
4224
  `Started ${run.workflow ?? fallbackWorkflow} run ${run.id}; ${candidateId ? `candidate ${candidateId}` : `${run.jobCount ?? 0} jobs queued`}.`,
@@ -4124,10 +4229,7 @@ function formatHostedRunStarted(run, fallbackWorkflow) {
4124
4229
  ].join("\n");
4125
4230
  }
4126
4231
  function readRunJobPurpose(job) {
4127
- const input = readRecord(job.input);
4128
- const execution = readRecord(input?.execution);
4129
- const purpose = execution?.purpose;
4130
- return typeof purpose === "string" && purpose ? purpose : null;
4232
+ return job.purpose && job.purpose.trim() ? job.purpose : null;
4131
4233
  }
4132
4234
  function readRecord(value) {
4133
4235
  return value && typeof value === "object" && !Array.isArray(value)
@@ -4146,24 +4248,24 @@ function integerValue(value) {
4146
4248
  function readFiniteNumber(value) {
4147
4249
  return typeof value === "number" && Number.isFinite(value) ? value : null;
4148
4250
  }
4149
- async function withHostedRunFailureSummary(target, run) {
4150
- if (hostedRunSucceeded(run) || run.error || (run.failedJobCount ?? 0) <= 0) {
4251
+ async function withRemoteRunFailureSummary(target, run) {
4252
+ if (remoteRunSucceeded(run) || run.error || (run.failedJobCount ?? 0) <= 0) {
4151
4253
  return run;
4152
4254
  }
4153
- const error = await readHostedRunFailureSummary(target, run.id);
4255
+ const error = await readRemoteRunFailureSummary(target, run.id);
4154
4256
  return error ? { ...run, error } : run;
4155
4257
  }
4156
- async function readHostedRunFailureSummary(target, runId) {
4258
+ async function readRemoteRunFailureSummary(target, runId) {
4157
4259
  try {
4158
- const project = await apiRequest(projectApiPath(target.projectId), {}, target.baseUrl);
4159
- const failed = project.benchmark.jobs.find((job) => job.runId === runId && job.status === "failed" && job.error);
4260
+ const detail = await readRemoteRunDetail(target, runId);
4261
+ const failed = detail.jobs.find((job) => job.status === "failed" && job.error);
4160
4262
  return failed?.error ? `First failed job ${failed.id}: ${failed.error}` : null;
4161
4263
  }
4162
4264
  catch {
4163
4265
  return null;
4164
4266
  }
4165
4267
  }
4166
- function hostedRunSucceeded(run) {
4268
+ function remoteRunSucceeded(run) {
4167
4269
  if (run.status !== "finished") {
4168
4270
  return false;
4169
4271
  }
@@ -4179,16 +4281,6 @@ async function readWorkbenchOrigin(dir) {
4179
4281
  throw new UsageError(`Workbench origin is malformed: ${workbenchOriginPath(dir)}`);
4180
4282
  }
4181
4283
  const originRecord = parsed;
4182
- const keys = Object.keys(originRecord).sort();
4183
- const expectedKeys = [
4184
- "baseUrl",
4185
- "linkedAt",
4186
- "projectId",
4187
- "remote",
4188
- "runtimeFingerprint",
4189
- "sourceFingerprint",
4190
- "sourceRevisionId",
4191
- ];
4192
4284
  if (typeof originRecord.projectId !== "string" ||
4193
4285
  typeof originRecord.baseUrl !== "string" ||
4194
4286
  typeof originRecord.remote !== "string" ||
@@ -4202,9 +4294,6 @@ async function readWorkbenchOrigin(dir) {
4202
4294
  originRecord.runtimeFingerprint.length === 0) {
4203
4295
  throw new UsageError(`Workbench origin is malformed: ${workbenchOriginPath(dir)}`);
4204
4296
  }
4205
- if (JSON.stringify(keys) !== JSON.stringify(expectedKeys)) {
4206
- throw new UsageError(`Workbench origin is malformed: ${workbenchOriginPath(dir)}`);
4207
- }
4208
4297
  return {
4209
4298
  baseUrl: normalizeBaseUrl(originRecord.baseUrl),
4210
4299
  remote: normalizeOriginRemote(originRecord.remote),
@@ -4254,7 +4343,7 @@ async function writeWorkbenchOriginFromState(dir, args) {
4254
4343
  const runtimeFingerprint = args.state.base.runtimeFingerprint ??
4255
4344
  workbenchRuntimeBundleFingerprint(args.state.runtime);
4256
4345
  if (!sourceRevisionId || !sourceFingerprint || !runtimeFingerprint) {
4257
- throw new UsageError("Hosted project state is missing required origin metadata.");
4346
+ throw new UsageError("Remote project state is missing required origin metadata.");
4258
4347
  }
4259
4348
  return await writeWorkbenchOrigin(dir, {
4260
4349
  baseUrl: args.baseUrl,
@@ -4290,18 +4379,29 @@ function originRemoteUrlParts(origin) {
4290
4379
  function workbenchOriginPath(dir) {
4291
4380
  return path.join(dir, ".workbench", "origin.json");
4292
4381
  }
4293
- async function effectiveBaseUrl(preferred) {
4382
+ async function effectiveBaseUrl() {
4383
+ const config = await loadConfig();
4384
+ return selectWorkbenchBaseUrl({ configBaseUrl: config.baseUrl });
4385
+ }
4386
+ async function effectiveOriginBaseUrl(originBaseUrl) {
4294
4387
  const config = await loadConfig();
4295
- return normalizeBaseUrl(process.env.WORKBENCH_API_URL ??
4296
- preferred ??
4297
- config.baseUrl ??
4388
+ return selectWorkbenchBaseUrl({
4389
+ originBaseUrl,
4390
+ configBaseUrl: config.baseUrl,
4391
+ });
4392
+ }
4393
+ function selectWorkbenchBaseUrl(input = {}) {
4394
+ return normalizeBaseUrl(input.explicitBaseUrl ??
4395
+ input.originBaseUrl ??
4396
+ process.env.WORKBENCH_API_URL ??
4397
+ input.configBaseUrl ??
4298
4398
  DEFAULT_BASE_URL);
4299
4399
  }
4300
4400
  async function readWorkbenchProfileStatus(config) {
4301
4401
  if (!config.accessToken) {
4302
4402
  return { authenticated: false, profile: null };
4303
4403
  }
4304
- const baseUrl = await effectiveBaseUrl(config.baseUrl);
4404
+ const baseUrl = selectWorkbenchBaseUrl({ configBaseUrl: config.baseUrl });
4305
4405
  try {
4306
4406
  const response = await fetch(`${baseUrl}/api/workbench/profile`, {
4307
4407
  headers: {
@@ -4327,10 +4427,9 @@ async function readWorkbenchProfileStatus(config) {
4327
4427
  }
4328
4428
  async function apiRequest(apiPath, options = {}, baseUrlOverride) {
4329
4429
  const config = await loadConfig();
4330
- const baseUrl = normalizeBaseUrl(baseUrlOverride ??
4331
- process.env.WORKBENCH_API_URL ??
4332
- config.baseUrl ??
4333
- DEFAULT_BASE_URL);
4430
+ const baseUrl = baseUrlOverride !== undefined
4431
+ ? normalizeBaseUrl(baseUrlOverride)
4432
+ : selectWorkbenchBaseUrl({ configBaseUrl: config.baseUrl });
4334
4433
  const method = options.method ?? "GET";
4335
4434
  const canRetry = method === "GET";
4336
4435
  let lastError = null;
@@ -4868,6 +4967,24 @@ function formatMetricValue(value) {
4868
4967
  }
4869
4968
  return value.toFixed(2);
4870
4969
  }
4970
+ function formatNullableMetric(value) {
4971
+ return typeof value === "number" && Number.isFinite(value)
4972
+ ? formatMetricValue(value)
4973
+ : "n/a";
4974
+ }
4975
+ function formatFailureLine(failure) {
4976
+ return [
4977
+ failure.kind,
4978
+ failure.id,
4979
+ failure.status ?? "failed",
4980
+ failure.runId ? `run=${failure.runId}` : null,
4981
+ failure.candidateId ? `candidate=${failure.candidateId}` : null,
4982
+ failure.jobId ? `job=${failure.jobId}` : null,
4983
+ failure.caseId ? `case=${failure.caseId}` : null,
4984
+ typeof failure.sampleIndex === "number" ? `sample=${failure.sampleIndex}` : null,
4985
+ failure.error ?? null,
4986
+ ].filter(Boolean).join("\t");
4987
+ }
4871
4988
  function resolveDir(parsed, positionalDir) {
4872
4989
  const resolved = path.resolve(asOptionalString(parsed.flags.dir) ?? positionalDir ?? process.cwd());
4873
4990
  return isWorkbenchSourceYamlPath(resolved) ? path.dirname(resolved) : resolved;
@@ -4928,28 +5045,12 @@ async function resolveLocalProjectForExecution(workspace, source) {
4928
5045
  };
4929
5046
  }
4930
5047
  function completedJobOutputFiles(job) {
4931
- const output = asJsonRecord(job.output);
5048
+ const output = jsonRecord(job.output);
4932
5049
  const files = Array.isArray(output.files)
4933
5050
  ? output.files.filter(isSurfaceSnapshotFile)
4934
5051
  : [];
4935
5052
  return normalizeSurfaceFiles(files);
4936
5053
  }
4937
- function asJsonRecord(value) {
4938
- return value && typeof value === "object" && !Array.isArray(value)
4939
- ? value
4940
- : {};
4941
- }
4942
- function isSurfaceSnapshotFile(value) {
4943
- const record = asJsonRecord(value);
4944
- return (typeof record.path === "string" &&
4945
- typeof record.content === "string" &&
4946
- (record.kind === undefined ||
4947
- record.kind === "text" ||
4948
- record.kind === "binary") &&
4949
- (record.encoding === undefined ||
4950
- record.encoding === "utf8" ||
4951
- record.encoding === "base64"));
4952
- }
4953
5054
  function createLocalEvent(type, at, event) {
4954
5055
  return {
4955
5056
  id: `evt_${Math.random().toString(36).slice(2, 10)}_${Date.now().toString(36)}`,