@workbench-ai/workbench 0.0.50 → 0.0.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,7 +5,7 @@ import { createRequire } from "node:module";
5
5
  import os from "node:os";
6
6
  import path from "node:path";
7
7
  import { Writable } from "node:stream";
8
- import { createCandidateFilePreview, createBaselineCandidateJob as createRuntimeBaselineCandidateJob, evaluationScorecardId, evaluationMeanMetrics, executeWorkbenchExecutionJob, engineResolveBindingForSpec, filterCandidateSourceFiles, workbenchExecutionPurpose, workbenchRunExecutionFingerprint, createWorkbenchAdapterAuthBundle, createOptimizerTraceInputFiles, DOCKER_SANDBOX_BACKEND, localWorkbenchAdapterAuthStore, materializeWorkbenchRunResult, normalizeSurfaceFiles, planWorkbenchExecutionJobsForPurpose, runWorkbenchExecutionDag, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, summarizeCandidateFiles, validateWorkbenchRunEnvelope, parseWorkbenchAdapterAuthTarget, } from "@workbench-ai/workbench-core";
8
+ import { createCandidateFilePreview, createBaselineCandidateJob as createRuntimeBaselineCandidateJob, evaluationScorecardId, evaluationMeanMetrics, executeWorkbenchExecutionJob, engineResolveBindingForSpec, filterOptimizerTraceJobsForCaseIds, filterCandidateSourceFiles, formatWorkbenchCaseSelector, formatWorkbenchSelectionPolicy, workbenchCaseSelectorUsesAllCases, workbenchExecutionPurpose, workbenchRunExecutionFingerprint, createWorkbenchAdapterAuthBundle, createOptimizerTraceInputFiles, DOCKER_SANDBOX_BACKEND, localWorkbenchAdapterAuthStore, materializeWorkbenchRunResult, normalizeSurfaceFiles, planWorkbenchExecutionJobsForPurpose, runWorkbenchExecutionDag, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, summarizeCandidateFiles, validateWorkbenchRunEnvelope, parseWorkbenchAdapterAuthTarget, workbenchEngineCaseIdsForImproveEvaluation, workbenchEngineCaseIdsForSelector, workbenchImproveOptimizeSelector, workbenchImproveSelectionPolicy, workbenchProjectSourceFingerprint, workbenchRuntimeBundleFingerprint, } from "@workbench-ai/workbench-core";
9
9
  import { assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, WORKBENCH_ADAPTER_RESULT_FILE, WORKBENCH_ADAPTER_RESULT_PROTOCOL, normalizeWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationCommand, workbenchAdapterOperationResultPath, withDefaultWorkbenchAdapterAuthProfiles as applyDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
10
10
  import { builtinLocalTraceAdapter, builtinLocalTraceAdapters, sortLocalTraceRefs, } from "@workbench-ai/workbench-built-in-adapters/local-traces";
11
11
  import { commandUsage, HOSTED_WATCH_LIFECYCLE_NOTE, LOCAL_DEV_OPEN_LIFECYCLE_NOTE, rootUsage, } from "./command-model.js";
@@ -13,7 +13,7 @@ import { startLocalWorkbenchDevServer } from "./dev-open-server.js";
13
13
  import { createWorkbenchInitScaffold, } from "./init-scaffold.js";
14
14
  import { defaultAdapterManifests, composeRuntimeDockerfileWithAdapters, resolveDefaultWorkbenchAdapter, resolveProjectAdapterSource, resolveWorkbenchAdaptersForProject, WORKBENCH_ADAPTER_MANIFEST_FILE, } from "./adapter-project.js";
15
15
  import { createAdapterCommandEnv } from "./adapter-command-env.js";
16
- import { loadLocalArchive, loadLocalArchiveIndex, materializeCandidateRoot, readLocalCandidate, readLocalCandidateFiles, readLocalJobs, saveLocalArchive, saveLocalJobs, setLocalActive, upsertLocalRun, upsertLocalCandidate, upsertLocalEvaluation, } from "./local-archive.js";
16
+ import { loadLocalArchive, loadLocalArchiveIndex, exportLocalRuntimeBundle, importLocalRuntimeBundle, runtimeBundleStats, materializeCandidateRoot, readLocalCandidate, readLocalCandidateFiles, readLocalJobs, saveLocalArchive, saveLocalJobs, setLocalActive, upsertLocalRun, upsertLocalCandidate, upsertLocalEvaluation, } from "./local-archive.js";
17
17
  import { WorkspaceSnapshotError, } from "./workspace-snapshot.js";
18
18
  import { readLocalProjectSource, WORKBENCH_BENCHMARK_FILE, } from "./project-source.js";
19
19
  import { localBenchmarkFingerprint, localCandidateFingerprint, } from "./benchmark-fingerprint.js";
@@ -74,32 +74,38 @@ export async function runCli(argv, io = {
74
74
  if (argv[0] === "clone") {
75
75
  return await cloneProject(argv.slice(1), io);
76
76
  }
77
- if (argv[0] === "fetch") {
78
- return await fetchProject(argv.slice(1), io);
79
- }
80
77
  if (argv[0] === "pull") {
81
78
  return await pullProject(argv.slice(1), io);
82
79
  }
83
80
  if (argv[0] === "push") {
84
81
  return await pushBenchmark(argv.slice(1), io);
85
82
  }
86
- if (argv[0] === "remote") {
87
- return await runRemoteCommand(argv.slice(1), io);
88
- }
89
83
  if (argv[0] === "eval") {
90
- return await localEvaluateCandidate(argv.slice(1), io, runtimeOptions);
84
+ const hosted = extractHostedFlag(argv.slice(1));
85
+ return hosted.enabled
86
+ ? await startHostedWorkflow("eval", hosted.argv, io)
87
+ : await localEvaluateCandidate(hosted.argv, io, runtimeOptions);
91
88
  }
92
89
  if (argv[0] === "retry") {
93
- return await localRetry(argv.slice(1), io, runtimeOptions);
90
+ const hosted = extractHostedFlag(argv.slice(1));
91
+ return hosted.enabled
92
+ ? await retryHostedWorkflow(hosted.argv, io)
93
+ : await localRetry(hosted.argv, io, runtimeOptions);
94
94
  }
95
95
  if (argv[0] === "improve") {
96
- return await localRun(argv.slice(1), io, runtimeOptions);
96
+ const hosted = extractHostedFlag(argv.slice(1));
97
+ return hosted.enabled
98
+ ? await startHostedWorkflow("improve", hosted.argv, io)
99
+ : await localRun(hosted.argv, io, runtimeOptions);
97
100
  }
98
101
  if (argv[0] === "restore") {
99
102
  return await localRestore(argv.slice(1), io);
100
103
  }
101
104
  if (argv[0] === "open") {
102
- return await localDevOpen(argv.slice(1), io);
105
+ const hosted = extractHostedFlag(argv.slice(1));
106
+ return hosted.enabled
107
+ ? await openWorkbench(hosted.argv, io)
108
+ : await localDevOpen(hosted.argv, io);
103
109
  }
104
110
  if (argv[0] === "auth") {
105
111
  return await runAuthCommand(argv.slice(1), io);
@@ -110,9 +116,6 @@ export async function runCli(argv, io = {
110
116
  if (argv[0] === "traces") {
111
117
  return await runTracesCommand(argv.slice(1), io);
112
118
  }
113
- if (argv[0] === "cloud") {
114
- return await runCloudCommand(argv.slice(1), io);
115
- }
116
119
  const commandPath = argv.slice(0, 2).join(" ");
117
120
  const rest = argv.slice(2);
118
121
  switch (commandPath) {
@@ -148,9 +151,6 @@ export async function runCli(argv, io = {
148
151
  }
149
152
  function commandPathForHelp(argv) {
150
153
  const positionals = argv.filter((arg) => arg !== "--help" && arg !== "-h" && !arg.startsWith("--"));
151
- if (positionals[0] === "cloud") {
152
- return positionals.slice(0, 3).join(" ");
153
- }
154
154
  if (positionals[0] === "adapters" &&
155
155
  ["create", "list", "inspect", "test"].includes(positionals[1] ?? "")) {
156
156
  return positionals.slice(0, 2).join(" ");
@@ -159,7 +159,7 @@ function commandPathForHelp(argv) {
159
159
  ["collect", "list", "show"].includes(positionals[1] ?? "")) {
160
160
  return positionals.slice(0, 2).join(" ");
161
161
  }
162
- if (positionals[0] === "auth" || positionals[0] === "remote") {
162
+ if (positionals[0] === "auth") {
163
163
  return positionals.slice(0, 2).join(" ");
164
164
  }
165
165
  if (positionals[0] === "runs" &&
@@ -172,65 +172,18 @@ function commandPathForHelp(argv) {
172
172
  }
173
173
  return positionals[0] ?? "";
174
174
  }
175
- async function runCloudCommand(argv, io) {
176
- const command = argv[0];
177
- const rest = argv.slice(1);
178
- switch (command) {
179
- case "eval":
180
- return await startHostedWorkflow("eval", rest, io);
181
- case "retry":
182
- return await retryHostedWorkflow(rest, io);
183
- case "improve":
184
- return await startHostedWorkflow("improve", rest, io);
185
- case "open":
186
- return await openWorkbench(rest, io);
187
- case "watch":
188
- return await runWatch(rest, io);
189
- case "logs":
190
- return await runLogs(rest, io);
191
- case "star":
192
- return await starProject(rest, io, true);
193
- case "unstar":
194
- return await starProject(rest, io, false);
195
- default:
196
- break;
197
- }
198
- const commandPath = argv.slice(0, 2).join(" ");
199
- const subRest = argv.slice(2);
200
- switch (commandPath) {
201
- case "benchmarks list":
202
- return await benchmarkList(subRest, io);
203
- case "benchmarks show":
204
- return await benchmarkShow(subRest, io);
205
- case "benchmarks versions":
206
- return await benchmarkVersions(subRest, io);
207
- case "benchmarks starred":
208
- return await benchmarkStarred(subRest, io);
209
- case "benchmarks delete":
210
- return await benchmarkDelete(subRest, io);
211
- case "runs list":
212
- return await runList(subRest, io);
213
- case "runs show":
214
- return await runShow(subRest, io);
215
- case "runs cancel":
216
- return await runCancel(subRest, io);
217
- case "candidates list":
218
- return await candidateList(subRest, io);
219
- case "candidates show":
220
- return await candidateShow(subRest, io);
221
- case "candidates files":
222
- return await candidateFiles(subRest, io);
223
- case "candidates preview":
224
- return await candidatePreview(subRest, io);
225
- case "candidates pull":
226
- return await candidateExport(subRest, io);
227
- case "candidates publish":
228
- return await candidateVisibility(subRest, io, "public");
229
- case "candidates unpublish":
230
- return await candidateVisibility(subRest, io, "private");
231
- default:
232
- throw new UsageError(`Unknown command: cloud ${argv.join(" ")}`);
175
+ function extractHostedFlag(argv) {
176
+ let enabled = false;
177
+ const next = [];
178
+ for (const arg of argv) {
179
+ if (arg === "--hosted") {
180
+ enabled = true;
181
+ }
182
+ else {
183
+ next.push(arg);
184
+ }
233
185
  }
186
+ return { enabled, argv: next };
234
187
  }
235
188
  async function localDevOpen(argv, io) {
236
189
  const parsed = parseArgs(argv);
@@ -708,12 +661,28 @@ async function localRun(argv, io, runtimeOptions) {
708
661
  if (caseIds.length === 0) {
709
662
  throw new UsageError("Engine resolver must emit at least one case.");
710
663
  }
664
+ const optimizeSelector = workbenchImproveOptimizeSelector(spec);
665
+ const selectionPolicy = workbenchImproveSelectionPolicy(spec);
666
+ const optimizeCaseIds = workbenchEngineCaseIdsForSelector(engineCases, optimizeSelector);
667
+ if (optimizeCaseIds.length === 0) {
668
+ throw new UsageError(`Improve optimizeOn selector matched no cases: ${formatWorkbenchCaseSelector(optimizeSelector)}.`);
669
+ }
670
+ const selectionCaseIds = workbenchEngineCaseIdsForSelector(engineCases, selectionPolicy.selector);
671
+ if (selectionCaseIds.length === 0) {
672
+ throw new UsageError(`Improve selectBy selector matched no cases: ${formatWorkbenchCaseSelector(selectionPolicy.selector)}.`);
673
+ }
674
+ const selectionScoreCaseIds = workbenchCaseSelectorUsesAllCases(selectionPolicy.selector)
675
+ ? undefined
676
+ : selectionCaseIds;
677
+ const evaluationCaseIds = workbenchEngineCaseIdsForImproveEvaluation({ spec, engineCases });
711
678
  requireValidRunEnvelope({
712
679
  workflow: "improve",
713
680
  budget,
714
681
  samples,
715
- caseCount: caseIds.length,
682
+ caseCount: evaluationCaseIds.length,
716
683
  });
684
+ const optimizeOnLabel = formatWorkbenchCaseSelector(optimizeSelector);
685
+ const selectByLabel = formatWorkbenchSelectionPolicy(selectionPolicy);
717
686
  const environmentRefs = await ensureLocalDockerfileEnvironments(workspace, spec, engineCases);
718
687
  const benchmarkFingerprint = await readLocalBenchmarkFingerprint(workspace);
719
688
  const executionFingerprint = localRunExecutionFingerprint(projectSource);
@@ -771,7 +740,7 @@ async function localRun(argv, io, runtimeOptions) {
771
740
  const events = [
772
741
  createLocalEvent("run_started", startedAt, {
773
742
  runId,
774
- detail: { budget, samples, strategy: "greedy" },
743
+ detail: { budget, samples, strategy: "greedy", optimizeOn: optimizeOnLabel, selectBy: selectByLabel },
775
744
  }),
776
745
  ];
777
746
  const runningRun = {
@@ -786,6 +755,8 @@ async function localRun(argv, io, runtimeOptions) {
786
755
  improver: formatSpecImprover(spec),
787
756
  engineRun: spec.engineRun.use,
788
757
  strategy: "greedy",
758
+ optimizeOn: optimizeOnLabel,
759
+ selectBy: selectByLabel,
789
760
  budget,
790
761
  repairBudget: 0,
791
762
  attemptsRequested: budget,
@@ -815,7 +786,7 @@ async function localRun(argv, io, runtimeOptions) {
815
786
  throw new UsageError("Candidate snapshot must include at least one file.");
816
787
  }
817
788
  const candidateRevisionTraceFiles = createOptimizerTraceInputFiles({
818
- jobs: [...baselineTraceJobs, ...runTraceJobs],
789
+ jobs: filterOptimizerTraceJobsForCaseIds([...baselineTraceJobs, ...runTraceJobs], optimizeCaseIds),
819
790
  });
820
791
  const candidateId = `candidate_${runId.replace(/^run_/u, "")}_${String(attemptIndex + 1).padStart(3, "0")}`;
821
792
  const plannedCandidateRevision = planWorkbenchExecutionJobsForPurpose({
@@ -825,7 +796,7 @@ async function localRun(argv, io, runtimeOptions) {
825
796
  candidateId,
826
797
  attemptIndex,
827
798
  samples,
828
- caseIds,
799
+ caseIds: optimizeCaseIds,
829
800
  engineCases,
830
801
  spec,
831
802
  workflow: "improve",
@@ -861,7 +832,7 @@ async function localRun(argv, io, runtimeOptions) {
861
832
  attemptIndex,
862
833
  samples,
863
834
  now: new Date().toISOString(),
864
- caseIds,
835
+ caseIds: evaluationCaseIds,
865
836
  engineCases,
866
837
  spec,
867
838
  environmentRefsByCase: environmentRefs.byCase,
@@ -891,6 +862,11 @@ async function localRun(argv, io, runtimeOptions) {
891
862
  jobs: completedJobs,
892
863
  previousCandidate: activeCandidate,
893
864
  existingCandidateCount: snapshot.candidates.length,
865
+ selection: {
866
+ metric: selectionPolicy.metric,
867
+ ...(selectionScoreCaseIds ? { caseIds: selectionScoreCaseIds } : {}),
868
+ label: selectByLabel,
869
+ },
894
870
  });
895
871
  for (const candidate of materialized.candidates) {
896
872
  outputCandidateId = candidate.id;
@@ -944,6 +920,8 @@ async function localRun(argv, io, runtimeOptions) {
944
920
  improver: formatSpecImprover(spec),
945
921
  engineRun: spec.engineRun.use,
946
922
  strategy: "greedy",
923
+ optimizeOn: optimizeOnLabel,
924
+ selectBy: selectByLabel,
947
925
  budget,
948
926
  repairBudget: 0,
949
927
  attemptsRequested: budget,
@@ -2932,13 +2910,21 @@ function adapterAuthRecord(value) {
2932
2910
  }
2933
2911
  async function pushBenchmark(argv, io) {
2934
2912
  const parsed = parseArgs(argv);
2935
- rejectUnknownFlags(parsed, new Set(["dir", "tag", "visibility", "dry-run", "json"]));
2913
+ rejectUnknownFlags(parsed, new Set(["dir", "visibility", "dry-run", "json"]));
2936
2914
  const dir = resolveSourceDir(parsed);
2937
2915
  const source = await readLocalProjectSource(dir);
2938
2916
  const origin = await readWorkbenchOrigin(dir);
2939
2917
  const baseUrl = await effectiveBaseUrl(origin?.baseUrl);
2940
- const visibility = readBenchmarkVisibility(parsed.flags.visibility);
2918
+ const visibility = readOptionalBenchmarkVisibility(parsed.flags.visibility);
2919
+ const createVisibility = visibility ?? "public";
2941
2920
  const dryRun = parsed.flags["dry-run"] === true;
2921
+ const runtime = await exportLocalRuntimeBundle(dir);
2922
+ const state = localProjectState({
2923
+ source,
2924
+ runtime,
2925
+ origin,
2926
+ visibility: createVisibility,
2927
+ });
2942
2928
  if (!origin) {
2943
2929
  if (dryRun) {
2944
2930
  writeOutput({
@@ -2948,35 +2934,36 @@ async function pushBenchmark(argv, io) {
2948
2934
  dir,
2949
2935
  baseUrl,
2950
2936
  benchmarkName: source.spec.name,
2951
- tag: asOptionalString(parsed.flags.tag) ?? null,
2952
- visibility,
2937
+ visibility: createVisibility,
2953
2938
  sourceFileCount: sourceFileCount(source),
2939
+ runtime: runtimeBundleStats(runtime),
2940
+ sourceFingerprint: state.source.fingerprint,
2941
+ runtimeFingerprint: state.base.runtimeFingerprint,
2954
2942
  }, parsed, io, () => `Would push benchmark ${source.spec.name}.`);
2955
2943
  return 0;
2956
2944
  }
2957
- const { project, publishedProject, origin: nextOrigin } = await createHostedBenchmarkFromSource({
2945
+ const { project, origin: nextOrigin, result } = await createHostedBenchmarkFromState({
2958
2946
  baseUrl,
2959
2947
  dir,
2960
- source,
2961
- visibility,
2948
+ state,
2962
2949
  });
2963
2950
  writeOutput({
2964
2951
  ok: true,
2965
2952
  action: "create",
2966
- benchmark: publishedProject,
2967
- tag: asOptionalString(parsed.flags.tag) ?? null,
2968
- visibility,
2953
+ benchmark: project,
2954
+ visibility: project.visibility ?? createVisibility,
2969
2955
  origin: nextOrigin,
2956
+ source: result.source,
2957
+ runtime: result.runtime.stats,
2970
2958
  urls: buildWorkbenchResourceUrls({
2971
2959
  baseUrl,
2972
- projectId: publishedProject.id ?? project.id,
2973
- owner: nextOrigin.owner,
2974
- projectName: nextOrigin.project,
2960
+ projectId: project.id,
2961
+ ...originRemoteUrlParts(nextOrigin),
2975
2962
  }),
2976
2963
  }, parsed, io, (record) => {
2977
2964
  const value = record;
2978
2965
  return [
2979
- `Pushed ${value.origin.owner}/${value.origin.project} (${value.origin.projectId}).`,
2966
+ `Pushed ${value.origin.remote} (${value.origin.projectId}).`,
2980
2967
  `Open benchmark: ${value.urls.benchmark}`,
2981
2968
  ].join("\n");
2982
2969
  });
@@ -2986,57 +2973,6 @@ async function pushBenchmark(argv, io) {
2986
2973
  if (!projectId) {
2987
2974
  throw new UsageError("Missing hosted benchmark. Run workbench push from a source directory.");
2988
2975
  }
2989
- if (!origin.writable) {
2990
- const signedInUsername = dryRun ? null : await readAuthenticatedWorkbenchUsername(baseUrl);
2991
- if (signedInUsername !== origin.owner) {
2992
- const upstream = upstreamFromOrigin(origin);
2993
- if (dryRun) {
2994
- writeOutput({
2995
- ok: true,
2996
- dryRun: true,
2997
- action: "create",
2998
- dir,
2999
- baseUrl,
3000
- benchmarkName: source.spec.name,
3001
- tag: asOptionalString(parsed.flags.tag) ?? null,
3002
- visibility,
3003
- sourceFileCount: sourceFileCount(source),
3004
- upstream: upstream ?? null,
3005
- }, parsed, io, () => `Would create a writable benchmark from read-only origin ${origin.owner}/${origin.project}.`);
3006
- return 0;
3007
- }
3008
- const { project, publishedProject, origin: nextOrigin } = await createHostedBenchmarkFromSource({
3009
- baseUrl,
3010
- dir,
3011
- source,
3012
- visibility,
3013
- upstream,
3014
- });
3015
- writeOutput({
3016
- ok: true,
3017
- action: "create",
3018
- benchmark: publishedProject,
3019
- tag: asOptionalString(parsed.flags.tag) ?? null,
3020
- visibility,
3021
- origin: nextOrigin,
3022
- upstream: upstream ?? null,
3023
- urls: buildWorkbenchResourceUrls({
3024
- baseUrl,
3025
- projectId: publishedProject.id ?? project.id,
3026
- owner: nextOrigin.owner,
3027
- projectName: nextOrigin.project,
3028
- }),
3029
- }, parsed, io, (record) => {
3030
- const value = record;
3031
- return [
3032
- `Pushed ${value.origin.owner}/${value.origin.project} (${value.origin.projectId}).`,
3033
- ...(value.upstream ? [`Upstream: ${value.upstream.owner}/${value.upstream.project}`] : []),
3034
- `Open benchmark: ${value.urls.benchmark}`,
3035
- ].join("\n");
3036
- });
3037
- return 0;
3038
- }
3039
- }
3040
2976
  if (dryRun) {
3041
2977
  writeOutput({
3042
2978
  ok: true,
@@ -3045,92 +2981,82 @@ async function pushBenchmark(argv, io) {
3045
2981
  dir,
3046
2982
  baseUrl,
3047
2983
  benchmarkId: projectId,
3048
- tag: asOptionalString(parsed.flags.tag) ?? null,
3049
- visibility,
2984
+ remote: origin.remote,
2985
+ benchmarkName: source.spec.name,
2986
+ visibility: visibility ?? "unchanged",
3050
2987
  sourceFileCount: sourceFileCount(source),
3051
- }, parsed, io, () => `Would push ${sourceFileCount(source)} source file(s) to ${projectId}.`);
2988
+ runtime: runtimeBundleStats(runtime),
2989
+ sourceFingerprint: state.source.fingerprint,
2990
+ runtimeFingerprint: state.base.runtimeFingerprint,
2991
+ }, parsed, io, () => `Would push ${sourceFileCount(source)} source file(s) and runtime history to ${origin.remote}.`);
3052
2992
  return 0;
3053
2993
  }
3054
- const response = await apiRequest(projectApiPath(projectId, "/source"), {
2994
+ const response = await apiRequest(projectApiPath(projectId, "/state"), {
3055
2995
  method: "PUT",
3056
- body: hostedProjectSourceRequest(source),
2996
+ body: state,
3057
2997
  }, baseUrl);
3058
- const publishedProject = visibility === "public"
3059
- ? (await apiRequest(projectApiPath(response.benchmark.id, "/publish"), { method: "PUT" }, baseUrl)).benchmark
3060
- : response.benchmark;
3061
- const nextOrigin = await writeWorkbenchOrigin(dir, {
2998
+ const responseProject = hostedProjectSummaryFromState(response.state);
2999
+ const publishedProject = await applyRequestedProjectVisibility({
3062
3000
  baseUrl,
3063
- owner: publishedProject.ownerUsername ?? response.benchmark.ownerUsername ?? origin.owner,
3064
- project: publishedProject.name ?? response.benchmark.name ?? origin.project ?? source.spec.name,
3065
- projectId: publishedProject.id ?? response.benchmark.id,
3066
- writable: true,
3067
- sourceRevisionId: publishedProject.currentSpecVersionId ?? response.benchmark.currentSpecVersionId,
3068
- sourceFingerprint: response.sourceFingerprint ?? publishedProject.sourceFingerprint ?? response.benchmark.sourceFingerprint,
3069
- upstream: origin.upstream,
3001
+ projectId: responseProject.id,
3002
+ responseProject,
3003
+ visibility,
3004
+ });
3005
+ const nextOrigin = await writeWorkbenchOriginFromState(dir, {
3006
+ baseUrl,
3007
+ state: response.state,
3008
+ project: publishedProject,
3009
+ sourceFingerprint: state.source.fingerprint,
3070
3010
  });
3071
3011
  writeOutput({
3072
3012
  ok: true,
3073
3013
  action: "update",
3074
3014
  changed: response.changed === true,
3075
3015
  benchmark: publishedProject,
3076
- tag: asOptionalString(parsed.flags.tag) ?? null,
3077
- visibility,
3016
+ visibility: visibility ?? "unchanged",
3078
3017
  origin: nextOrigin,
3018
+ source: response.source,
3019
+ runtime: response.runtime.stats,
3079
3020
  urls: buildWorkbenchResourceUrls({
3080
3021
  baseUrl,
3081
- projectId: publishedProject.id ?? response.benchmark.id,
3082
- owner: nextOrigin.owner,
3083
- projectName: nextOrigin.project,
3022
+ projectId: publishedProject.id ?? responseProject.id,
3023
+ ...originRemoteUrlParts(nextOrigin),
3084
3024
  }),
3085
3025
  }, parsed, io, (record) => {
3086
3026
  const value = record;
3087
3027
  return [
3088
- `${value.changed ? "Pushed" : "Already up to date"} ${value.origin.owner}/${value.origin.project} (${value.origin.projectId}).`,
3028
+ `${value.changed ? "Pushed" : "Already up to date"} ${value.origin.remote} (${value.origin.projectId}).`,
3089
3029
  `Open benchmark: ${value.urls.benchmark}`,
3090
3030
  ].join("\n");
3091
3031
  });
3092
3032
  return 0;
3093
3033
  }
3094
- async function createHostedBenchmarkFromSource(args) {
3095
- const response = await apiRequest("/api/workbench/benchmarks", {
3034
+ async function createHostedBenchmarkFromState(args) {
3035
+ const result = await apiRequest("/api/workbench/benchmarks/state", {
3096
3036
  method: "POST",
3097
- body: hostedProjectSourceRequest(args.source),
3037
+ body: args.state,
3098
3038
  }, args.baseUrl);
3099
- const project = response.benchmark;
3100
- const publishedProject = args.visibility === "public"
3101
- ? (await apiRequest(projectApiPath(project.id, "/publish"), { method: "PUT" }, args.baseUrl)).benchmark
3102
- : project;
3103
- const origin = await writeWorkbenchOrigin(args.dir, {
3039
+ const project = hostedProjectSummaryFromState(result.state);
3040
+ const origin = await writeWorkbenchOriginFromState(args.dir, {
3104
3041
  baseUrl: args.baseUrl,
3105
- owner: publishedProject.ownerUsername ?? project.ownerUsername ?? "",
3106
- project: publishedProject.name ?? project.name ?? args.source.spec.name,
3107
- projectId: publishedProject.id ?? project.id,
3108
- writable: true,
3109
- sourceRevisionId: publishedProject.currentSpecVersionId ?? project.currentSpecVersionId,
3110
- sourceFingerprint: publishedProject.sourceFingerprint ?? project.sourceFingerprint,
3111
- ...(args.upstream ? { upstream: args.upstream } : {}),
3042
+ state: result.state,
3043
+ project,
3044
+ sourceFingerprint: args.state.source.fingerprint,
3112
3045
  });
3113
- return { project, publishedProject, origin };
3114
- }
3115
- async function readAuthenticatedWorkbenchUsername(baseUrl) {
3116
- const config = await loadConfig();
3117
- const status = await readWorkbenchProfileStatus({ ...config, baseUrl });
3118
- return status.authenticated ? status.profile?.username ?? null : null;
3046
+ return { project, origin, result };
3119
3047
  }
3120
- function upstreamFromOrigin(origin) {
3121
- if (!origin.owner || !origin.project || !origin.projectId || !origin.sourceRevisionId) {
3122
- return undefined;
3048
+ async function applyRequestedProjectVisibility(args) {
3049
+ if (args.visibility === "public") {
3050
+ return (await apiRequest(projectApiPath(args.projectId, "/publish"), { method: "PUT" }, args.baseUrl)).benchmark;
3123
3051
  }
3124
- return {
3125
- owner: origin.owner,
3126
- project: origin.project,
3127
- projectId: origin.projectId,
3128
- sourceRevisionId: origin.sourceRevisionId,
3129
- };
3052
+ if (args.visibility === "private") {
3053
+ return (await apiRequest(projectApiPath(args.projectId, "/publish"), { method: "DELETE" }, args.baseUrl)).benchmark;
3054
+ }
3055
+ return args.responseProject;
3130
3056
  }
3131
- function readBenchmarkVisibility(value) {
3057
+ function readOptionalBenchmarkVisibility(value) {
3132
3058
  if (value === undefined) {
3133
- return "public";
3059
+ return undefined;
3134
3060
  }
3135
3061
  if (value === "private" || value === "public") {
3136
3062
  return value;
@@ -3143,41 +3069,37 @@ async function cloneProject(argv, io) {
3143
3069
  const ref = readRequiredBenchmarkRef(parsed);
3144
3070
  const outputDir = parsed.positionals[1] ?? ref.project;
3145
3071
  if (parsed.positionals.length > 2) {
3146
- throw new UsageError("workbench clone accepts OWNER/BENCHMARK[@REF] and an optional output directory.");
3072
+ throw new UsageError("workbench clone accepts OWNER/BENCHMARK and an optional output directory.");
3147
3073
  }
3148
3074
  const baseUrl = await effectiveBaseUrl();
3149
- const projectResponse = await apiRequest(publicProjectApiPath(ref), {}, baseUrl);
3150
- const filesResponse = await apiRequest(publicProjectSourceApiPath(ref), {}, baseUrl);
3075
+ const state = await apiRequest(publicProjectStateApiPath(ref), {}, baseUrl);
3151
3076
  if (parsed.flags["dry-run"] === true) {
3152
3077
  writeOutput({
3153
3078
  ok: true,
3154
3079
  dryRun: true,
3155
3080
  ref,
3156
3081
  outputDir,
3157
- fileCount: filesResponse.files.length,
3082
+ fileCount: state.source.files.length,
3083
+ runtime: runtimeBundleStats(state.runtime),
3084
+ sourceFingerprint: state.source.fingerprint ?? state.base.sourceFingerprint ?? null,
3085
+ runtimeFingerprint: state.base.runtimeFingerprint ?? null,
3158
3086
  }, parsed, io, () => `Would clone ${formatBenchmarkRef(ref)} to ${outputDir}.`);
3159
3087
  return 0;
3160
3088
  }
3161
- await syncSourceFiles(outputDir, filesResponse.files);
3162
- const project = projectResponse.benchmark;
3163
- const sourceProject = filesResponse.benchmark;
3164
- const origin = await writeWorkbenchOrigin(outputDir, {
3089
+ const applied = await applyProjectStateToLocal({
3090
+ dir: outputDir,
3165
3091
  baseUrl,
3166
- owner: sourceProject?.ownerUsername ?? project.ownerUsername,
3167
- project: sourceProject?.name ?? project.name,
3168
- projectId: sourceProject?.id ?? project.id,
3169
- writable: false,
3170
- sourceRevisionId: sourceProject?.currentSpecVersionId ?? project.currentSpecVersionId,
3171
- sourceFingerprint: sourceProject?.sourceFingerprint ?? project.sourceFingerprint,
3092
+ state,
3172
3093
  });
3173
3094
  writeOutput({
3174
3095
  ok: true,
3175
- origin,
3096
+ origin: applied.origin,
3176
3097
  outputDir,
3177
- files: filesResponse.files.length,
3098
+ files: applied.files,
3099
+ runtime: applied.runtime,
3178
3100
  }, parsed, io, (record) => {
3179
3101
  const value = record;
3180
- return `Cloned ${value.origin.owner}/${value.origin.project} to ${value.outputDir} (${value.files} file(s)).`;
3102
+ return `Cloned ${value.origin.remote} to ${value.outputDir} (${value.files} file(s)).`;
3181
3103
  });
3182
3104
  return 0;
3183
3105
  }
@@ -3185,167 +3107,60 @@ async function pullProject(argv, io) {
3185
3107
  const parsed = parseArgs(argv);
3186
3108
  rejectUnknownFlags(parsed, new Set(["dir", "dry-run", "json"]));
3187
3109
  if (parsed.positionals.length > 0) {
3188
- throw new UsageError("workbench pull updates the current origin; use workbench clone OWNER/BENCHMARK[@REF] DIR for a new directory.");
3110
+ throw new UsageError("workbench pull updates the current origin; use workbench clone OWNER/BENCHMARK DIR for a new directory.");
3189
3111
  }
3190
3112
  const dir = resolveDir(parsed);
3191
3113
  const origin = await requireWorkbenchOrigin(dir);
3192
- const filesResponse = origin.writable
3193
- ? await apiRequest(projectApiPath(origin.projectId, "/source"), {}, await effectiveBaseUrl(origin.baseUrl))
3194
- : await apiRequest(publicProjectSourceApiPath({ owner: origin.owner, project: origin.project }), {}, await effectiveBaseUrl(origin.baseUrl));
3114
+ const baseUrl = await effectiveBaseUrl(origin.baseUrl);
3115
+ const remoteRef = parseOriginRemote(origin);
3116
+ const state = await apiRequest(publicProjectStateApiPath(remoteRef), {}, baseUrl);
3195
3117
  if (parsed.flags["dry-run"] === true) {
3196
3118
  writeOutput({
3197
3119
  ok: true,
3198
3120
  dryRun: true,
3199
3121
  dir,
3200
- fileCount: filesResponse.files.length,
3201
- }, parsed, io, () => `Would pull ${filesResponse.files.length} source file(s) into ${dir}.`);
3122
+ fileCount: state.source.files.length,
3123
+ runtime: runtimeBundleStats(state.runtime),
3124
+ sourceFingerprint: state.source.fingerprint ?? state.base.sourceFingerprint ?? null,
3125
+ runtimeFingerprint: state.base.runtimeFingerprint ?? null,
3126
+ }, parsed, io, () => `Would pull ${state.source.files.length} source file(s) and runtime history into ${dir}.`);
3202
3127
  return 0;
3203
3128
  }
3204
- await syncSourceFiles(dir, filesResponse.files);
3205
- const sourceProject = filesResponse.benchmark;
3206
- const nextOrigin = await writeWorkbenchOrigin(dir, {
3207
- ...origin,
3208
- ...(sourceProject?.ownerUsername ? { owner: sourceProject.ownerUsername } : {}),
3209
- ...(sourceProject?.name ? { project: sourceProject.name } : {}),
3210
- ...(sourceProject?.id ? { projectId: sourceProject.id } : {}),
3211
- ...(sourceProject?.currentSpecVersionId ? { sourceRevisionId: sourceProject.currentSpecVersionId } : {}),
3212
- ...(sourceProject?.sourceFingerprint ? { sourceFingerprint: sourceProject.sourceFingerprint } : {}),
3213
- });
3214
- writeOutput({
3215
- ok: true,
3216
- origin: nextOrigin,
3129
+ const applied = await applyProjectStateToLocal({
3217
3130
  dir,
3218
- files: filesResponse.files.length,
3219
- }, parsed, io, (record) => {
3220
- const value = record;
3221
- return `Pulled ${value.files} source file(s) into ${value.dir}.`;
3222
- });
3223
- return 0;
3224
- }
3225
- async function fetchProject(argv, io) {
3226
- const parsed = parseArgs(argv);
3227
- rejectUnknownFlags(parsed, new Set(["dir", "json"]));
3228
- if (parsed.positionals.length > 0) {
3229
- throw new UsageError("workbench fetch updates the current remote cache; use workbench clone OWNER/BENCHMARK[@REF] DIR for a new directory.");
3230
- }
3231
- const dir = resolveDir(parsed);
3232
- const origin = await requireWorkbenchOrigin(dir);
3233
- const filesResponse = await readRemoteSourceFiles(origin);
3234
- const fetchRoot = path.join(dir, ".workbench", "fetch");
3235
- await fs.rm(fetchRoot, { force: true, recursive: true });
3236
- await fs.mkdir(fetchRoot, { recursive: true });
3237
- await writeFiles(path.join(fetchRoot, "source"), filesResponse.files);
3238
- const sourceProject = filesResponse.benchmark;
3239
- const nextOrigin = await writeWorkbenchOrigin(dir, {
3240
- ...origin,
3241
- ...(sourceProject?.ownerUsername ? { owner: sourceProject.ownerUsername } : {}),
3242
- ...(sourceProject?.name ? { project: sourceProject.name } : {}),
3243
- ...(sourceProject?.id ? { projectId: sourceProject.id } : {}),
3244
- ...(sourceProject?.currentSpecVersionId ? { sourceRevisionId: sourceProject.currentSpecVersionId } : {}),
3245
- ...(sourceProject?.sourceFingerprint ? { sourceFingerprint: sourceProject.sourceFingerprint } : {}),
3131
+ baseUrl,
3132
+ state,
3133
+ origin,
3134
+ requireCleanSource: true,
3246
3135
  });
3247
- await fs.writeFile(path.join(fetchRoot, "manifest.json"), `${JSON.stringify({
3248
- fetchedAt: new Date().toISOString(),
3249
- origin: nextOrigin,
3250
- files: filesResponse.files.map((file) => file.path),
3251
- }, null, 2)}\n`);
3252
3136
  writeOutput({
3253
3137
  ok: true,
3254
- origin: nextOrigin,
3138
+ origin: applied.origin,
3255
3139
  dir,
3256
- fetchRoot,
3257
- files: filesResponse.files.length,
3140
+ files: applied.files,
3141
+ runtime: applied.runtime,
3258
3142
  }, parsed, io, (record) => {
3259
3143
  const value = record;
3260
- return `Fetched ${value.files} source file(s) into ${value.fetchRoot}.`;
3261
- });
3262
- return 0;
3263
- }
3264
- async function readRemoteSourceFiles(origin) {
3265
- return origin.writable
3266
- ? await apiRequest(projectApiPath(origin.projectId, "/source"), {}, await effectiveBaseUrl(origin.baseUrl))
3267
- : await apiRequest(publicProjectSourceApiPath({ owner: origin.owner, project: origin.project }), {}, await effectiveBaseUrl(origin.baseUrl));
3268
- }
3269
- async function runRemoteCommand(argv, io) {
3270
- const command = argv[0] ?? "show";
3271
- switch (command) {
3272
- case "show":
3273
- return await remoteShow(argv.slice(1), io);
3274
- case "add":
3275
- return await remoteAdd(argv.slice(1), io, "add");
3276
- case "set-url":
3277
- return await remoteAdd(argv.slice(1), io, "set-url");
3278
- case "remove":
3279
- return await remoteRemove(argv.slice(1), io);
3280
- default:
3281
- throw new UsageError(`Unknown command: remote ${argv.join(" ")}`);
3282
- }
3283
- }
3284
- async function remoteShow(argv, io) {
3285
- const parsed = parseArgs(argv);
3286
- rejectUnknownFlags(parsed, new Set(["dir", "json"]));
3287
- const origin = await requireWorkbenchOrigin(resolveDir(parsed));
3288
- writeOutput({ ok: true, remote: "origin", origin }, parsed, io, (record) => {
3289
- const value = record;
3290
- return [
3291
- `origin\t${value.origin.owner}/${value.origin.project}`,
3292
- `url\t${value.origin.baseUrl}`,
3293
- `writable\t${value.origin.writable ? "yes" : "no"}`,
3294
- ...(value.origin.sourceFingerprint ? [`fingerprint\t${value.origin.sourceFingerprint}`] : []),
3295
- ].join("\n");
3296
- });
3297
- return 0;
3298
- }
3299
- async function remoteAdd(argv, io, command) {
3300
- const parsed = parseArgs(argv);
3301
- rejectUnknownFlags(parsed, new Set(["dir", "json"]));
3302
- const [name, refValue] = parsed.positionals;
3303
- if (name !== "origin" || !refValue || parsed.positionals.length !== 2) {
3304
- throw new UsageError(`workbench remote ${command} accepts: origin OWNER/BENCHMARK[@REF].`);
3305
- }
3306
- const ref = parseBenchmarkRef(refValue);
3307
- const baseUrl = await effectiveBaseUrl();
3308
- const project = await resolveRemoteProject(formatBenchmarkRef(ref), baseUrl);
3309
- const origin = await writeWorkbenchOrigin(resolveDir(parsed), {
3310
- baseUrl,
3311
- owner: project.ownerUsername ?? ref.owner,
3312
- project: project.name ?? ref.project,
3313
- projectId: project.id,
3314
- writable: false,
3315
- ...(project.currentSpecVersionId ? { sourceRevisionId: project.currentSpecVersionId } : {}),
3316
- ...(project.sourceFingerprint ? { sourceFingerprint: project.sourceFingerprint } : {}),
3144
+ return `Pulled ${value.files} source file(s) into ${value.dir}.`;
3317
3145
  });
3318
- writeOutput({ ok: true, remote: "origin", origin }, parsed, io, () => `Set origin to ${origin.owner}/${origin.project}.`);
3319
- return 0;
3320
- }
3321
- async function remoteRemove(argv, io) {
3322
- const parsed = parseArgs(argv);
3323
- rejectUnknownFlags(parsed, new Set(["dir", "json"]));
3324
- const [name] = parsed.positionals;
3325
- if (name !== "origin" || parsed.positionals.length !== 1) {
3326
- throw new UsageError("workbench remote remove accepts: origin.");
3327
- }
3328
- const originPath = workbenchOriginPath(resolveDir(parsed));
3329
- const existed = await fileIsReadable(originPath);
3330
- await fs.rm(originPath, { force: true });
3331
- writeOutput({ ok: true, remote: "origin", removed: existed, path: originPath }, parsed, io, () => existed
3332
- ? `Removed origin (${originPath}).`
3333
- : `No origin configured (${originPath}).`);
3334
3146
  return 0;
3335
3147
  }
3336
- async function starProject(argv, io, starred) {
3337
- const parsed = parseArgs(argv);
3338
- rejectUnknownFlags(parsed, new Set(["json"]));
3339
- const ref = readRequiredBenchmarkRef(parsed);
3340
- if (parsed.positionals.length > 1) {
3341
- throw new UsageError(`${starred ? "workbench cloud star" : "workbench cloud unstar"} accepts exactly one OWNER/BENCHMARK ref.`);
3148
+ async function applyProjectStateToLocal(args) {
3149
+ if (args.requireCleanSource === true && args.origin) {
3150
+ await assertLocalSourceMatchesOrigin(args.dir, args.origin);
3342
3151
  }
3343
- const response = await apiRequest(`${publicProjectApiPath(ref)}/star`, { method: starred ? "PUT" : "DELETE" }, await effectiveBaseUrl());
3344
- writeOutput({ ok: true, benchmark: response.benchmark }, parsed, io, (record) => {
3345
- const value = record;
3346
- return `${starred ? "Starred" : "Unstarred"} ${formatBenchmarkRef(ref)}; ${value.benchmark.starCount} star(s).`;
3152
+ await syncSourceFiles(args.dir, args.state.source.files);
3153
+ const runtimeImport = await importLocalRuntimeBundle(args.dir, args.state.runtime);
3154
+ const origin = await writeWorkbenchOriginFromState(args.dir, {
3155
+ baseUrl: args.baseUrl,
3156
+ state: args.state,
3157
+ sourceFingerprint: await localSourceFingerprint(args.dir),
3347
3158
  });
3348
- return 0;
3159
+ return {
3160
+ origin,
3161
+ files: args.state.source.files.length,
3162
+ runtime: runtimeImport.stats,
3163
+ };
3349
3164
  }
3350
3165
  async function retryHostedWorkflow(argv, io) {
3351
3166
  const parsed = parseArgs(argv);
@@ -3357,7 +3172,7 @@ async function retryHostedWorkflow(argv, io) {
3357
3172
  "timeout-ms",
3358
3173
  "json",
3359
3174
  ]));
3360
- rejectUnexpectedPositionals(parsed, "workbench cloud retry", 1);
3175
+ rejectUnexpectedPositionals(parsed, "workbench retry --hosted", 1);
3361
3176
  const targetId = parsed.positionals[0];
3362
3177
  if (!targetId) {
3363
3178
  throw new UsageError("Missing required TARGET_ID.");
@@ -3391,6 +3206,7 @@ async function retryHostedWorkflow(argv, io) {
3391
3206
  timeoutMs: watchTimeoutMs,
3392
3207
  });
3393
3208
  const outputRun = withRunUrls(target, await withHostedRunFailureSummary(target, watched));
3209
+ await tryImportTerminalHostedProjectState({ target, io });
3394
3210
  const result = {
3395
3211
  ok: hostedRunSucceeded(watched),
3396
3212
  retried: {
@@ -3435,7 +3251,7 @@ async function resolveHostedRetryTarget(target, targetId) {
3435
3251
  throw new UsageError(`Run ${run.id} is ${run.status}; wait for it to finish before retrying.`);
3436
3252
  }
3437
3253
  if (!hostedRunRecordFailed(run)) {
3438
- throw new UsageError(`Run ${run.id} did not fail; use workbench cloud ${run.workflow ?? "eval"} to intentionally run it again.`);
3254
+ throw new UsageError(`Run ${run.id} did not fail; use workbench ${run.workflow ?? "eval"} --hosted to intentionally run it again.`);
3439
3255
  }
3440
3256
  if (run.workflow === "eval") {
3441
3257
  const candidateId = hostedRunEvaluationCandidateId(run, detail.jobs);
@@ -3485,7 +3301,7 @@ async function resolveHostedEvaluationRetryTarget(target, evaluationId) {
3485
3301
  }
3486
3302
  const run = snapshot.runs.find((entry) => entry.id === evaluation.runId) ?? null;
3487
3303
  if (!evaluationScorecardFailed(evaluation, run)) {
3488
- throw new UsageError(`Evaluation ${evaluation.id} did not fail; use workbench cloud eval to intentionally run it again.`);
3304
+ throw new UsageError(`Evaluation ${evaluation.id} did not fail; use workbench eval --hosted to intentionally run it again.`);
3489
3305
  }
3490
3306
  if (!run) {
3491
3307
  throw new UsageError(`Evaluation ${evaluation.id} is missing its run record.`);
@@ -3527,6 +3343,25 @@ function uniqueCaseSamplePairs(pairs) {
3527
3343
  async function readHostedRunDetail(target, runId) {
3528
3344
  return await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(runId)}`), {}, target.baseUrl);
3529
3345
  }
3346
+ async function tryImportTerminalHostedProjectState(args) {
3347
+ const origin = args.target.origin;
3348
+ if (!origin || origin.projectId !== args.target.projectId) {
3349
+ return;
3350
+ }
3351
+ try {
3352
+ const state = await apiRequest(projectApiPath(args.target.projectId, "/state"), {}, args.target.baseUrl);
3353
+ await applyProjectStateToLocal({
3354
+ dir: args.target.dir,
3355
+ baseUrl: args.target.baseUrl,
3356
+ state,
3357
+ origin,
3358
+ requireCleanSource: true,
3359
+ });
3360
+ }
3361
+ catch (error) {
3362
+ args.io.stderr.write(`Hosted run finished, but local project state was not updated: ${errorMessage(error)}\n`);
3363
+ }
3364
+ }
3530
3365
  function hostedRetrySourceYaml(run, runId) {
3531
3366
  const sourceYaml = stringValue(readRecord(run.input)?.sourceYaml);
3532
3367
  if (!sourceYaml) {
@@ -3557,12 +3392,9 @@ async function startHostedWorkflow(workflow, argv, io) {
3557
3392
  "json",
3558
3393
  ]));
3559
3394
  if (parsed.positionals.length > 1) {
3560
- throw new UsageError(`workbench cloud ${workflow} accepts at most one source file or directory argument.`);
3561
- }
3562
- const sourceArg = parsed.positionals[0] ?? asOptionalString(parsed.flags.dir) ?? process.cwd();
3563
- if (parsed.positionals.length > 0 && parsed.flags.dir !== undefined) {
3564
- throw new UsageError("Use either --dir or SOURCE, not both.");
3395
+ throw new UsageError(`workbench ${workflow} --hosted accepts at most one source file or directory argument.`);
3565
3396
  }
3397
+ const sourceArg = resolveSourceDir(parsed);
3566
3398
  const samples = parsePositiveInt(parsed.flags.samples, 1, "samples");
3567
3399
  const budget = workflow === "improve"
3568
3400
  ? parsePositiveInt(parsed.flags.budget, 1, "budget")
@@ -3575,7 +3407,7 @@ async function startHostedWorkflow(workflow, argv, io) {
3575
3407
  const defaultProjectSource = await readLocalProjectSource(path.resolve(sourceArg));
3576
3408
  const selectedRunIds = workflow === "eval"
3577
3409
  ? resolveCandidateRunSelection(defaultProjectSource, runsFlag)
3578
- : [singleRequestedRunId(runsFlag, `workbench cloud ${workflow}`) ?? defaultProjectSource.candidateRunId];
3410
+ : [singleRequestedRunId(runsFlag, `workbench ${workflow} --hosted`) ?? defaultProjectSource.candidateRunId];
3579
3411
  if (workflow === "eval" && selectedRunIds.length > 1) {
3580
3412
  let failed = 0;
3581
3413
  const results = [];
@@ -3657,6 +3489,7 @@ async function startHostedWorkflow(workflow, argv, io) {
3657
3489
  adapterFiles: projectSource.adapterFiles,
3658
3490
  intervalMs: watchIntervalMs ?? 1000,
3659
3491
  timeoutMs: watchTimeoutMs,
3492
+ io,
3660
3493
  });
3661
3494
  }
3662
3495
  const response = await apiRequest(projectApiPath(target.projectId, "/runs"), {
@@ -3668,6 +3501,7 @@ async function startHostedWorkflow(workflow, argv, io) {
3668
3501
  ? { ...startedRun, reused: true }
3669
3502
  : startedRun;
3670
3503
  if (response.reused === true && response.run.status === "finished") {
3504
+ await tryImportTerminalHostedProjectState({ target, io });
3671
3505
  writeOutput({
3672
3506
  ok: hostedRunSucceeded(response.run),
3673
3507
  reused: true,
@@ -3689,6 +3523,7 @@ async function startHostedWorkflow(workflow, argv, io) {
3689
3523
  timeoutMs: watchTimeoutMs,
3690
3524
  });
3691
3525
  const outputRun = await withHostedRunFailureSummary(target, watched);
3526
+ await tryImportTerminalHostedProjectState({ target, io });
3692
3527
  writeOutput(withRunUrls(target, outputRun), parsed, io, formatHostedRunResult);
3693
3528
  return hostedRunSucceeded(watched) ? 0 : 1;
3694
3529
  }
@@ -3734,6 +3569,7 @@ async function ensureHostedImproveBaseCandidate(args) {
3734
3569
  if (!watched.candidateId) {
3735
3570
  throw new UsageError(`Parent candidate eval ${watched.id} did not produce a candidate.`);
3736
3571
  }
3572
+ await tryImportTerminalHostedProjectState({ target: args.target, io: args.io });
3737
3573
  return watched.candidateId;
3738
3574
  }
3739
3575
  function hostedWorkflowArgsForRun(args) {
@@ -3776,310 +3612,11 @@ async function readEvaluatedActiveHostedCandidate(target) {
3776
3612
  function hostedCandidateIsEvaluated(candidate) {
3777
3613
  return candidate.status === "evaluated" || candidate.eval != null;
3778
3614
  }
3779
- async function benchmarkList(argv, io) {
3780
- const parsed = parseArgs(argv);
3781
- rejectUnknownFlags(parsed, new Set(["json"]));
3782
- rejectUnexpectedPositionals(parsed, "workbench cloud benchmarks list", 0);
3783
- const response = await apiRequest("/api/workbench/public/benchmarks");
3784
- writeOutput(response.benchmarks, parsed, io, (projects) => {
3785
- if (projects.length === 0) {
3786
- return "No hosted Workbench benchmarks.";
3787
- }
3788
- return projects
3789
- .map((project) => `${project.id}\t${project.name}\t${project.runCount} runs\t${project.candidateCount} candidates`)
3790
- .join("\n");
3791
- });
3792
- return 0;
3793
- }
3794
- async function benchmarkShow(argv, io) {
3795
- const parsed = parseArgs(argv);
3796
- rejectUnknownFlags(parsed, new Set(["dir", "json"]));
3797
- rejectUnexpectedPositionals(parsed, "workbench cloud benchmarks show", 1);
3798
- const dir = resolveDir(parsed);
3799
- const origin = await readWorkbenchOrigin(dir);
3800
- const projectRef = parsed.positionals[0] ??
3801
- origin?.projectId;
3802
- if (!projectRef) {
3803
- throw new UsageError("Missing hosted benchmark. Pass OWNER/BENCHMARK, run workbench push, or run workbench clone.");
3804
- }
3805
- const response = await apiRequest(benchmarkApiPath(projectRef), {}, await effectiveBaseUrl(origin?.baseUrl));
3806
- writeOutput(response.benchmark, parsed, io, (project) => {
3807
- const record = project;
3808
- return `${record.name} (${record.id})\n${record.runs.length} runs\n${record.candidates.length} candidates`;
3809
- });
3810
- return 0;
3811
- }
3812
- async function benchmarkDelete(argv, io) {
3813
- const parsed = parseArgs(argv);
3814
- rejectUnknownFlags(parsed, new Set(["dir", "dry-run", "json"]));
3815
- if (parsed.positionals.length > 1) {
3816
- throw new UsageError(`Unexpected argument for workbench benchmarks delete: ${parsed.positionals.slice(1).join(" ")}`);
3817
- }
3818
- const dir = resolveDir(parsed);
3819
- const origin = await readWorkbenchOrigin(dir);
3820
- const projectRef = parsed.positionals[0] ??
3821
- origin?.projectId;
3822
- if (!projectRef) {
3823
- throw new UsageError("Missing hosted benchmark. Pass OWNER/BENCHMARK, run workbench push, or run workbench clone.");
3824
- }
3825
- const originPath = workbenchOriginPath(dir);
3826
- const baseUrl = await effectiveBaseUrl(origin?.baseUrl);
3827
- if (parsed.flags["dry-run"] === true) {
3828
- const originProjectDeleted = originMatchesProjectRef(origin, projectRef);
3829
- writeOutput({
3830
- ok: true,
3831
- dryRun: true,
3832
- projectRef,
3833
- ...(isRemoteProjectId(projectRef) ? { projectId: projectRef } : {}),
3834
- ...(originProjectDeleted && origin?.project ? { projectName: origin.project } : {}),
3835
- baseUrl,
3836
- ...(originProjectDeleted ? { originPath } : {}),
3837
- }, parsed, io, () => originProjectDeleted
3838
- ? `Would delete hosted benchmark ${projectRef} and remove local origin ${originPath}.`
3839
- : `Would delete hosted benchmark ${projectRef}.`);
3840
- return 0;
3841
- }
3842
- const project = await resolveRemoteProject(projectRef, baseUrl);
3843
- const projectId = project.id;
3844
- const projectName = project.name;
3845
- const originProjectDeleted = origin ? origin.projectId === projectId : false;
3846
- await apiRequest(projectApiPath(projectId), { method: "DELETE" }, baseUrl);
3847
- if (originProjectDeleted) {
3848
- await fs.rm(originPath, { force: true });
3849
- }
3850
- writeOutput({
3851
- ok: true,
3852
- deleted: true,
3853
- projectId,
3854
- ...(projectName ? { projectName } : {}),
3855
- originRemoved: originProjectDeleted,
3856
- ...(originProjectDeleted ? { originPath } : {}),
3857
- }, parsed, io, () => originProjectDeleted
3858
- ? `Deleted benchmark ${formatProjectRef(project)} and removed local origin ${originPath}.`
3859
- : `Deleted benchmark ${formatProjectRef(project)}.`);
3860
- return 0;
3861
- }
3862
- async function benchmarkVersions(argv, io) {
3863
- const parsed = parseArgs(argv);
3864
- rejectUnknownFlags(parsed, new Set(["dir", "json"]));
3865
- rejectUnexpectedPositionals(parsed, "workbench cloud benchmarks versions", 1);
3866
- const projectRef = parsed.positionals[0];
3867
- const origin = await readWorkbenchOrigin(resolveDir(parsed));
3868
- if (!projectRef && !origin) {
3869
- throw new UsageError("Missing benchmark ref. Pass OWNER/BENCHMARK or run from a benchmark clone.");
3870
- }
3871
- const response = await apiRequest(benchmarkApiPath(projectRef ?? origin.projectId), {}, await effectiveBaseUrl(origin?.baseUrl));
3872
- const version = response.benchmark.sourceFingerprint ?? response.benchmark.currentSpecVersionId ?? "current";
3873
- writeOutput({
3874
- ok: true,
3875
- benchmark: response.benchmark,
3876
- versions: [{ ref: "main", digest: version, current: true }],
3877
- }, parsed, io, () => `${response.benchmark.name ?? projectRef ?? origin.project}\tmain\t${shortDigest(version)}\tcurrent`);
3878
- return 0;
3879
- }
3880
- async function benchmarkStarred(argv, io) {
3881
- const parsed = parseArgs(argv);
3882
- rejectUnknownFlags(parsed, new Set(["json"]));
3883
- rejectUnexpectedPositionals(parsed, "workbench cloud benchmarks starred", 0);
3884
- const response = await apiRequest("/api/workbench/benchmarks");
3885
- const starred = response.benchmarks.filter((project) => project.viewerHasStarred === true);
3886
- writeOutput(starred, parsed, io, (benchmarks) => {
3887
- if (benchmarks.length === 0) {
3888
- return "No starred benchmarks.";
3889
- }
3890
- return benchmarks
3891
- .map((benchmark) => `${benchmark.ownerUsername ?? "-"} / ${benchmark.name ?? "-"}\t${benchmark.starCount ?? 0} stars`)
3892
- .join("\n");
3893
- });
3894
- return 0;
3895
- }
3896
- async function candidateList(argv, io) {
3897
- const parsed = parseArgs(argv);
3898
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3899
- rejectUnexpectedPositionals(parsed, "workbench cloud candidates list", 0);
3900
- const target = await resolveHostedTarget(parsed);
3901
- const response = await apiRequest(projectApiPath(target.projectId, "/candidates"), {}, target.baseUrl);
3902
- writeOutput(response.candidates, parsed, io, (candidates) => {
3903
- if (candidates.length === 0) {
3904
- return "No candidates yet.";
3905
- }
3906
- return candidates
3907
- .map((candidate) => `${candidate.id}\t${candidate.status}\t${candidate.fileChanges?.length ?? 0} files`)
3908
- .join("\n");
3909
- });
3910
- return 0;
3911
- }
3912
- async function candidateShow(argv, io) {
3913
- const parsed = parseArgs(argv);
3914
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3915
- rejectUnexpectedPositionals(parsed, "workbench cloud candidates show", 1);
3916
- const target = await resolveHostedTarget(parsed);
3917
- const candidateId = readRequiredCandidateId(parsed);
3918
- const params = new URLSearchParams({ id: candidateId });
3919
- const candidate = await apiRequest(projectApiPath(target.projectId, `/workbench/record?${params.toString()}`), {}, target.baseUrl);
3920
- writeOutput(candidate, parsed, io, (record) => {
3921
- const value = record;
3922
- return [
3923
- `${value.id ?? candidateId}\t${value.status ?? "unknown"}`,
3924
- ...(value.benchmarkFingerprint ? [`Benchmark version: ${shortDigest(value.benchmarkFingerprint)}`] : []),
3925
- ...(value.candidateFingerprint ? [`Candidate digest: ${shortDigest(value.candidateFingerprint)}`] : []),
3926
- ].join("\n");
3927
- });
3928
- return 0;
3929
- }
3930
- async function candidateFiles(argv, io) {
3931
- const parsed = parseArgs(argv);
3932
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3933
- rejectUnexpectedPositionals(parsed, "workbench cloud candidates files", 1);
3934
- const target = await resolveHostedTarget(parsed);
3935
- const candidateId = readRequiredCandidateId(parsed);
3936
- const response = await apiRequest(projectApiPath(target.projectId, `/candidates/${encodeURIComponent(candidateId)}/files`), {}, target.baseUrl);
3937
- writeOutput(response.files, parsed, io, (files) => files
3938
- .map((file) => `${file.path}\t${file.status}\t${file.preview_kind}`)
3939
- .join("\n") || "No files.");
3940
- return 0;
3941
- }
3942
- async function candidatePreview(argv, io) {
3943
- const parsed = parseArgs(argv);
3944
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "path", "output", "json"]));
3945
- rejectUnexpectedPositionals(parsed, "workbench cloud candidates preview", 1);
3946
- const target = await resolveHostedTarget(parsed);
3947
- const candidateId = readRequiredCandidateId(parsed);
3948
- const filePath = requireFlag(parsed, "path");
3949
- const params = new URLSearchParams({ path: filePath });
3950
- const response = await apiRequest(projectApiPath(target.projectId, `/candidates/${encodeURIComponent(candidateId)}/files?${params.toString()}`), {}, target.baseUrl);
3951
- const content = response.preview.source?.content ??
3952
- response.preview.rendered_html ??
3953
- response.preview.diff ??
3954
- "";
3955
- const outputPath = asOptionalString(parsed.flags.output);
3956
- if (outputPath && outputPath !== "-") {
3957
- await fs.writeFile(outputPath, content);
3958
- io.stdout.write(`Wrote preview to ${outputPath}\n`);
3959
- }
3960
- else if (parsed.flags.json === true) {
3961
- writeJson(response.preview, io);
3962
- }
3963
- else {
3964
- io.stdout.write(content);
3965
- }
3966
- return 0;
3967
- }
3968
- async function candidateExport(argv, io) {
3969
- const parsed = parseArgs(argv);
3970
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "out", "json"]));
3971
- rejectUnexpectedPositionals(parsed, "workbench cloud candidates pull", 1);
3972
- const target = await resolveHostedTarget(parsed);
3973
- const candidateId = readRequiredCandidateId(parsed);
3974
- const outputDir = requireOutDir(parsed);
3975
- const response = await apiRequest(projectApiPath(target.projectId, `/candidates/${encodeURIComponent(candidateId)}/export`), {}, target.baseUrl);
3976
- await writeFiles(outputDir, response.files);
3977
- writeOutput({ ok: true, outputDir, files: response.files.length }, parsed, io, (result) => {
3978
- const record = result;
3979
- return `Exported ${record.files} file(s) to ${record.outputDir}`;
3980
- });
3981
- return 0;
3982
- }
3983
- async function candidateVisibility(argv, io, visibility) {
3984
- const parsed = parseArgs(argv);
3985
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3986
- rejectUnexpectedPositionals(parsed, `workbench cloud candidates ${visibility === "public" ? "publish" : "unpublish"}`, 1);
3987
- const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
3988
- const candidateId = readRequiredCandidateId(parsed);
3989
- const response = await apiRequest(projectApiPath(target.projectId, `/candidates/${encodeURIComponent(candidateId)}/publish`), { method: visibility === "public" ? "PUT" : "DELETE" }, target.baseUrl);
3990
- writeOutput({ ok: true, visibility, candidate: response.candidate }, parsed, io, () => `${visibility === "public" ? "Published" : "Unpublished"} candidate ${candidateId}.`);
3991
- return 0;
3992
- }
3993
- async function runList(argv, io) {
3994
- const parsed = parseArgs(argv);
3995
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3996
- rejectUnexpectedPositionals(parsed, "workbench cloud runs list", 0);
3997
- const target = await resolveHostedTarget(parsed);
3998
- const response = await apiRequest(projectApiPath(target.projectId, "/runs"), {}, target.baseUrl);
3999
- writeOutput(response.runs, parsed, io, (runs) => runs
4000
- .map((run) => `${run.id}\t${run.status}\t${run.candidateId ?? "pending"}`)
4001
- .join("\n") || "No runs.");
4002
- return 0;
4003
- }
4004
- async function runShow(argv, io) {
4005
- const parsed = parseArgs(argv);
4006
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
4007
- rejectUnexpectedPositionals(parsed, "workbench cloud runs show", 1);
4008
- const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
4009
- const runId = readRequiredRunId(parsed);
4010
- const response = await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(runId)}`), {}, target.baseUrl);
4011
- const detail = withRunDetailUrls(target, response);
4012
- writeOutput(detail, parsed, io, formatRunDetail);
4013
- return 0;
4014
- }
4015
- async function runCancel(argv, io) {
4016
- const parsed = parseArgs(argv);
4017
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
4018
- rejectUnexpectedPositionals(parsed, "workbench cloud runs cancel", 1);
4019
- const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
4020
- const runId = readRequiredRunId(parsed);
4021
- const response = await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(runId)}`), { method: "DELETE" }, target.baseUrl);
4022
- const run = withRunUrls(target, response.run);
4023
- writeOutput(run, parsed, io, (record) => {
4024
- const value = record;
4025
- return [
4026
- `Cancelled run ${value.id}; status ${value.status}; outcome ${value.outcome ?? "cancelled"}.`,
4027
- `Open benchmark: ${value.urls?.benchmark ?? buildWorkbenchResourceUrls(target).benchmark}`,
4028
- ].join("\n");
4029
- });
4030
- return 0;
4031
- }
4032
- async function runWatch(argv, io) {
4033
- const parsed = parseArgs(argv);
4034
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "interval-ms", "timeout-ms", "json"]));
4035
- rejectUnexpectedPositionals(parsed, "workbench cloud watch", 1);
4036
- const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
4037
- const runId = readRequiredRunId(parsed);
4038
- if (parsed.flags.json !== true) {
4039
- io.stdout.write(`Watching run ${runId}.\n${HOSTED_WATCH_LIFECYCLE_NOTE}\n`);
4040
- }
4041
- const run = await watchHostedRun({
4042
- parsed,
4043
- target,
4044
- runId,
4045
- intervalMs: parsePositiveInt(parsed.flags["interval-ms"], 1000, "interval-ms"),
4046
- timeoutMs: parseOptionalPositiveInt(parsed.flags["timeout-ms"], "timeout-ms"),
4047
- });
4048
- const outputRun = await withHostedRunFailureSummary(target, run);
4049
- writeOutput(withRunUrls(target, outputRun), parsed, io, formatHostedRunResult);
4050
- return hostedRunSucceeded(run) ? 0 : 1;
4051
- }
4052
- async function runLogs(argv, io) {
4053
- const parsed = parseArgs(argv);
4054
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
4055
- rejectUnexpectedPositionals(parsed, "workbench cloud logs", 1);
4056
- const target = await resolveHostedTarget(parsed);
4057
- const requestedRunId = parsed.positionals[0];
4058
- if (requestedRunId) {
4059
- const response = await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(requestedRunId)}`), {}, target.baseUrl);
4060
- writeOutput({ runId: response.run.id, jobs: response.jobs }, parsed, io, formatRunLogs);
4061
- return 0;
4062
- }
4063
- const project = (await apiRequest(projectApiPath(target.projectId), {}, target.baseUrl)).project;
4064
- const runId = project.runs.at(-1)?.id;
4065
- if (!runId) {
4066
- throw new UsageError("Missing RUN_ID; the benchmark has no runs.");
4067
- }
4068
- const jobs = project.jobs.filter((job) => job.runId === runId);
4069
- writeOutput({ runId, jobs }, parsed, io, formatRunLogs);
4070
- return 0;
4071
- }
4072
- function formatRunLogs(record) {
4073
- const value = record;
4074
- return (value.jobs
4075
- .map((job) => `${job.id}\t${job.kind}\t${job.status}\t${job.candidateId ?? "-"}${job.error ? `\t${job.error}` : ""}`)
4076
- .join("\n") || `No jobs for ${value.runId}.`);
4077
- }
4078
3615
  async function openWorkbench(argv, io) {
4079
3616
  const parsed = parseArgs(argv);
4080
3617
  rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "no-open", "json"]));
4081
3618
  if (parsed.positionals.length > 1) {
4082
- throw new UsageError(`Unexpected argument for workbench open: ${parsed.positionals.slice(1).join(" ")}`);
3619
+ throw new UsageError(`Unexpected argument for workbench open --hosted: ${parsed.positionals.slice(1).join(" ")}`);
4083
3620
  }
4084
3621
  const target = await resolveOpenTarget(parsed);
4085
3622
  const ref = target.openRef;
@@ -4134,11 +3671,12 @@ async function resolveHostedTarget(parsed, options = {}) {
4134
3671
  if (!projectId) {
4135
3672
  throw new UsageError("Missing hosted benchmark. Run workbench push, workbench clone, or pass --benchmark OWNER/BENCHMARK.");
4136
3673
  }
3674
+ const originRemote = origin ? parseOriginRemote(origin) : null;
4137
3675
  return {
4138
3676
  projectId,
4139
- ...(!explicitProject && origin?.owner ? { owner: origin.owner } : {}),
4140
- ...(!explicitProject && origin?.project
4141
- ? { projectName: origin.project }
3677
+ ...(!explicitProject && originRemote ? { owner: originRemote.owner } : {}),
3678
+ ...(!explicitProject && originRemote
3679
+ ? { projectName: originRemote.project }
4142
3680
  : {}),
4143
3681
  dir,
4144
3682
  baseUrl,
@@ -4176,13 +3714,12 @@ async function resolveHostedDryRunTarget(parsed, options = {}) {
4176
3714
  };
4177
3715
  }
4178
3716
  if (origin?.projectId) {
3717
+ const originRemote = parseOriginRemote(origin);
4179
3718
  return {
4180
- projectRef: origin.owner && origin.project
4181
- ? `${origin.owner}/${origin.project}`
4182
- : origin.projectId,
3719
+ projectRef: origin.remote,
4183
3720
  projectId: origin.projectId,
4184
- ...(origin.owner ? { owner: origin.owner } : {}),
4185
- ...(origin.project ? { projectName: origin.project } : {}),
3721
+ owner: originRemote.owner,
3722
+ projectName: originRemote.project,
4186
3723
  dir,
4187
3724
  baseUrl,
4188
3725
  origin,
@@ -4241,38 +3778,31 @@ function buildWorkbenchResourceUrls(target, refs = {}) {
4241
3778
  function projectApiPath(projectRef, suffix = "") {
4242
3779
  return `/api/workbench/benchmarks/${encodeURIComponent(projectRef)}${suffix}`;
4243
3780
  }
4244
- function benchmarkApiPath(benchmarkRef) {
4245
- if (benchmarkRef.includes("/")) {
4246
- return publicProjectApiPath(parseBenchmarkRef(benchmarkRef));
4247
- }
4248
- return projectApiPath(benchmarkRef);
4249
- }
4250
3781
  function publicProjectApiPath(ref) {
4251
3782
  return `/api/workbench/public/benchmarks/${encodeURIComponent(ref.owner)}/${encodeURIComponent(ref.project)}`;
4252
3783
  }
4253
- function publicProjectSourceApiPath(ref) {
4254
- return `${publicProjectApiPath(ref)}/source`;
3784
+ function publicProjectStateApiPath(ref) {
3785
+ return `${publicProjectApiPath(ref)}/state`;
4255
3786
  }
4256
3787
  function readRequiredBenchmarkRef(parsed) {
4257
3788
  const ref = parsed.positionals[0];
4258
3789
  if (!ref) {
4259
- throw new UsageError("Missing required OWNER/BENCHMARK ref.");
3790
+ throw new UsageError("Missing required OWNER/BENCHMARK.");
4260
3791
  }
4261
3792
  return parseBenchmarkRef(ref);
4262
3793
  }
4263
3794
  function parseBenchmarkRef(value) {
4264
- const [namePart, versionRef, extraRef] = value.split("@");
4265
- if (extraRef !== undefined || !namePart) {
4266
- throw new UsageError("Benchmark refs must use OWNER/BENCHMARK[@REF].");
3795
+ if (value.includes("@")) {
3796
+ throw new UsageError("Benchmark refs must use OWNER/BENCHMARK.");
4267
3797
  }
4268
- const [owner, project, extra] = namePart.split("/");
3798
+ const [owner, project, extra] = value.split("/");
4269
3799
  if (!owner || !project || extra !== undefined) {
4270
- throw new UsageError("Benchmark refs must use OWNER/BENCHMARK[@REF].");
3800
+ throw new UsageError("Benchmark refs must use OWNER/BENCHMARK.");
4271
3801
  }
4272
- return { owner, project, ...(versionRef ? { ref: versionRef } : {}) };
3802
+ return { owner, project };
4273
3803
  }
4274
3804
  function formatBenchmarkRef(ref) {
4275
- return `${ref.owner}/${ref.project}${ref.ref ? `@${ref.ref}` : ""}`;
3805
+ return `${ref.owner}/${ref.project}`;
4276
3806
  }
4277
3807
  async function resolveRemoteProject(projectRef, baseUrl) {
4278
3808
  if (projectRef.includes("/")) {
@@ -4283,22 +3813,6 @@ async function resolveRemoteProject(projectRef, baseUrl) {
4283
3813
  const response = await apiRequest(projectApiPath(projectRef), {}, baseUrl);
4284
3814
  return response.benchmark;
4285
3815
  }
4286
- function formatProjectRef(project) {
4287
- return project.name ? `${project.name} (${project.id})` : project.id;
4288
- }
4289
- function originMatchesProjectRef(origin, projectRef) {
4290
- if (!origin) {
4291
- return false;
4292
- }
4293
- if (origin.projectId === projectRef) {
4294
- return true;
4295
- }
4296
- if (!projectRef.includes("/")) {
4297
- return false;
4298
- }
4299
- const ref = parseBenchmarkRef(projectRef);
4300
- return origin.owner === ref.owner && origin.project === ref.project;
4301
- }
4302
3816
  function withRunUrls(target, run) {
4303
3817
  return {
4304
3818
  ...run,
@@ -4308,18 +3822,6 @@ function withRunUrls(target, run) {
4308
3822
  }),
4309
3823
  };
4310
3824
  }
4311
- function withRunDetailUrls(target, detail) {
4312
- const candidateId = hostedRunEvaluationCandidateId(detail.run, detail.jobs);
4313
- const run = withRunUrls(target, {
4314
- ...detail.run,
4315
- outputCandidateId: detail.run.outputCandidateId ?? candidateId,
4316
- });
4317
- return {
4318
- run,
4319
- jobs: detail.jobs,
4320
- urls: run.urls ?? buildWorkbenchResourceUrls(target, { runId: run.id }),
4321
- };
4322
- }
4323
3825
  function hostedRunEvaluationCandidateId(run, jobs = []) {
4324
3826
  if (run.outputCandidateId) {
4325
3827
  return run.outputCandidateId;
@@ -4330,6 +3832,66 @@ function hostedRunEvaluationCandidateId(run, jobs = []) {
4330
3832
  .filter((candidateId) => Boolean(candidateId));
4331
3833
  return attemptCandidates.at(-1) ?? run.candidateId ?? null;
4332
3834
  }
3835
+ function localProjectState(args) {
3836
+ const stateSource = localProjectStateSource(args.source);
3837
+ const runtimeFingerprint = workbenchRuntimeBundleFingerprint(args.runtime);
3838
+ return {
3839
+ schema: "workbench.project.state.v1",
3840
+ project: {
3841
+ id: args.origin?.projectId ?? "",
3842
+ remote: args.origin?.remote ?? `local/${args.source.spec.name}`,
3843
+ ownerUsername: args.origin ? parseOriginRemote(args.origin).owner : "local",
3844
+ name: args.origin ? parseOriginRemote(args.origin).project : args.source.spec.name,
3845
+ visibility: args.visibility,
3846
+ },
3847
+ base: {
3848
+ ...(args.origin ? { sourceRevisionId: args.origin.sourceRevisionId } : {}),
3849
+ ...(args.origin ? { sourceFingerprint: args.origin.sourceFingerprint } : {}),
3850
+ runtimeFingerprint: args.origin?.runtimeFingerprint ?? runtimeFingerprint,
3851
+ },
3852
+ source: stateSource,
3853
+ runtime: args.runtime,
3854
+ };
3855
+ }
3856
+ function localProjectStateSource(source) {
3857
+ const request = hostedProjectSourceRequest(source);
3858
+ const stateSource = {
3859
+ source: request.source,
3860
+ files: source.sourceFiles.map((file) => ({ ...file })),
3861
+ candidateFiles: request.candidateFiles.map(toSurfaceSnapshotFile),
3862
+ engineResolveFiles: request.engineResolveFiles.map(toSurfaceSnapshotFile),
3863
+ engineResolveBinding: request.engineResolveBinding,
3864
+ adapterFiles: request.adapterFiles.map(toSurfaceSnapshotFile),
3865
+ dockerfile: request.dockerfile,
3866
+ runtimeDockerfile: request.runtimeDockerfile,
3867
+ runtimeFiles: request.runtimeFiles.map(toSurfaceSnapshotFile),
3868
+ network: request.network,
3869
+ resources: { ...request.resources },
3870
+ };
3871
+ return {
3872
+ ...stateSource,
3873
+ fingerprint: workbenchProjectSourceFingerprint(stateSource),
3874
+ };
3875
+ }
3876
+ function toSurfaceSnapshotFile(file) {
3877
+ return {
3878
+ path: file.path,
3879
+ kind: "kind" in file ? file.kind : file.encoding === "base64" ? "binary" : "text",
3880
+ encoding: file.encoding ?? "utf8",
3881
+ content: file.content,
3882
+ executable: file.executable === true,
3883
+ };
3884
+ }
3885
+ function hostedProjectSummaryFromState(state) {
3886
+ return {
3887
+ id: state.project.id,
3888
+ ownerUsername: state.project.ownerUsername,
3889
+ name: state.project.name,
3890
+ visibility: state.project.visibility,
3891
+ currentSpecVersionId: state.source.revisionId ?? state.base.sourceRevisionId,
3892
+ sourceFingerprint: state.source.fingerprint ?? state.base.sourceFingerprint,
3893
+ };
3894
+ }
4333
3895
  function sourceFileCount(source) {
4334
3896
  return source.sourceFiles.length;
4335
3897
  }
@@ -4463,71 +4025,12 @@ function formatHostedRunStarted(run, fallbackWorkflow) {
4463
4025
  "",
4464
4026
  ].join("\n");
4465
4027
  }
4466
- function formatRunDetail(record) {
4467
- const detail = record;
4468
- const { run, jobs, urls } = detail;
4469
- const cost = sumJobCostUsd(jobs);
4470
- const firstFailedJob = jobs.find((job) => job.status === "failed" && job.error);
4471
- const candidateId = hostedRunEvaluationCandidateId(run, jobs);
4472
- return [
4473
- `Run ${run.id}: ${run.status}${run.outcome ? ` (${run.outcome})` : ""}`,
4474
- `Workflow: ${run.workflow ?? "improve"}`,
4475
- `Candidate: ${candidateId ?? "pending"}`,
4476
- ...(run.activeCandidateId && candidateId && run.activeCandidateId !== candidateId
4477
- ? [`Active candidate: ${run.activeCandidateId}`]
4478
- : []),
4479
- `Samples: ${run.samples ?? 0}`,
4480
- `Attempts: ${run.attemptsExecuted ?? 0}/${run.attemptsRequested ?? run.attemptsExecuted ?? 0}`,
4481
- `Jobs: ${run.completedJobCount ?? jobs.filter(isTerminalRunJob).length}/${run.jobCount ?? jobs.length} completed${run.failedJobCount ? `; ${run.failedJobCount} failed` : ""}`,
4482
- ...(typeof run.durationMs === "number"
4483
- ? [`Duration: ${formatDurationMs(run.durationMs)}`]
4484
- : []),
4485
- ...(cost > 0 ? [`Cost: ${formatUsd(cost)}`] : []),
4486
- ...(firstFailedJob?.error
4487
- ? [`First failed job ${firstFailedJob.id}: ${firstFailedJob.error}`]
4488
- : []),
4489
- ...(urls.candidateEvaluation
4490
- ? [`Open evaluation: ${urls.candidateEvaluation}`]
4491
- : [`Open benchmark: ${urls.benchmark}`]),
4492
- ...(jobs.length > 0 ? ["", "Jobs:", ...jobs.map(formatRunJobLine)] : []),
4493
- ].join("\n");
4494
- }
4495
- function formatRunJobLine(job) {
4496
- return [
4497
- job.id,
4498
- readRunJobPurpose(job) ?? job.kind ?? "job",
4499
- job.status,
4500
- job.candidateId ?? "-",
4501
- job.error ?? "",
4502
- ].filter((value, index) => index < 4 || value !== "").join("\t");
4503
- }
4504
- function isTerminalRunJob(job) {
4505
- return job.status === "succeeded" || job.status === "failed" || job.status === "cancelled";
4506
- }
4507
4028
  function readRunJobPurpose(job) {
4508
4029
  const input = readRecord(job.input);
4509
4030
  const execution = readRecord(input?.execution);
4510
4031
  const purpose = execution?.purpose;
4511
4032
  return typeof purpose === "string" && purpose ? purpose : null;
4512
4033
  }
4513
- function sumJobCostUsd(jobs) {
4514
- const sum = jobs.reduce((total, job) => total + costUsdFromUsage(readRecord(job.output)?.usage), 0);
4515
- return Number.isFinite(sum) ? Math.round(sum * 1_000_000) / 1_000_000 : 0;
4516
- }
4517
- function costUsdFromUsage(value) {
4518
- const usage = readRecord(value);
4519
- if (!usage) {
4520
- return 0;
4521
- }
4522
- const direct = readFiniteNumber(usage.costUsd);
4523
- if (direct !== null) {
4524
- return direct;
4525
- }
4526
- return ["total", "improver", "runner", "engine"].reduce((sum, key) => {
4527
- const nested = readRecord(usage[key]);
4528
- return sum + (readFiniteNumber(nested?.costUsd) ?? 0);
4529
- }, 0);
4530
- }
4531
4034
  function readRecord(value) {
4532
4035
  return value && typeof value === "object" && !Array.isArray(value)
4533
4036
  ? value
@@ -4545,24 +4048,6 @@ function integerValue(value) {
4545
4048
  function readFiniteNumber(value) {
4546
4049
  return typeof value === "number" && Number.isFinite(value) ? value : null;
4547
4050
  }
4548
- function formatDurationMs(durationMs) {
4549
- if (durationMs < 1000) {
4550
- return `${Math.max(0, Math.round(durationMs))}ms`;
4551
- }
4552
- const seconds = durationMs / 1000;
4553
- if (seconds < 60) {
4554
- return `${seconds.toFixed(seconds < 10 ? 1 : 0)}s`;
4555
- }
4556
- const minutes = Math.floor(seconds / 60);
4557
- const remainingSeconds = Math.round(seconds % 60);
4558
- return `${minutes}m ${remainingSeconds}s`;
4559
- }
4560
- function formatUsd(value) {
4561
- return `$${value.toFixed(value < 1 ? 4 : 2)}`;
4562
- }
4563
- function shortDigest(value) {
4564
- return value.length > 12 ? value.slice(0, 12) : value;
4565
- }
4566
4051
  async function withHostedRunFailureSummary(target, run) {
4567
4052
  if (hostedRunSucceeded(run) || run.error || (run.failedJobCount ?? 0) <= 0) {
4568
4053
  return run;
@@ -4592,23 +4077,44 @@ function hostedRunSucceeded(run) {
4592
4077
  async function readWorkbenchOrigin(dir) {
4593
4078
  try {
4594
4079
  const parsed = JSON.parse(await fs.readFile(workbenchOriginPath(dir), "utf8"));
4595
- if (!parsed.projectId ||
4596
- !parsed.baseUrl ||
4597
- !parsed.owner ||
4598
- !parsed.project ||
4599
- typeof parsed.writable !== "boolean") {
4080
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
4081
+ throw new UsageError(`Workbench origin is malformed: ${workbenchOriginPath(dir)}`);
4082
+ }
4083
+ const originRecord = parsed;
4084
+ const keys = Object.keys(originRecord).sort();
4085
+ const expectedKeys = [
4086
+ "baseUrl",
4087
+ "linkedAt",
4088
+ "projectId",
4089
+ "remote",
4090
+ "runtimeFingerprint",
4091
+ "sourceFingerprint",
4092
+ "sourceRevisionId",
4093
+ ];
4094
+ if (typeof originRecord.projectId !== "string" ||
4095
+ typeof originRecord.baseUrl !== "string" ||
4096
+ typeof originRecord.remote !== "string" ||
4097
+ typeof originRecord.sourceRevisionId !== "string" ||
4098
+ typeof originRecord.sourceFingerprint !== "string" ||
4099
+ typeof originRecord.runtimeFingerprint !== "string" ||
4100
+ typeof originRecord.linkedAt !== "string" ||
4101
+ originRecord.projectId.length === 0 ||
4102
+ originRecord.sourceRevisionId.length === 0 ||
4103
+ originRecord.sourceFingerprint.length === 0 ||
4104
+ originRecord.runtimeFingerprint.length === 0) {
4105
+ throw new UsageError(`Workbench origin is malformed: ${workbenchOriginPath(dir)}`);
4106
+ }
4107
+ if (JSON.stringify(keys) !== JSON.stringify(expectedKeys)) {
4600
4108
  throw new UsageError(`Workbench origin is malformed: ${workbenchOriginPath(dir)}`);
4601
4109
  }
4602
4110
  return {
4603
- baseUrl: normalizeBaseUrl(parsed.baseUrl),
4604
- owner: parsed.owner,
4605
- project: parsed.project,
4606
- projectId: parsed.projectId,
4607
- writable: parsed.writable,
4608
- ...(parsed.sourceRevisionId ? { sourceRevisionId: parsed.sourceRevisionId } : {}),
4609
- ...(parsed.sourceFingerprint ? { sourceFingerprint: parsed.sourceFingerprint } : {}),
4610
- ...(parsed.upstream ? { upstream: parsed.upstream } : {}),
4611
- linkedAt: parsed.linkedAt ?? new Date(0).toISOString(),
4111
+ baseUrl: normalizeBaseUrl(originRecord.baseUrl),
4112
+ remote: normalizeOriginRemote(originRecord.remote),
4113
+ projectId: originRecord.projectId,
4114
+ sourceRevisionId: originRecord.sourceRevisionId,
4115
+ sourceFingerprint: originRecord.sourceFingerprint,
4116
+ runtimeFingerprint: originRecord.runtimeFingerprint,
4117
+ linkedAt: originRecord.linkedAt,
4612
4118
  };
4613
4119
  }
4614
4120
  catch (error) {
@@ -4627,8 +4133,12 @@ async function requireWorkbenchOrigin(dir) {
4627
4133
  }
4628
4134
  async function writeWorkbenchOrigin(dir, input) {
4629
4135
  const origin = {
4630
- ...input,
4631
4136
  baseUrl: normalizeBaseUrl(input.baseUrl),
4137
+ remote: normalizeOriginRemote(input.remote),
4138
+ projectId: input.projectId,
4139
+ sourceRevisionId: input.sourceRevisionId,
4140
+ sourceFingerprint: input.sourceFingerprint,
4141
+ runtimeFingerprint: input.runtimeFingerprint,
4632
4142
  linkedAt: input.linkedAt ?? new Date().toISOString(),
4633
4143
  };
4634
4144
  const filePath = workbenchOriginPath(dir);
@@ -4636,6 +4146,56 @@ async function writeWorkbenchOrigin(dir, input) {
4636
4146
  await fs.writeFile(filePath, `${JSON.stringify(origin, null, 2)}\n`);
4637
4147
  return origin;
4638
4148
  }
4149
+ async function writeWorkbenchOriginFromState(dir, args) {
4150
+ const owner = args.project?.ownerUsername ?? args.state.project.ownerUsername;
4151
+ const name = args.project?.name ?? args.state.project.name;
4152
+ const sourceRevisionId = args.project?.currentSpecVersionId ??
4153
+ args.state.source.revisionId ??
4154
+ args.state.base.sourceRevisionId;
4155
+ const sourceFingerprint = args.sourceFingerprint ??
4156
+ args.project?.sourceFingerprint ??
4157
+ args.state.source.fingerprint ??
4158
+ args.state.base.sourceFingerprint;
4159
+ const runtimeFingerprint = args.state.base.runtimeFingerprint ??
4160
+ workbenchRuntimeBundleFingerprint(args.state.runtime);
4161
+ if (!sourceRevisionId || !sourceFingerprint || !runtimeFingerprint) {
4162
+ throw new UsageError("Hosted project state is missing required origin metadata.");
4163
+ }
4164
+ return await writeWorkbenchOrigin(dir, {
4165
+ baseUrl: args.baseUrl,
4166
+ remote: `${owner}/${name}`,
4167
+ projectId: args.project?.id ?? args.state.project.id,
4168
+ sourceRevisionId,
4169
+ sourceFingerprint,
4170
+ runtimeFingerprint,
4171
+ });
4172
+ }
4173
+ async function localSourceFingerprint(dir) {
4174
+ const source = localProjectStateSource(await readLocalProjectSource(dir));
4175
+ return source.fingerprint ?? workbenchProjectSourceFingerprint(source);
4176
+ }
4177
+ function parseOriginRemote(origin) {
4178
+ return parseRemoteName(origin.remote);
4179
+ }
4180
+ function parseRemoteName(remote) {
4181
+ try {
4182
+ return parseBenchmarkRef(remote);
4183
+ }
4184
+ catch {
4185
+ throw new UsageError(`Workbench origin remote must use OWNER/BENCHMARK: ${remote}`);
4186
+ }
4187
+ }
4188
+ function normalizeOriginRemote(remote) {
4189
+ const parsed = parseRemoteName(remote.trim());
4190
+ return `${parsed.owner}/${parsed.project}`;
4191
+ }
4192
+ function originRemoteUrlParts(origin) {
4193
+ const remote = parseOriginRemote(origin);
4194
+ return {
4195
+ owner: remote.owner,
4196
+ projectName: remote.project,
4197
+ };
4198
+ }
4639
4199
  function workbenchOriginPath(dir) {
4640
4200
  return path.join(dir, ".workbench", "origin.json");
4641
4201
  }
@@ -4674,30 +4234,6 @@ async function readWorkbenchProfileStatus(config) {
4674
4234
  return { authenticated: true, profile: null };
4675
4235
  }
4676
4236
  }
4677
- function readOptionalCandidateId(parsed) {
4678
- return asOptionalString(parsed.flags.candidate) ?? parsed.positionals[0];
4679
- }
4680
- function readRequiredCandidateId(parsed) {
4681
- const candidateId = readOptionalCandidateId(parsed);
4682
- if (!candidateId) {
4683
- throw new UsageError("Missing required CANDIDATE_ID.");
4684
- }
4685
- return candidateId;
4686
- }
4687
- function readRequiredRunId(parsed) {
4688
- const runId = parsed.positionals[0];
4689
- if (!runId) {
4690
- throw new UsageError("Missing required RUN_ID.");
4691
- }
4692
- return runId;
4693
- }
4694
- function requireOutDir(parsed) {
4695
- const output = asOptionalString(parsed.flags.out);
4696
- if (!output) {
4697
- throw new UsageError("Missing required --out.");
4698
- }
4699
- return output;
4700
- }
4701
4237
  async function apiRequest(apiPath, options = {}, baseUrlOverride) {
4702
4238
  const config = await loadConfig();
4703
4239
  const baseUrl = normalizeBaseUrl(baseUrlOverride ??
@@ -5219,10 +4755,12 @@ function resolveSourceDir(parsed) {
5219
4755
  if (parsed.positionals.length > 1) {
5220
4756
  throw new UsageError("Expected at most one source file or directory argument.");
5221
4757
  }
5222
- if (parsed.positionals.length > 0 && parsed.flags.dir !== undefined) {
5223
- throw new UsageError("Use either --dir or SOURCE, not both.");
4758
+ const dir = asOptionalString(parsed.flags.dir);
4759
+ const source = parsed.positionals[0];
4760
+ if (dir && source) {
4761
+ return path.resolve(dir, source);
5224
4762
  }
5225
- return path.resolve(asOptionalString(parsed.flags.dir) ?? parsed.positionals[0] ?? process.cwd());
4763
+ return path.resolve(dir ?? source ?? process.cwd());
5226
4764
  }
5227
4765
  function isWorkbenchSourceYamlPath(filePath) {
5228
4766
  return path.basename(filePath) === WORKBENCH_BENCHMARK_FILE;
@@ -5368,6 +4906,14 @@ async function syncSourceFiles(outputDir, files) {
5368
4906
  }
5369
4907
  await writeFiles(outputDir, files);
5370
4908
  }
4909
+ async function assertLocalSourceMatchesOrigin(dir, origin) {
4910
+ const source = await readLocalProjectSource(dir);
4911
+ const fingerprint = localProjectStateSource(source).fingerprint;
4912
+ if (fingerprint === origin.sourceFingerprint) {
4913
+ return;
4914
+ }
4915
+ throw new UsageError("Local source changed since the last pull or push. Run `workbench push` before pulling, or restore the local source changes and try again.");
4916
+ }
5371
4917
  async function readManagedSourceFilePaths(outputDir) {
5372
4918
  try {
5373
4919
  const source = await readLocalProjectSource(outputDir);