@workbench-ai/workbench 0.0.49 → 0.0.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,7 +5,7 @@ import { createRequire } from "node:module";
5
5
  import os from "node:os";
6
6
  import path from "node:path";
7
7
  import { Writable } from "node:stream";
8
- import { createSubjectFilePreview, createBaselineSubjectJob as createRuntimeBaselineSubjectJob, evaluationScorecardId, executeWorkbenchExecutionJob, engineResolveBindingForSpec, filterSubjectSourceFiles, workbenchExecutionPurpose, createWorkbenchAdapterAuthBundle, createSubjectEvaluationTraceInputFiles, createSubjectRevisionTraceInputFiles, DOCKER_SANDBOX_BACKEND, localWorkbenchAdapterAuthStore, materializeWorkbenchRunResult, normalizeSurfaceFiles, planWorkbenchExecutionJobsForPurpose, runWorkbenchExecutionDag, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, summarizeSubjectFiles, validateWorkbenchRunEnvelope, parseWorkbenchAdapterAuthTarget, } from "@workbench-ai/workbench-core";
8
+ import { createCandidateFilePreview, createBaselineCandidateJob as createRuntimeBaselineCandidateJob, evaluationScorecardId, evaluationMeanMetrics, executeWorkbenchExecutionJob, engineResolveBindingForSpec, filterOptimizerTraceJobsForCaseIds, filterCandidateSourceFiles, formatWorkbenchCaseSelector, formatWorkbenchSelectionPolicy, workbenchCaseSelectorUsesAllCases, workbenchExecutionPurpose, workbenchRunExecutionFingerprint, createWorkbenchAdapterAuthBundle, createOptimizerTraceInputFiles, DOCKER_SANDBOX_BACKEND, localWorkbenchAdapterAuthStore, materializeWorkbenchRunResult, normalizeSurfaceFiles, planWorkbenchExecutionJobsForPurpose, runWorkbenchExecutionDag, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, summarizeCandidateFiles, validateWorkbenchRunEnvelope, parseWorkbenchAdapterAuthTarget, workbenchEngineCaseIdsForImproveEvaluation, workbenchEngineCaseIdsForSelector, workbenchImproveOptimizeSelector, workbenchImproveSelectionPolicy, workbenchProjectSourceFingerprint, workbenchRuntimeBundleFingerprint, } from "@workbench-ai/workbench-core";
9
9
  import { assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, WORKBENCH_ADAPTER_RESULT_FILE, WORKBENCH_ADAPTER_RESULT_PROTOCOL, normalizeWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationCommand, workbenchAdapterOperationResultPath, withDefaultWorkbenchAdapterAuthProfiles as applyDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
10
10
  import { builtinLocalTraceAdapter, builtinLocalTraceAdapters, sortLocalTraceRefs, } from "@workbench-ai/workbench-built-in-adapters/local-traces";
11
11
  import { commandUsage, HOSTED_WATCH_LIFECYCLE_NOTE, LOCAL_DEV_OPEN_LIFECYCLE_NOTE, rootUsage, } from "./command-model.js";
@@ -13,10 +13,10 @@ import { startLocalWorkbenchDevServer } from "./dev-open-server.js";
13
13
  import { createWorkbenchInitScaffold, } from "./init-scaffold.js";
14
14
  import { defaultAdapterManifests, composeRuntimeDockerfileWithAdapters, resolveDefaultWorkbenchAdapter, resolveProjectAdapterSource, resolveWorkbenchAdaptersForProject, WORKBENCH_ADAPTER_MANIFEST_FILE, } from "./adapter-project.js";
15
15
  import { createAdapterCommandEnv } from "./adapter-command-env.js";
16
- import { appendLocalRun, loadLocalArchive, loadLocalArchiveIndex, materializeSubjectRoot, readLocalSubject, readLocalSubjectFiles, saveLocalArchive, saveLocalJobs, setLocalActive, upsertLocalSubject, upsertLocalEvaluation, } from "./local-archive.js";
16
+ import { loadLocalArchive, loadLocalArchiveIndex, exportLocalRuntimeBundle, importLocalRuntimeBundle, runtimeBundleStats, materializeCandidateRoot, readLocalCandidate, readLocalCandidateFiles, readLocalJobs, saveLocalArchive, saveLocalJobs, setLocalActive, upsertLocalRun, upsertLocalCandidate, upsertLocalEvaluation, } from "./local-archive.js";
17
17
  import { WorkspaceSnapshotError, } from "./workspace-snapshot.js";
18
18
  import { readLocalProjectSource, WORKBENCH_BENCHMARK_FILE, } from "./project-source.js";
19
- import { localBenchmarkFingerprint, localSubjectFingerprint, } from "./benchmark-fingerprint.js";
19
+ import { localBenchmarkFingerprint, localCandidateFingerprint, } from "./benchmark-fingerprint.js";
20
20
  const require = createRequire(import.meta.url);
21
21
  function getCliVersion() {
22
22
  const manifest = require("../package.json");
@@ -74,29 +74,38 @@ export async function runCli(argv, io = {
74
74
  if (argv[0] === "clone") {
75
75
  return await cloneProject(argv.slice(1), io);
76
76
  }
77
- if (argv[0] === "fetch") {
78
- return await fetchProject(argv.slice(1), io);
79
- }
80
77
  if (argv[0] === "pull") {
81
78
  return await pullProject(argv.slice(1), io);
82
79
  }
83
80
  if (argv[0] === "push") {
84
81
  return await pushBenchmark(argv.slice(1), io);
85
82
  }
86
- if (argv[0] === "remote") {
87
- return await runRemoteCommand(argv.slice(1), io);
88
- }
89
83
  if (argv[0] === "eval") {
90
- return await localEvaluateSubject(argv.slice(1), io, runtimeOptions);
84
+ const hosted = extractHostedFlag(argv.slice(1));
85
+ return hosted.enabled
86
+ ? await startHostedWorkflow("eval", hosted.argv, io)
87
+ : await localEvaluateCandidate(hosted.argv, io, runtimeOptions);
88
+ }
89
+ if (argv[0] === "retry") {
90
+ const hosted = extractHostedFlag(argv.slice(1));
91
+ return hosted.enabled
92
+ ? await retryHostedWorkflow(hosted.argv, io)
93
+ : await localRetry(hosted.argv, io, runtimeOptions);
91
94
  }
92
95
  if (argv[0] === "improve") {
93
- return await localRun(argv.slice(1), io, runtimeOptions);
96
+ const hosted = extractHostedFlag(argv.slice(1));
97
+ return hosted.enabled
98
+ ? await startHostedWorkflow("improve", hosted.argv, io)
99
+ : await localRun(hosted.argv, io, runtimeOptions);
94
100
  }
95
101
  if (argv[0] === "restore") {
96
102
  return await localRestore(argv.slice(1), io);
97
103
  }
98
104
  if (argv[0] === "open") {
99
- return await localDevOpen(argv.slice(1), io);
105
+ const hosted = extractHostedFlag(argv.slice(1));
106
+ return hosted.enabled
107
+ ? await openWorkbench(hosted.argv, io)
108
+ : await localDevOpen(hosted.argv, io);
100
109
  }
101
110
  if (argv[0] === "auth") {
102
111
  return await runAuthCommand(argv.slice(1), io);
@@ -107,9 +116,6 @@ export async function runCli(argv, io = {
107
116
  if (argv[0] === "traces") {
108
117
  return await runTracesCommand(argv.slice(1), io);
109
118
  }
110
- if (argv[0] === "cloud") {
111
- return await runCloudCommand(argv.slice(1), io);
112
- }
113
119
  const commandPath = argv.slice(0, 2).join(" ");
114
120
  const rest = argv.slice(2);
115
121
  switch (commandPath) {
@@ -117,14 +123,14 @@ export async function runCli(argv, io = {
117
123
  return await localRunList(rest, io);
118
124
  case "runs show":
119
125
  return await localRunShow(rest, io);
120
- case "subjects list":
121
- return await localSubjectList(rest, io);
122
- case "subjects show":
123
- return await localSubjectShow(rest, io);
124
- case "subjects files":
125
- return await localSubjectFiles(rest, io);
126
- case "subjects preview":
127
- return await localSubjectPreview(rest, io);
126
+ case "candidates list":
127
+ return await localCandidateList(rest, io);
128
+ case "candidates show":
129
+ return await localCandidateShow(rest, io);
130
+ case "candidates files":
131
+ return await localCandidateFiles(rest, io);
132
+ case "candidates preview":
133
+ return await localCandidatePreview(rest, io);
128
134
  default:
129
135
  break;
130
136
  }
@@ -145,9 +151,6 @@ export async function runCli(argv, io = {
145
151
  }
146
152
  function commandPathForHelp(argv) {
147
153
  const positionals = argv.filter((arg) => arg !== "--help" && arg !== "-h" && !arg.startsWith("--"));
148
- if (positionals[0] === "cloud") {
149
- return positionals.slice(0, 3).join(" ");
150
- }
151
154
  if (positionals[0] === "adapters" &&
152
155
  ["create", "list", "inspect", "test"].includes(positionals[1] ?? "")) {
153
156
  return positionals.slice(0, 2).join(" ");
@@ -156,76 +159,31 @@ function commandPathForHelp(argv) {
156
159
  ["collect", "list", "show"].includes(positionals[1] ?? "")) {
157
160
  return positionals.slice(0, 2).join(" ");
158
161
  }
159
- if (positionals[0] === "auth" || positionals[0] === "remote") {
162
+ if (positionals[0] === "auth") {
160
163
  return positionals.slice(0, 2).join(" ");
161
164
  }
162
165
  if (positionals[0] === "runs" &&
163
166
  ["list", "show"].includes(positionals[1] ?? "")) {
164
167
  return positionals.slice(0, 2).join(" ");
165
168
  }
166
- if (positionals[0] === "subjects" &&
169
+ if (positionals[0] === "candidates" &&
167
170
  ["list", "show", "files", "preview"].includes(positionals[1] ?? "")) {
168
171
  return positionals.slice(0, 2).join(" ");
169
172
  }
170
173
  return positionals[0] ?? "";
171
174
  }
172
- async function runCloudCommand(argv, io) {
173
- const command = argv[0];
174
- const rest = argv.slice(1);
175
- switch (command) {
176
- case "eval":
177
- return await startHostedWorkflow("eval", rest, io);
178
- case "improve":
179
- return await startHostedWorkflow("improve", rest, io);
180
- case "open":
181
- return await openWorkbench(rest, io);
182
- case "watch":
183
- return await runWatch(rest, io);
184
- case "logs":
185
- return await runLogs(rest, io);
186
- case "star":
187
- return await starProject(rest, io, true);
188
- case "unstar":
189
- return await starProject(rest, io, false);
190
- default:
191
- break;
192
- }
193
- const commandPath = argv.slice(0, 2).join(" ");
194
- const subRest = argv.slice(2);
195
- switch (commandPath) {
196
- case "benchmarks list":
197
- return await benchmarkList(subRest, io);
198
- case "benchmarks show":
199
- return await benchmarkShow(subRest, io);
200
- case "benchmarks versions":
201
- return await benchmarkVersions(subRest, io);
202
- case "benchmarks starred":
203
- return await benchmarkStarred(subRest, io);
204
- case "benchmarks delete":
205
- return await benchmarkDelete(subRest, io);
206
- case "runs list":
207
- return await runList(subRest, io);
208
- case "runs show":
209
- return await runShow(subRest, io);
210
- case "runs cancel":
211
- return await runCancel(subRest, io);
212
- case "subjects list":
213
- return await subjectList(subRest, io);
214
- case "subjects show":
215
- return await subjectShow(subRest, io);
216
- case "subjects files":
217
- return await subjectFiles(subRest, io);
218
- case "subjects preview":
219
- return await subjectPreview(subRest, io);
220
- case "subjects pull":
221
- return await subjectExport(subRest, io);
222
- case "subjects publish":
223
- return await subjectVisibility(subRest, io, "public");
224
- case "subjects unpublish":
225
- return await subjectVisibility(subRest, io, "private");
226
- default:
227
- throw new UsageError(`Unknown command: cloud ${argv.join(" ")}`);
175
+ function extractHostedFlag(argv) {
176
+ let enabled = false;
177
+ const next = [];
178
+ for (const arg of argv) {
179
+ if (arg === "--hosted") {
180
+ enabled = true;
181
+ }
182
+ else {
183
+ next.push(arg);
184
+ }
228
185
  }
186
+ return { enabled, argv: next };
229
187
  }
230
188
  async function localDevOpen(argv, io) {
231
189
  const parsed = parseArgs(argv);
@@ -313,7 +271,7 @@ async function localInit(argv, io) {
313
271
  specPath,
314
272
  kind: scaffold.kind,
315
273
  name: scaffold.name,
316
- subjectRoot: scaffold.subjectRoot,
274
+ candidateRoot: scaffold.candidateRoot,
317
275
  }, parsed, io, () => `Initialized ${scaffold.kind} Workbench source directory at ${workspace}`);
318
276
  return 0;
319
277
  }
@@ -358,20 +316,20 @@ function buildWorkbenchCheckPlan(source) {
358
316
  files: sourceFileCount(source),
359
317
  yaml: [
360
318
  path.relative(source.dir, source.benchmarkPath) || "benchmark.yaml",
361
- path.relative(source.dir, source.subjectSpecPath) || "subject YAML",
362
- ...(source.optimizerSource !== undefined
363
- ? [path.relative(source.dir, source.optimizerPath ?? "") || "optimizer YAML"]
364
- : []),
319
+ path.relative(source.dir, source.candidateSpecPath) || "candidate YAML",
365
320
  ],
366
321
  dockerfile: source.dockerfilePath,
367
322
  },
368
- subject: {
369
- filesPath: source.spec.subject.files.path,
370
- files: source.subjectFiles.length,
323
+ candidate: {
324
+ name: source.spec.candidate.name,
325
+ selectedRunId: source.spec.candidate.selectedRunId,
326
+ runCount: Object.keys(source.spec.candidate.runs).length,
327
+ filesPath: source.spec.candidate.files.path,
328
+ files: source.candidateFiles.length,
371
329
  },
372
- optimizer: source.spec.optimizer
330
+ improve: source.spec.candidate.improve
373
331
  ? {
374
- edits: [...source.spec.optimizer.edits],
332
+ edits: [...source.spec.candidate.improve.edits],
375
333
  }
376
334
  : null,
377
335
  engine: {
@@ -394,8 +352,8 @@ function buildWorkbenchCheckPlan(source) {
394
352
  };
395
353
  }
396
354
  function formatWorkbenchCheckPlan(plan, warningSuffix) {
397
- const edits = plan.optimizer?.edits.length
398
- ? plan.optimizer.edits.join(", ")
355
+ const edits = plan.improve?.edits.length
356
+ ? plan.improve.edits.join(", ")
399
357
  : "-";
400
358
  const network = plan.environment.network.egress;
401
359
  const resources = plan.environment.resources;
@@ -404,11 +362,12 @@ function formatWorkbenchCheckPlan(plan, warningSuffix) {
404
362
  `Benchmark: ${plan.benchmarkName}`,
405
363
  `Description: ${plan.benchmarkDescription}`,
406
364
  `Source: ${plan.source.files} file(s) (${plan.source.yaml.join(", ")}, ${plan.source.dockerfile})`,
407
- `Subject files: ${plan.subject.filesPath} (${plan.subject.files} file(s))`,
408
- `Optimizer edits: ${edits}`,
365
+ `Candidate: ${plan.candidate.name} (${plan.candidate.runCount} run(s), selected ${plan.candidate.selectedRunId})`,
366
+ `Candidate files: ${plan.candidate.filesPath} (${plan.candidate.files} file(s))`,
367
+ `Improve edits: ${edits}`,
409
368
  `Engine cases: ${plan.engine.cases} case(s) from ${formatAdapterSummary(plan.engine.resolver)} at ${plan.engine.path} (${plan.engine.files} file(s))`,
410
369
  `Environment: ${plan.environment.dockerfile}, network ${network}, ${resources.cpu} CPU, ${resources.memoryGb}GB RAM, ${resources.timeoutMinutes}m timeout`,
411
- `Execution: improve ${plan.adapters.improve ? formatAdapterSummary(plan.adapters.improve) : "not configured"}, subject ${formatAdapterSummary(plan.adapters.run)}, engine ${formatAdapterSummary(plan.adapters.engine)}`,
370
+ `Execution: improve ${plan.adapters.improve ? formatAdapterSummary(plan.adapters.improve) : "not configured"}, candidate run ${formatAdapterSummary(plan.adapters.run)}, engine ${formatAdapterSummary(plan.adapters.engine)}`,
412
371
  ...adapterSourceLines(plan.adapters.sources),
413
372
  ].join("\n");
414
373
  }
@@ -493,18 +452,206 @@ function splitWorkspaceError(error) {
493
452
  const message = error instanceof Error ? error.message : String(error);
494
453
  return message.split(/\n+/u).map((entry) => entry.trim()).filter(Boolean);
495
454
  }
455
+ async function localRetry(argv, io, runtimeOptions) {
456
+ const parsed = parseArgs(argv);
457
+ rejectUnknownFlags(parsed, new Set(["dir", "json"]));
458
+ rejectUnexpectedPositionals(parsed, "workbench retry", 1);
459
+ const targetId = parsed.positionals[0];
460
+ if (!targetId) {
461
+ throw new UsageError("Missing required TARGET_ID.");
462
+ }
463
+ const workspace = resolveDir(parsed);
464
+ const target = await resolveLocalRetryTarget(workspace, targetId);
465
+ const captured = createCapturingIo(io);
466
+ const code = target.workflow === "eval"
467
+ ? await localEvaluateCandidate([
468
+ "--dir",
469
+ workspace,
470
+ "--candidate",
471
+ target.candidateId,
472
+ "--runs",
473
+ target.candidateRunId,
474
+ "--samples",
475
+ String(target.samples),
476
+ "--json",
477
+ ], captured.io, runtimeOptions)
478
+ : await localRun([
479
+ "--dir",
480
+ workspace,
481
+ "--from",
482
+ target.candidateId,
483
+ "--runs",
484
+ target.candidateRunId,
485
+ "--budget",
486
+ String(target.budget ?? 1),
487
+ "--samples",
488
+ String(target.samples),
489
+ "--json",
490
+ ], captured.io, runtimeOptions);
491
+ const commandOutput = parseCapturedJson(captured.stdoutText());
492
+ await preserveLocalActiveCandidate(workspace, target.preserveActiveId);
493
+ const outputRecord = readRecord(commandOutput) ?? {};
494
+ const result = {
495
+ ok: code === 0 && outputRecord.ok !== false,
496
+ retried: {
497
+ id: target.sourceId,
498
+ kind: target.sourceKind,
499
+ workflow: target.workflow,
500
+ },
501
+ };
502
+ assignRetryResultString(result, "runId", outputRecord.runId);
503
+ assignRetryResultString(result, "evaluationId", outputRecord.evaluationId);
504
+ assignRetryResultString(result, "candidateId", outputRecord.candidateId);
505
+ assignRetryResultString(result, "activeCandidateId", outputRecord.activeCandidateId);
506
+ const localView = localRetryViewHint(outputRecord.localView);
507
+ if (localView) {
508
+ result.localView = localView;
509
+ }
510
+ const failedJobCount = numberValue(outputRecord.failedJobCount);
511
+ if (failedJobCount !== null) {
512
+ result.failedJobCount = failedJobCount;
513
+ }
514
+ const error = stringValue(outputRecord.error);
515
+ if (error) {
516
+ result.error = error;
517
+ }
518
+ writeOutput(result, parsed, io, formatRetryCommandResult);
519
+ return code;
520
+ }
521
+ async function resolveLocalRetryTarget(workspace, targetId) {
522
+ const snapshot = await loadLocalArchive(workspace);
523
+ const evaluation = snapshot.evaluations.find((entry) => entry.id === targetId);
524
+ if (evaluation) {
525
+ const run = snapshot.runs.find((entry) => entry.id === evaluation.runId) ?? null;
526
+ return localEvaluationRetryTarget(snapshot, evaluation, run, "evaluation", targetId);
527
+ }
528
+ const run = snapshot.runs.find((entry) => entry.id === targetId);
529
+ if (!run) {
530
+ throw new UsageError(`Run or evaluation not found: ${targetId}`);
531
+ }
532
+ if (run.status !== "finished") {
533
+ throw new UsageError(`Run ${run.id} is ${run.status}; wait for it to finish before retrying.`);
534
+ }
535
+ if (!runSummaryFailed(run)) {
536
+ throw new UsageError(`Run ${run.id} did not fail; use workbench ${run.workflow} to intentionally run it again.`);
537
+ }
538
+ if (run.workflow === "eval") {
539
+ const evaluations = snapshot.evaluations.filter((entry) => entry.runId === run.id);
540
+ if (evaluations.length !== 1) {
541
+ throw new UsageError(evaluations.length === 0
542
+ ? `Run ${run.id} has no evaluation record to retry.`
543
+ : `Run ${run.id} has multiple evaluations; retry a specific evaluation id instead.`);
544
+ }
545
+ return localEvaluationRetryTarget(snapshot, evaluations[0], run, "run", targetId);
546
+ }
547
+ const candidateRunId = run.candidateRunId;
548
+ if (!run.candidateId || !candidateRunId) {
549
+ throw new UsageError(`Run ${run.id} is missing retry metadata; use workbench improve --from with an explicit candidate id.`);
550
+ }
551
+ return {
552
+ sourceId: targetId,
553
+ sourceKind: "run",
554
+ workflow: "improve",
555
+ candidateId: run.candidateId,
556
+ candidateRunId,
557
+ samples: run.samples,
558
+ budget: run.budget,
559
+ preserveActiveId: snapshot.activeId,
560
+ };
561
+ }
562
+ function localEvaluationRetryTarget(snapshot, evaluation, run, sourceKind, sourceId) {
563
+ if (!evaluationScorecardFailed(evaluation, run)) {
564
+ throw new UsageError(`Evaluation ${evaluation.id} did not fail; use workbench eval to intentionally run it again.`);
565
+ }
566
+ if (!snapshot.candidates.some((entry) => entry.id === evaluation.candidateId)) {
567
+ throw new UsageError(`Candidate not found for evaluation ${evaluation.id}: ${evaluation.candidateId}`);
568
+ }
569
+ const candidateRunId = evaluation.candidateRunId ?? run?.candidateRunId;
570
+ if (!candidateRunId) {
571
+ throw new UsageError(`Evaluation ${evaluation.id} is missing its candidate run configuration.`);
572
+ }
573
+ return {
574
+ sourceId,
575
+ sourceKind,
576
+ workflow: "eval",
577
+ candidateId: evaluation.candidateId,
578
+ candidateRunId,
579
+ samples: evaluation.sampleCount || run?.samples || 1,
580
+ preserveActiveId: snapshot.activeId,
581
+ };
582
+ }
583
+ async function preserveLocalActiveCandidate(workspace, activeId) {
584
+ let snapshot = await loadLocalArchive(workspace);
585
+ if (activeId && !snapshot.candidates.some((candidate) => candidate.id === activeId)) {
586
+ return;
587
+ }
588
+ if (snapshot.activeId === activeId) {
589
+ return;
590
+ }
591
+ snapshot = setLocalActive(snapshot, activeId);
592
+ await saveLocalArchive(workspace, snapshot);
593
+ }
594
+ function evaluationScorecardFailed(evaluation, run) {
595
+ return evaluation.errorSampleCount > 0 ||
596
+ evaluation.status !== "completed" ||
597
+ runSummaryFailed(run);
598
+ }
599
+ function runSummaryFailed(run) {
600
+ return run?.outcome === "error" || run?.outcome === "cancelled";
601
+ }
602
+ function createCapturingIo(io) {
603
+ const chunks = [];
604
+ const stdout = new class extends Writable {
605
+ _write(chunk, _encoding, callback) {
606
+ chunks.push(Buffer.isBuffer(chunk) ? chunk.toString("utf8") : String(chunk));
607
+ callback();
608
+ }
609
+ }();
610
+ return {
611
+ io: {
612
+ stdin: io.stdin,
613
+ stdout,
614
+ stderr: io.stderr,
615
+ },
616
+ stdoutText: () => chunks.join(""),
617
+ };
618
+ }
619
+ function parseCapturedJson(value) {
620
+ const trimmed = value.trim();
621
+ if (!trimmed) {
622
+ return {};
623
+ }
624
+ try {
625
+ return JSON.parse(trimmed);
626
+ }
627
+ catch {
628
+ return { output: trimmed };
629
+ }
630
+ }
631
+ function localRetryViewHint(value) {
632
+ const record = readRecord(value);
633
+ const command = stringValue(record?.command);
634
+ const note = stringValue(record?.note);
635
+ return command && note ? { command, note } : undefined;
636
+ }
637
+ function assignRetryResultString(result, key, value) {
638
+ const normalized = stringValue(value);
639
+ if (normalized) {
640
+ result[key] = normalized;
641
+ }
642
+ }
496
643
  async function localRun(argv, io, runtimeOptions) {
497
644
  const parsed = parseArgs(argv);
498
- rejectUnknownFlags(parsed, new Set(["dir", "optimizer", "from", "budget", "samples", "json"]));
645
+ rejectUnknownFlags(parsed, new Set(["dir", "runs", "from", "budget", "samples", "rerun", "json"]));
499
646
  const budget = parsePositiveInt(parsed.flags.budget, 1, "budget");
500
647
  const samples = parsePositiveInt(parsed.flags.samples, 1, "samples");
501
648
  const sourceArg = resolveSourceDir(parsed);
502
649
  const projectSource = await readLocalProjectSource(sourceArg, {
503
- optimizerPath: asOptionalString(parsed.flags.optimizer),
650
+ runId: singleRequestedRunId(asOptionalString(parsed.flags.runs), "workbench improve"),
504
651
  });
505
652
  const workspace = projectSource.dir;
506
- if (!projectSource.spec.optimizer) {
507
- throw new UsageError("Optimizer YAML is required for workbench improve.");
653
+ if (!projectSource.spec.improve || !projectSource.spec.candidate.improve) {
654
+ throw new UsageError("Candidate improve configuration is required for workbench improve.");
508
655
  }
509
656
  const executionProject = await resolveLocalProjectForExecution(workspace, projectSource.specSource);
510
657
  const { spec, adapterManifests } = executionProject;
@@ -514,18 +661,32 @@ async function localRun(argv, io, runtimeOptions) {
514
661
  if (caseIds.length === 0) {
515
662
  throw new UsageError("Engine resolver must emit at least one case.");
516
663
  }
664
+ const optimizeSelector = workbenchImproveOptimizeSelector(spec);
665
+ const selectionPolicy = workbenchImproveSelectionPolicy(spec);
666
+ const optimizeCaseIds = workbenchEngineCaseIdsForSelector(engineCases, optimizeSelector);
667
+ if (optimizeCaseIds.length === 0) {
668
+ throw new UsageError(`Improve optimizeOn selector matched no cases: ${formatWorkbenchCaseSelector(optimizeSelector)}.`);
669
+ }
670
+ const selectionCaseIds = workbenchEngineCaseIdsForSelector(engineCases, selectionPolicy.selector);
671
+ if (selectionCaseIds.length === 0) {
672
+ throw new UsageError(`Improve selectBy selector matched no cases: ${formatWorkbenchCaseSelector(selectionPolicy.selector)}.`);
673
+ }
674
+ const selectionScoreCaseIds = workbenchCaseSelectorUsesAllCases(selectionPolicy.selector)
675
+ ? undefined
676
+ : selectionCaseIds;
677
+ const evaluationCaseIds = workbenchEngineCaseIdsForImproveEvaluation({ spec, engineCases });
517
678
  requireValidRunEnvelope({
518
679
  workflow: "improve",
519
680
  budget,
520
681
  samples,
521
- caseCount: caseIds.length,
682
+ caseCount: evaluationCaseIds.length,
522
683
  });
684
+ const optimizeOnLabel = formatWorkbenchCaseSelector(optimizeSelector);
685
+ const selectByLabel = formatWorkbenchSelectionPolicy(selectionPolicy);
523
686
  const environmentRefs = await ensureLocalDockerfileEnvironments(workspace, spec, engineCases);
524
687
  const benchmarkFingerprint = await readLocalBenchmarkFingerprint(workspace);
525
- const runId = `run_local_${Date.now().toString(36)}`;
526
- const startedAt = new Date().toISOString();
527
- let snapshot = await loadLocalArchive(workspace);
528
- const baseSubject = await ensureLocalImproveBaseSubject({
688
+ const executionFingerprint = localRunExecutionFingerprint(projectSource);
689
+ const baseCandidate = await ensureLocalImproveBaseCandidate({
529
690
  parsed,
530
691
  sourceArg,
531
692
  workspace,
@@ -534,242 +695,370 @@ async function localRun(argv, io, runtimeOptions) {
534
695
  io,
535
696
  runtimeOptions,
536
697
  });
537
- let currentBaseId = baseSubject.id;
698
+ let snapshot = await loadLocalArchive(workspace);
699
+ if (parsed.flags.rerun !== true) {
700
+ const reusableRun = findReusableLocalImproveRun(snapshot.runs, {
701
+ benchmarkFingerprint,
702
+ candidateId: baseCandidate.id,
703
+ candidateRunId: projectSource.spec.candidate.selectedRunId,
704
+ executionFingerprint,
705
+ budget,
706
+ samples,
707
+ });
708
+ if (reusableRun) {
709
+ const evaluation = snapshot.evaluations.find((entry) => entry.runId === reusableRun.id) ?? null;
710
+ const outputCandidateId = reusableRun.outputCandidateId ?? reusableRun.candidateId ?? baseCandidate.id;
711
+ const outputCandidate = readLocalCandidate(snapshot, outputCandidateId);
712
+ const activeCandidate = snapshot.activeId
713
+ ? readLocalCandidate(snapshot, snapshot.activeId)
714
+ : null;
715
+ const result = {
716
+ ok: true,
717
+ reused: true,
718
+ runId: reusableRun.id,
719
+ evaluationId: evaluation?.id ?? null,
720
+ outputCandidateId,
721
+ outputCandidate,
722
+ activeCandidateId: snapshot.activeId,
723
+ activeCandidate,
724
+ completedJobCount: 0,
725
+ failedJobCount: 0,
726
+ localView: localDevViewHint(workspace, reusableRun.id),
727
+ };
728
+ writeOutput(result, parsed, io, () => `Reused improve run ${reusableRun.id}. Use --rerun to intentionally run it again.`);
729
+ return 0;
730
+ }
731
+ }
732
+ const runId = `run_local_${Date.now().toString(36)}`;
733
+ const startedAt = new Date().toISOString();
734
+ let currentBaseId = baseCandidate.id;
735
+ let outputCandidateId = null;
538
736
  let completedJobCount = 0;
539
737
  let failedJobCount = 0;
738
+ let attemptsExecuted = 0;
540
739
  const failedJobs = [];
541
740
  const events = [
542
741
  createLocalEvent("run_started", startedAt, {
543
742
  runId,
544
- detail: { budget, samples, strategy: "greedy" },
743
+ detail: { budget, samples, strategy: "greedy", optimizeOn: optimizeOnLabel, selectBy: selectByLabel },
545
744
  }),
546
745
  ];
547
- const devCapacity = await localDevelopmentCapacity(workspace);
548
- const runTraceJobs = [];
549
- const attempts = budget;
550
- for (let attemptIndex = 0; attemptIndex < attempts; attemptIndex += 1) {
551
- snapshot = await loadLocalArchive(workspace);
552
- const activeSubject = readLocalSubject(snapshot, currentBaseId);
553
- const baseFiles = filterSubjectSourceFiles(readLocalSubjectFiles(snapshot, activeSubject.id));
554
- if (baseFiles.length === 0) {
555
- throw new UsageError("Subject snapshot must include at least one file.");
556
- }
557
- const subjectRevisionTraceFiles = [
558
- ...createSubjectEvaluationTraceInputFiles({ subject: activeSubject }),
559
- ...createSubjectRevisionTraceInputFiles({
560
- runId,
561
- jobs: runTraceJobs,
562
- events,
563
- }),
564
- ];
565
- const subjectId = `subject_${runId.replace(/^run_/u, "")}_${String(attemptIndex + 1).padStart(3, "0")}`;
566
- const plannedSubjectRevision = planWorkbenchExecutionJobsForPurpose({
567
- ownerUserId: "local",
568
- projectId: "local",
569
- runId,
570
- subjectId,
571
- attemptIndex,
572
- samples,
573
- caseIds,
574
- engineCases,
575
- spec,
576
- workflow: "improve",
577
- purpose: "improve",
578
- now: new Date().toISOString(),
579
- baseFiles,
580
- traceFiles: subjectRevisionTraceFiles,
581
- ...(environmentRefs.defaultRef ? { environmentRef: environmentRefs.defaultRef } : {}),
582
- baseId: activeSubject.id,
583
- })[0];
584
- const subjectRevisionJobs = await executeLocalDevelopmentDag({
585
- jobs: [plannedSubjectRevision],
586
- spec,
587
- adapterManifests,
588
- adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
589
- baseFiles,
590
- engineResolveFiles,
591
- engineCases,
592
- traceFiles: subjectRevisionTraceFiles,
593
- capacity: devCapacity,
746
+ const runningRun = {
747
+ id: runId,
748
+ workflow: "improve",
749
+ benchmarkFingerprint,
750
+ status: "running",
751
+ candidateId: baseCandidate.id,
752
+ candidateRunId: projectSource.spec.candidate.selectedRunId,
753
+ candidateRunName: projectSource.spec.candidate.selectedRunName,
754
+ startedAt,
755
+ improver: formatSpecImprover(spec),
756
+ engineRun: spec.engineRun.use,
757
+ strategy: "greedy",
758
+ optimizeOn: optimizeOnLabel,
759
+ selectBy: selectByLabel,
760
+ budget,
761
+ repairBudget: 0,
762
+ attemptsRequested: budget,
763
+ attemptsExecuted: 0,
764
+ samples,
765
+ executionFingerprint,
766
+ activeCandidateId: snapshot.activeId,
767
+ outputCandidateId: null,
768
+ };
769
+ snapshot = upsertLocalRun(snapshot, runningRun, events);
770
+ await saveLocalArchive(workspace, snapshot);
771
+ try {
772
+ const devCapacity = await localDevelopmentCapacity(workspace);
773
+ const baselineTraceJobs = selectLocalOptimizerBaselineTraceJobs(snapshot, await readLocalJobs(workspace), {
774
+ benchmarkFingerprint,
775
+ candidateId: baseCandidate.id,
776
+ candidateRunId: projectSource.spec.candidate.selectedRunId,
777
+ executionFingerprint,
594
778
  });
595
- const subjectRevision = subjectRevisionJobs[0];
596
- const completedJobs = [subjectRevision];
597
- if (subjectRevision.status === "succeeded") {
598
- const subjectRevisionFiles = completedJobOutputFiles(subjectRevision).length > 0
599
- ? normalizeSurfaceFiles(completedJobOutputFiles(subjectRevision).filter((file) => !file.path.startsWith(".workbench/")))
600
- : baseFiles;
601
- const attemptJobs = planWorkbenchExecutionJobsForPurpose({
779
+ const runTraceJobs = [];
780
+ const attempts = budget;
781
+ for (let attemptIndex = 0; attemptIndex < attempts; attemptIndex += 1) {
782
+ snapshot = await loadLocalArchive(workspace);
783
+ const activeCandidate = readLocalCandidate(snapshot, currentBaseId);
784
+ const baseFiles = filterCandidateSourceFiles(readLocalCandidateFiles(snapshot, activeCandidate.id));
785
+ if (baseFiles.length === 0) {
786
+ throw new UsageError("Candidate snapshot must include at least one file.");
787
+ }
788
+ const candidateRevisionTraceFiles = createOptimizerTraceInputFiles({
789
+ jobs: filterOptimizerTraceJobsForCaseIds([...baselineTraceJobs, ...runTraceJobs], optimizeCaseIds),
790
+ });
791
+ const candidateId = `candidate_${runId.replace(/^run_/u, "")}_${String(attemptIndex + 1).padStart(3, "0")}`;
792
+ const plannedCandidateRevision = planWorkbenchExecutionJobsForPurpose({
602
793
  ownerUserId: "local",
603
794
  projectId: "local",
604
795
  runId,
605
- subjectId,
796
+ candidateId,
606
797
  attemptIndex,
607
798
  samples,
608
- now: new Date().toISOString(),
609
- caseIds,
799
+ caseIds: optimizeCaseIds,
610
800
  engineCases,
611
801
  spec,
612
- environmentRefsByCase: environmentRefs.byCase,
613
802
  workflow: "improve",
614
- purpose: "attempt",
615
- });
616
- const dagJobs = await executeLocalDevelopmentDag({
617
- jobs: [subjectRevision, ...attemptJobs],
803
+ purpose: "improve",
804
+ now: new Date().toISOString(),
805
+ baseFiles,
806
+ traceFiles: candidateRevisionTraceFiles,
807
+ ...(environmentRefs.defaultRef ? { environmentRef: environmentRefs.defaultRef } : {}),
808
+ baseId: activeCandidate.id,
809
+ })[0];
810
+ const candidateRevisionJobs = await executeLocalDevelopmentDag({
811
+ jobs: [plannedCandidateRevision],
618
812
  spec,
619
813
  adapterManifests,
620
814
  adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
621
- baseFiles: subjectRevisionFiles,
815
+ baseFiles,
622
816
  engineResolveFiles,
623
817
  engineCases,
818
+ traceFiles: candidateRevisionTraceFiles,
624
819
  capacity: devCapacity,
625
820
  });
626
- completedJobs.splice(0, completedJobs.length, ...dagJobs);
627
- }
628
- runTraceJobs.push(...completedJobs);
629
- const materialized = materializeWorkbenchRunResult({
630
- runId,
631
- benchmarkFingerprint,
632
- sourceYaml: projectSource.specSource,
633
- benchmarkSourceFiles: authoredBenchmarkSourceFiles(projectSource),
634
- startedAt,
635
- spec,
636
- jobs: completedJobs,
637
- previousSubject: activeSubject,
638
- existingSubjectCount: snapshot.subjects.length,
639
- });
640
- for (const subject of materialized.subjects) {
641
- snapshot = upsertLocalSubject(snapshot, subject, materialized.subjectFiles[subject.id] ?? []);
642
- events.push(createLocalEvent("subject_created", subject.createdAt, {
821
+ const candidateRevision = candidateRevisionJobs[0];
822
+ const completedJobs = [candidateRevision];
823
+ if (candidateRevision.status === "succeeded") {
824
+ const candidateRevisionFiles = completedJobOutputFiles(candidateRevision).length > 0
825
+ ? normalizeSurfaceFiles(completedJobOutputFiles(candidateRevision).filter((file) => !file.path.startsWith(".workbench/")))
826
+ : baseFiles;
827
+ const attemptJobs = planWorkbenchExecutionJobsForPurpose({
828
+ ownerUserId: "local",
829
+ projectId: "local",
830
+ runId,
831
+ candidateId,
832
+ attemptIndex,
833
+ samples,
834
+ now: new Date().toISOString(),
835
+ caseIds: evaluationCaseIds,
836
+ engineCases,
837
+ spec,
838
+ environmentRefsByCase: environmentRefs.byCase,
839
+ workflow: "improve",
840
+ purpose: "attempt",
841
+ });
842
+ const dagJobs = await executeLocalDevelopmentDag({
843
+ jobs: [candidateRevision, ...attemptJobs],
844
+ spec,
845
+ adapterManifests,
846
+ adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
847
+ baseFiles: candidateRevisionFiles,
848
+ engineResolveFiles,
849
+ engineCases,
850
+ capacity: devCapacity,
851
+ });
852
+ completedJobs.splice(0, completedJobs.length, ...dagJobs);
853
+ }
854
+ runTraceJobs.push(...completedJobs);
855
+ const materialized = materializeWorkbenchRunResult({
643
856
  runId,
644
- subjectId: subject.id,
645
- baseId: subject.baseId,
646
- status: subject.status,
647
- metrics: subject.metrics,
857
+ benchmarkFingerprint,
858
+ sourceYaml: projectSource.specSource,
859
+ benchmarkSourceFiles: authoredBenchmarkSourceFiles(projectSource),
860
+ startedAt,
861
+ spec,
862
+ jobs: completedJobs,
863
+ previousCandidate: activeCandidate,
864
+ existingCandidateCount: snapshot.candidates.length,
865
+ selection: {
866
+ metric: selectionPolicy.metric,
867
+ ...(selectionScoreCaseIds ? { caseIds: selectionScoreCaseIds } : {}),
868
+ label: selectByLabel,
869
+ },
870
+ });
871
+ for (const candidate of materialized.candidates) {
872
+ outputCandidateId = candidate.id;
873
+ snapshot = upsertLocalCandidate(snapshot, candidate, materialized.candidateFiles[candidate.id] ?? []);
874
+ events.push(createLocalEvent("candidate_created", candidate.createdAt, {
875
+ runId,
876
+ candidateId: candidate.id,
877
+ baseId: candidate.baseId,
878
+ status: candidate.status,
879
+ metrics: evaluationMeanMetrics(candidate.eval),
880
+ }));
881
+ }
882
+ for (const evaluation of materialized.evaluations) {
883
+ snapshot = upsertLocalEvaluation(snapshot, evaluation);
884
+ }
885
+ snapshot = setLocalActive(snapshot, materialized.activeCandidateId);
886
+ currentBaseId = materialized.activeCandidateId ?? currentBaseId;
887
+ completedJobCount += materialized.completedJobCount;
888
+ failedJobCount += materialized.failedJobCount;
889
+ failedJobs.push(...completedJobs
890
+ .filter((job) => job.status === "failed")
891
+ .map((job) => ({
892
+ id: job.id,
893
+ purpose: workbenchExecutionPurpose(job),
894
+ error: job.error ?? "Job failed without an error message.",
895
+ })));
896
+ events.push(createLocalEvent("active_changed", new Date().toISOString(), {
897
+ runId,
898
+ candidateId: materialized.activeCandidateId ?? undefined,
899
+ activeId: materialized.activeCandidateId ?? undefined,
900
+ status: materialized.selectedCandidate?.status,
901
+ metrics: evaluationMeanMetrics(materialized.selectedCandidate?.eval),
648
902
  }));
903
+ await saveLocalJobs(workspace, completedJobs);
904
+ await saveLocalArchive(workspace, snapshot);
905
+ attemptsExecuted += 1;
649
906
  }
650
- for (const evaluation of materialized.evaluations) {
651
- snapshot = upsertLocalEvaluation(snapshot, evaluation);
652
- }
653
- snapshot = setLocalActive(snapshot, materialized.activeSubjectId);
654
- currentBaseId = materialized.activeSubjectId ?? currentBaseId;
655
- completedJobCount += materialized.completedJobCount;
656
- failedJobCount += materialized.failedJobCount;
657
- failedJobs.push(...completedJobs
658
- .filter((job) => job.status === "failed")
659
- .map((job) => ({
660
- id: job.id,
661
- purpose: workbenchExecutionPurpose(job),
662
- error: job.error ?? "Job failed without an error message.",
663
- })));
664
- events.push(createLocalEvent("active_changed", new Date().toISOString(), {
907
+ snapshot = await loadLocalArchive(workspace);
908
+ const finishedAt = new Date().toISOString();
909
+ const run = {
910
+ id: runId,
911
+ workflow: "improve",
912
+ benchmarkFingerprint,
913
+ status: "finished",
914
+ candidateId: baseCandidate.id,
915
+ candidateRunId: projectSource.spec.candidate.selectedRunId,
916
+ candidateRunName: projectSource.spec.candidate.selectedRunName,
917
+ startedAt,
918
+ finishedAt,
919
+ durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
920
+ improver: formatSpecImprover(spec),
921
+ engineRun: spec.engineRun.use,
922
+ strategy: "greedy",
923
+ optimizeOn: optimizeOnLabel,
924
+ selectBy: selectByLabel,
925
+ budget,
926
+ repairBudget: 0,
927
+ attemptsRequested: budget,
928
+ attemptsExecuted,
929
+ samples,
930
+ executionFingerprint,
931
+ stoppedReason: "budget_exhausted",
932
+ outcome: failedJobCount > 0 ? "error" : "ok",
933
+ activeCandidateId: snapshot.activeId,
934
+ outputCandidateId: outputCandidateId ?? snapshot.activeId,
935
+ };
936
+ events.push(createLocalEvent("run_finished", finishedAt, {
665
937
  runId,
666
- subjectId: materialized.activeSubjectId ?? undefined,
667
- activeId: materialized.activeSubjectId ?? undefined,
668
- status: materialized.selectedSubject?.status,
669
- metrics: materialized.selectedSubject?.metrics,
938
+ detail: {
939
+ outcome: run.outcome ?? null,
940
+ attemptsExecuted: run.attemptsExecuted,
941
+ durationMs: run.durationMs ?? null,
942
+ },
670
943
  }));
671
- await saveLocalJobs(workspace, completedJobs);
944
+ snapshot = upsertLocalRun(snapshot, run, events.slice(1));
672
945
  await saveLocalArchive(workspace, snapshot);
946
+ const outputCandidate = run.outputCandidateId
947
+ ? readLocalCandidate(snapshot, run.outputCandidateId)
948
+ : null;
949
+ const activeCandidate = snapshot.activeId
950
+ ? readLocalCandidate(snapshot, snapshot.activeId)
951
+ : null;
952
+ const result = {
953
+ ok: failedJobCount === 0,
954
+ runId,
955
+ outputCandidateId: run.outputCandidateId,
956
+ outputCandidate,
957
+ activeCandidateId: snapshot.activeId,
958
+ activeCandidate,
959
+ completedJobCount,
960
+ failedJobCount,
961
+ failedJobs,
962
+ localView: localDevViewHint(workspace, runId),
963
+ };
964
+ writeOutput(result, parsed, io, () => {
965
+ const outputMetricValue = outputCandidate ? formatCandidateEvaluationScore(outputCandidate) : "n/a";
966
+ const activeMetricValue = activeCandidate ? formatCandidateEvaluationScore(activeCandidate) : "n/a";
967
+ const firstFailure = result.failedJobs[0];
968
+ const failureDetail = firstFailure
969
+ ? `\nFirst failed job ${firstFailure.id}${firstFailure.purpose ? ` (${firstFailure.purpose})` : ""}: ${firstFailure.error}`
970
+ : "";
971
+ const viewDetail = failedJobCount === 0
972
+ ? `\nOpen local view: ${result.localView.command}\n${result.localView.note}`
973
+ : "";
974
+ return `Run ${runId} finished. Output candidate: ${formatLocalCandidateLabel(outputCandidate)} (score: ${outputMetricValue}). Active candidate: ${formatLocalCandidateLabel(activeCandidate)} (score: ${activeMetricValue}).${failureDetail}${viewDetail}`;
975
+ });
976
+ return failedJobCount === 0 ? 0 : 1;
977
+ }
978
+ catch (error) {
979
+ await markLocalRunFailed({
980
+ workspace,
981
+ run: {
982
+ ...runningRun,
983
+ attemptsExecuted,
984
+ outputCandidateId,
985
+ },
986
+ startedAt,
987
+ error,
988
+ }).catch(() => undefined);
989
+ throw error;
673
990
  }
674
- snapshot = await loadLocalArchive(workspace);
675
- const finishedAt = new Date().toISOString();
676
- const run = {
677
- id: runId,
678
- workflow: "improve",
679
- benchmarkFingerprint,
680
- status: "finished",
681
- startedAt,
682
- finishedAt,
683
- durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
684
- optimizer: formatSpecOptimizer(spec),
685
- engineRun: spec.engineRun.use,
686
- strategy: "greedy",
687
- budget,
688
- repairBudget: 0,
689
- attemptsRequested: budget,
690
- attemptsExecuted: budget,
691
- samples,
692
- stoppedReason: "budget_exhausted",
693
- outcome: failedJobCount > 0 ? "error" : "ok",
694
- };
695
- events.push(createLocalEvent("run_finished", finishedAt, {
696
- runId,
697
- detail: {
698
- outcome: run.outcome ?? null,
699
- attemptsExecuted: run.attemptsExecuted,
700
- durationMs: run.durationMs ?? null,
701
- },
702
- }));
703
- snapshot = appendLocalRun(snapshot, run, events);
704
- await saveLocalArchive(workspace, snapshot);
705
- const selected = snapshot.activeId
706
- ? readLocalSubject(snapshot, snapshot.activeId)
707
- : null;
708
- const result = {
709
- ok: failedJobCount === 0,
710
- runId,
711
- activeSubjectId: snapshot.activeId,
712
- selectedSubject: selected,
713
- completedJobCount,
714
- failedJobCount,
715
- failedJobs,
716
- localView: localDevViewHint(workspace, runId),
717
- };
718
- writeOutput(result, parsed, io, () => {
719
- const metricValue = selected?.metrics?.score ?? "n/a";
720
- const firstFailure = result.failedJobs[0];
721
- const failureDetail = firstFailure
722
- ? `\nFirst failed job ${firstFailure.id}${firstFailure.purpose ? ` (${firstFailure.purpose})` : ""}: ${firstFailure.error}`
723
- : "";
724
- const viewDetail = failedJobCount === 0
725
- ? `\nOpen local view: ${result.localView.command}\n${result.localView.note}`
726
- : "";
727
- return `Run ${runId} finished. Active subject: ${snapshot.activeId ?? "none"} (score: ${metricValue}).${failureDetail}${viewDetail}`;
728
- });
729
- return failedJobCount === 0 ? 0 : 1;
730
991
  }
731
- async function ensureLocalImproveBaseSubject(args) {
992
+ async function ensureLocalImproveBaseCandidate(args) {
732
993
  let snapshot = await loadLocalArchive(args.workspace);
733
994
  const explicitBase = asOptionalString(args.parsed.flags.from);
734
995
  const benchmarkFingerprint = await readLocalBenchmarkFingerprint(args.workspace);
735
996
  if (explicitBase) {
736
- let subject = readLocalSubject(snapshot, explicitBase);
737
- if (subject.benchmarkFingerprint !== benchmarkFingerprint) {
738
- throw new UsageError(`Base subject ${explicitBase} belongs to benchmark ${subject.benchmarkFingerprint}, not ${benchmarkFingerprint}.`);
997
+ let candidate = readLocalCandidate(snapshot, explicitBase);
998
+ if (candidate.benchmarkFingerprint !== benchmarkFingerprint) {
999
+ throw new UsageError(`Base candidate ${explicitBase} belongs to benchmark ${candidate.benchmarkFingerprint}, not ${benchmarkFingerprint}.`);
739
1000
  }
740
- if (!subject.subjectFingerprint) {
741
- throw new UsageError(`Base subject ${explicitBase} is missing a subject fingerprint.`);
1001
+ if (!candidate.candidateFingerprint) {
1002
+ throw new UsageError(`Base candidate ${explicitBase} is missing a candidate fingerprint.`);
742
1003
  }
743
- if (subject.status !== "evaluated" && !subject.eval) {
744
- const code = await localEvaluateSubject(["--dir", args.workspace, "--subject", explicitBase, "--samples", String(args.samples), "--json"], createSilentIo(args.io), args.runtimeOptions);
1004
+ if (candidate.status !== "evaluated" && !candidate.eval) {
1005
+ const code = await localEvaluateCandidate([
1006
+ "--dir",
1007
+ args.workspace,
1008
+ "--candidate",
1009
+ explicitBase,
1010
+ "--runs",
1011
+ args.projectSource.spec.candidate.selectedRunId,
1012
+ "--samples",
1013
+ String(args.samples),
1014
+ ...(args.parsed.flags.rerun === true ? ["--rerun"] : []),
1015
+ "--json",
1016
+ ], createSilentIo(args.io), args.runtimeOptions);
745
1017
  if (code !== 0) {
746
- throw new UsageError(`Base subject ${explicitBase} eval failed; improve was not started.`);
1018
+ throw new UsageError(`Base candidate ${explicitBase} eval failed; improve was not started.`);
747
1019
  }
748
1020
  snapshot = await loadLocalArchive(args.workspace);
749
- subject = readLocalSubject(snapshot, explicitBase);
1021
+ candidate = readLocalCandidate(snapshot, explicitBase);
750
1022
  }
751
- return subject;
1023
+ return candidate;
752
1024
  }
753
- const subjectFingerprint = localSubjectFingerprint(args.projectSource);
754
- const existing = snapshot.subjects.find((subject) => subject.benchmarkFingerprint === benchmarkFingerprint &&
755
- subject.subjectFingerprint === subjectFingerprint &&
756
- (subject.status === "evaluated" || Boolean(subject.eval)));
1025
+ const candidateFingerprint = localCandidateFingerprint(args.projectSource);
1026
+ const existing = snapshot.candidates.find((candidate) => candidate.benchmarkFingerprint === benchmarkFingerprint &&
1027
+ candidate.candidateFingerprint === candidateFingerprint &&
1028
+ (candidate.status === "evaluated" || Boolean(candidate.eval)));
757
1029
  if (existing) {
758
1030
  return existing;
759
1031
  }
760
1032
  const evalArgs = args.parsed.positionals.length > 0
761
- ? [args.sourceArg, "--samples", String(args.samples), "--json"]
762
- : ["--dir", args.workspace, "--samples", String(args.samples), "--json"];
763
- const code = await localEvaluateSubject(evalArgs, createSilentIo(args.io), args.runtimeOptions);
1033
+ ? [
1034
+ args.sourceArg,
1035
+ "--runs",
1036
+ args.projectSource.spec.candidate.selectedRunId,
1037
+ "--samples",
1038
+ String(args.samples),
1039
+ ...(args.parsed.flags.rerun === true ? ["--rerun"] : []),
1040
+ "--json",
1041
+ ]
1042
+ : [
1043
+ "--dir",
1044
+ args.workspace,
1045
+ "--runs",
1046
+ args.projectSource.spec.candidate.selectedRunId,
1047
+ "--samples",
1048
+ String(args.samples),
1049
+ ...(args.parsed.flags.rerun === true ? ["--rerun"] : []),
1050
+ "--json",
1051
+ ];
1052
+ const code = await localEvaluateCandidate(evalArgs, createSilentIo(args.io), args.runtimeOptions);
764
1053
  if (code !== 0) {
765
- throw new UsageError("Parent subject eval failed; improve was not started.");
1054
+ throw new UsageError("Parent candidate eval failed; improve was not started.");
766
1055
  }
767
1056
  snapshot = await loadLocalArchive(args.workspace);
768
- const evaluated = snapshot.subjects.find((subject) => subject.benchmarkFingerprint === benchmarkFingerprint &&
769
- subject.subjectFingerprint === subjectFingerprint &&
770
- (subject.status === "evaluated" || Boolean(subject.eval)));
1057
+ const evaluated = snapshot.candidates.find((candidate) => candidate.benchmarkFingerprint === benchmarkFingerprint &&
1058
+ candidate.candidateFingerprint === candidateFingerprint &&
1059
+ (candidate.status === "evaluated" || Boolean(candidate.eval)));
771
1060
  if (!evaluated) {
772
- throw new UsageError("Parent subject eval did not produce an evaluated subject.");
1061
+ throw new UsageError("Parent candidate eval did not produce an evaluated candidate.");
773
1062
  }
774
1063
  return evaluated;
775
1064
  }
@@ -785,13 +1074,62 @@ function createSilentIo(io) {
785
1074
  stderr: io.stderr,
786
1075
  };
787
1076
  }
788
- async function localEvaluateSubject(argv, io, runtimeOptions) {
1077
+ function selectLocalOptimizerBaselineTraceJobs(snapshot, jobs, target) {
1078
+ const runById = new Map(snapshot.runs.map((run) => [run.id, run]));
1079
+ const evaluation = snapshot.evaluations
1080
+ .filter((entry) => {
1081
+ const run = runById.get(entry.runId);
1082
+ return entry.benchmarkFingerprint === target.benchmarkFingerprint &&
1083
+ entry.candidateId === target.candidateId &&
1084
+ entry.candidateRunId === target.candidateRunId &&
1085
+ run?.executionFingerprint === target.executionFingerprint;
1086
+ })
1087
+ .sort((left, right) => right.updatedAt.localeCompare(left.updatedAt) ||
1088
+ right.runId.localeCompare(left.runId))[0] ?? null;
1089
+ if (!evaluation) {
1090
+ return [];
1091
+ }
1092
+ return jobs.filter((job) => job.runId === evaluation.runId);
1093
+ }
1094
+ async function localEvaluateCandidate(argv, io, runtimeOptions) {
789
1095
  void runtimeOptions;
790
1096
  const parsed = parseArgs(argv);
791
- rejectUnknownFlags(parsed, new Set(["dir", "subject", "samples", "json"]));
1097
+ rejectUnknownFlags(parsed, new Set(["dir", "candidate", "runs", "samples", "rerun", "json"]));
792
1098
  const samples = parsePositiveInt(parsed.flags.samples, 1, "samples");
793
1099
  const sourceArg = resolveSourceDir(parsed);
794
- const projectSource = await readLocalProjectSource(sourceArg);
1100
+ const runsFlag = asOptionalString(parsed.flags.runs);
1101
+ const defaultProjectSource = await readLocalProjectSource(sourceArg);
1102
+ const selectedRunIds = resolveCandidateRunSelection(defaultProjectSource, runsFlag);
1103
+ if (selectedRunIds.length > 1) {
1104
+ let failed = 0;
1105
+ for (const runId of selectedRunIds) {
1106
+ const args = [
1107
+ "--dir",
1108
+ defaultProjectSource.dir,
1109
+ "--runs",
1110
+ runId,
1111
+ "--samples",
1112
+ String(samples),
1113
+ ...(readOptionalCandidateFlag(parsed) ? ["--candidate", readOptionalCandidateFlag(parsed)] : []),
1114
+ ...(parsed.flags.rerun === true ? ["--rerun"] : []),
1115
+ "--json",
1116
+ ];
1117
+ const code = await localEvaluateCandidate(args, createSilentIo(io), runtimeOptions);
1118
+ if (code !== 0) {
1119
+ failed += 1;
1120
+ }
1121
+ }
1122
+ writeOutput({
1123
+ ok: failed === 0,
1124
+ candidateId: defaultProjectSource.candidateName,
1125
+ candidateRunIds: selectedRunIds,
1126
+ failedRunCount: failed,
1127
+ }, parsed, io, () => `Evaluated ${selectedRunIds.length} candidate run(s); ${failed} failed.`);
1128
+ return failed === 0 ? 0 : 1;
1129
+ }
1130
+ const projectSource = selectedRunIds[0] === defaultProjectSource.candidateRunId
1131
+ ? defaultProjectSource
1132
+ : await readLocalProjectSource(sourceArg, { runId: selectedRunIds[0] });
795
1133
  const workspace = projectSource.dir;
796
1134
  const executionProject = await resolveLocalProjectForExecution(workspace, projectSource.specSource);
797
1135
  const { spec, adapterManifests } = executionProject;
@@ -810,114 +1148,367 @@ async function localEvaluateSubject(argv, io, runtimeOptions) {
810
1148
  const environmentRefs = await ensureLocalDockerfileEnvironments(workspace, spec, engineCases);
811
1149
  let snapshot = await loadLocalArchive(workspace);
812
1150
  const benchmarkFingerprint = await readLocalBenchmarkFingerprint(workspace);
813
- const sourceSubjectFingerprint = localSubjectFingerprint(projectSource);
814
- const explicitSubjectId = asOptionalString(parsed.flags.subject);
815
- const existingSourceSubject = snapshot.subjects.find((subject) => subject.benchmarkFingerprint === benchmarkFingerprint &&
816
- subject.subjectFingerprint === sourceSubjectFingerprint);
817
- const subjectId = explicitSubjectId ?? existingSourceSubject?.id ?? `subject_${sourceSubjectFingerprint.slice(0, 12)}`;
818
- const existingSubject = snapshot.subjects.find((subject) => subject.id === subjectId);
819
- const files = filterSubjectSourceFiles(existingSubject
820
- ? readLocalSubjectFiles(snapshot, subjectId)
821
- : normalizeSurfaceFiles(projectSource.subjectFiles));
1151
+ const executionFingerprint = localRunExecutionFingerprint(projectSource);
1152
+ const sourceCandidateFingerprint = localCandidateFingerprint(projectSource);
1153
+ const explicitCandidateId = readOptionalCandidateFlag(parsed);
1154
+ const existingSourceCandidate = snapshot.candidates.find((candidate) => candidate.benchmarkFingerprint === benchmarkFingerprint &&
1155
+ candidate.candidateFingerprint === sourceCandidateFingerprint);
1156
+ const candidateId = explicitCandidateId ?? existingSourceCandidate?.id ?? `candidate_${sourceCandidateFingerprint.slice(0, 12)}`;
1157
+ const existingCandidate = snapshot.candidates.find((candidate) => candidate.id === candidateId);
1158
+ const activeCandidateIdBeforeEval = snapshot.activeId;
1159
+ const selectedCandidateRunId = projectSource.spec.candidate.selectedRunId;
1160
+ const files = filterCandidateSourceFiles(existingCandidate
1161
+ ? readLocalCandidateFiles(snapshot, candidateId)
1162
+ : normalizeSurfaceFiles(projectSource.candidateFiles));
1163
+ const evaluationWork = parsed.flags.rerun !== true
1164
+ ? await resolveLocalEvaluationWork(workspace, snapshot, {
1165
+ benchmarkFingerprint,
1166
+ candidateId,
1167
+ candidateFingerprint: existingCandidate?.candidateFingerprint ?? sourceCandidateFingerprint,
1168
+ candidateRunId: selectedCandidateRunId,
1169
+ executionFingerprint,
1170
+ samples,
1171
+ caseIds,
1172
+ })
1173
+ : null;
1174
+ const reusableEvaluation = evaluationWork?.reusableEvaluation ?? null;
1175
+ if (reusableEvaluation) {
1176
+ const result = {
1177
+ ok: true,
1178
+ reused: true,
1179
+ runId: reusableEvaluation.runId,
1180
+ evaluation: reusableEvaluation,
1181
+ evaluationId: reusableEvaluation.id,
1182
+ candidateId,
1183
+ completedJobCount: 0,
1184
+ failedJobCount: 0,
1185
+ localView: localDevViewHint(workspace, reusableEvaluation.runId),
1186
+ };
1187
+ writeOutput(result, parsed, io, () => `Reused evaluation ${reusableEvaluation.id}. Use --rerun to intentionally run it again.`);
1188
+ return 0;
1189
+ }
1190
+ const selectedPairs = evaluationWork?.missingPairs.length
1191
+ ? evaluationWork.missingPairs
1192
+ : allCaseSamplePairs(caseIds, samples);
822
1193
  const runId = `eval_local_${Date.now().toString(36)}`;
823
- const evaluatedSubjectId = subjectId;
1194
+ const evaluatedCandidateId = candidateId;
824
1195
  const startedAt = new Date().toISOString();
825
- const baseline = createRuntimeBaselineSubjectJob({
826
- ownerUserId: "local",
827
- projectId: "local",
828
- runId,
829
- subjectId: evaluatedSubjectId,
830
- attemptIndex: 0,
831
- files,
832
- now: startedAt,
833
- baseId: null,
834
- });
835
- const completedJobs = [baseline];
836
- const attemptJobs = planWorkbenchExecutionJobsForPurpose({
837
- ownerUserId: "local",
838
- projectId: "local",
839
- runId,
840
- subjectId: evaluatedSubjectId,
841
- attemptIndex: 0,
842
- samples,
843
- now: startedAt,
844
- caseIds,
845
- engineCases,
846
- spec,
847
- environmentRefsByCase: environmentRefs.byCase,
848
- workflow: "eval",
849
- purpose: "attempt",
850
- });
851
- const dagJobs = await executeLocalDevelopmentDag({
852
- jobs: [baseline, ...attemptJobs],
853
- spec,
854
- adapterManifests,
855
- adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
856
- baseFiles: files,
857
- engineResolveFiles,
858
- engineCases,
859
- capacity: await localDevelopmentCapacity(workspace),
860
- });
861
- completedJobs.splice(0, completedJobs.length, ...dagJobs);
862
- const materialized = materializeWorkbenchRunResult({
1196
+ const runStartedEvent = createLocalEvent("run_started", startedAt, {
863
1197
  runId,
864
- benchmarkFingerprint,
865
- sourceYaml: projectSource.specSource,
866
- benchmarkSourceFiles: authoredBenchmarkSourceFiles(projectSource),
867
- subjectFingerprint: existingSubject?.subjectFingerprint ?? sourceSubjectFingerprint,
868
- ...(!existingSubject || existingSubject.subjectFingerprint === sourceSubjectFingerprint
869
- ? { subjectSourceFiles: authoredSubjectSourceFiles(projectSource) }
870
- : {}),
871
- startedAt,
872
- spec,
873
- jobs: completedJobs,
874
- previousSubject: null,
875
- existingSubjectCount: snapshot.subjects.length,
1198
+ candidateId: evaluatedCandidateId,
1199
+ detail: { samples, strategy: "direct" },
876
1200
  });
877
- for (const subjectRecord of materialized.subjects) {
878
- snapshot = upsertLocalSubject(snapshot, subjectRecord, materialized.subjectFiles[subjectRecord.id] ?? []);
879
- }
880
- if (materialized.activeSubjectId) {
881
- snapshot = setLocalActive(snapshot, materialized.activeSubjectId);
882
- }
883
- for (const evaluation of materialized.evaluations) {
884
- snapshot = upsertLocalEvaluation(snapshot, evaluation);
885
- }
886
- const finishedAt = new Date().toISOString();
887
- snapshot = appendLocalRun(snapshot, {
1201
+ const runningRun = {
888
1202
  id: runId,
889
1203
  workflow: "eval",
890
1204
  benchmarkFingerprint,
891
- status: "finished",
1205
+ status: "running",
1206
+ candidateId: evaluatedCandidateId,
1207
+ candidateRunId: projectSource.spec.candidate.selectedRunId,
1208
+ candidateRunName: projectSource.spec.candidate.selectedRunName,
892
1209
  startedAt,
893
- finishedAt,
894
- durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
895
- optimizer: "none",
1210
+ improver: "none",
896
1211
  engineRun: spec.engineRun.use,
897
1212
  strategy: "direct",
898
1213
  budget: 1,
899
1214
  repairBudget: 0,
900
1215
  attemptsRequested: 1,
901
- attemptsExecuted: 1,
1216
+ attemptsExecuted: 0,
902
1217
  samples,
903
- stoppedReason: "completed",
904
- outcome: materialized.failedJobCount > 0 ? "error" : "ok",
905
- }, []);
906
- await saveLocalJobs(workspace, completedJobs);
1218
+ executionFingerprint,
1219
+ activeCandidateId: activeCandidateIdBeforeEval,
1220
+ outputCandidateId: evaluatedCandidateId,
1221
+ };
1222
+ snapshot = upsertLocalRun(snapshot, runningRun, [runStartedEvent]);
907
1223
  await saveLocalArchive(workspace, snapshot);
908
- const evaluation = materialized.evaluations[0] ?? null;
909
- const result = {
910
- ok: materialized.failedJobCount === 0,
911
- runId,
912
- evaluation,
913
- evaluationId: evaluation?.id ?? null,
914
- subjectId: evaluatedSubjectId,
915
- completedJobCount: materialized.completedJobCount,
916
- failedJobCount: materialized.failedJobCount,
917
- localView: localDevViewHint(workspace, runId),
1224
+ try {
1225
+ const baseline = createRuntimeBaselineCandidateJob({
1226
+ ownerUserId: "local",
1227
+ projectId: "local",
1228
+ runId,
1229
+ candidateId: evaluatedCandidateId,
1230
+ attemptIndex: 0,
1231
+ files,
1232
+ now: startedAt,
1233
+ baseId: null,
1234
+ });
1235
+ const attemptJobs = planWorkbenchExecutionJobsForPurpose({
1236
+ ownerUserId: "local",
1237
+ projectId: "local",
1238
+ runId,
1239
+ candidateId: evaluatedCandidateId,
1240
+ attemptIndex: 0,
1241
+ samples,
1242
+ now: startedAt,
1243
+ caseIds: orderedCaseIdsForPairs(caseIds, selectedPairs),
1244
+ sampleIndexesByCase: sampleIndexesByCase(selectedPairs),
1245
+ engineCases,
1246
+ spec,
1247
+ environmentRefsByCase: environmentRefs.byCase,
1248
+ workflow: "eval",
1249
+ purpose: "attempt",
1250
+ });
1251
+ const dagJobs = await executeLocalDevelopmentDag({
1252
+ jobs: [baseline, ...attemptJobs],
1253
+ spec,
1254
+ adapterManifests,
1255
+ adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
1256
+ baseFiles: files,
1257
+ engineResolveFiles,
1258
+ engineCases,
1259
+ capacity: await localDevelopmentCapacity(workspace),
1260
+ });
1261
+ const materializationJobs = [
1262
+ ...(evaluationWork?.priorAttemptJobs ?? []),
1263
+ ...dagJobs,
1264
+ ];
1265
+ const currentRunJobs = dagJobs.filter((job) => job.runId === runId);
1266
+ const currentRunCompletedJobCount = currentRunJobs.filter((job) => job.status === "succeeded").length;
1267
+ const currentRunFailedJobCount = currentRunJobs.filter((job) => job.status === "failed").length;
1268
+ const materialized = materializeWorkbenchRunResult({
1269
+ runId,
1270
+ benchmarkFingerprint,
1271
+ sourceYaml: projectSource.specSource,
1272
+ benchmarkSourceFiles: authoredBenchmarkSourceFiles(projectSource),
1273
+ candidateFingerprint: existingCandidate?.candidateFingerprint ?? sourceCandidateFingerprint,
1274
+ ...(!existingCandidate || existingCandidate.candidateFingerprint === sourceCandidateFingerprint
1275
+ ? { candidateSourceFiles: authoredCandidateSourceFiles(projectSource) }
1276
+ : {}),
1277
+ startedAt,
1278
+ spec,
1279
+ jobs: materializationJobs,
1280
+ previousCandidate: existingCandidate ?? null,
1281
+ existingCandidateCount: snapshot.candidates.length,
1282
+ });
1283
+ for (const candidateRecord of materialized.candidates) {
1284
+ snapshot = upsertLocalCandidate(snapshot, candidateRecord, materialized.candidateFiles[candidateRecord.id] ?? []);
1285
+ }
1286
+ if (materialized.activeCandidateId) {
1287
+ snapshot = setLocalActive(snapshot, materialized.activeCandidateId);
1288
+ }
1289
+ for (const evaluation of materialized.evaluations) {
1290
+ snapshot = upsertLocalEvaluation(snapshot, evaluation);
1291
+ }
1292
+ const activeCandidateId = activeCandidateIdBeforeEval ?? materialized.activeCandidateId ?? null;
1293
+ const finishedAt = new Date().toISOString();
1294
+ if (activeCandidateId) {
1295
+ snapshot = setLocalActive(snapshot, activeCandidateId);
1296
+ }
1297
+ const runFinishedEvent = createLocalEvent("run_finished", finishedAt, {
1298
+ runId,
1299
+ candidateId: evaluatedCandidateId,
1300
+ detail: {
1301
+ outcome: currentRunFailedJobCount > 0 ? "error" : "ok",
1302
+ attemptsExecuted: 1,
1303
+ durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
1304
+ },
1305
+ });
1306
+ snapshot = upsertLocalRun(snapshot, {
1307
+ id: runId,
1308
+ workflow: "eval",
1309
+ benchmarkFingerprint,
1310
+ status: "finished",
1311
+ candidateId: evaluatedCandidateId,
1312
+ candidateRunId: projectSource.spec.candidate.selectedRunId,
1313
+ candidateRunName: projectSource.spec.candidate.selectedRunName,
1314
+ startedAt,
1315
+ finishedAt,
1316
+ durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
1317
+ improver: "none",
1318
+ engineRun: spec.engineRun.use,
1319
+ strategy: "direct",
1320
+ budget: 1,
1321
+ repairBudget: 0,
1322
+ attemptsRequested: 1,
1323
+ attemptsExecuted: 1,
1324
+ samples,
1325
+ executionFingerprint,
1326
+ stoppedReason: "completed",
1327
+ outcome: currentRunFailedJobCount > 0 ? "error" : "ok",
1328
+ activeCandidateId,
1329
+ outputCandidateId: evaluatedCandidateId,
1330
+ }, [runFinishedEvent]);
1331
+ await saveLocalJobs(workspace, currentRunJobs);
1332
+ await saveLocalArchive(workspace, snapshot);
1333
+ const evaluation = materialized.evaluations[0] ?? null;
1334
+ const result = {
1335
+ ok: currentRunFailedJobCount === 0,
1336
+ runId,
1337
+ evaluation,
1338
+ evaluationId: evaluation?.id ?? null,
1339
+ candidateId: evaluatedCandidateId,
1340
+ activeCandidateId,
1341
+ completedJobCount: currentRunCompletedJobCount,
1342
+ failedJobCount: currentRunFailedJobCount,
1343
+ localView: localDevViewHint(workspace, runId),
1344
+ };
1345
+ writeOutput(result, parsed, io, ({ evaluationId, candidateId }) => `Evaluation ${evaluationId ?? runId} finished for candidate ${candidateId}.\nOpen local view: ${result.localView.command}\n${result.localView.note}`);
1346
+ return currentRunFailedJobCount === 0 ? 0 : 1;
1347
+ }
1348
+ catch (error) {
1349
+ await markLocalRunFailed({
1350
+ workspace,
1351
+ run: runningRun,
1352
+ startedAt,
1353
+ error,
1354
+ }).catch(() => undefined);
1355
+ throw error;
1356
+ }
1357
+ }
1358
+ async function resolveLocalEvaluationWork(workspace, snapshot, target) {
1359
+ const runById = new Map(snapshot.runs.map((run) => [run.id, run]));
1360
+ const matchingEvaluations = snapshot.evaluations.filter((evaluation) => {
1361
+ const run = runById.get(evaluation.runId);
1362
+ return evaluation.benchmarkFingerprint === target.benchmarkFingerprint &&
1363
+ evaluation.candidateId === target.candidateId &&
1364
+ evaluation.candidateFingerprint === target.candidateFingerprint &&
1365
+ evaluation.candidateRunId === target.candidateRunId &&
1366
+ run?.executionFingerprint === target.executionFingerprint;
1367
+ });
1368
+ const reusableEvaluation = matchingEvaluations
1369
+ .filter((evaluation) => evaluation.status === "completed" &&
1370
+ evaluation.errorSampleCount === 0 &&
1371
+ evaluation.completedSampleCount >= target.samples)
1372
+ .sort((left, right) => right.updatedAt.localeCompare(left.updatedAt) ||
1373
+ right.id.localeCompare(left.id))[0] ?? null;
1374
+ if (reusableEvaluation) {
1375
+ return {
1376
+ reusableEvaluation,
1377
+ missingPairs: [],
1378
+ priorAttemptJobs: [],
1379
+ };
1380
+ }
1381
+ const matchingRunIds = new Set(matchingEvaluations.map((evaluation) => evaluation.runId));
1382
+ if (matchingRunIds.size === 0) {
1383
+ return null;
1384
+ }
1385
+ const allPairs = allCaseSamplePairs(target.caseIds, target.samples);
1386
+ const desiredKeys = new Set(allPairs.map(caseSamplePairKey));
1387
+ const previousJobs = await readLocalJobs(workspace);
1388
+ const priorAttemptJobsByPair = latestCompletedAttemptJobsByPair(previousJobs.filter((job) => matchingRunIds.has(job.runId) &&
1389
+ job.candidateId === target.candidateId), desiredKeys);
1390
+ const missingPairs = allPairs.filter((pair) => !priorAttemptJobsByPair.has(caseSamplePairKey(pair)));
1391
+ if (missingPairs.length === allPairs.length) {
1392
+ return null;
1393
+ }
1394
+ return {
1395
+ reusableEvaluation: null,
1396
+ missingPairs,
1397
+ priorAttemptJobs: [...priorAttemptJobsByPair.values()],
918
1398
  };
919
- writeOutput(result, parsed, io, ({ evaluationId, subjectId: evaluatedSubjectId }) => `Evaluation ${evaluationId ?? runId} finished for ${evaluatedSubjectId}.\nOpen local view: ${result.localView.command}\n${result.localView.note}`);
920
- return materialized.failedJobCount === 0 ? 0 : 1;
1399
+ }
1400
+ async function markLocalRunFailed(args) {
1401
+ const latest = await loadLocalArchive(args.workspace);
1402
+ const current = latest.runs.find((run) => run.id === args.run.id);
1403
+ if (current?.status === "finished") {
1404
+ return;
1405
+ }
1406
+ const finishedAt = new Date().toISOString();
1407
+ const message = errorMessage(args.error);
1408
+ const failedRun = {
1409
+ ...args.run,
1410
+ status: "finished",
1411
+ finishedAt,
1412
+ durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(args.startedAt)),
1413
+ outcome: "error",
1414
+ error: message,
1415
+ };
1416
+ await saveLocalArchive(args.workspace, upsertLocalRun(latest, failedRun, [
1417
+ createLocalEvent("run_finished", finishedAt, {
1418
+ runId: args.run.id,
1419
+ candidateId: args.run.candidateId ?? undefined,
1420
+ detail: {
1421
+ outcome: "error",
1422
+ error: message,
1423
+ attemptsExecuted: failedRun.attemptsExecuted,
1424
+ durationMs: failedRun.durationMs ?? null,
1425
+ },
1426
+ }),
1427
+ ]));
1428
+ }
1429
+ function errorMessage(error) {
1430
+ return error instanceof Error ? error.message : String(error);
1431
+ }
1432
+ function allCaseSamplePairs(caseIds, samples) {
1433
+ return caseIds.flatMap((caseId) => Array.from({ length: samples }, (_, sampleIndex) => ({
1434
+ caseId,
1435
+ sampleIndex,
1436
+ })));
1437
+ }
1438
+ function orderedCaseIdsForPairs(caseIds, pairs) {
1439
+ const selected = new Set(pairs.map((pair) => pair.caseId));
1440
+ return caseIds.filter((caseId) => selected.has(caseId));
1441
+ }
1442
+ function sampleIndexesByCase(pairs) {
1443
+ const byCase = new Map();
1444
+ for (const pair of pairs) {
1445
+ byCase.set(pair.caseId, [...(byCase.get(pair.caseId) ?? []), pair.sampleIndex]);
1446
+ }
1447
+ for (const [caseId, indexes] of byCase.entries()) {
1448
+ byCase.set(caseId, [...new Set(indexes)].sort((left, right) => left - right));
1449
+ }
1450
+ return byCase;
1451
+ }
1452
+ function latestCompletedAttemptJobsByPair(jobs, desiredKeys) {
1453
+ const byPair = new Map();
1454
+ for (const job of jobs) {
1455
+ if (job.status !== "succeeded" || executionPurposeFromJobInput(job.input) !== "attempt") {
1456
+ continue;
1457
+ }
1458
+ const pair = caseSamplePairFromJob(job);
1459
+ if (!pair) {
1460
+ continue;
1461
+ }
1462
+ const key = caseSamplePairKey(pair);
1463
+ if (!desiredKeys.has(key)) {
1464
+ continue;
1465
+ }
1466
+ const previous = byPair.get(key);
1467
+ if (!previous || compareJobRecency(job, previous) > 0) {
1468
+ byPair.set(key, job);
1469
+ }
1470
+ }
1471
+ return byPair;
1472
+ }
1473
+ function caseSamplePairFromJob(job) {
1474
+ const input = readRecord(job.input);
1475
+ const execution = readRecord(input?.execution);
1476
+ const metadata = readRecord(execution?.metadata);
1477
+ const caseId = stringValue(input?.caseId) ?? stringValue(metadata?.caseId);
1478
+ const sampleIndex = integerValue(input?.sampleIndex) ?? integerValue(metadata?.sampleIndex);
1479
+ return caseId && sampleIndex !== null
1480
+ ? { caseId, sampleIndex }
1481
+ : null;
1482
+ }
1483
+ function executionPurposeFromJobInput(inputValue) {
1484
+ const input = readRecord(inputValue);
1485
+ const execution = readRecord(input?.execution);
1486
+ return stringValue(execution?.purpose);
1487
+ }
1488
+ function caseSamplePairKey(pair) {
1489
+ return `${pair.caseId}\0${pair.sampleIndex}`;
1490
+ }
1491
+ function compareJobRecency(left, right) {
1492
+ return jobRecencyTimestamp(left).localeCompare(jobRecencyTimestamp(right)) ||
1493
+ left.id.localeCompare(right.id);
1494
+ }
1495
+ function jobRecencyTimestamp(job) {
1496
+ return job.finishedAt ?? job.updatedAt ?? job.startedAt ?? job.createdAt ?? "";
1497
+ }
1498
+ function findReusableLocalImproveRun(runs, target) {
1499
+ return runs
1500
+ .filter((run) => run.workflow === "improve" &&
1501
+ run.benchmarkFingerprint === target.benchmarkFingerprint &&
1502
+ run.candidateId === target.candidateId &&
1503
+ run.candidateRunId === target.candidateRunId &&
1504
+ run.executionFingerprint === target.executionFingerprint &&
1505
+ run.budget === target.budget &&
1506
+ run.samples === target.samples &&
1507
+ run.status === "finished" &&
1508
+ run.outcome === "ok" &&
1509
+ Boolean(run.outputCandidateId))
1510
+ .sort((left, right) => (right.finishedAt ?? right.startedAt).localeCompare(left.finishedAt ?? left.startedAt) ||
1511
+ right.id.localeCompare(left.id))[0] ?? null;
921
1512
  }
922
1513
  function localDevViewHint(workspace, runId) {
923
1514
  const runFlag = runId ? ` --run ${shellQuote(runId)}` : "";
@@ -935,20 +1526,26 @@ function localDevOpenUrl(baseUrl, snapshot, runId) {
935
1526
  .reverse()
936
1527
  .find((entry) => entry.runId === runId);
937
1528
  if (!evaluation) {
938
- return new URL("subjects", baseUrl).toString();
1529
+ return new URL("candidates", baseUrl).toString();
939
1530
  }
940
1531
  const params = new URLSearchParams({ evaluation: evaluation.id });
941
- return new URL(`subjects/${encodeURIComponent(evaluation.subjectId)}?${params.toString()}`, baseUrl).toString();
1532
+ return new URL(`candidates/${encodeURIComponent(evaluation.candidateId)}?${params.toString()}`, baseUrl).toString();
942
1533
  }
943
1534
  async function readLocalBenchmarkFingerprint(workspace) {
944
1535
  return localBenchmarkFingerprint(await readLocalProjectSource(workspace));
945
1536
  }
946
- function authoredSubjectSourceFiles(projectSource) {
1537
+ function localRunExecutionFingerprint(projectSource) {
1538
+ return workbenchRunExecutionFingerprint({
1539
+ sourceYaml: projectSource.specSource,
1540
+ adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
1541
+ });
1542
+ }
1543
+ function authoredCandidateSourceFiles(projectSource) {
947
1544
  return [{
948
- path: path.relative(projectSource.dir, projectSource.subjectSpecPath).split(path.sep).join("/"),
1545
+ path: path.relative(projectSource.dir, projectSource.candidateSpecPath).split(path.sep).join("/"),
949
1546
  kind: "text",
950
1547
  encoding: "utf8",
951
- content: projectSource.subjectSource,
1548
+ content: projectSource.candidateSource,
952
1549
  executable: false,
953
1550
  }];
954
1551
  }
@@ -1155,72 +1752,72 @@ function requireValidRunEnvelope(args) {
1155
1752
  }
1156
1753
  async function localRestore(argv, io) {
1157
1754
  const parsed = parseArgs(argv);
1158
- rejectUnknownFlags(parsed, new Set(["dir", "subject", "dry-run", "yes", "json"]));
1755
+ rejectUnknownFlags(parsed, new Set(["dir", "candidate", "dry-run", "yes", "json"]));
1159
1756
  const workspace = resolveDir(parsed);
1160
1757
  const spec = await readLocalSpecIfValid(workspace);
1161
1758
  if (!spec) {
1162
1759
  throw new UsageError("restore requires a valid Workbench project.");
1163
1760
  }
1164
- const subjectRoot = spec.subject.files.path;
1761
+ const candidateRoot = spec.candidate.files.path;
1165
1762
  const snapshot = await loadLocalArchive(workspace);
1166
- const subjectId = readSubjectIdFlag(parsed, snapshot);
1167
- const files = readLocalSubjectFiles(snapshot, subjectId);
1763
+ const candidateId = readCandidateIdFlag(parsed, snapshot);
1764
+ const files = readLocalCandidateFiles(snapshot, candidateId);
1168
1765
  if (parsed.flags["dry-run"] === true) {
1169
- writeOutput({ ok: true, subjectId, fileCount: files.length }, parsed, io, () => `Restore would write ${files.length} file(s) from ${subjectId}.`);
1766
+ writeOutput({ ok: true, candidateId: candidateId, fileCount: files.length }, parsed, io, () => `Restore would write ${files.length} file(s) from ${candidateId}.`);
1170
1767
  return 0;
1171
1768
  }
1172
1769
  if (parsed.flags.yes !== true) {
1173
1770
  throw new UsageError("restore requires --dry-run to preview or --yes to apply source directory changes.");
1174
1771
  }
1175
- const changedPaths = await materializeSubjectRoot(workspace, subjectRoot, files);
1176
- const next = setLocalActive(snapshot, subjectId);
1772
+ const changedPaths = await materializeCandidateRoot(workspace, candidateRoot, files);
1773
+ const next = setLocalActive(snapshot, candidateId);
1177
1774
  await saveLocalArchive(workspace, next);
1178
- writeOutput({ ok: true, activeAfter: subjectId, changedPaths }, parsed, io, () => `Restored ${subjectId} to ${subjectRoot}.`);
1775
+ writeOutput({ ok: true, activeCandidateId: candidateId, changedPaths }, parsed, io, () => `Restored ${candidateId} to ${candidateRoot}.`);
1179
1776
  return 0;
1180
1777
  }
1181
- async function localSubjectList(argv, io) {
1778
+ async function localCandidateList(argv, io) {
1182
1779
  const parsed = parseArgs(argv);
1183
1780
  rejectUnknownFlags(parsed, new Set(["dir", "json"]));
1184
1781
  const snapshot = await loadLocalArchive(resolveDir(parsed));
1185
- writeOutput(snapshot.subjects, parsed, io, (subjects) => subjects
1186
- .map((subject) => `${subject.id}\t${subject.status}\tmetrics ${formatMetricSummary(subject.metrics)}${snapshot.activeId === subject.id ? "\tactive" : ""}`)
1187
- .join("\n") || "No subjects.");
1782
+ writeOutput(snapshot.candidates, parsed, io, (candidates) => candidates
1783
+ .map((candidate) => `${candidate.id}\t${candidate.status}\tevaluation ${formatCandidateEvaluationScore(candidate)}${snapshot.activeId === candidate.id ? "\tactive" : ""}`)
1784
+ .join("\n") || "No candidates.");
1188
1785
  return 0;
1189
1786
  }
1190
- async function localSubjectShow(argv, io) {
1787
+ async function localCandidateShow(argv, io) {
1191
1788
  const parsed = parseArgs(argv);
1192
- rejectUnknownFlags(parsed, new Set(["dir", "subject", "json"]));
1789
+ rejectUnknownFlags(parsed, new Set(["dir", "candidate", "json"]));
1193
1790
  const snapshot = await loadLocalArchive(resolveDir(parsed));
1194
- const subjectId = readSubjectIdFlag(parsed, snapshot);
1195
- const subject = readLocalSubject(snapshot, subjectId);
1196
- writeOutput(subject, parsed, io, (record) => [
1791
+ const candidateId = readCandidateIdFlag(parsed, snapshot);
1792
+ const candidate = readLocalCandidate(snapshot, candidateId);
1793
+ writeOutput(candidate, parsed, io, (record) => [
1197
1794
  `${record.id}\t${record.status}`,
1198
1795
  `benchmark\t${record.benchmarkFingerprint}`,
1199
- `subject\t${record.subjectFingerprint}`,
1200
- `metrics\t${formatMetricSummary(record.metrics)}`,
1796
+ `candidate\t${record.candidateFingerprint ?? record.candidateFingerprint}`,
1797
+ `evaluation\t${formatCandidateEvaluationSummary(record)}`,
1201
1798
  ...(record.baseId ? [`base\t${record.baseId}`] : []),
1202
1799
  ].join("\n"));
1203
1800
  return 0;
1204
1801
  }
1205
- async function localSubjectFiles(argv, io) {
1802
+ async function localCandidateFiles(argv, io) {
1206
1803
  const parsed = parseArgs(argv);
1207
- rejectUnknownFlags(parsed, new Set(["dir", "subject", "json"]));
1804
+ rejectUnknownFlags(parsed, new Set(["dir", "candidate", "json"]));
1208
1805
  const snapshot = await loadLocalArchive(resolveDir(parsed));
1209
- const subjectId = readSubjectIdFlag(parsed, snapshot);
1210
- const subject = readLocalSubject(snapshot, subjectId);
1211
- const files = summarizeSubjectFiles(readLocalSubjectFiles(snapshot, subjectId), subject.fileChanges);
1806
+ const candidateId = readCandidateIdFlag(parsed, snapshot);
1807
+ const candidate = readLocalCandidate(snapshot, candidateId);
1808
+ const files = summarizeCandidateFiles(readLocalCandidateFiles(snapshot, candidateId), candidate.fileChanges);
1212
1809
  writeOutput(files, parsed, io, (records) => records
1213
1810
  .map((file) => `${file.path}\t${file.status}\t${file.preview_kind}`)
1214
1811
  .join("\n") || "No files.");
1215
1812
  return 0;
1216
1813
  }
1217
- async function localSubjectPreview(argv, io) {
1814
+ async function localCandidatePreview(argv, io) {
1218
1815
  const parsed = parseArgs(argv);
1219
- rejectUnknownFlags(parsed, new Set(["dir", "subject", "path", "output", "view", "json"]));
1816
+ rejectUnknownFlags(parsed, new Set(["dir", "candidate", "path", "output", "view", "json"]));
1220
1817
  const snapshot = await loadLocalArchive(resolveDir(parsed));
1221
- const subjectId = readSubjectIdFlag(parsed, snapshot);
1222
- const preview = createSubjectFilePreview({
1223
- files: readLocalSubjectFiles(snapshot, subjectId),
1818
+ const candidateId = readCandidateIdFlag(parsed, snapshot);
1819
+ const preview = createCandidateFilePreview({
1820
+ files: readLocalCandidateFiles(snapshot, candidateId),
1224
1821
  path: requireFlag(parsed, "path"),
1225
1822
  view: readPreviewMode(parsed),
1226
1823
  });
@@ -1755,7 +2352,7 @@ function createAdapterScaffoldFiles(id) {
1755
2352
  "setup:",
1756
2353
  " - npm install --global .",
1757
2354
  "operations:",
1758
- " subject.run: {}",
2355
+ " candidate.run: {}",
1759
2356
  "",
1760
2357
  ].join("\n");
1761
2358
  const packageJson = `${JSON.stringify({
@@ -1777,11 +2374,11 @@ const request = requestPath && fs.existsSync(requestPath)
1777
2374
  ? JSON.parse(fs.readFileSync(requestPath, "utf8"))
1778
2375
  : {};
1779
2376
  fs.mkdirSync(outputRoot, { recursive: true });
1780
- const operation = request.operation || "subject.run";
2377
+ const operation = request.operation || "candidate.run";
1781
2378
  const resultPath = process.env.WORKBENCH_RESULT || request.paths?.result || path.join(outputRoot, "workbench-result.json");
1782
2379
 
1783
2380
  let value;
1784
- if (operation === "subject.run") {
2381
+ if (operation === "candidate.run") {
1785
2382
  const task = request.context?.case?.prompt || "No case prompt was provided.";
1786
2383
  fs.writeFileSync(path.join(outputRoot, "adapter-output.txt"), [
1787
2384
  "adapter: ${id}",
@@ -1790,7 +2387,7 @@ if (operation === "subject.run") {
1790
2387
  "",
1791
2388
  ].join("\\n"));
1792
2389
  } else {
1793
- console.error("${id} only implements subject.run.");
2390
+ console.error("${id} only implements candidate.run.");
1794
2391
  process.exit(2);
1795
2392
  }
1796
2393
 
@@ -2065,7 +2662,7 @@ async function resolveAdapterForAuthTarget(dir, targetRaw) {
2065
2662
  const adapters = await resolveWorkbenchAdaptersForProject(dir, spec);
2066
2663
  const adapter = adapters.find((entry) => entry.manifest.id === target.adapterId);
2067
2664
  if (!adapter) {
2068
- throw new UsageError(`Adapter ${target.adapterId} is not used by this benchmark source. Add it to the benchmark, subject, or optimizer YAML before connecting auth.`);
2665
+ throw new UsageError(`Adapter ${target.adapterId} is not used by this benchmark source. Add it to the benchmark or candidate YAML before connecting auth.`);
2069
2666
  }
2070
2667
  if (!adapter.manifest.auth) {
2071
2668
  throw new UsageError(`Adapter ${target.adapterId} does not declare auth.`);
@@ -2313,13 +2910,21 @@ function adapterAuthRecord(value) {
2313
2910
  }
2314
2911
  async function pushBenchmark(argv, io) {
2315
2912
  const parsed = parseArgs(argv);
2316
- rejectUnknownFlags(parsed, new Set(["dir", "tag", "visibility", "dry-run", "json"]));
2913
+ rejectUnknownFlags(parsed, new Set(["dir", "visibility", "dry-run", "json"]));
2317
2914
  const dir = resolveSourceDir(parsed);
2318
2915
  const source = await readLocalProjectSource(dir);
2319
2916
  const origin = await readWorkbenchOrigin(dir);
2320
2917
  const baseUrl = await effectiveBaseUrl(origin?.baseUrl);
2321
- const visibility = readBenchmarkVisibility(parsed.flags.visibility);
2918
+ const visibility = readOptionalBenchmarkVisibility(parsed.flags.visibility);
2919
+ const createVisibility = visibility ?? "public";
2322
2920
  const dryRun = parsed.flags["dry-run"] === true;
2921
+ const runtime = await exportLocalRuntimeBundle(dir);
2922
+ const state = localProjectState({
2923
+ source,
2924
+ runtime,
2925
+ origin,
2926
+ visibility: createVisibility,
2927
+ });
2323
2928
  if (!origin) {
2324
2929
  if (dryRun) {
2325
2930
  writeOutput({
@@ -2329,35 +2934,36 @@ async function pushBenchmark(argv, io) {
2329
2934
  dir,
2330
2935
  baseUrl,
2331
2936
  benchmarkName: source.spec.name,
2332
- tag: asOptionalString(parsed.flags.tag) ?? null,
2333
- visibility,
2937
+ visibility: createVisibility,
2334
2938
  sourceFileCount: sourceFileCount(source),
2939
+ runtime: runtimeBundleStats(runtime),
2940
+ sourceFingerprint: state.source.fingerprint,
2941
+ runtimeFingerprint: state.base.runtimeFingerprint,
2335
2942
  }, parsed, io, () => `Would push benchmark ${source.spec.name}.`);
2336
2943
  return 0;
2337
2944
  }
2338
- const { project, publishedProject, origin: nextOrigin } = await createHostedBenchmarkFromSource({
2945
+ const { project, origin: nextOrigin, result } = await createHostedBenchmarkFromState({
2339
2946
  baseUrl,
2340
2947
  dir,
2341
- source,
2342
- visibility,
2948
+ state,
2343
2949
  });
2344
2950
  writeOutput({
2345
2951
  ok: true,
2346
2952
  action: "create",
2347
- benchmark: publishedProject,
2348
- tag: asOptionalString(parsed.flags.tag) ?? null,
2349
- visibility,
2953
+ benchmark: project,
2954
+ visibility: project.visibility ?? createVisibility,
2350
2955
  origin: nextOrigin,
2956
+ source: result.source,
2957
+ runtime: result.runtime.stats,
2351
2958
  urls: buildWorkbenchResourceUrls({
2352
2959
  baseUrl,
2353
- projectId: publishedProject.id ?? project.id,
2354
- owner: nextOrigin.owner,
2355
- projectName: nextOrigin.project,
2960
+ projectId: project.id,
2961
+ ...originRemoteUrlParts(nextOrigin),
2356
2962
  }),
2357
2963
  }, parsed, io, (record) => {
2358
2964
  const value = record;
2359
2965
  return [
2360
- `Pushed ${value.origin.owner}/${value.origin.project} (${value.origin.projectId}).`,
2966
+ `Pushed ${value.origin.remote} (${value.origin.projectId}).`,
2361
2967
  `Open benchmark: ${value.urls.benchmark}`,
2362
2968
  ].join("\n");
2363
2969
  });
@@ -2367,57 +2973,6 @@ async function pushBenchmark(argv, io) {
2367
2973
  if (!projectId) {
2368
2974
  throw new UsageError("Missing hosted benchmark. Run workbench push from a source directory.");
2369
2975
  }
2370
- if (!origin.writable) {
2371
- const signedInUsername = dryRun ? null : await readAuthenticatedWorkbenchUsername(baseUrl);
2372
- if (signedInUsername !== origin.owner) {
2373
- const upstream = upstreamFromOrigin(origin);
2374
- if (dryRun) {
2375
- writeOutput({
2376
- ok: true,
2377
- dryRun: true,
2378
- action: "create",
2379
- dir,
2380
- baseUrl,
2381
- benchmarkName: source.spec.name,
2382
- tag: asOptionalString(parsed.flags.tag) ?? null,
2383
- visibility,
2384
- sourceFileCount: sourceFileCount(source),
2385
- upstream: upstream ?? null,
2386
- }, parsed, io, () => `Would create a writable benchmark from read-only origin ${origin.owner}/${origin.project}.`);
2387
- return 0;
2388
- }
2389
- const { project, publishedProject, origin: nextOrigin } = await createHostedBenchmarkFromSource({
2390
- baseUrl,
2391
- dir,
2392
- source,
2393
- visibility,
2394
- upstream,
2395
- });
2396
- writeOutput({
2397
- ok: true,
2398
- action: "create",
2399
- benchmark: publishedProject,
2400
- tag: asOptionalString(parsed.flags.tag) ?? null,
2401
- visibility,
2402
- origin: nextOrigin,
2403
- upstream: upstream ?? null,
2404
- urls: buildWorkbenchResourceUrls({
2405
- baseUrl,
2406
- projectId: publishedProject.id ?? project.id,
2407
- owner: nextOrigin.owner,
2408
- projectName: nextOrigin.project,
2409
- }),
2410
- }, parsed, io, (record) => {
2411
- const value = record;
2412
- return [
2413
- `Pushed ${value.origin.owner}/${value.origin.project} (${value.origin.projectId}).`,
2414
- ...(value.upstream ? [`Upstream: ${value.upstream.owner}/${value.upstream.project}`] : []),
2415
- `Open benchmark: ${value.urls.benchmark}`,
2416
- ].join("\n");
2417
- });
2418
- return 0;
2419
- }
2420
- }
2421
2976
  if (dryRun) {
2422
2977
  writeOutput({
2423
2978
  ok: true,
@@ -2426,92 +2981,82 @@ async function pushBenchmark(argv, io) {
2426
2981
  dir,
2427
2982
  baseUrl,
2428
2983
  benchmarkId: projectId,
2429
- tag: asOptionalString(parsed.flags.tag) ?? null,
2430
- visibility,
2984
+ remote: origin.remote,
2985
+ benchmarkName: source.spec.name,
2986
+ visibility: visibility ?? "unchanged",
2431
2987
  sourceFileCount: sourceFileCount(source),
2432
- }, parsed, io, () => `Would push ${sourceFileCount(source)} source file(s) to ${projectId}.`);
2988
+ runtime: runtimeBundleStats(runtime),
2989
+ sourceFingerprint: state.source.fingerprint,
2990
+ runtimeFingerprint: state.base.runtimeFingerprint,
2991
+ }, parsed, io, () => `Would push ${sourceFileCount(source)} source file(s) and runtime history to ${origin.remote}.`);
2433
2992
  return 0;
2434
2993
  }
2435
- const response = await apiRequest(projectApiPath(projectId, "/source"), {
2994
+ const response = await apiRequest(projectApiPath(projectId, "/state"), {
2436
2995
  method: "PUT",
2437
- body: hostedProjectSourceRequest(source),
2996
+ body: state,
2438
2997
  }, baseUrl);
2439
- const publishedProject = visibility === "public"
2440
- ? (await apiRequest(projectApiPath(response.benchmark.id, "/publish"), { method: "PUT" }, baseUrl)).benchmark
2441
- : response.benchmark;
2442
- const nextOrigin = await writeWorkbenchOrigin(dir, {
2998
+ const responseProject = hostedProjectSummaryFromState(response.state);
2999
+ const publishedProject = await applyRequestedProjectVisibility({
2443
3000
  baseUrl,
2444
- owner: publishedProject.ownerUsername ?? response.benchmark.ownerUsername ?? origin.owner,
2445
- project: publishedProject.name ?? response.benchmark.name ?? origin.project ?? source.spec.name,
2446
- projectId: publishedProject.id ?? response.benchmark.id,
2447
- writable: true,
2448
- sourceRevisionId: publishedProject.currentSpecVersionId ?? response.benchmark.currentSpecVersionId,
2449
- sourceFingerprint: response.sourceFingerprint ?? publishedProject.sourceFingerprint ?? response.benchmark.sourceFingerprint,
2450
- upstream: origin.upstream,
3001
+ projectId: responseProject.id,
3002
+ responseProject,
3003
+ visibility,
3004
+ });
3005
+ const nextOrigin = await writeWorkbenchOriginFromState(dir, {
3006
+ baseUrl,
3007
+ state: response.state,
3008
+ project: publishedProject,
3009
+ sourceFingerprint: state.source.fingerprint,
2451
3010
  });
2452
3011
  writeOutput({
2453
3012
  ok: true,
2454
3013
  action: "update",
2455
3014
  changed: response.changed === true,
2456
3015
  benchmark: publishedProject,
2457
- tag: asOptionalString(parsed.flags.tag) ?? null,
2458
- visibility,
3016
+ visibility: visibility ?? "unchanged",
2459
3017
  origin: nextOrigin,
3018
+ source: response.source,
3019
+ runtime: response.runtime.stats,
2460
3020
  urls: buildWorkbenchResourceUrls({
2461
3021
  baseUrl,
2462
- projectId: publishedProject.id ?? response.benchmark.id,
2463
- owner: nextOrigin.owner,
2464
- projectName: nextOrigin.project,
3022
+ projectId: publishedProject.id ?? responseProject.id,
3023
+ ...originRemoteUrlParts(nextOrigin),
2465
3024
  }),
2466
3025
  }, parsed, io, (record) => {
2467
3026
  const value = record;
2468
3027
  return [
2469
- `${value.changed ? "Pushed" : "Already up to date"} ${value.origin.owner}/${value.origin.project} (${value.origin.projectId}).`,
3028
+ `${value.changed ? "Pushed" : "Already up to date"} ${value.origin.remote} (${value.origin.projectId}).`,
2470
3029
  `Open benchmark: ${value.urls.benchmark}`,
2471
3030
  ].join("\n");
2472
3031
  });
2473
3032
  return 0;
2474
3033
  }
2475
- async function createHostedBenchmarkFromSource(args) {
2476
- const response = await apiRequest("/api/workbench/benchmarks", {
3034
+ async function createHostedBenchmarkFromState(args) {
3035
+ const result = await apiRequest("/api/workbench/benchmarks/state", {
2477
3036
  method: "POST",
2478
- body: hostedProjectSourceRequest(args.source),
3037
+ body: args.state,
2479
3038
  }, args.baseUrl);
2480
- const project = response.benchmark;
2481
- const publishedProject = args.visibility === "public"
2482
- ? (await apiRequest(projectApiPath(project.id, "/publish"), { method: "PUT" }, args.baseUrl)).benchmark
2483
- : project;
2484
- const origin = await writeWorkbenchOrigin(args.dir, {
3039
+ const project = hostedProjectSummaryFromState(result.state);
3040
+ const origin = await writeWorkbenchOriginFromState(args.dir, {
2485
3041
  baseUrl: args.baseUrl,
2486
- owner: publishedProject.ownerUsername ?? project.ownerUsername ?? "",
2487
- project: publishedProject.name ?? project.name ?? args.source.spec.name,
2488
- projectId: publishedProject.id ?? project.id,
2489
- writable: true,
2490
- sourceRevisionId: publishedProject.currentSpecVersionId ?? project.currentSpecVersionId,
2491
- sourceFingerprint: publishedProject.sourceFingerprint ?? project.sourceFingerprint,
2492
- ...(args.upstream ? { upstream: args.upstream } : {}),
3042
+ state: result.state,
3043
+ project,
3044
+ sourceFingerprint: args.state.source.fingerprint,
2493
3045
  });
2494
- return { project, publishedProject, origin };
3046
+ return { project, origin, result };
2495
3047
  }
2496
- async function readAuthenticatedWorkbenchUsername(baseUrl) {
2497
- const config = await loadConfig();
2498
- const status = await readWorkbenchProfileStatus({ ...config, baseUrl });
2499
- return status.authenticated ? status.profile?.username ?? null : null;
2500
- }
2501
- function upstreamFromOrigin(origin) {
2502
- if (!origin.owner || !origin.project || !origin.projectId || !origin.sourceRevisionId) {
2503
- return undefined;
3048
+ async function applyRequestedProjectVisibility(args) {
3049
+ if (args.visibility === "public") {
3050
+ return (await apiRequest(projectApiPath(args.projectId, "/publish"), { method: "PUT" }, args.baseUrl)).benchmark;
2504
3051
  }
2505
- return {
2506
- owner: origin.owner,
2507
- project: origin.project,
2508
- projectId: origin.projectId,
2509
- sourceRevisionId: origin.sourceRevisionId,
2510
- };
3052
+ if (args.visibility === "private") {
3053
+ return (await apiRequest(projectApiPath(args.projectId, "/publish"), { method: "DELETE" }, args.baseUrl)).benchmark;
3054
+ }
3055
+ return args.responseProject;
2511
3056
  }
2512
- function readBenchmarkVisibility(value) {
3057
+ function readOptionalBenchmarkVisibility(value) {
2513
3058
  if (value === undefined) {
2514
- return "public";
3059
+ return undefined;
2515
3060
  }
2516
3061
  if (value === "private" || value === "public") {
2517
3062
  return value;
@@ -2524,41 +3069,37 @@ async function cloneProject(argv, io) {
2524
3069
  const ref = readRequiredBenchmarkRef(parsed);
2525
3070
  const outputDir = parsed.positionals[1] ?? ref.project;
2526
3071
  if (parsed.positionals.length > 2) {
2527
- throw new UsageError("workbench clone accepts OWNER/BENCHMARK[@REF] and an optional output directory.");
3072
+ throw new UsageError("workbench clone accepts OWNER/BENCHMARK and an optional output directory.");
2528
3073
  }
2529
3074
  const baseUrl = await effectiveBaseUrl();
2530
- const projectResponse = await apiRequest(publicProjectApiPath(ref), {}, baseUrl);
2531
- const filesResponse = await apiRequest(publicProjectSourceApiPath(ref), {}, baseUrl);
3075
+ const state = await apiRequest(publicProjectStateApiPath(ref), {}, baseUrl);
2532
3076
  if (parsed.flags["dry-run"] === true) {
2533
3077
  writeOutput({
2534
3078
  ok: true,
2535
3079
  dryRun: true,
2536
3080
  ref,
2537
3081
  outputDir,
2538
- fileCount: filesResponse.files.length,
3082
+ fileCount: state.source.files.length,
3083
+ runtime: runtimeBundleStats(state.runtime),
3084
+ sourceFingerprint: state.source.fingerprint ?? state.base.sourceFingerprint ?? null,
3085
+ runtimeFingerprint: state.base.runtimeFingerprint ?? null,
2539
3086
  }, parsed, io, () => `Would clone ${formatBenchmarkRef(ref)} to ${outputDir}.`);
2540
3087
  return 0;
2541
3088
  }
2542
- await syncSourceFiles(outputDir, filesResponse.files);
2543
- const project = projectResponse.benchmark;
2544
- const sourceProject = filesResponse.benchmark;
2545
- const origin = await writeWorkbenchOrigin(outputDir, {
3089
+ const applied = await applyProjectStateToLocal({
3090
+ dir: outputDir,
2546
3091
  baseUrl,
2547
- owner: sourceProject?.ownerUsername ?? project.ownerUsername,
2548
- project: sourceProject?.name ?? project.name,
2549
- projectId: sourceProject?.id ?? project.id,
2550
- writable: false,
2551
- sourceRevisionId: sourceProject?.currentSpecVersionId ?? project.currentSpecVersionId,
2552
- sourceFingerprint: sourceProject?.sourceFingerprint ?? project.sourceFingerprint,
3092
+ state,
2553
3093
  });
2554
3094
  writeOutput({
2555
3095
  ok: true,
2556
- origin,
3096
+ origin: applied.origin,
2557
3097
  outputDir,
2558
- files: filesResponse.files.length,
3098
+ files: applied.files,
3099
+ runtime: applied.runtime,
2559
3100
  }, parsed, io, (record) => {
2560
3101
  const value = record;
2561
- return `Cloned ${value.origin.owner}/${value.origin.project} to ${value.outputDir} (${value.files} file(s)).`;
3102
+ return `Cloned ${value.origin.remote} to ${value.outputDir} (${value.files} file(s)).`;
2562
3103
  });
2563
3104
  return 0;
2564
3105
  }
@@ -2566,167 +3107,273 @@ async function pullProject(argv, io) {
2566
3107
  const parsed = parseArgs(argv);
2567
3108
  rejectUnknownFlags(parsed, new Set(["dir", "dry-run", "json"]));
2568
3109
  if (parsed.positionals.length > 0) {
2569
- throw new UsageError("workbench pull updates the current origin; use workbench clone OWNER/BENCHMARK[@REF] DIR for a new directory.");
3110
+ throw new UsageError("workbench pull updates the current origin; use workbench clone OWNER/BENCHMARK DIR for a new directory.");
2570
3111
  }
2571
3112
  const dir = resolveDir(parsed);
2572
3113
  const origin = await requireWorkbenchOrigin(dir);
2573
- const filesResponse = origin.writable
2574
- ? await apiRequest(projectApiPath(origin.projectId, "/source"), {}, await effectiveBaseUrl(origin.baseUrl))
2575
- : await apiRequest(publicProjectSourceApiPath({ owner: origin.owner, project: origin.project }), {}, await effectiveBaseUrl(origin.baseUrl));
3114
+ const baseUrl = await effectiveBaseUrl(origin.baseUrl);
3115
+ const remoteRef = parseOriginRemote(origin);
3116
+ const state = await apiRequest(publicProjectStateApiPath(remoteRef), {}, baseUrl);
2576
3117
  if (parsed.flags["dry-run"] === true) {
2577
3118
  writeOutput({
2578
3119
  ok: true,
2579
3120
  dryRun: true,
2580
3121
  dir,
2581
- fileCount: filesResponse.files.length,
2582
- }, parsed, io, () => `Would pull ${filesResponse.files.length} source file(s) into ${dir}.`);
3122
+ fileCount: state.source.files.length,
3123
+ runtime: runtimeBundleStats(state.runtime),
3124
+ sourceFingerprint: state.source.fingerprint ?? state.base.sourceFingerprint ?? null,
3125
+ runtimeFingerprint: state.base.runtimeFingerprint ?? null,
3126
+ }, parsed, io, () => `Would pull ${state.source.files.length} source file(s) and runtime history into ${dir}.`);
2583
3127
  return 0;
2584
3128
  }
2585
- await syncSourceFiles(dir, filesResponse.files);
2586
- const sourceProject = filesResponse.benchmark;
2587
- const nextOrigin = await writeWorkbenchOrigin(dir, {
2588
- ...origin,
2589
- ...(sourceProject?.ownerUsername ? { owner: sourceProject.ownerUsername } : {}),
2590
- ...(sourceProject?.name ? { project: sourceProject.name } : {}),
2591
- ...(sourceProject?.id ? { projectId: sourceProject.id } : {}),
2592
- ...(sourceProject?.currentSpecVersionId ? { sourceRevisionId: sourceProject.currentSpecVersionId } : {}),
2593
- ...(sourceProject?.sourceFingerprint ? { sourceFingerprint: sourceProject.sourceFingerprint } : {}),
3129
+ const applied = await applyProjectStateToLocal({
3130
+ dir,
3131
+ baseUrl,
3132
+ state,
3133
+ origin,
3134
+ requireCleanSource: true,
2594
3135
  });
2595
3136
  writeOutput({
2596
3137
  ok: true,
2597
- origin: nextOrigin,
3138
+ origin: applied.origin,
2598
3139
  dir,
2599
- files: filesResponse.files.length,
3140
+ files: applied.files,
3141
+ runtime: applied.runtime,
2600
3142
  }, parsed, io, (record) => {
2601
3143
  const value = record;
2602
3144
  return `Pulled ${value.files} source file(s) into ${value.dir}.`;
2603
3145
  });
2604
3146
  return 0;
2605
3147
  }
2606
- async function fetchProject(argv, io) {
2607
- const parsed = parseArgs(argv);
2608
- rejectUnknownFlags(parsed, new Set(["dir", "json"]));
2609
- if (parsed.positionals.length > 0) {
2610
- throw new UsageError("workbench fetch updates the current remote cache; use workbench clone OWNER/BENCHMARK[@REF] DIR for a new directory.");
3148
+ async function applyProjectStateToLocal(args) {
3149
+ if (args.requireCleanSource === true && args.origin) {
3150
+ await assertLocalSourceMatchesOrigin(args.dir, args.origin);
2611
3151
  }
2612
- const dir = resolveDir(parsed);
2613
- const origin = await requireWorkbenchOrigin(dir);
2614
- const filesResponse = await readRemoteSourceFiles(origin);
2615
- const fetchRoot = path.join(dir, ".workbench", "fetch");
2616
- await fs.rm(fetchRoot, { force: true, recursive: true });
2617
- await fs.mkdir(fetchRoot, { recursive: true });
2618
- await writeFiles(path.join(fetchRoot, "source"), filesResponse.files);
2619
- const sourceProject = filesResponse.benchmark;
2620
- const nextOrigin = await writeWorkbenchOrigin(dir, {
2621
- ...origin,
2622
- ...(sourceProject?.ownerUsername ? { owner: sourceProject.ownerUsername } : {}),
2623
- ...(sourceProject?.name ? { project: sourceProject.name } : {}),
2624
- ...(sourceProject?.id ? { projectId: sourceProject.id } : {}),
2625
- ...(sourceProject?.currentSpecVersionId ? { sourceRevisionId: sourceProject.currentSpecVersionId } : {}),
2626
- ...(sourceProject?.sourceFingerprint ? { sourceFingerprint: sourceProject.sourceFingerprint } : {}),
3152
+ await syncSourceFiles(args.dir, args.state.source.files);
3153
+ const runtimeImport = await importLocalRuntimeBundle(args.dir, args.state.runtime);
3154
+ const origin = await writeWorkbenchOriginFromState(args.dir, {
3155
+ baseUrl: args.baseUrl,
3156
+ state: args.state,
3157
+ sourceFingerprint: await localSourceFingerprint(args.dir),
2627
3158
  });
2628
- await fs.writeFile(path.join(fetchRoot, "manifest.json"), `${JSON.stringify({
2629
- fetchedAt: new Date().toISOString(),
2630
- origin: nextOrigin,
2631
- files: filesResponse.files.map((file) => file.path),
2632
- }, null, 2)}\n`);
2633
- writeOutput({
3159
+ return {
3160
+ origin,
3161
+ files: args.state.source.files.length,
3162
+ runtime: runtimeImport.stats,
3163
+ };
3164
+ }
3165
+ async function retryHostedWorkflow(argv, io) {
3166
+ const parsed = parseArgs(argv);
3167
+ rejectUnknownFlags(parsed, new Set([
3168
+ "dir",
3169
+ "benchmark",
3170
+ "watch",
3171
+ "interval-ms",
3172
+ "timeout-ms",
3173
+ "json",
3174
+ ]));
3175
+ rejectUnexpectedPositionals(parsed, "workbench retry --hosted", 1);
3176
+ const targetId = parsed.positionals[0];
3177
+ if (!targetId) {
3178
+ throw new UsageError("Missing required TARGET_ID.");
3179
+ }
3180
+ if (parsed.flags.watch !== true && (parsed.flags["interval-ms"] !== undefined ||
3181
+ parsed.flags["timeout-ms"] !== undefined)) {
3182
+ throw new UsageError("--interval-ms and --timeout-ms require --watch.");
3183
+ }
3184
+ const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
3185
+ const retryTarget = await resolveHostedRetryTarget(target, targetId);
3186
+ const watchIntervalMs = parsed.flags.watch === true
3187
+ ? parsePositiveInt(parsed.flags["interval-ms"], 1000, "interval-ms")
3188
+ : undefined;
3189
+ const watchTimeoutMs = parsed.flags.watch === true
3190
+ ? parseOptionalPositiveInt(parsed.flags["timeout-ms"], "timeout-ms")
3191
+ : undefined;
3192
+ const response = await apiRequest(projectApiPath(target.projectId, "/runs"), {
3193
+ method: "POST",
3194
+ body: retryTarget.request,
3195
+ }, target.baseUrl);
3196
+ const startedRun = withRunUrls(target, response.run);
3197
+ if (parsed.flags.watch === true) {
3198
+ if (parsed.flags.json !== true) {
3199
+ io.stdout.write(`${formatHostedRunStarted(startedRun, retryTarget.workflow).trimEnd()}\n${HOSTED_WATCH_LIFECYCLE_NOTE}\n`);
3200
+ }
3201
+ const watched = await watchHostedRun({
3202
+ parsed,
3203
+ target,
3204
+ runId: response.run.id,
3205
+ intervalMs: watchIntervalMs ?? 1000,
3206
+ timeoutMs: watchTimeoutMs,
3207
+ });
3208
+ const outputRun = withRunUrls(target, await withHostedRunFailureSummary(target, watched));
3209
+ await tryImportTerminalHostedProjectState({ target, io });
3210
+ const result = {
3211
+ ok: hostedRunSucceeded(watched),
3212
+ retried: {
3213
+ id: retryTarget.sourceId,
3214
+ kind: retryTarget.sourceKind,
3215
+ workflow: retryTarget.workflow,
3216
+ },
3217
+ runId: outputRun.id,
3218
+ candidateId: outputRun.outputCandidateId ?? outputRun.candidateId,
3219
+ activeCandidateId: outputRun.activeCandidateId ?? null,
3220
+ run: outputRun,
3221
+ ...(outputRun.urls ? { urls: outputRun.urls } : {}),
3222
+ ...(outputRun.failedJobCount !== undefined ? { failedJobCount: outputRun.failedJobCount } : {}),
3223
+ ...(outputRun.error ? { error: outputRun.error } : {}),
3224
+ };
3225
+ writeOutput(result, parsed, io, formatRetryCommandResult);
3226
+ return hostedRunSucceeded(watched) ? 0 : 1;
3227
+ }
3228
+ const result = {
2634
3229
  ok: true,
2635
- origin: nextOrigin,
2636
- dir,
2637
- fetchRoot,
2638
- files: filesResponse.files.length,
2639
- }, parsed, io, (record) => {
2640
- const value = record;
2641
- return `Fetched ${value.files} source file(s) into ${value.fetchRoot}.`;
2642
- });
3230
+ retried: {
3231
+ id: retryTarget.sourceId,
3232
+ kind: retryTarget.sourceKind,
3233
+ workflow: retryTarget.workflow,
3234
+ },
3235
+ runId: startedRun.id,
3236
+ candidateId: startedRun.outputCandidateId ?? startedRun.candidateId,
3237
+ activeCandidateId: startedRun.activeCandidateId ?? null,
3238
+ run: startedRun,
3239
+ ...(startedRun.urls ? { urls: startedRun.urls } : {}),
3240
+ };
3241
+ writeOutput(result, parsed, io, formatRetryCommandResult);
2643
3242
  return 0;
2644
3243
  }
2645
- async function readRemoteSourceFiles(origin) {
2646
- return origin.writable
2647
- ? await apiRequest(projectApiPath(origin.projectId, "/source"), {}, await effectiveBaseUrl(origin.baseUrl))
2648
- : await apiRequest(publicProjectSourceApiPath({ owner: origin.owner, project: origin.project }), {}, await effectiveBaseUrl(origin.baseUrl));
3244
+ async function resolveHostedRetryTarget(target, targetId) {
3245
+ if (targetId.startsWith("eval_")) {
3246
+ return await resolveHostedEvaluationRetryTarget(target, targetId);
3247
+ }
3248
+ const detail = await readHostedRunDetail(target, targetId);
3249
+ const run = detail.run;
3250
+ if (run.status !== "finished") {
3251
+ throw new UsageError(`Run ${run.id} is ${run.status}; wait for it to finish before retrying.`);
3252
+ }
3253
+ if (!hostedRunRecordFailed(run)) {
3254
+ throw new UsageError(`Run ${run.id} did not fail; use workbench ${run.workflow ?? "eval"} --hosted to intentionally run it again.`);
3255
+ }
3256
+ if (run.workflow === "eval") {
3257
+ const candidateId = hostedRunEvaluationCandidateId(run, detail.jobs);
3258
+ if (!candidateId) {
3259
+ throw new UsageError(`Run ${run.id} has no candidate id to retry.`);
3260
+ }
3261
+ return {
3262
+ sourceId: targetId,
3263
+ sourceKind: "run",
3264
+ workflow: "eval",
3265
+ request: {
3266
+ workflow: "eval",
3267
+ samples: run.samples ?? 1,
3268
+ candidateId,
3269
+ sourceYaml: hostedRetrySourceYaml(run, run.id),
3270
+ preserveActive: true,
3271
+ ...retrySampleSelectionFromJobs(detail.jobs),
3272
+ },
3273
+ };
3274
+ }
3275
+ if (run.workflow === "improve") {
3276
+ const baseCandidateId = stringValue(readRecord(run.input)?.baseCandidateId);
3277
+ if (!baseCandidateId) {
3278
+ throw new UsageError(`Run ${run.id} is missing its base candidate id.`);
3279
+ }
3280
+ return {
3281
+ sourceId: targetId,
3282
+ sourceKind: "run",
3283
+ workflow: "improve",
3284
+ request: {
3285
+ workflow: "improve",
3286
+ samples: run.samples ?? 1,
3287
+ budget: run.budget ?? run.attemptsRequested ?? 1,
3288
+ candidateId: baseCandidateId,
3289
+ sourceYaml: hostedRetrySourceYaml(run, run.id),
3290
+ preserveActive: true,
3291
+ },
3292
+ };
3293
+ }
3294
+ throw new UsageError(`Run ${run.id} has no retryable workflow.`);
2649
3295
  }
2650
- async function runRemoteCommand(argv, io) {
2651
- const command = argv[0] ?? "show";
2652
- switch (command) {
2653
- case "show":
2654
- return await remoteShow(argv.slice(1), io);
2655
- case "add":
2656
- return await remoteAdd(argv.slice(1), io, "add");
2657
- case "set-url":
2658
- return await remoteAdd(argv.slice(1), io, "set-url");
2659
- case "remove":
2660
- return await remoteRemove(argv.slice(1), io);
2661
- default:
2662
- throw new UsageError(`Unknown command: remote ${argv.join(" ")}`);
3296
+ async function resolveHostedEvaluationRetryTarget(target, evaluationId) {
3297
+ const snapshot = await apiRequest(projectApiPath(target.projectId, "/workbench/snapshot"), {}, target.baseUrl);
3298
+ const evaluation = snapshot.evaluations.find((entry) => entry.id === evaluationId);
3299
+ if (!evaluation) {
3300
+ throw new UsageError(`Hosted evaluation not found: ${evaluationId}`);
2663
3301
  }
3302
+ const run = snapshot.runs.find((entry) => entry.id === evaluation.runId) ?? null;
3303
+ if (!evaluationScorecardFailed(evaluation, run)) {
3304
+ throw new UsageError(`Evaluation ${evaluation.id} did not fail; use workbench eval --hosted to intentionally run it again.`);
3305
+ }
3306
+ if (!run) {
3307
+ throw new UsageError(`Evaluation ${evaluation.id} is missing its run record.`);
3308
+ }
3309
+ const detail = await readHostedRunDetail(target, run.id);
3310
+ const detailedRun = detail.run;
3311
+ return {
3312
+ sourceId: evaluationId,
3313
+ sourceKind: "evaluation",
3314
+ workflow: "eval",
3315
+ request: {
3316
+ workflow: "eval",
3317
+ samples: evaluation.sampleCount || detailedRun.samples || 1,
3318
+ candidateId: evaluation.candidateId,
3319
+ sourceYaml: hostedRetrySourceYaml(detailedRun, detailedRun.id),
3320
+ preserveActive: true,
3321
+ ...retrySampleSelectionFromJobs(detail.jobs),
3322
+ },
3323
+ };
2664
3324
  }
2665
- async function remoteShow(argv, io) {
2666
- const parsed = parseArgs(argv);
2667
- rejectUnknownFlags(parsed, new Set(["dir", "json"]));
2668
- const origin = await requireWorkbenchOrigin(resolveDir(parsed));
2669
- writeOutput({ ok: true, remote: "origin", origin }, parsed, io, (record) => {
2670
- const value = record;
2671
- return [
2672
- `origin\t${value.origin.owner}/${value.origin.project}`,
2673
- `url\t${value.origin.baseUrl}`,
2674
- `writable\t${value.origin.writable ? "yes" : "no"}`,
2675
- ...(value.origin.sourceFingerprint ? [`fingerprint\t${value.origin.sourceFingerprint}`] : []),
2676
- ].join("\n");
2677
- });
2678
- return 0;
3325
+ function retrySampleSelectionFromJobs(jobs) {
3326
+ const selectedSamples = uniqueCaseSamplePairs(jobs
3327
+ .filter((job) => job.status !== "succeeded" &&
3328
+ executionPurposeFromJobInput(job.input) === "attempt")
3329
+ .map(caseSamplePairFromJob)
3330
+ .filter((pair) => pair !== null));
3331
+ return selectedSamples.length > 0
3332
+ ? { selectedSamples }
3333
+ : {};
2679
3334
  }
2680
- async function remoteAdd(argv, io, command) {
2681
- const parsed = parseArgs(argv);
2682
- rejectUnknownFlags(parsed, new Set(["dir", "json"]));
2683
- const [name, refValue] = parsed.positionals;
2684
- if (name !== "origin" || !refValue || parsed.positionals.length !== 2) {
2685
- throw new UsageError(`workbench remote ${command} accepts: origin OWNER/BENCHMARK[@REF].`);
3335
+ function uniqueCaseSamplePairs(pairs) {
3336
+ const byKey = new Map();
3337
+ for (const pair of pairs) {
3338
+ byKey.set(caseSamplePairKey(pair), pair);
3339
+ }
3340
+ return [...byKey.values()].sort((left, right) => left.caseId.localeCompare(right.caseId) ||
3341
+ left.sampleIndex - right.sampleIndex);
3342
+ }
3343
+ async function readHostedRunDetail(target, runId) {
3344
+ return await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(runId)}`), {}, target.baseUrl);
3345
+ }
3346
+ async function tryImportTerminalHostedProjectState(args) {
3347
+ const origin = args.target.origin;
3348
+ if (!origin || origin.projectId !== args.target.projectId) {
3349
+ return;
3350
+ }
3351
+ try {
3352
+ const state = await apiRequest(projectApiPath(args.target.projectId, "/state"), {}, args.target.baseUrl);
3353
+ await applyProjectStateToLocal({
3354
+ dir: args.target.dir,
3355
+ baseUrl: args.target.baseUrl,
3356
+ state,
3357
+ origin,
3358
+ requireCleanSource: true,
3359
+ });
3360
+ }
3361
+ catch (error) {
3362
+ args.io.stderr.write(`Hosted run finished, but local project state was not updated: ${errorMessage(error)}\n`);
2686
3363
  }
2687
- const ref = parseBenchmarkRef(refValue);
2688
- const baseUrl = await effectiveBaseUrl();
2689
- const project = await resolveRemoteProject(formatBenchmarkRef(ref), baseUrl);
2690
- const origin = await writeWorkbenchOrigin(resolveDir(parsed), {
2691
- baseUrl,
2692
- owner: project.ownerUsername ?? ref.owner,
2693
- project: project.name ?? ref.project,
2694
- projectId: project.id,
2695
- writable: false,
2696
- ...(project.currentSpecVersionId ? { sourceRevisionId: project.currentSpecVersionId } : {}),
2697
- ...(project.sourceFingerprint ? { sourceFingerprint: project.sourceFingerprint } : {}),
2698
- });
2699
- writeOutput({ ok: true, remote: "origin", origin }, parsed, io, () => `Set origin to ${origin.owner}/${origin.project}.`);
2700
- return 0;
2701
- }
2702
- async function remoteRemove(argv, io) {
2703
- const parsed = parseArgs(argv);
2704
- rejectUnknownFlags(parsed, new Set(["dir", "json"]));
2705
- const [name] = parsed.positionals;
2706
- if (name !== "origin" || parsed.positionals.length !== 1) {
2707
- throw new UsageError("workbench remote remove accepts: origin.");
2708
- }
2709
- const originPath = workbenchOriginPath(resolveDir(parsed));
2710
- const existed = await fileIsReadable(originPath);
2711
- await fs.rm(originPath, { force: true });
2712
- writeOutput({ ok: true, remote: "origin", removed: existed, path: originPath }, parsed, io, () => existed
2713
- ? `Removed origin (${originPath}).`
2714
- : `No origin configured (${originPath}).`);
2715
- return 0;
2716
3364
  }
2717
- async function starProject(argv, io, starred) {
2718
- const parsed = parseArgs(argv);
2719
- rejectUnknownFlags(parsed, new Set(["json"]));
2720
- const ref = readRequiredBenchmarkRef(parsed);
2721
- if (parsed.positionals.length > 1) {
2722
- throw new UsageError(`${starred ? "workbench cloud star" : "workbench cloud unstar"} accepts exactly one OWNER/BENCHMARK ref.`);
3365
+ function hostedRetrySourceYaml(run, runId) {
3366
+ const sourceYaml = stringValue(readRecord(run.input)?.sourceYaml);
3367
+ if (!sourceYaml) {
3368
+ throw new UsageError(`Run ${runId} is missing its recorded source configuration.`);
2723
3369
  }
2724
- const response = await apiRequest(`${publicProjectApiPath(ref)}/star`, { method: starred ? "PUT" : "DELETE" }, await effectiveBaseUrl());
2725
- writeOutput({ ok: true, benchmark: response.benchmark }, parsed, io, (record) => {
2726
- const value = record;
2727
- return `${starred ? "Starred" : "Unstarred"} ${formatBenchmarkRef(ref)}; ${value.benchmark.starCount} star(s).`;
2728
- });
2729
- return 0;
3370
+ return sourceYaml;
3371
+ }
3372
+ function hostedRunRecordFailed(run) {
3373
+ return run.outcome === "error" ||
3374
+ run.outcome === "cancelled" ||
3375
+ (run.failedJobCount ?? 0) > 0 ||
3376
+ Boolean(run.error);
2730
3377
  }
2731
3378
  async function startHostedWorkflow(workflow, argv, io) {
2732
3379
  const parsed = parseArgs(argv);
@@ -2734,9 +3381,10 @@ async function startHostedWorkflow(workflow, argv, io) {
2734
3381
  "dir",
2735
3382
  "benchmark",
2736
3383
  "base",
2737
- "optimizer",
3384
+ "runs",
2738
3385
  "budget",
2739
3386
  "samples",
3387
+ "rerun",
2740
3388
  "watch",
2741
3389
  "dry-run",
2742
3390
  "interval-ms",
@@ -2744,44 +3392,68 @@ async function startHostedWorkflow(workflow, argv, io) {
2744
3392
  "json",
2745
3393
  ]));
2746
3394
  if (parsed.positionals.length > 1) {
2747
- throw new UsageError(`workbench cloud ${workflow} accepts at most one source file or directory argument.`);
3395
+ throw new UsageError(`workbench ${workflow} --hosted accepts at most one source file or directory argument.`);
2748
3396
  }
2749
- const optimizerPath = asOptionalString(parsed.flags.optimizer);
2750
- const sourceArg = parsed.positionals[0] ?? asOptionalString(parsed.flags.dir) ?? process.cwd();
2751
- if (parsed.positionals.length > 0 && parsed.flags.dir !== undefined) {
2752
- throw new UsageError("Use either --dir or SOURCE, not both.");
3397
+ const sourceArg = resolveSourceDir(parsed);
3398
+ const samples = parsePositiveInt(parsed.flags.samples, 1, "samples");
3399
+ const budget = workflow === "improve"
3400
+ ? parsePositiveInt(parsed.flags.budget, 1, "budget")
3401
+ : undefined;
3402
+ if (parsed.flags.watch !== true && (parsed.flags["interval-ms"] !== undefined ||
3403
+ parsed.flags["timeout-ms"] !== undefined)) {
3404
+ throw new UsageError("--interval-ms and --timeout-ms require --watch.");
2753
3405
  }
2754
- const baseSubjectId = asOptionalString(parsed.flags.base);
3406
+ const runsFlag = asOptionalString(parsed.flags.runs);
3407
+ const defaultProjectSource = await readLocalProjectSource(path.resolve(sourceArg));
3408
+ const selectedRunIds = workflow === "eval"
3409
+ ? resolveCandidateRunSelection(defaultProjectSource, runsFlag)
3410
+ : [singleRequestedRunId(runsFlag, `workbench ${workflow} --hosted`) ?? defaultProjectSource.candidateRunId];
3411
+ if (workflow === "eval" && selectedRunIds.length > 1) {
3412
+ let failed = 0;
3413
+ const results = [];
3414
+ for (const runId of selectedRunIds) {
3415
+ const captured = createCapturingIo(io);
3416
+ const code = await startHostedWorkflow(workflow, hostedWorkflowArgsForRun({
3417
+ parsed,
3418
+ sourceDir: defaultProjectSource.dir,
3419
+ runId,
3420
+ }), captured.io);
3421
+ if (code !== 0) {
3422
+ failed += 1;
3423
+ }
3424
+ results.push(parseCapturedJson(captured.stdoutText()));
3425
+ }
3426
+ writeOutput({
3427
+ ok: failed === 0,
3428
+ candidateRunIds: selectedRunIds,
3429
+ failedRunCount: failed,
3430
+ results,
3431
+ }, parsed, io, () => `Processed ${selectedRunIds.length} hosted candidate run(s); ${failed} failed.`);
3432
+ return failed === 0 ? 0 : 1;
3433
+ }
3434
+ const baseCandidateId = asOptionalString(parsed.flags.base);
2755
3435
  const request = workflow === "improve"
2756
3436
  ? {
2757
3437
  workflow,
2758
- budget: parsePositiveInt(parsed.flags.budget, 1, "budget"),
2759
- samples: parsePositiveInt(parsed.flags.samples, 1, "samples"),
2760
- ...(baseSubjectId ? { subjectId: baseSubjectId } : {}),
3438
+ budget,
3439
+ samples,
3440
+ ...(baseCandidateId ? { candidateId: baseCandidateId } : {}),
2761
3441
  }
2762
3442
  : {
2763
3443
  workflow,
2764
- samples: parsePositiveInt(parsed.flags.samples, 1, "samples"),
2765
- ...(baseSubjectId ? { subjectId: baseSubjectId } : {}),
3444
+ samples,
3445
+ ...(baseCandidateId ? { candidateId: baseCandidateId } : {}),
2766
3446
  };
2767
- if (workflow === "improve" && !optimizerPath) {
2768
- throw new UsageError("workbench cloud improve requires --optimizer OPTIMIZER_YAML.");
3447
+ const projectSource = selectedRunIds[0] === defaultProjectSource.candidateRunId
3448
+ ? defaultProjectSource
3449
+ : await readLocalProjectSource(path.resolve(sourceArg), { runId: selectedRunIds[0] });
3450
+ request.sourceYaml = projectSource.specSource;
3451
+ request.adapterFiles = projectSource.adapterFiles;
3452
+ if (workflow === "eval" && !baseCandidateId) {
3453
+ request.candidateFiles = projectSource.candidateFiles;
2769
3454
  }
2770
- if (parsed.flags.watch !== true && (parsed.flags["interval-ms"] !== undefined ||
2771
- parsed.flags["timeout-ms"] !== undefined)) {
2772
- throw new UsageError("--interval-ms and --timeout-ms require --watch.");
2773
- }
2774
- const projectSource = await readLocalProjectSource(path.resolve(sourceArg), {
2775
- optimizerPath,
2776
- });
2777
- if (workflow === "eval") {
2778
- request.subjectSource = projectSource.subjectSource;
2779
- request.subjectFiles = projectSource.subjectFiles;
2780
- request.adapterFiles = projectSource.adapterFiles;
2781
- }
2782
- if (workflow === "improve" && projectSource.optimizerSource) {
2783
- request.optimizerSource = projectSource.optimizerSource;
2784
- request.adapterFiles = projectSource.adapterFiles;
3455
+ if (parsed.flags.rerun === true) {
3456
+ request.rerun = true;
2785
3457
  }
2786
3458
  const watchIntervalMs = parsed.flags.watch === true
2787
3459
  ? parsePositiveInt(parsed.flags["interval-ms"], 1000, "interval-ms")
@@ -2808,13 +3480,16 @@ async function startHostedWorkflow(workflow, argv, io) {
2808
3480
  sourceDir: projectSource.dir,
2809
3481
  });
2810
3482
  if (workflow === "improve") {
2811
- request.subjectId = await ensureHostedImproveBaseSubject({
3483
+ request.candidateId = await ensureHostedImproveBaseCandidate({
2812
3484
  parsed,
2813
3485
  target,
2814
3486
  samples: request.samples,
2815
- subjectId: baseSubjectId,
3487
+ candidateId: baseCandidateId,
3488
+ sourceYaml: projectSource.specSource,
3489
+ adapterFiles: projectSource.adapterFiles,
2816
3490
  intervalMs: watchIntervalMs ?? 1000,
2817
3491
  timeoutMs: watchTimeoutMs,
3492
+ io,
2818
3493
  });
2819
3494
  }
2820
3495
  const response = await apiRequest(projectApiPath(target.projectId, "/runs"), {
@@ -2822,6 +3497,20 @@ async function startHostedWorkflow(workflow, argv, io) {
2822
3497
  body: request,
2823
3498
  }, target.baseUrl);
2824
3499
  const startedRun = withRunUrls(target, response.run);
3500
+ const startedRunOutput = response.reused === true
3501
+ ? { ...startedRun, reused: true }
3502
+ : startedRun;
3503
+ if (response.reused === true && response.run.status === "finished") {
3504
+ await tryImportTerminalHostedProjectState({ target, io });
3505
+ writeOutput({
3506
+ ok: hostedRunSucceeded(response.run),
3507
+ reused: true,
3508
+ workflow,
3509
+ runId: startedRun.id,
3510
+ ...startedRun,
3511
+ }, parsed, io, () => `Reused hosted ${workflow} ${startedRun.id}. Use --rerun to intentionally run it again.`);
3512
+ return hostedRunSucceeded(response.run) ? 0 : 1;
3513
+ }
2825
3514
  if (parsed.flags.watch === true) {
2826
3515
  if (parsed.flags.json !== true) {
2827
3516
  io.stdout.write(`${formatHostedRunStarted(startedRun, workflow).trimEnd()}\n${HOSTED_WATCH_LIFECYCLE_NOTE}\n`);
@@ -2834,26 +3523,27 @@ async function startHostedWorkflow(workflow, argv, io) {
2834
3523
  timeoutMs: watchTimeoutMs,
2835
3524
  });
2836
3525
  const outputRun = await withHostedRunFailureSummary(target, watched);
3526
+ await tryImportTerminalHostedProjectState({ target, io });
2837
3527
  writeOutput(withRunUrls(target, outputRun), parsed, io, formatHostedRunResult);
2838
3528
  return hostedRunSucceeded(watched) ? 0 : 1;
2839
3529
  }
2840
- writeOutput(startedRun, parsed, io, (run) => formatHostedRunStarted(run, workflow).trimEnd());
3530
+ writeOutput(startedRunOutput, parsed, io, (run) => formatHostedRunStarted(run, workflow).trimEnd());
2841
3531
  return 0;
2842
3532
  }
2843
- async function ensureHostedImproveBaseSubject(args) {
2844
- if (args.subjectId) {
2845
- const subject = await readHostedSubjectSummary(args.target, args.subjectId);
2846
- if (!subject) {
2847
- throw new UsageError(`Base subject ${args.subjectId} was not found for the current benchmark.`);
3533
+ async function ensureHostedImproveBaseCandidate(args) {
3534
+ if (args.candidateId) {
3535
+ const candidate = await readHostedCandidateSummary(args.target, args.candidateId);
3536
+ if (!candidate) {
3537
+ throw new UsageError(`Base candidate ${args.candidateId} was not found for the current benchmark.`);
2848
3538
  }
2849
- if (hostedSubjectIsEvaluated(subject)) {
2850
- return args.subjectId;
3539
+ if (hostedCandidateIsEvaluated(candidate)) {
3540
+ return args.candidateId;
2851
3541
  }
2852
3542
  }
2853
3543
  else {
2854
- const activeSubject = await readEvaluatedActiveHostedSubject(args.target);
2855
- if (activeSubject) {
2856
- return activeSubject.id;
3544
+ const activeCandidate = await readEvaluatedActiveHostedCandidate(args.target);
3545
+ if (activeCandidate) {
3546
+ return activeCandidate.id;
2857
3547
  }
2858
3548
  }
2859
3549
  const response = await apiRequest(projectApiPath(args.target.projectId, "/runs"), {
@@ -2861,7 +3551,9 @@ async function ensureHostedImproveBaseSubject(args) {
2861
3551
  body: {
2862
3552
  workflow: "eval",
2863
3553
  samples: args.samples,
2864
- ...(args.subjectId ? { subjectId: args.subjectId } : {}),
3554
+ ...(args.candidateId ? { candidateId: args.candidateId } : {}),
3555
+ sourceYaml: args.sourceYaml,
3556
+ ...(args.adapterFiles.length > 0 ? { adapterFiles: args.adapterFiles } : {}),
2865
3557
  },
2866
3558
  }, args.target.baseUrl);
2867
3559
  const watched = await watchHostedRun({
@@ -2872,333 +3564,59 @@ async function ensureHostedImproveBaseSubject(args) {
2872
3564
  timeoutMs: args.timeoutMs,
2873
3565
  });
2874
3566
  if (!hostedRunSucceeded(watched)) {
2875
- throw new UsageError(`Parent subject eval ${watched.id} failed; improve was not started.`);
3567
+ throw new UsageError(`Parent candidate eval ${watched.id} failed; improve was not started.`);
2876
3568
  }
2877
- if (!watched.subjectId) {
2878
- throw new UsageError(`Parent subject eval ${watched.id} did not produce a subject.`);
2879
- }
2880
- return watched.subjectId;
2881
- }
2882
- async function readHostedSubjectSummary(target, subjectId) {
2883
- const response = await apiRequest(projectApiPath(target.projectId, "/subjects"), {}, target.baseUrl);
2884
- return response.subjects.find((entry) => entry.id === subjectId) ?? null;
2885
- }
2886
- async function readEvaluatedActiveHostedSubject(target) {
2887
- const response = await apiRequest(projectApiPath(target.projectId), {}, target.baseUrl);
2888
- const activeSubjectId = response.benchmark.activeSubjectId;
2889
- if (!activeSubjectId) {
2890
- return null;
3569
+ if (!watched.candidateId) {
3570
+ throw new UsageError(`Parent candidate eval ${watched.id} did not produce a candidate.`);
2891
3571
  }
2892
- const subject = await readHostedSubjectSummary(target, activeSubjectId);
2893
- return subject && hostedSubjectIsEvaluated(subject) ? subject : null;
2894
- }
2895
- function hostedSubjectIsEvaluated(subject) {
2896
- return subject.status === "evaluated" || subject.eval != null;
3572
+ await tryImportTerminalHostedProjectState({ target: args.target, io: args.io });
3573
+ return watched.candidateId;
2897
3574
  }
2898
- async function benchmarkList(argv, io) {
2899
- const parsed = parseArgs(argv);
2900
- rejectUnknownFlags(parsed, new Set(["json"]));
2901
- rejectUnexpectedPositionals(parsed, "workbench cloud benchmarks list", 0);
2902
- const response = await apiRequest("/api/workbench/public/benchmarks");
2903
- writeOutput(response.benchmarks, parsed, io, (projects) => {
2904
- if (projects.length === 0) {
2905
- return "No hosted Workbench benchmarks.";
2906
- }
2907
- return projects
2908
- .map((project) => `${project.id}\t${project.name}\t${project.runCount} runs\t${project.subjectCount} subjects`)
2909
- .join("\n");
2910
- });
2911
- return 0;
2912
- }
2913
- async function benchmarkShow(argv, io) {
2914
- const parsed = parseArgs(argv);
2915
- rejectUnknownFlags(parsed, new Set(["dir", "json"]));
2916
- rejectUnexpectedPositionals(parsed, "workbench cloud benchmarks show", 1);
2917
- const dir = resolveDir(parsed);
2918
- const origin = await readWorkbenchOrigin(dir);
2919
- const projectRef = parsed.positionals[0] ??
2920
- origin?.projectId;
2921
- if (!projectRef) {
2922
- throw new UsageError("Missing hosted benchmark. Pass OWNER/BENCHMARK, run workbench push, or run workbench clone.");
2923
- }
2924
- const response = await apiRequest(benchmarkApiPath(projectRef), {}, await effectiveBaseUrl(origin?.baseUrl));
2925
- writeOutput(response.benchmark, parsed, io, (project) => {
2926
- const record = project;
2927
- return `${record.name} (${record.id})\n${record.runs.length} runs\n${record.subjects.length} subjects`;
2928
- });
2929
- return 0;
2930
- }
2931
- async function benchmarkDelete(argv, io) {
2932
- const parsed = parseArgs(argv);
2933
- rejectUnknownFlags(parsed, new Set(["dir", "dry-run", "json"]));
2934
- if (parsed.positionals.length > 1) {
2935
- throw new UsageError(`Unexpected argument for workbench benchmarks delete: ${parsed.positionals.slice(1).join(" ")}`);
3575
+ function hostedWorkflowArgsForRun(args) {
3576
+ const next = ["--dir", args.sourceDir, "--runs", args.runId, "--json"];
3577
+ appendStringFlag(next, "benchmark", asOptionalString(args.parsed.flags.benchmark));
3578
+ appendStringFlag(next, "base", asOptionalString(args.parsed.flags.base));
3579
+ appendStringFlag(next, "samples", asOptionalString(args.parsed.flags.samples));
3580
+ appendStringFlag(next, "budget", asOptionalString(args.parsed.flags.budget));
3581
+ appendStringFlag(next, "interval-ms", asOptionalString(args.parsed.flags["interval-ms"]));
3582
+ appendStringFlag(next, "timeout-ms", asOptionalString(args.parsed.flags["timeout-ms"]));
3583
+ if (args.parsed.flags.watch === true) {
3584
+ next.push("--watch");
2936
3585
  }
2937
- const dir = resolveDir(parsed);
2938
- const origin = await readWorkbenchOrigin(dir);
2939
- const projectRef = parsed.positionals[0] ??
2940
- origin?.projectId;
2941
- if (!projectRef) {
2942
- throw new UsageError("Missing hosted benchmark. Pass OWNER/BENCHMARK, run workbench push, or run workbench clone.");
2943
- }
2944
- const originPath = workbenchOriginPath(dir);
2945
- const baseUrl = await effectiveBaseUrl(origin?.baseUrl);
2946
- if (parsed.flags["dry-run"] === true) {
2947
- const originProjectDeleted = originMatchesProjectRef(origin, projectRef);
2948
- writeOutput({
2949
- ok: true,
2950
- dryRun: true,
2951
- projectRef,
2952
- ...(isRemoteProjectId(projectRef) ? { projectId: projectRef } : {}),
2953
- ...(originProjectDeleted && origin?.project ? { projectName: origin.project } : {}),
2954
- baseUrl,
2955
- ...(originProjectDeleted ? { originPath } : {}),
2956
- }, parsed, io, () => originProjectDeleted
2957
- ? `Would delete hosted benchmark ${projectRef} and remove local origin ${originPath}.`
2958
- : `Would delete hosted benchmark ${projectRef}.`);
2959
- return 0;
3586
+ if (args.parsed.flags["dry-run"] === true) {
3587
+ next.push("--dry-run");
2960
3588
  }
2961
- const project = await resolveRemoteProject(projectRef, baseUrl);
2962
- const projectId = project.id;
2963
- const projectName = project.name;
2964
- const originProjectDeleted = origin ? origin.projectId === projectId : false;
2965
- await apiRequest(projectApiPath(projectId), { method: "DELETE" }, baseUrl);
2966
- if (originProjectDeleted) {
2967
- await fs.rm(originPath, { force: true });
3589
+ if (args.parsed.flags.rerun === true) {
3590
+ next.push("--rerun");
2968
3591
  }
2969
- writeOutput({
2970
- ok: true,
2971
- deleted: true,
2972
- projectId,
2973
- ...(projectName ? { projectName } : {}),
2974
- originRemoved: originProjectDeleted,
2975
- ...(originProjectDeleted ? { originPath } : {}),
2976
- }, parsed, io, () => originProjectDeleted
2977
- ? `Deleted benchmark ${formatProjectRef(project)} and removed local origin ${originPath}.`
2978
- : `Deleted benchmark ${formatProjectRef(project)}.`);
2979
- return 0;
2980
- }
2981
- async function benchmarkVersions(argv, io) {
2982
- const parsed = parseArgs(argv);
2983
- rejectUnknownFlags(parsed, new Set(["dir", "json"]));
2984
- rejectUnexpectedPositionals(parsed, "workbench cloud benchmarks versions", 1);
2985
- const projectRef = parsed.positionals[0];
2986
- const origin = await readWorkbenchOrigin(resolveDir(parsed));
2987
- if (!projectRef && !origin) {
2988
- throw new UsageError("Missing benchmark ref. Pass OWNER/BENCHMARK or run from a benchmark clone.");
2989
- }
2990
- const response = await apiRequest(benchmarkApiPath(projectRef ?? origin.projectId), {}, await effectiveBaseUrl(origin?.baseUrl));
2991
- const version = response.benchmark.sourceFingerprint ?? response.benchmark.currentSpecVersionId ?? "current";
2992
- writeOutput({
2993
- ok: true,
2994
- benchmark: response.benchmark,
2995
- versions: [{ ref: "main", digest: version, current: true }],
2996
- }, parsed, io, () => `${response.benchmark.name ?? projectRef ?? origin.project}\tmain\t${shortDigest(version)}\tcurrent`);
2997
- return 0;
2998
- }
2999
- async function benchmarkStarred(argv, io) {
3000
- const parsed = parseArgs(argv);
3001
- rejectUnknownFlags(parsed, new Set(["json"]));
3002
- rejectUnexpectedPositionals(parsed, "workbench cloud benchmarks starred", 0);
3003
- const response = await apiRequest("/api/workbench/benchmarks");
3004
- const starred = response.benchmarks.filter((project) => project.viewerHasStarred === true);
3005
- writeOutput(starred, parsed, io, (benchmarks) => {
3006
- if (benchmarks.length === 0) {
3007
- return "No starred benchmarks.";
3008
- }
3009
- return benchmarks
3010
- .map((benchmark) => `${benchmark.ownerUsername ?? "-"} / ${benchmark.name ?? "-"}\t${benchmark.starCount ?? 0} stars`)
3011
- .join("\n");
3012
- });
3013
- return 0;
3014
- }
3015
- async function subjectList(argv, io) {
3016
- const parsed = parseArgs(argv);
3017
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3018
- rejectUnexpectedPositionals(parsed, "workbench cloud subjects list", 0);
3019
- const target = await resolveHostedTarget(parsed);
3020
- const response = await apiRequest(projectApiPath(target.projectId, "/subjects"), {}, target.baseUrl);
3021
- writeOutput(response.subjects, parsed, io, (subjects) => {
3022
- if (subjects.length === 0) {
3023
- return "No subjects yet.";
3024
- }
3025
- return subjects
3026
- .map((subject) => `${subject.id}\t${subject.status}\tmetrics ${formatMetricSummary(subject.metrics)}\t${subject.fileChanges?.length ?? 0} files`)
3027
- .join("\n");
3028
- });
3029
- return 0;
3030
- }
3031
- async function subjectShow(argv, io) {
3032
- const parsed = parseArgs(argv);
3033
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3034
- rejectUnexpectedPositionals(parsed, "workbench cloud subjects show", 1);
3035
- const target = await resolveHostedTarget(parsed);
3036
- const subjectId = readRequiredSubjectId(parsed);
3037
- const params = new URLSearchParams({ id: subjectId });
3038
- const subject = await apiRequest(projectApiPath(target.projectId, `/workbench/record?${params.toString()}`), {}, target.baseUrl);
3039
- writeOutput(subject, parsed, io, (record) => {
3040
- const value = record;
3041
- return [
3042
- `${value.id ?? subjectId}\t${value.status ?? "unknown"}`,
3043
- ...(value.benchmarkFingerprint ? [`Benchmark version: ${shortDigest(value.benchmarkFingerprint)}`] : []),
3044
- ...(value.subjectFingerprint ? [`Subject digest: ${shortDigest(value.subjectFingerprint)}`] : []),
3045
- ].join("\n");
3046
- });
3047
- return 0;
3048
- }
3049
- async function subjectFiles(argv, io) {
3050
- const parsed = parseArgs(argv);
3051
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3052
- rejectUnexpectedPositionals(parsed, "workbench cloud subjects files", 1);
3053
- const target = await resolveHostedTarget(parsed);
3054
- const subjectId = readRequiredSubjectId(parsed);
3055
- const response = await apiRequest(projectApiPath(target.projectId, `/subjects/${encodeURIComponent(subjectId)}/files`), {}, target.baseUrl);
3056
- writeOutput(response.files, parsed, io, (files) => files
3057
- .map((file) => `${file.path}\t${file.status}\t${file.preview_kind}`)
3058
- .join("\n") || "No files.");
3059
- return 0;
3592
+ return next;
3060
3593
  }
3061
- async function subjectPreview(argv, io) {
3062
- const parsed = parseArgs(argv);
3063
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "path", "output", "json"]));
3064
- rejectUnexpectedPositionals(parsed, "workbench cloud subjects preview", 1);
3065
- const target = await resolveHostedTarget(parsed);
3066
- const subjectId = readRequiredSubjectId(parsed);
3067
- const filePath = requireFlag(parsed, "path");
3068
- const params = new URLSearchParams({ path: filePath });
3069
- const response = await apiRequest(projectApiPath(target.projectId, `/subjects/${encodeURIComponent(subjectId)}/files?${params.toString()}`), {}, target.baseUrl);
3070
- const content = response.preview.source?.content ??
3071
- response.preview.rendered_html ??
3072
- response.preview.diff ??
3073
- "";
3074
- const outputPath = asOptionalString(parsed.flags.output);
3075
- if (outputPath && outputPath !== "-") {
3076
- await fs.writeFile(outputPath, content);
3077
- io.stdout.write(`Wrote preview to ${outputPath}\n`);
3594
+ function appendStringFlag(args, name, value) {
3595
+ if (value !== undefined) {
3596
+ args.push(`--${name}`, value);
3078
3597
  }
3079
- else if (parsed.flags.json === true) {
3080
- writeJson(response.preview, io);
3081
- }
3082
- else {
3083
- io.stdout.write(content);
3084
- }
3085
- return 0;
3086
- }
3087
- async function subjectExport(argv, io) {
3088
- const parsed = parseArgs(argv);
3089
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "out", "json"]));
3090
- rejectUnexpectedPositionals(parsed, "workbench cloud subjects pull", 1);
3091
- const target = await resolveHostedTarget(parsed);
3092
- const subjectId = readRequiredSubjectId(parsed);
3093
- const outputDir = requireOutDir(parsed);
3094
- const response = await apiRequest(projectApiPath(target.projectId, `/subjects/${encodeURIComponent(subjectId)}/export`), {}, target.baseUrl);
3095
- await writeFiles(outputDir, response.files);
3096
- writeOutput({ ok: true, outputDir, files: response.files.length }, parsed, io, (result) => {
3097
- const record = result;
3098
- return `Exported ${record.files} file(s) to ${record.outputDir}`;
3099
- });
3100
- return 0;
3101
- }
3102
- async function subjectVisibility(argv, io, visibility) {
3103
- const parsed = parseArgs(argv);
3104
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3105
- rejectUnexpectedPositionals(parsed, `workbench cloud subjects ${visibility === "public" ? "publish" : "unpublish"}`, 1);
3106
- const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
3107
- const subjectId = readRequiredSubjectId(parsed);
3108
- const response = await apiRequest(projectApiPath(target.projectId, `/subjects/${encodeURIComponent(subjectId)}/publish`), { method: visibility === "public" ? "PUT" : "DELETE" }, target.baseUrl);
3109
- writeOutput({ ok: true, visibility, subject: response.subject }, parsed, io, () => `${visibility === "public" ? "Published" : "Unpublished"} subject ${subjectId}.`);
3110
- return 0;
3111
- }
3112
- async function runList(argv, io) {
3113
- const parsed = parseArgs(argv);
3114
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3115
- rejectUnexpectedPositionals(parsed, "workbench cloud runs list", 0);
3116
- const target = await resolveHostedTarget(parsed);
3117
- const response = await apiRequest(projectApiPath(target.projectId, "/runs"), {}, target.baseUrl);
3118
- writeOutput(response.runs, parsed, io, (runs) => runs
3119
- .map((run) => `${run.id}\t${run.status}\t${run.subjectId ?? "pending"}`)
3120
- .join("\n") || "No runs.");
3121
- return 0;
3122
- }
3123
- async function runShow(argv, io) {
3124
- const parsed = parseArgs(argv);
3125
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3126
- rejectUnexpectedPositionals(parsed, "workbench cloud runs show", 1);
3127
- const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
3128
- const runId = readRequiredRunId(parsed);
3129
- const response = await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(runId)}`), {}, target.baseUrl);
3130
- const detail = withRunDetailUrls(target, response);
3131
- writeOutput(detail, parsed, io, formatRunDetail);
3132
- return 0;
3133
- }
3134
- async function runCancel(argv, io) {
3135
- const parsed = parseArgs(argv);
3136
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3137
- rejectUnexpectedPositionals(parsed, "workbench cloud runs cancel", 1);
3138
- const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
3139
- const runId = readRequiredRunId(parsed);
3140
- const response = await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(runId)}`), { method: "DELETE" }, target.baseUrl);
3141
- const run = withRunUrls(target, response.run);
3142
- writeOutput(run, parsed, io, (record) => {
3143
- const value = record;
3144
- return [
3145
- `Cancelled run ${value.id}; status ${value.status}; outcome ${value.outcome ?? "cancelled"}.`,
3146
- `Open benchmark: ${value.urls?.benchmark ?? buildWorkbenchResourceUrls(target).benchmark}`,
3147
- ].join("\n");
3148
- });
3149
- return 0;
3150
3598
  }
3151
- async function runWatch(argv, io) {
3152
- const parsed = parseArgs(argv);
3153
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "interval-ms", "timeout-ms", "json"]));
3154
- rejectUnexpectedPositionals(parsed, "workbench cloud watch", 1);
3155
- const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
3156
- const runId = readRequiredRunId(parsed);
3157
- if (parsed.flags.json !== true) {
3158
- io.stdout.write(`Watching run ${runId}.\n${HOSTED_WATCH_LIFECYCLE_NOTE}\n`);
3159
- }
3160
- const run = await watchHostedRun({
3161
- parsed,
3162
- target,
3163
- runId,
3164
- intervalMs: parsePositiveInt(parsed.flags["interval-ms"], 1000, "interval-ms"),
3165
- timeoutMs: parseOptionalPositiveInt(parsed.flags["timeout-ms"], "timeout-ms"),
3166
- });
3167
- const outputRun = await withHostedRunFailureSummary(target, run);
3168
- writeOutput(withRunUrls(target, outputRun), parsed, io, formatHostedRunResult);
3169
- return hostedRunSucceeded(run) ? 0 : 1;
3599
+ async function readHostedCandidateSummary(target, candidateId) {
3600
+ const response = await apiRequest(projectApiPath(target.projectId, "/candidates"), {}, target.baseUrl);
3601
+ return response.candidates.find((entry) => entry.id === candidateId) ?? null;
3170
3602
  }
3171
- async function runLogs(argv, io) {
3172
- const parsed = parseArgs(argv);
3173
- rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3174
- rejectUnexpectedPositionals(parsed, "workbench cloud logs", 1);
3175
- const target = await resolveHostedTarget(parsed);
3176
- const requestedRunId = parsed.positionals[0];
3177
- if (requestedRunId) {
3178
- const response = await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(requestedRunId)}`), {}, target.baseUrl);
3179
- writeOutput({ runId: response.run.id, jobs: response.jobs }, parsed, io, formatRunLogs);
3180
- return 0;
3181
- }
3182
- const project = (await apiRequest(projectApiPath(target.projectId), {}, target.baseUrl)).project;
3183
- const runId = project.runs.at(-1)?.id;
3184
- if (!runId) {
3185
- throw new UsageError("Missing RUN_ID; the benchmark has no runs.");
3603
+ async function readEvaluatedActiveHostedCandidate(target) {
3604
+ const response = await apiRequest(projectApiPath(target.projectId), {}, target.baseUrl);
3605
+ const activeCandidateId = response.benchmark.activeCandidateId;
3606
+ if (!activeCandidateId) {
3607
+ return null;
3186
3608
  }
3187
- const jobs = project.jobs.filter((job) => job.runId === runId);
3188
- writeOutput({ runId, jobs }, parsed, io, formatRunLogs);
3189
- return 0;
3609
+ const candidate = await readHostedCandidateSummary(target, activeCandidateId);
3610
+ return candidate && hostedCandidateIsEvaluated(candidate) ? candidate : null;
3190
3611
  }
3191
- function formatRunLogs(record) {
3192
- const value = record;
3193
- return (value.jobs
3194
- .map((job) => `${job.id}\t${job.kind}\t${job.status}\t${job.subjectId ?? "-"}${job.error ? `\t${job.error}` : ""}`)
3195
- .join("\n") || `No jobs for ${value.runId}.`);
3612
+ function hostedCandidateIsEvaluated(candidate) {
3613
+ return candidate.status === "evaluated" || candidate.eval != null;
3196
3614
  }
3197
3615
  async function openWorkbench(argv, io) {
3198
3616
  const parsed = parseArgs(argv);
3199
3617
  rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "no-open", "json"]));
3200
3618
  if (parsed.positionals.length > 1) {
3201
- throw new UsageError(`Unexpected argument for workbench open: ${parsed.positionals.slice(1).join(" ")}`);
3619
+ throw new UsageError(`Unexpected argument for workbench open --hosted: ${parsed.positionals.slice(1).join(" ")}`);
3202
3620
  }
3203
3621
  const target = await resolveOpenTarget(parsed);
3204
3622
  const ref = target.openRef;
@@ -3226,7 +3644,7 @@ function buildWorkbenchWebUrl(target, ref) {
3226
3644
  if (ref.startsWith("run_")) {
3227
3645
  return benchmarkUrl;
3228
3646
  }
3229
- return buildWorkbenchResourceUrls(target, { subjectId: ref }).subjectEvaluation;
3647
+ return buildWorkbenchResourceUrls(target, { candidateId: ref }).candidateEvaluation;
3230
3648
  }
3231
3649
  async function resolveHostedTarget(parsed, options = {}) {
3232
3650
  if (options.sourceArg !== undefined && parsed.flags.dir !== undefined) {
@@ -3253,11 +3671,12 @@ async function resolveHostedTarget(parsed, options = {}) {
3253
3671
  if (!projectId) {
3254
3672
  throw new UsageError("Missing hosted benchmark. Run workbench push, workbench clone, or pass --benchmark OWNER/BENCHMARK.");
3255
3673
  }
3674
+ const originRemote = origin ? parseOriginRemote(origin) : null;
3256
3675
  return {
3257
3676
  projectId,
3258
- ...(!explicitProject && origin?.owner ? { owner: origin.owner } : {}),
3259
- ...(!explicitProject && origin?.project
3260
- ? { projectName: origin.project }
3677
+ ...(!explicitProject && originRemote ? { owner: originRemote.owner } : {}),
3678
+ ...(!explicitProject && originRemote
3679
+ ? { projectName: originRemote.project }
3261
3680
  : {}),
3262
3681
  dir,
3263
3682
  baseUrl,
@@ -3295,13 +3714,12 @@ async function resolveHostedDryRunTarget(parsed, options = {}) {
3295
3714
  };
3296
3715
  }
3297
3716
  if (origin?.projectId) {
3717
+ const originRemote = parseOriginRemote(origin);
3298
3718
  return {
3299
- projectRef: origin.owner && origin.project
3300
- ? `${origin.owner}/${origin.project}`
3301
- : origin.projectId,
3719
+ projectRef: origin.remote,
3302
3720
  projectId: origin.projectId,
3303
- ...(origin.owner ? { owner: origin.owner } : {}),
3304
- ...(origin.project ? { projectName: origin.project } : {}),
3721
+ owner: originRemote.owner,
3722
+ projectName: originRemote.project,
3305
3723
  dir,
3306
3724
  baseUrl,
3307
3725
  origin,
@@ -3313,7 +3731,7 @@ async function resolveOpenTarget(parsed) {
3313
3731
  const ref = parsed.positionals[0];
3314
3732
  if (ref &&
3315
3733
  !ref.startsWith("run_") &&
3316
- !ref.startsWith("subject_")) {
3734
+ !ref.startsWith("candidate_")) {
3317
3735
  const baseUrl = await effectiveBaseUrl();
3318
3736
  if (ref.includes("/")) {
3319
3737
  const parsedRef = parseBenchmarkRef(ref);
@@ -3347,51 +3765,44 @@ function buildWorkbenchResourceUrls(target, refs = {}) {
3347
3765
  const projectRef = `${encodeURIComponent(target.owner)}/${encodeURIComponent(target.projectName)}`;
3348
3766
  const benchmark = `${target.baseUrl}/benchmarks/${projectRef}`;
3349
3767
  const urls = { benchmark };
3350
- if (refs.subjectId) {
3768
+ if (refs.candidateId) {
3351
3769
  const evaluationId = refs.runId
3352
- ? evaluationScorecardId(refs.runId, refs.subjectId)
3770
+ ? evaluationScorecardId(refs.runId, refs.candidateId)
3353
3771
  : null;
3354
- urls.subjectEvaluation = evaluationId
3355
- ? `${benchmark}/subjects/${encodeURIComponent(refs.subjectId)}?evaluation=${encodeURIComponent(evaluationId)}`
3356
- : `${benchmark}/subjects/${encodeURIComponent(refs.subjectId)}`;
3772
+ urls.candidateEvaluation = evaluationId
3773
+ ? `${benchmark}/candidates/${encodeURIComponent(refs.candidateId)}?evaluation=${encodeURIComponent(evaluationId)}`
3774
+ : `${benchmark}/candidates/${encodeURIComponent(refs.candidateId)}`;
3357
3775
  }
3358
3776
  return urls;
3359
3777
  }
3360
3778
  function projectApiPath(projectRef, suffix = "") {
3361
3779
  return `/api/workbench/benchmarks/${encodeURIComponent(projectRef)}${suffix}`;
3362
3780
  }
3363
- function benchmarkApiPath(benchmarkRef) {
3364
- if (benchmarkRef.includes("/")) {
3365
- return publicProjectApiPath(parseBenchmarkRef(benchmarkRef));
3366
- }
3367
- return projectApiPath(benchmarkRef);
3368
- }
3369
3781
  function publicProjectApiPath(ref) {
3370
3782
  return `/api/workbench/public/benchmarks/${encodeURIComponent(ref.owner)}/${encodeURIComponent(ref.project)}`;
3371
3783
  }
3372
- function publicProjectSourceApiPath(ref) {
3373
- return `${publicProjectApiPath(ref)}/source`;
3784
+ function publicProjectStateApiPath(ref) {
3785
+ return `${publicProjectApiPath(ref)}/state`;
3374
3786
  }
3375
3787
  function readRequiredBenchmarkRef(parsed) {
3376
3788
  const ref = parsed.positionals[0];
3377
3789
  if (!ref) {
3378
- throw new UsageError("Missing required OWNER/BENCHMARK ref.");
3790
+ throw new UsageError("Missing required OWNER/BENCHMARK.");
3379
3791
  }
3380
3792
  return parseBenchmarkRef(ref);
3381
3793
  }
3382
3794
  function parseBenchmarkRef(value) {
3383
- const [namePart, versionRef, extraRef] = value.split("@");
3384
- if (extraRef !== undefined || !namePart) {
3385
- throw new UsageError("Benchmark refs must use OWNER/BENCHMARK[@REF].");
3795
+ if (value.includes("@")) {
3796
+ throw new UsageError("Benchmark refs must use OWNER/BENCHMARK.");
3386
3797
  }
3387
- const [owner, project, extra] = namePart.split("/");
3798
+ const [owner, project, extra] = value.split("/");
3388
3799
  if (!owner || !project || extra !== undefined) {
3389
- throw new UsageError("Benchmark refs must use OWNER/BENCHMARK[@REF].");
3800
+ throw new UsageError("Benchmark refs must use OWNER/BENCHMARK.");
3390
3801
  }
3391
- return { owner, project, ...(versionRef ? { ref: versionRef } : {}) };
3802
+ return { owner, project };
3392
3803
  }
3393
3804
  function formatBenchmarkRef(ref) {
3394
- return `${ref.owner}/${ref.project}${ref.ref ? `@${ref.ref}` : ""}`;
3805
+ return `${ref.owner}/${ref.project}`;
3395
3806
  }
3396
3807
  async function resolveRemoteProject(projectRef, baseUrl) {
3397
3808
  if (projectRef.includes("/")) {
@@ -3402,52 +3813,84 @@ async function resolveRemoteProject(projectRef, baseUrl) {
3402
3813
  const response = await apiRequest(projectApiPath(projectRef), {}, baseUrl);
3403
3814
  return response.benchmark;
3404
3815
  }
3405
- function formatProjectRef(project) {
3406
- return project.name ? `${project.name} (${project.id})` : project.id;
3407
- }
3408
- function originMatchesProjectRef(origin, projectRef) {
3409
- if (!origin) {
3410
- return false;
3411
- }
3412
- if (origin.projectId === projectRef) {
3413
- return true;
3414
- }
3415
- if (!projectRef.includes("/")) {
3416
- return false;
3417
- }
3418
- const ref = parseBenchmarkRef(projectRef);
3419
- return origin.owner === ref.owner && origin.project === ref.project;
3420
- }
3421
3816
  function withRunUrls(target, run) {
3422
3817
  return {
3423
3818
  ...run,
3424
3819
  urls: buildWorkbenchResourceUrls(target, {
3425
3820
  runId: run.id,
3426
- subjectId: run.outputSubjectId ?? run.subjectId,
3821
+ candidateId: run.outputCandidateId ?? run.candidateId,
3427
3822
  }),
3428
3823
  };
3429
3824
  }
3430
- function withRunDetailUrls(target, detail) {
3431
- const subjectId = hostedRunEvaluationSubjectId(detail.run, detail.jobs);
3432
- const run = withRunUrls(target, {
3433
- ...detail.run,
3434
- outputSubjectId: detail.run.outputSubjectId ?? subjectId,
3435
- });
3825
+ function hostedRunEvaluationCandidateId(run, jobs = []) {
3826
+ if (run.outputCandidateId) {
3827
+ return run.outputCandidateId;
3828
+ }
3829
+ const attemptCandidates = jobs
3830
+ .filter((job) => readRunJobPurpose(job) === "attempt")
3831
+ .map((job) => job.candidateId)
3832
+ .filter((candidateId) => Boolean(candidateId));
3833
+ return attemptCandidates.at(-1) ?? run.candidateId ?? null;
3834
+ }
3835
+ function localProjectState(args) {
3836
+ const stateSource = localProjectStateSource(args.source);
3837
+ const runtimeFingerprint = workbenchRuntimeBundleFingerprint(args.runtime);
3436
3838
  return {
3437
- run,
3438
- jobs: detail.jobs,
3439
- urls: run.urls ?? buildWorkbenchResourceUrls(target, { runId: run.id }),
3839
+ schema: "workbench.project.state.v1",
3840
+ project: {
3841
+ id: args.origin?.projectId ?? "",
3842
+ remote: args.origin?.remote ?? `local/${args.source.spec.name}`,
3843
+ ownerUsername: args.origin ? parseOriginRemote(args.origin).owner : "local",
3844
+ name: args.origin ? parseOriginRemote(args.origin).project : args.source.spec.name,
3845
+ visibility: args.visibility,
3846
+ },
3847
+ base: {
3848
+ ...(args.origin ? { sourceRevisionId: args.origin.sourceRevisionId } : {}),
3849
+ ...(args.origin ? { sourceFingerprint: args.origin.sourceFingerprint } : {}),
3850
+ runtimeFingerprint: args.origin?.runtimeFingerprint ?? runtimeFingerprint,
3851
+ },
3852
+ source: stateSource,
3853
+ runtime: args.runtime,
3440
3854
  };
3441
3855
  }
3442
- function hostedRunEvaluationSubjectId(run, jobs = []) {
3443
- if (run.outputSubjectId) {
3444
- return run.outputSubjectId;
3445
- }
3446
- const attemptSubjects = jobs
3447
- .filter((job) => readRunJobPurpose(job) === "attempt")
3448
- .map((job) => job.subjectId)
3449
- .filter((subjectId) => Boolean(subjectId));
3450
- return attemptSubjects.at(-1) ?? run.subjectId ?? null;
3856
+ function localProjectStateSource(source) {
3857
+ const request = hostedProjectSourceRequest(source);
3858
+ const stateSource = {
3859
+ source: request.source,
3860
+ files: source.sourceFiles.map((file) => ({ ...file })),
3861
+ candidateFiles: request.candidateFiles.map(toSurfaceSnapshotFile),
3862
+ engineResolveFiles: request.engineResolveFiles.map(toSurfaceSnapshotFile),
3863
+ engineResolveBinding: request.engineResolveBinding,
3864
+ adapterFiles: request.adapterFiles.map(toSurfaceSnapshotFile),
3865
+ dockerfile: request.dockerfile,
3866
+ runtimeDockerfile: request.runtimeDockerfile,
3867
+ runtimeFiles: request.runtimeFiles.map(toSurfaceSnapshotFile),
3868
+ network: request.network,
3869
+ resources: { ...request.resources },
3870
+ };
3871
+ return {
3872
+ ...stateSource,
3873
+ fingerprint: workbenchProjectSourceFingerprint(stateSource),
3874
+ };
3875
+ }
3876
+ function toSurfaceSnapshotFile(file) {
3877
+ return {
3878
+ path: file.path,
3879
+ kind: "kind" in file ? file.kind : file.encoding === "base64" ? "binary" : "text",
3880
+ encoding: file.encoding ?? "utf8",
3881
+ content: file.content,
3882
+ executable: file.executable === true,
3883
+ };
3884
+ }
3885
+ function hostedProjectSummaryFromState(state) {
3886
+ return {
3887
+ id: state.project.id,
3888
+ ownerUsername: state.project.ownerUsername,
3889
+ name: state.project.name,
3890
+ visibility: state.project.visibility,
3891
+ currentSpecVersionId: state.source.revisionId ?? state.base.sourceRevisionId,
3892
+ sourceFingerprint: state.source.fingerprint ?? state.base.sourceFingerprint,
3893
+ };
3451
3894
  }
3452
3895
  function sourceFileCount(source) {
3453
3896
  return source.sourceFiles.length;
@@ -3456,7 +3899,7 @@ function hostedProjectSourceRequest(source) {
3456
3899
  const { network, resources } = hostedEnvironmentOptions(source);
3457
3900
  return {
3458
3901
  source: source.specSource,
3459
- subjectFiles: source.subjectFiles,
3902
+ candidateFiles: source.candidateFiles,
3460
3903
  engineResolveFiles: hostedEngineResolveFiles(source),
3461
3904
  engineResolveBinding: engineResolveBindingForSpec(source.spec),
3462
3905
  adapterFiles: source.adapterFiles,
@@ -3539,68 +3982,48 @@ async function watchHostedRun(args) {
3539
3982
  }
3540
3983
  }
3541
3984
  function formatHostedRunResult(run) {
3542
- const subjectId = run.outputSubjectId ?? run.subjectId;
3543
- const activeDetail = run.activeSubjectId && subjectId && run.activeSubjectId !== subjectId
3544
- ? `; active ${run.activeSubjectId}`
3985
+ const candidateId = run.outputCandidateId ?? run.candidateId;
3986
+ const activeDetail = run.activeCandidateId && candidateId && run.activeCandidateId !== candidateId
3987
+ ? `; active ${run.activeCandidateId}`
3545
3988
  : "";
3546
- const summary = `Run ${run.id} reached ${run.status}; ${run.outcome ? `outcome ${run.outcome}; ` : ""}subject ${subjectId ?? "pending"}${activeDetail}; ${run.completedJobCount ?? 0}/${run.jobCount ?? 0} jobs completed.`;
3989
+ const summary = `Run ${run.id} reached ${run.status}; ${run.outcome ? `outcome ${run.outcome}; ` : ""}candidate ${candidateId ?? "pending"}${activeDetail}; ${run.completedJobCount ?? 0}/${run.jobCount ?? 0} jobs completed.`;
3547
3990
  return [
3548
3991
  run.error ? `${summary}\nError: ${run.error}` : summary,
3549
- ...(run.urls?.subjectEvaluation
3550
- ? [`Open evaluation: ${run.urls.subjectEvaluation}`]
3992
+ ...(run.urls?.candidateEvaluation
3993
+ ? [`Open evaluation: ${run.urls.candidateEvaluation}`]
3551
3994
  : [`Open benchmark: ${run.urls?.benchmark ?? ""}`].filter(Boolean)),
3552
3995
  ].join("\n");
3553
3996
  }
3554
- function formatHostedRunStarted(run, fallbackWorkflow) {
3555
- const subjectId = run.outputSubjectId ?? run.subjectId;
3556
- return [
3557
- `Started ${run.workflow ?? fallbackWorkflow} run ${run.id}; ${subjectId ? `subject ${subjectId}` : `${run.jobCount ?? 0} jobs queued`}.`,
3558
- ...(run.urls?.subjectEvaluation
3559
- ? [`Open evaluation: ${run.urls.subjectEvaluation}`]
3560
- : run.urls?.benchmark ? [`Open benchmark: ${run.urls.benchmark}`] : []),
3561
- "",
3562
- ].join("\n");
3563
- }
3564
- function formatRunDetail(record) {
3565
- const detail = record;
3566
- const { run, jobs, urls } = detail;
3567
- const cost = sumJobCostUsd(jobs);
3568
- const firstFailedJob = jobs.find((job) => job.status === "failed" && job.error);
3569
- const subjectId = hostedRunEvaluationSubjectId(run, jobs);
3997
+ function formatRetryCommandResult(result) {
3998
+ const run = result.run;
3999
+ const runId = run?.id ?? result.runId ?? "unknown";
4000
+ const scope = `${result.retried.kind} ${result.retried.id}`;
4001
+ const verb = run
4002
+ ? run.status === "finished" ? "finished as hosted run" : "started as hosted run"
4003
+ : "finished as local run";
3570
4004
  return [
3571
- `Run ${run.id}: ${run.status}${run.outcome ? ` (${run.outcome})` : ""}`,
3572
- `Workflow: ${run.workflow ?? "improve"}`,
3573
- `Subject: ${subjectId ?? "pending"}`,
3574
- ...(run.activeSubjectId && subjectId && run.activeSubjectId !== subjectId
3575
- ? [`Active subject: ${run.activeSubjectId}`]
3576
- : []),
3577
- `Samples: ${run.samples ?? 0}`,
3578
- `Attempts: ${run.attemptsExecuted ?? 0}/${run.attemptsRequested ?? run.attemptsExecuted ?? 0}`,
3579
- `Jobs: ${run.completedJobCount ?? jobs.filter(isTerminalRunJob).length}/${run.jobCount ?? jobs.length} completed${run.failedJobCount ? `; ${run.failedJobCount} failed` : ""}`,
3580
- ...(typeof run.durationMs === "number"
3581
- ? [`Duration: ${formatDurationMs(run.durationMs)}`]
4005
+ `Retry of ${scope} ${verb} ${runId}.`,
4006
+ ...(result.evaluationId ? [`Evaluation: ${result.evaluationId}`] : []),
4007
+ ...(result.candidateId ? [`Candidate: ${result.candidateId}`] : []),
4008
+ ...(result.failedJobCount ? [`Failed jobs: ${result.failedJobCount}`] : []),
4009
+ ...(result.error ? [`Error: ${result.error}`] : []),
4010
+ ...(result.localView
4011
+ ? [`Open local view: ${result.localView.command}`, result.localView.note]
3582
4012
  : []),
3583
- ...(cost > 0 ? [`Cost: ${formatUsd(cost)}`] : []),
3584
- ...(firstFailedJob?.error
3585
- ? [`First failed job ${firstFailedJob.id}: ${firstFailedJob.error}`]
3586
- : []),
3587
- ...(urls.subjectEvaluation
3588
- ? [`Open evaluation: ${urls.subjectEvaluation}`]
3589
- : [`Open benchmark: ${urls.benchmark}`]),
3590
- ...(jobs.length > 0 ? ["", "Jobs:", ...jobs.map(formatRunJobLine)] : []),
4013
+ ...(result.urls?.candidateEvaluation
4014
+ ? [`Open evaluation: ${result.urls.candidateEvaluation}`]
4015
+ : result.urls?.benchmark ? [`Open benchmark: ${result.urls.benchmark}`] : []),
3591
4016
  ].join("\n");
3592
4017
  }
3593
- function formatRunJobLine(job) {
4018
+ function formatHostedRunStarted(run, fallbackWorkflow) {
4019
+ const candidateId = run.outputCandidateId ?? run.candidateId;
3594
4020
  return [
3595
- job.id,
3596
- readRunJobPurpose(job) ?? job.kind ?? "job",
3597
- job.status,
3598
- job.subjectId ?? "-",
3599
- job.error ?? "",
3600
- ].filter((value, index) => index < 4 || value !== "").join("\t");
3601
- }
3602
- function isTerminalRunJob(job) {
3603
- return job.status === "succeeded" || job.status === "failed" || job.status === "cancelled";
4021
+ `Started ${run.workflow ?? fallbackWorkflow} run ${run.id}; ${candidateId ? `candidate ${candidateId}` : `${run.jobCount ?? 0} jobs queued`}.`,
4022
+ ...(run.urls?.candidateEvaluation
4023
+ ? [`Open evaluation: ${run.urls.candidateEvaluation}`]
4024
+ : run.urls?.benchmark ? [`Open benchmark: ${run.urls.benchmark}`] : []),
4025
+ "",
4026
+ ].join("\n");
3604
4027
  }
3605
4028
  function readRunJobPurpose(job) {
3606
4029
  const input = readRecord(job.input);
@@ -3608,49 +4031,22 @@ function readRunJobPurpose(job) {
3608
4031
  const purpose = execution?.purpose;
3609
4032
  return typeof purpose === "string" && purpose ? purpose : null;
3610
4033
  }
3611
- function sumJobCostUsd(jobs) {
3612
- const sum = jobs.reduce((total, job) => total + costUsdFromUsage(readRecord(job.output)?.usage), 0);
3613
- return Number.isFinite(sum) ? Math.round(sum * 1_000_000) / 1_000_000 : 0;
3614
- }
3615
- function costUsdFromUsage(value) {
3616
- const usage = readRecord(value);
3617
- if (!usage) {
3618
- return 0;
3619
- }
3620
- const direct = readFiniteNumber(usage.costUsd);
3621
- if (direct !== null) {
3622
- return direct;
3623
- }
3624
- return ["total", "optimizer", "runner", "engine"].reduce((sum, key) => {
3625
- const nested = readRecord(usage[key]);
3626
- return sum + (readFiniteNumber(nested?.costUsd) ?? 0);
3627
- }, 0);
3628
- }
3629
4034
  function readRecord(value) {
3630
4035
  return value && typeof value === "object" && !Array.isArray(value)
3631
4036
  ? value
3632
4037
  : null;
3633
4038
  }
3634
- function readFiniteNumber(value) {
3635
- return typeof value === "number" && Number.isFinite(value) ? value : null;
4039
+ function stringValue(value) {
4040
+ return typeof value === "string" && value.length > 0 ? value : null;
3636
4041
  }
3637
- function formatDurationMs(durationMs) {
3638
- if (durationMs < 1000) {
3639
- return `${Math.max(0, Math.round(durationMs))}ms`;
3640
- }
3641
- const seconds = durationMs / 1000;
3642
- if (seconds < 60) {
3643
- return `${seconds.toFixed(seconds < 10 ? 1 : 0)}s`;
3644
- }
3645
- const minutes = Math.floor(seconds / 60);
3646
- const remainingSeconds = Math.round(seconds % 60);
3647
- return `${minutes}m ${remainingSeconds}s`;
4042
+ function numberValue(value) {
4043
+ return readFiniteNumber(value);
3648
4044
  }
3649
- function formatUsd(value) {
3650
- return `$${value.toFixed(value < 1 ? 4 : 2)}`;
4045
+ function integerValue(value) {
4046
+ return Number.isSafeInteger(value) ? value : null;
3651
4047
  }
3652
- function shortDigest(value) {
3653
- return value.length > 12 ? value.slice(0, 12) : value;
4048
+ function readFiniteNumber(value) {
4049
+ return typeof value === "number" && Number.isFinite(value) ? value : null;
3654
4050
  }
3655
4051
  async function withHostedRunFailureSummary(target, run) {
3656
4052
  if (hostedRunSucceeded(run) || run.error || (run.failedJobCount ?? 0) <= 0) {
@@ -3681,23 +4077,44 @@ function hostedRunSucceeded(run) {
3681
4077
  async function readWorkbenchOrigin(dir) {
3682
4078
  try {
3683
4079
  const parsed = JSON.parse(await fs.readFile(workbenchOriginPath(dir), "utf8"));
3684
- if (!parsed.projectId ||
3685
- !parsed.baseUrl ||
3686
- !parsed.owner ||
3687
- !parsed.project ||
3688
- typeof parsed.writable !== "boolean") {
4080
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
4081
+ throw new UsageError(`Workbench origin is malformed: ${workbenchOriginPath(dir)}`);
4082
+ }
4083
+ const originRecord = parsed;
4084
+ const keys = Object.keys(originRecord).sort();
4085
+ const expectedKeys = [
4086
+ "baseUrl",
4087
+ "linkedAt",
4088
+ "projectId",
4089
+ "remote",
4090
+ "runtimeFingerprint",
4091
+ "sourceFingerprint",
4092
+ "sourceRevisionId",
4093
+ ];
4094
+ if (typeof originRecord.projectId !== "string" ||
4095
+ typeof originRecord.baseUrl !== "string" ||
4096
+ typeof originRecord.remote !== "string" ||
4097
+ typeof originRecord.sourceRevisionId !== "string" ||
4098
+ typeof originRecord.sourceFingerprint !== "string" ||
4099
+ typeof originRecord.runtimeFingerprint !== "string" ||
4100
+ typeof originRecord.linkedAt !== "string" ||
4101
+ originRecord.projectId.length === 0 ||
4102
+ originRecord.sourceRevisionId.length === 0 ||
4103
+ originRecord.sourceFingerprint.length === 0 ||
4104
+ originRecord.runtimeFingerprint.length === 0) {
4105
+ throw new UsageError(`Workbench origin is malformed: ${workbenchOriginPath(dir)}`);
4106
+ }
4107
+ if (JSON.stringify(keys) !== JSON.stringify(expectedKeys)) {
3689
4108
  throw new UsageError(`Workbench origin is malformed: ${workbenchOriginPath(dir)}`);
3690
4109
  }
3691
4110
  return {
3692
- baseUrl: normalizeBaseUrl(parsed.baseUrl),
3693
- owner: parsed.owner,
3694
- project: parsed.project,
3695
- projectId: parsed.projectId,
3696
- writable: parsed.writable,
3697
- ...(parsed.sourceRevisionId ? { sourceRevisionId: parsed.sourceRevisionId } : {}),
3698
- ...(parsed.sourceFingerprint ? { sourceFingerprint: parsed.sourceFingerprint } : {}),
3699
- ...(parsed.upstream ? { upstream: parsed.upstream } : {}),
3700
- linkedAt: parsed.linkedAt ?? new Date(0).toISOString(),
4111
+ baseUrl: normalizeBaseUrl(originRecord.baseUrl),
4112
+ remote: normalizeOriginRemote(originRecord.remote),
4113
+ projectId: originRecord.projectId,
4114
+ sourceRevisionId: originRecord.sourceRevisionId,
4115
+ sourceFingerprint: originRecord.sourceFingerprint,
4116
+ runtimeFingerprint: originRecord.runtimeFingerprint,
4117
+ linkedAt: originRecord.linkedAt,
3701
4118
  };
3702
4119
  }
3703
4120
  catch (error) {
@@ -3716,8 +4133,12 @@ async function requireWorkbenchOrigin(dir) {
3716
4133
  }
3717
4134
  async function writeWorkbenchOrigin(dir, input) {
3718
4135
  const origin = {
3719
- ...input,
3720
4136
  baseUrl: normalizeBaseUrl(input.baseUrl),
4137
+ remote: normalizeOriginRemote(input.remote),
4138
+ projectId: input.projectId,
4139
+ sourceRevisionId: input.sourceRevisionId,
4140
+ sourceFingerprint: input.sourceFingerprint,
4141
+ runtimeFingerprint: input.runtimeFingerprint,
3721
4142
  linkedAt: input.linkedAt ?? new Date().toISOString(),
3722
4143
  };
3723
4144
  const filePath = workbenchOriginPath(dir);
@@ -3725,6 +4146,56 @@ async function writeWorkbenchOrigin(dir, input) {
3725
4146
  await fs.writeFile(filePath, `${JSON.stringify(origin, null, 2)}\n`);
3726
4147
  return origin;
3727
4148
  }
4149
+ async function writeWorkbenchOriginFromState(dir, args) {
4150
+ const owner = args.project?.ownerUsername ?? args.state.project.ownerUsername;
4151
+ const name = args.project?.name ?? args.state.project.name;
4152
+ const sourceRevisionId = args.project?.currentSpecVersionId ??
4153
+ args.state.source.revisionId ??
4154
+ args.state.base.sourceRevisionId;
4155
+ const sourceFingerprint = args.sourceFingerprint ??
4156
+ args.project?.sourceFingerprint ??
4157
+ args.state.source.fingerprint ??
4158
+ args.state.base.sourceFingerprint;
4159
+ const runtimeFingerprint = args.state.base.runtimeFingerprint ??
4160
+ workbenchRuntimeBundleFingerprint(args.state.runtime);
4161
+ if (!sourceRevisionId || !sourceFingerprint || !runtimeFingerprint) {
4162
+ throw new UsageError("Hosted project state is missing required origin metadata.");
4163
+ }
4164
+ return await writeWorkbenchOrigin(dir, {
4165
+ baseUrl: args.baseUrl,
4166
+ remote: `${owner}/${name}`,
4167
+ projectId: args.project?.id ?? args.state.project.id,
4168
+ sourceRevisionId,
4169
+ sourceFingerprint,
4170
+ runtimeFingerprint,
4171
+ });
4172
+ }
4173
+ async function localSourceFingerprint(dir) {
4174
+ const source = localProjectStateSource(await readLocalProjectSource(dir));
4175
+ return source.fingerprint ?? workbenchProjectSourceFingerprint(source);
4176
+ }
4177
+ function parseOriginRemote(origin) {
4178
+ return parseRemoteName(origin.remote);
4179
+ }
4180
+ function parseRemoteName(remote) {
4181
+ try {
4182
+ return parseBenchmarkRef(remote);
4183
+ }
4184
+ catch {
4185
+ throw new UsageError(`Workbench origin remote must use OWNER/BENCHMARK: ${remote}`);
4186
+ }
4187
+ }
4188
+ function normalizeOriginRemote(remote) {
4189
+ const parsed = parseRemoteName(remote.trim());
4190
+ return `${parsed.owner}/${parsed.project}`;
4191
+ }
4192
+ function originRemoteUrlParts(origin) {
4193
+ const remote = parseOriginRemote(origin);
4194
+ return {
4195
+ owner: remote.owner,
4196
+ projectName: remote.project,
4197
+ };
4198
+ }
3728
4199
  function workbenchOriginPath(dir) {
3729
4200
  return path.join(dir, ".workbench", "origin.json");
3730
4201
  }
@@ -3763,30 +4234,6 @@ async function readWorkbenchProfileStatus(config) {
3763
4234
  return { authenticated: true, profile: null };
3764
4235
  }
3765
4236
  }
3766
- function readOptionalSubjectId(parsed) {
3767
- return asOptionalString(parsed.flags.subject) ?? parsed.positionals[0];
3768
- }
3769
- function readRequiredSubjectId(parsed) {
3770
- const subjectId = readOptionalSubjectId(parsed);
3771
- if (!subjectId) {
3772
- throw new UsageError("Missing required SUBJECT_ID.");
3773
- }
3774
- return subjectId;
3775
- }
3776
- function readRequiredRunId(parsed) {
3777
- const runId = parsed.positionals[0];
3778
- if (!runId) {
3779
- throw new UsageError("Missing required RUN_ID.");
3780
- }
3781
- return runId;
3782
- }
3783
- function requireOutDir(parsed) {
3784
- const output = asOptionalString(parsed.flags.out);
3785
- if (!output) {
3786
- throw new UsageError("Missing required --out.");
3787
- }
3788
- return output;
3789
- }
3790
4237
  async function apiRequest(apiPath, options = {}, baseUrlOverride) {
3791
4238
  const config = await loadConfig();
3792
4239
  const baseUrl = normalizeBaseUrl(baseUrlOverride ??
@@ -4002,6 +4449,38 @@ function readInitAgent(parsed, kind) {
4002
4449
  function asOptionalString(value) {
4003
4450
  return typeof value === "string" && value.length > 0 ? value : undefined;
4004
4451
  }
4452
+ function singleRequestedRunId(value, command) {
4453
+ if (!value || value.trim() === "") {
4454
+ return undefined;
4455
+ }
4456
+ const trimmed = value.trim();
4457
+ if (trimmed === "all" || trimmed.includes(",")) {
4458
+ throw new UsageError(`${command} accepts one candidate run id for --runs; use workbench eval --runs all to evaluate every run.`);
4459
+ }
4460
+ return trimmed;
4461
+ }
4462
+ function resolveCandidateRunSelection(source, value) {
4463
+ const available = source.candidateRunIds;
4464
+ if (available.length === 0) {
4465
+ throw new UsageError("Candidate must declare at least one run.");
4466
+ }
4467
+ if (!value || value.trim() === "") {
4468
+ return [source.candidateRunId];
4469
+ }
4470
+ const trimmed = value.trim();
4471
+ if (trimmed === "all") {
4472
+ return available;
4473
+ }
4474
+ const requested = [...new Set(trimmed.split(",").map((entry) => entry.trim()).filter(Boolean))];
4475
+ if (requested.length === 0) {
4476
+ throw new UsageError("--runs must include at least one run id or all.");
4477
+ }
4478
+ const missing = requested.filter((runId) => !available.includes(runId));
4479
+ if (missing.length > 0) {
4480
+ throw new UsageError(`Unknown candidate run(s): ${missing.join(", ")}. Available: ${available.join(", ")}.`);
4481
+ }
4482
+ return requested;
4483
+ }
4005
4484
  function readOptionalStringFlag(value, name) {
4006
4485
  if (value == null || value === false) {
4007
4486
  return undefined;
@@ -4226,6 +4705,27 @@ function parsePortFlag(value) {
4226
4705
  }
4227
4706
  return port;
4228
4707
  }
4708
+ function formatCandidateEvaluationScore(candidate) {
4709
+ const score = candidate.eval?.metrics?.score?.mean;
4710
+ return typeof score === "number" && Number.isFinite(score)
4711
+ ? formatMetricValue(score)
4712
+ : "n/a";
4713
+ }
4714
+ function formatLocalCandidateLabel(candidate) {
4715
+ if (!candidate) {
4716
+ return "none";
4717
+ }
4718
+ const name = candidate.name?.trim() || candidate.id;
4719
+ const displayName = candidate.version > 0
4720
+ ? `${name} v${candidate.version}`
4721
+ : name;
4722
+ return `${displayName} (${candidate.id})`;
4723
+ }
4724
+ function formatCandidateEvaluationSummary(candidate) {
4725
+ return formatMetricSummary(evaluationMeanMetrics(candidate.eval), {
4726
+ limit: Number.POSITIVE_INFINITY,
4727
+ });
4728
+ }
4229
4729
  function formatMetricSummary(metrics, options = {}) {
4230
4730
  const entries = Object.entries(metrics ?? {}).filter((entry) => Number.isFinite(entry[1]));
4231
4731
  if (entries.length === 0) {
@@ -4255,23 +4755,28 @@ function resolveSourceDir(parsed) {
4255
4755
  if (parsed.positionals.length > 1) {
4256
4756
  throw new UsageError("Expected at most one source file or directory argument.");
4257
4757
  }
4258
- if (parsed.positionals.length > 0 && parsed.flags.dir !== undefined) {
4259
- throw new UsageError("Use either --dir or SOURCE, not both.");
4758
+ const dir = asOptionalString(parsed.flags.dir);
4759
+ const source = parsed.positionals[0];
4760
+ if (dir && source) {
4761
+ return path.resolve(dir, source);
4260
4762
  }
4261
- return path.resolve(asOptionalString(parsed.flags.dir) ?? parsed.positionals[0] ?? process.cwd());
4763
+ return path.resolve(dir ?? source ?? process.cwd());
4262
4764
  }
4263
4765
  function isWorkbenchSourceYamlPath(filePath) {
4264
4766
  return path.basename(filePath) === WORKBENCH_BENCHMARK_FILE;
4265
4767
  }
4266
- function readSubjectIdFlag(parsed, snapshot) {
4267
- const explicit = asOptionalString(parsed.flags.subject) ?? asOptionalString(parsed.flags.subject);
4768
+ function readCandidateIdFlag(parsed, snapshot) {
4769
+ const explicit = readOptionalCandidateFlag(parsed);
4268
4770
  if (explicit) {
4269
4771
  return explicit;
4270
4772
  }
4271
4773
  if (snapshot.activeId) {
4272
4774
  return snapshot.activeId;
4273
4775
  }
4274
- throw new UsageError("Missing required --subject; no active subject exists.");
4776
+ throw new UsageError("Missing required --candidate; no active candidate exists.");
4777
+ }
4778
+ function readOptionalCandidateFlag(parsed) {
4779
+ return asOptionalString(parsed.flags.candidate);
4275
4780
  }
4276
4781
  function readPreviewMode(parsed) {
4277
4782
  const view = asOptionalString(parsed.flags.view) ?? "rendered";
@@ -4375,8 +4880,8 @@ async function copyInitSeedIfProvided(parsed, workspace, seed) {
4375
4880
  }
4376
4881
  });
4377
4882
  }
4378
- function formatSpecOptimizer(spec) {
4379
- return spec.improve ? `adapter:${spec.improve.use}` : "optimizer not configured";
4883
+ function formatSpecImprover(spec) {
4884
+ return spec.improve ? `adapter:${spec.improve.use}` : "improve not configured";
4380
4885
  }
4381
4886
  async function writeFiles(outputDir, files) {
4382
4887
  await fs.mkdir(outputDir, { recursive: true });
@@ -4401,6 +4906,14 @@ async function syncSourceFiles(outputDir, files) {
4401
4906
  }
4402
4907
  await writeFiles(outputDir, files);
4403
4908
  }
4909
+ async function assertLocalSourceMatchesOrigin(dir, origin) {
4910
+ const source = await readLocalProjectSource(dir);
4911
+ const fingerprint = localProjectStateSource(source).fingerprint;
4912
+ if (fingerprint === origin.sourceFingerprint) {
4913
+ return;
4914
+ }
4915
+ throw new UsageError("Local source changed since the last pull or push. Run `workbench push` before pulling, or restore the local source changes and try again.");
4916
+ }
4404
4917
  async function readManagedSourceFilePaths(outputDir) {
4405
4918
  try {
4406
4919
  const source = await readLocalProjectSource(outputDir);