@workbench-ai/workbench 0.0.48 → 0.0.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -5,7 +5,7 @@ import { createRequire } from "node:module";
5
5
  import os from "node:os";
6
6
  import path from "node:path";
7
7
  import { Writable } from "node:stream";
8
- import { createSubjectFilePreview, createBaselineSubjectJob as createRuntimeBaselineSubjectJob, evaluationScorecardId, executeWorkbenchExecutionJob, engineResolveBindingForSpec, filterSubjectSourceFiles, workbenchExecutionPurpose, createWorkbenchAdapterAuthBundle, createSubjectEvaluationTraceInputFiles, createSubjectRevisionTraceInputFiles, DOCKER_SANDBOX_BACKEND, localWorkbenchAdapterAuthStore, materializeWorkbenchRunResult, normalizeSurfaceFiles, planWorkbenchExecutionJobsForPurpose, runWorkbenchExecutionDag, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, summarizeSubjectFiles, validateWorkbenchRunEnvelope, parseWorkbenchAdapterAuthTarget, } from "@workbench-ai/workbench-core";
8
+ import { createCandidateFilePreview, createBaselineCandidateJob as createRuntimeBaselineCandidateJob, evaluationScorecardId, evaluationMeanMetrics, executeWorkbenchExecutionJob, engineResolveBindingForSpec, filterCandidateSourceFiles, workbenchExecutionPurpose, workbenchRunExecutionFingerprint, createWorkbenchAdapterAuthBundle, createOptimizerTraceInputFiles, DOCKER_SANDBOX_BACKEND, localWorkbenchAdapterAuthStore, materializeWorkbenchRunResult, normalizeSurfaceFiles, planWorkbenchExecutionJobsForPurpose, runWorkbenchExecutionDag, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, summarizeCandidateFiles, validateWorkbenchRunEnvelope, parseWorkbenchAdapterAuthTarget, } from "@workbench-ai/workbench-core";
9
9
  import { assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, WORKBENCH_ADAPTER_RESULT_FILE, WORKBENCH_ADAPTER_RESULT_PROTOCOL, normalizeWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationCommand, workbenchAdapterOperationResultPath, withDefaultWorkbenchAdapterAuthProfiles as applyDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
10
10
  import { builtinLocalTraceAdapter, builtinLocalTraceAdapters, sortLocalTraceRefs, } from "@workbench-ai/workbench-built-in-adapters/local-traces";
11
11
  import { commandUsage, HOSTED_WATCH_LIFECYCLE_NOTE, LOCAL_DEV_OPEN_LIFECYCLE_NOTE, rootUsage, } from "./command-model.js";
@@ -13,10 +13,10 @@ import { startLocalWorkbenchDevServer } from "./dev-open-server.js";
13
13
  import { createWorkbenchInitScaffold, } from "./init-scaffold.js";
14
14
  import { defaultAdapterManifests, composeRuntimeDockerfileWithAdapters, resolveDefaultWorkbenchAdapter, resolveProjectAdapterSource, resolveWorkbenchAdaptersForProject, WORKBENCH_ADAPTER_MANIFEST_FILE, } from "./adapter-project.js";
15
15
  import { createAdapterCommandEnv } from "./adapter-command-env.js";
16
- import { appendLocalRun, loadLocalArchive, loadLocalArchiveIndex, materializeSubjectRoot, readLocalSubject, readLocalSubjectFiles, saveLocalArchive, saveLocalJobs, setLocalActive, upsertLocalSubject, upsertLocalEvaluation, } from "./local-archive.js";
16
+ import { loadLocalArchive, loadLocalArchiveIndex, materializeCandidateRoot, readLocalCandidate, readLocalCandidateFiles, readLocalJobs, saveLocalArchive, saveLocalJobs, setLocalActive, upsertLocalRun, upsertLocalCandidate, upsertLocalEvaluation, } from "./local-archive.js";
17
17
  import { WorkspaceSnapshotError, } from "./workspace-snapshot.js";
18
18
  import { readLocalProjectSource, WORKBENCH_BENCHMARK_FILE, } from "./project-source.js";
19
- import { localBenchmarkFingerprint, localSubjectFingerprint, } from "./benchmark-fingerprint.js";
19
+ import { localBenchmarkFingerprint, localCandidateFingerprint, } from "./benchmark-fingerprint.js";
20
20
  const require = createRequire(import.meta.url);
21
21
  function getCliVersion() {
22
22
  const manifest = require("../package.json");
@@ -87,7 +87,10 @@ export async function runCli(argv, io = {
87
87
  return await runRemoteCommand(argv.slice(1), io);
88
88
  }
89
89
  if (argv[0] === "eval") {
90
- return await localEvaluateSubject(argv.slice(1), io, runtimeOptions);
90
+ return await localEvaluateCandidate(argv.slice(1), io, runtimeOptions);
91
+ }
92
+ if (argv[0] === "retry") {
93
+ return await localRetry(argv.slice(1), io, runtimeOptions);
91
94
  }
92
95
  if (argv[0] === "improve") {
93
96
  return await localRun(argv.slice(1), io, runtimeOptions);
@@ -117,14 +120,14 @@ export async function runCli(argv, io = {
117
120
  return await localRunList(rest, io);
118
121
  case "runs show":
119
122
  return await localRunShow(rest, io);
120
- case "subjects list":
121
- return await localSubjectList(rest, io);
122
- case "subjects show":
123
- return await localSubjectShow(rest, io);
124
- case "subjects files":
125
- return await localSubjectFiles(rest, io);
126
- case "subjects preview":
127
- return await localSubjectPreview(rest, io);
123
+ case "candidates list":
124
+ return await localCandidateList(rest, io);
125
+ case "candidates show":
126
+ return await localCandidateShow(rest, io);
127
+ case "candidates files":
128
+ return await localCandidateFiles(rest, io);
129
+ case "candidates preview":
130
+ return await localCandidatePreview(rest, io);
128
131
  default:
129
132
  break;
130
133
  }
@@ -163,7 +166,7 @@ function commandPathForHelp(argv) {
163
166
  ["list", "show"].includes(positionals[1] ?? "")) {
164
167
  return positionals.slice(0, 2).join(" ");
165
168
  }
166
- if (positionals[0] === "subjects" &&
169
+ if (positionals[0] === "candidates" &&
167
170
  ["list", "show", "files", "preview"].includes(positionals[1] ?? "")) {
168
171
  return positionals.slice(0, 2).join(" ");
169
172
  }
@@ -175,6 +178,8 @@ async function runCloudCommand(argv, io) {
175
178
  switch (command) {
176
179
  case "eval":
177
180
  return await startHostedWorkflow("eval", rest, io);
181
+ case "retry":
182
+ return await retryHostedWorkflow(rest, io);
178
183
  case "improve":
179
184
  return await startHostedWorkflow("improve", rest, io);
180
185
  case "open":
@@ -209,20 +214,20 @@ async function runCloudCommand(argv, io) {
209
214
  return await runShow(subRest, io);
210
215
  case "runs cancel":
211
216
  return await runCancel(subRest, io);
212
- case "subjects list":
213
- return await subjectList(subRest, io);
214
- case "subjects show":
215
- return await subjectShow(subRest, io);
216
- case "subjects files":
217
- return await subjectFiles(subRest, io);
218
- case "subjects preview":
219
- return await subjectPreview(subRest, io);
220
- case "subjects pull":
221
- return await subjectExport(subRest, io);
222
- case "subjects publish":
223
- return await subjectVisibility(subRest, io, "public");
224
- case "subjects unpublish":
225
- return await subjectVisibility(subRest, io, "private");
217
+ case "candidates list":
218
+ return await candidateList(subRest, io);
219
+ case "candidates show":
220
+ return await candidateShow(subRest, io);
221
+ case "candidates files":
222
+ return await candidateFiles(subRest, io);
223
+ case "candidates preview":
224
+ return await candidatePreview(subRest, io);
225
+ case "candidates pull":
226
+ return await candidateExport(subRest, io);
227
+ case "candidates publish":
228
+ return await candidateVisibility(subRest, io, "public");
229
+ case "candidates unpublish":
230
+ return await candidateVisibility(subRest, io, "private");
226
231
  default:
227
232
  throw new UsageError(`Unknown command: cloud ${argv.join(" ")}`);
228
233
  }
@@ -313,7 +318,7 @@ async function localInit(argv, io) {
313
318
  specPath,
314
319
  kind: scaffold.kind,
315
320
  name: scaffold.name,
316
- subjectRoot: scaffold.subjectRoot,
321
+ candidateRoot: scaffold.candidateRoot,
317
322
  }, parsed, io, () => `Initialized ${scaffold.kind} Workbench source directory at ${workspace}`);
318
323
  return 0;
319
324
  }
@@ -358,20 +363,20 @@ function buildWorkbenchCheckPlan(source) {
358
363
  files: sourceFileCount(source),
359
364
  yaml: [
360
365
  path.relative(source.dir, source.benchmarkPath) || "benchmark.yaml",
361
- path.relative(source.dir, source.subjectSpecPath) || "subject YAML",
362
- ...(source.optimizerSource !== undefined
363
- ? [path.relative(source.dir, source.optimizerPath ?? "") || "optimizer YAML"]
364
- : []),
366
+ path.relative(source.dir, source.candidateSpecPath) || "candidate YAML",
365
367
  ],
366
368
  dockerfile: source.dockerfilePath,
367
369
  },
368
- subject: {
369
- filesPath: source.spec.subject.files.path,
370
- files: source.subjectFiles.length,
370
+ candidate: {
371
+ name: source.spec.candidate.name,
372
+ selectedRunId: source.spec.candidate.selectedRunId,
373
+ runCount: Object.keys(source.spec.candidate.runs).length,
374
+ filesPath: source.spec.candidate.files.path,
375
+ files: source.candidateFiles.length,
371
376
  },
372
- optimizer: source.spec.optimizer
377
+ improve: source.spec.candidate.improve
373
378
  ? {
374
- edits: [...source.spec.optimizer.edits],
379
+ edits: [...source.spec.candidate.improve.edits],
375
380
  }
376
381
  : null,
377
382
  engine: {
@@ -394,8 +399,8 @@ function buildWorkbenchCheckPlan(source) {
394
399
  };
395
400
  }
396
401
  function formatWorkbenchCheckPlan(plan, warningSuffix) {
397
- const edits = plan.optimizer?.edits.length
398
- ? plan.optimizer.edits.join(", ")
402
+ const edits = plan.improve?.edits.length
403
+ ? plan.improve.edits.join(", ")
399
404
  : "-";
400
405
  const network = plan.environment.network.egress;
401
406
  const resources = plan.environment.resources;
@@ -404,11 +409,12 @@ function formatWorkbenchCheckPlan(plan, warningSuffix) {
404
409
  `Benchmark: ${plan.benchmarkName}`,
405
410
  `Description: ${plan.benchmarkDescription}`,
406
411
  `Source: ${plan.source.files} file(s) (${plan.source.yaml.join(", ")}, ${plan.source.dockerfile})`,
407
- `Subject files: ${plan.subject.filesPath} (${plan.subject.files} file(s))`,
408
- `Optimizer edits: ${edits}`,
412
+ `Candidate: ${plan.candidate.name} (${plan.candidate.runCount} run(s), selected ${plan.candidate.selectedRunId})`,
413
+ `Candidate files: ${plan.candidate.filesPath} (${plan.candidate.files} file(s))`,
414
+ `Improve edits: ${edits}`,
409
415
  `Engine cases: ${plan.engine.cases} case(s) from ${formatAdapterSummary(plan.engine.resolver)} at ${plan.engine.path} (${plan.engine.files} file(s))`,
410
416
  `Environment: ${plan.environment.dockerfile}, network ${network}, ${resources.cpu} CPU, ${resources.memoryGb}GB RAM, ${resources.timeoutMinutes}m timeout`,
411
- `Execution: improve ${plan.adapters.improve ? formatAdapterSummary(plan.adapters.improve) : "not configured"}, subject ${formatAdapterSummary(plan.adapters.run)}, engine ${formatAdapterSummary(plan.adapters.engine)}`,
417
+ `Execution: improve ${plan.adapters.improve ? formatAdapterSummary(plan.adapters.improve) : "not configured"}, candidate run ${formatAdapterSummary(plan.adapters.run)}, engine ${formatAdapterSummary(plan.adapters.engine)}`,
412
418
  ...adapterSourceLines(plan.adapters.sources),
413
419
  ].join("\n");
414
420
  }
@@ -493,18 +499,206 @@ function splitWorkspaceError(error) {
493
499
  const message = error instanceof Error ? error.message : String(error);
494
500
  return message.split(/\n+/u).map((entry) => entry.trim()).filter(Boolean);
495
501
  }
502
+ async function localRetry(argv, io, runtimeOptions) {
503
+ const parsed = parseArgs(argv);
504
+ rejectUnknownFlags(parsed, new Set(["dir", "json"]));
505
+ rejectUnexpectedPositionals(parsed, "workbench retry", 1);
506
+ const targetId = parsed.positionals[0];
507
+ if (!targetId) {
508
+ throw new UsageError("Missing required TARGET_ID.");
509
+ }
510
+ const workspace = resolveDir(parsed);
511
+ const target = await resolveLocalRetryTarget(workspace, targetId);
512
+ const captured = createCapturingIo(io);
513
+ const code = target.workflow === "eval"
514
+ ? await localEvaluateCandidate([
515
+ "--dir",
516
+ workspace,
517
+ "--candidate",
518
+ target.candidateId,
519
+ "--runs",
520
+ target.candidateRunId,
521
+ "--samples",
522
+ String(target.samples),
523
+ "--json",
524
+ ], captured.io, runtimeOptions)
525
+ : await localRun([
526
+ "--dir",
527
+ workspace,
528
+ "--from",
529
+ target.candidateId,
530
+ "--runs",
531
+ target.candidateRunId,
532
+ "--budget",
533
+ String(target.budget ?? 1),
534
+ "--samples",
535
+ String(target.samples),
536
+ "--json",
537
+ ], captured.io, runtimeOptions);
538
+ const commandOutput = parseCapturedJson(captured.stdoutText());
539
+ await preserveLocalActiveCandidate(workspace, target.preserveActiveId);
540
+ const outputRecord = readRecord(commandOutput) ?? {};
541
+ const result = {
542
+ ok: code === 0 && outputRecord.ok !== false,
543
+ retried: {
544
+ id: target.sourceId,
545
+ kind: target.sourceKind,
546
+ workflow: target.workflow,
547
+ },
548
+ };
549
+ assignRetryResultString(result, "runId", outputRecord.runId);
550
+ assignRetryResultString(result, "evaluationId", outputRecord.evaluationId);
551
+ assignRetryResultString(result, "candidateId", outputRecord.candidateId);
552
+ assignRetryResultString(result, "activeCandidateId", outputRecord.activeCandidateId);
553
+ const localView = localRetryViewHint(outputRecord.localView);
554
+ if (localView) {
555
+ result.localView = localView;
556
+ }
557
+ const failedJobCount = numberValue(outputRecord.failedJobCount);
558
+ if (failedJobCount !== null) {
559
+ result.failedJobCount = failedJobCount;
560
+ }
561
+ const error = stringValue(outputRecord.error);
562
+ if (error) {
563
+ result.error = error;
564
+ }
565
+ writeOutput(result, parsed, io, formatRetryCommandResult);
566
+ return code;
567
+ }
568
+ async function resolveLocalRetryTarget(workspace, targetId) {
569
+ const snapshot = await loadLocalArchive(workspace);
570
+ const evaluation = snapshot.evaluations.find((entry) => entry.id === targetId);
571
+ if (evaluation) {
572
+ const run = snapshot.runs.find((entry) => entry.id === evaluation.runId) ?? null;
573
+ return localEvaluationRetryTarget(snapshot, evaluation, run, "evaluation", targetId);
574
+ }
575
+ const run = snapshot.runs.find((entry) => entry.id === targetId);
576
+ if (!run) {
577
+ throw new UsageError(`Run or evaluation not found: ${targetId}`);
578
+ }
579
+ if (run.status !== "finished") {
580
+ throw new UsageError(`Run ${run.id} is ${run.status}; wait for it to finish before retrying.`);
581
+ }
582
+ if (!runSummaryFailed(run)) {
583
+ throw new UsageError(`Run ${run.id} did not fail; use workbench ${run.workflow} to intentionally run it again.`);
584
+ }
585
+ if (run.workflow === "eval") {
586
+ const evaluations = snapshot.evaluations.filter((entry) => entry.runId === run.id);
587
+ if (evaluations.length !== 1) {
588
+ throw new UsageError(evaluations.length === 0
589
+ ? `Run ${run.id} has no evaluation record to retry.`
590
+ : `Run ${run.id} has multiple evaluations; retry a specific evaluation id instead.`);
591
+ }
592
+ return localEvaluationRetryTarget(snapshot, evaluations[0], run, "run", targetId);
593
+ }
594
+ const candidateRunId = run.candidateRunId;
595
+ if (!run.candidateId || !candidateRunId) {
596
+ throw new UsageError(`Run ${run.id} is missing retry metadata; use workbench improve --from with an explicit candidate id.`);
597
+ }
598
+ return {
599
+ sourceId: targetId,
600
+ sourceKind: "run",
601
+ workflow: "improve",
602
+ candidateId: run.candidateId,
603
+ candidateRunId,
604
+ samples: run.samples,
605
+ budget: run.budget,
606
+ preserveActiveId: snapshot.activeId,
607
+ };
608
+ }
609
+ function localEvaluationRetryTarget(snapshot, evaluation, run, sourceKind, sourceId) {
610
+ if (!evaluationScorecardFailed(evaluation, run)) {
611
+ throw new UsageError(`Evaluation ${evaluation.id} did not fail; use workbench eval to intentionally run it again.`);
612
+ }
613
+ if (!snapshot.candidates.some((entry) => entry.id === evaluation.candidateId)) {
614
+ throw new UsageError(`Candidate not found for evaluation ${evaluation.id}: ${evaluation.candidateId}`);
615
+ }
616
+ const candidateRunId = evaluation.candidateRunId ?? run?.candidateRunId;
617
+ if (!candidateRunId) {
618
+ throw new UsageError(`Evaluation ${evaluation.id} is missing its candidate run configuration.`);
619
+ }
620
+ return {
621
+ sourceId,
622
+ sourceKind,
623
+ workflow: "eval",
624
+ candidateId: evaluation.candidateId,
625
+ candidateRunId,
626
+ samples: evaluation.sampleCount || run?.samples || 1,
627
+ preserveActiveId: snapshot.activeId,
628
+ };
629
+ }
630
+ async function preserveLocalActiveCandidate(workspace, activeId) {
631
+ let snapshot = await loadLocalArchive(workspace);
632
+ if (activeId && !snapshot.candidates.some((candidate) => candidate.id === activeId)) {
633
+ return;
634
+ }
635
+ if (snapshot.activeId === activeId) {
636
+ return;
637
+ }
638
+ snapshot = setLocalActive(snapshot, activeId);
639
+ await saveLocalArchive(workspace, snapshot);
640
+ }
641
+ function evaluationScorecardFailed(evaluation, run) {
642
+ return evaluation.errorSampleCount > 0 ||
643
+ evaluation.status !== "completed" ||
644
+ runSummaryFailed(run);
645
+ }
646
+ function runSummaryFailed(run) {
647
+ return run?.outcome === "error" || run?.outcome === "cancelled";
648
+ }
649
+ function createCapturingIo(io) {
650
+ const chunks = [];
651
+ const stdout = new class extends Writable {
652
+ _write(chunk, _encoding, callback) {
653
+ chunks.push(Buffer.isBuffer(chunk) ? chunk.toString("utf8") : String(chunk));
654
+ callback();
655
+ }
656
+ }();
657
+ return {
658
+ io: {
659
+ stdin: io.stdin,
660
+ stdout,
661
+ stderr: io.stderr,
662
+ },
663
+ stdoutText: () => chunks.join(""),
664
+ };
665
+ }
666
+ function parseCapturedJson(value) {
667
+ const trimmed = value.trim();
668
+ if (!trimmed) {
669
+ return {};
670
+ }
671
+ try {
672
+ return JSON.parse(trimmed);
673
+ }
674
+ catch {
675
+ return { output: trimmed };
676
+ }
677
+ }
678
+ function localRetryViewHint(value) {
679
+ const record = readRecord(value);
680
+ const command = stringValue(record?.command);
681
+ const note = stringValue(record?.note);
682
+ return command && note ? { command, note } : undefined;
683
+ }
684
+ function assignRetryResultString(result, key, value) {
685
+ const normalized = stringValue(value);
686
+ if (normalized) {
687
+ result[key] = normalized;
688
+ }
689
+ }
496
690
  async function localRun(argv, io, runtimeOptions) {
497
691
  const parsed = parseArgs(argv);
498
- rejectUnknownFlags(parsed, new Set(["dir", "optimizer", "from", "budget", "samples", "json"]));
692
+ rejectUnknownFlags(parsed, new Set(["dir", "runs", "from", "budget", "samples", "rerun", "json"]));
499
693
  const budget = parsePositiveInt(parsed.flags.budget, 1, "budget");
500
694
  const samples = parsePositiveInt(parsed.flags.samples, 1, "samples");
501
695
  const sourceArg = resolveSourceDir(parsed);
502
696
  const projectSource = await readLocalProjectSource(sourceArg, {
503
- optimizerPath: asOptionalString(parsed.flags.optimizer),
697
+ runId: singleRequestedRunId(asOptionalString(parsed.flags.runs), "workbench improve"),
504
698
  });
505
699
  const workspace = projectSource.dir;
506
- if (!projectSource.spec.optimizer) {
507
- throw new UsageError("Optimizer YAML is required for workbench improve.");
700
+ if (!projectSource.spec.improve || !projectSource.spec.candidate.improve) {
701
+ throw new UsageError("Candidate improve configuration is required for workbench improve.");
508
702
  }
509
703
  const executionProject = await resolveLocalProjectForExecution(workspace, projectSource.specSource);
510
704
  const { spec, adapterManifests } = executionProject;
@@ -522,10 +716,8 @@ async function localRun(argv, io, runtimeOptions) {
522
716
  });
523
717
  const environmentRefs = await ensureLocalDockerfileEnvironments(workspace, spec, engineCases);
524
718
  const benchmarkFingerprint = await readLocalBenchmarkFingerprint(workspace);
525
- const runId = `run_local_${Date.now().toString(36)}`;
526
- const startedAt = new Date().toISOString();
527
- let snapshot = await loadLocalArchive(workspace);
528
- const baseSubject = await ensureLocalImproveBaseSubject({
719
+ const executionFingerprint = localRunExecutionFingerprint(projectSource);
720
+ const baseCandidate = await ensureLocalImproveBaseCandidate({
529
721
  parsed,
530
722
  sourceArg,
531
723
  workspace,
@@ -534,9 +726,47 @@ async function localRun(argv, io, runtimeOptions) {
534
726
  io,
535
727
  runtimeOptions,
536
728
  });
537
- let currentBaseId = baseSubject.id;
729
+ let snapshot = await loadLocalArchive(workspace);
730
+ if (parsed.flags.rerun !== true) {
731
+ const reusableRun = findReusableLocalImproveRun(snapshot.runs, {
732
+ benchmarkFingerprint,
733
+ candidateId: baseCandidate.id,
734
+ candidateRunId: projectSource.spec.candidate.selectedRunId,
735
+ executionFingerprint,
736
+ budget,
737
+ samples,
738
+ });
739
+ if (reusableRun) {
740
+ const evaluation = snapshot.evaluations.find((entry) => entry.runId === reusableRun.id) ?? null;
741
+ const outputCandidateId = reusableRun.outputCandidateId ?? reusableRun.candidateId ?? baseCandidate.id;
742
+ const outputCandidate = readLocalCandidate(snapshot, outputCandidateId);
743
+ const activeCandidate = snapshot.activeId
744
+ ? readLocalCandidate(snapshot, snapshot.activeId)
745
+ : null;
746
+ const result = {
747
+ ok: true,
748
+ reused: true,
749
+ runId: reusableRun.id,
750
+ evaluationId: evaluation?.id ?? null,
751
+ outputCandidateId,
752
+ outputCandidate,
753
+ activeCandidateId: snapshot.activeId,
754
+ activeCandidate,
755
+ completedJobCount: 0,
756
+ failedJobCount: 0,
757
+ localView: localDevViewHint(workspace, reusableRun.id),
758
+ };
759
+ writeOutput(result, parsed, io, () => `Reused improve run ${reusableRun.id}. Use --rerun to intentionally run it again.`);
760
+ return 0;
761
+ }
762
+ }
763
+ const runId = `run_local_${Date.now().toString(36)}`;
764
+ const startedAt = new Date().toISOString();
765
+ let currentBaseId = baseCandidate.id;
766
+ let outputCandidateId = null;
538
767
  let completedJobCount = 0;
539
768
  let failedJobCount = 0;
769
+ let attemptsExecuted = 0;
540
770
  const failedJobs = [];
541
771
  const events = [
542
772
  createLocalEvent("run_started", startedAt, {
@@ -544,232 +774,313 @@ async function localRun(argv, io, runtimeOptions) {
544
774
  detail: { budget, samples, strategy: "greedy" },
545
775
  }),
546
776
  ];
547
- const devCapacity = await localDevelopmentCapacity(workspace);
548
- const runTraceJobs = [];
549
- const attempts = budget;
550
- for (let attemptIndex = 0; attemptIndex < attempts; attemptIndex += 1) {
551
- snapshot = await loadLocalArchive(workspace);
552
- const activeSubject = readLocalSubject(snapshot, currentBaseId);
553
- const baseFiles = filterSubjectSourceFiles(readLocalSubjectFiles(snapshot, activeSubject.id));
554
- if (baseFiles.length === 0) {
555
- throw new UsageError("Subject snapshot must include at least one file.");
556
- }
557
- const subjectRevisionTraceFiles = [
558
- ...createSubjectEvaluationTraceInputFiles({ subject: activeSubject }),
559
- ...createSubjectRevisionTraceInputFiles({
560
- runId,
561
- jobs: runTraceJobs,
562
- events,
563
- }),
564
- ];
565
- const subjectId = `subject_${runId.replace(/^run_/u, "")}_${String(attemptIndex + 1).padStart(3, "0")}`;
566
- const plannedSubjectRevision = planWorkbenchExecutionJobsForPurpose({
567
- ownerUserId: "local",
568
- projectId: "local",
569
- runId,
570
- subjectId,
571
- attemptIndex,
572
- samples,
573
- caseIds,
574
- engineCases,
575
- spec,
576
- workflow: "improve",
577
- purpose: "improve",
578
- now: new Date().toISOString(),
579
- baseFiles,
580
- traceFiles: subjectRevisionTraceFiles,
581
- ...(environmentRefs.defaultRef ? { environmentRef: environmentRefs.defaultRef } : {}),
582
- baseId: activeSubject.id,
583
- })[0];
584
- const subjectRevisionJobs = await executeLocalDevelopmentDag({
585
- jobs: [plannedSubjectRevision],
586
- spec,
587
- adapterManifests,
588
- adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
589
- baseFiles,
590
- engineResolveFiles,
591
- engineCases,
592
- traceFiles: subjectRevisionTraceFiles,
593
- capacity: devCapacity,
777
+ const runningRun = {
778
+ id: runId,
779
+ workflow: "improve",
780
+ benchmarkFingerprint,
781
+ status: "running",
782
+ candidateId: baseCandidate.id,
783
+ candidateRunId: projectSource.spec.candidate.selectedRunId,
784
+ candidateRunName: projectSource.spec.candidate.selectedRunName,
785
+ startedAt,
786
+ improver: formatSpecImprover(spec),
787
+ engineRun: spec.engineRun.use,
788
+ strategy: "greedy",
789
+ budget,
790
+ repairBudget: 0,
791
+ attemptsRequested: budget,
792
+ attemptsExecuted: 0,
793
+ samples,
794
+ executionFingerprint,
795
+ activeCandidateId: snapshot.activeId,
796
+ outputCandidateId: null,
797
+ };
798
+ snapshot = upsertLocalRun(snapshot, runningRun, events);
799
+ await saveLocalArchive(workspace, snapshot);
800
+ try {
801
+ const devCapacity = await localDevelopmentCapacity(workspace);
802
+ const baselineTraceJobs = selectLocalOptimizerBaselineTraceJobs(snapshot, await readLocalJobs(workspace), {
803
+ benchmarkFingerprint,
804
+ candidateId: baseCandidate.id,
805
+ candidateRunId: projectSource.spec.candidate.selectedRunId,
806
+ executionFingerprint,
594
807
  });
595
- const subjectRevision = subjectRevisionJobs[0];
596
- const completedJobs = [subjectRevision];
597
- if (subjectRevision.status === "succeeded") {
598
- const subjectRevisionFiles = completedJobOutputFiles(subjectRevision).length > 0
599
- ? normalizeSurfaceFiles(completedJobOutputFiles(subjectRevision).filter((file) => !file.path.startsWith(".workbench/")))
600
- : baseFiles;
601
- const attemptJobs = planWorkbenchExecutionJobsForPurpose({
808
+ const runTraceJobs = [];
809
+ const attempts = budget;
810
+ for (let attemptIndex = 0; attemptIndex < attempts; attemptIndex += 1) {
811
+ snapshot = await loadLocalArchive(workspace);
812
+ const activeCandidate = readLocalCandidate(snapshot, currentBaseId);
813
+ const baseFiles = filterCandidateSourceFiles(readLocalCandidateFiles(snapshot, activeCandidate.id));
814
+ if (baseFiles.length === 0) {
815
+ throw new UsageError("Candidate snapshot must include at least one file.");
816
+ }
817
+ const candidateRevisionTraceFiles = createOptimizerTraceInputFiles({
818
+ jobs: [...baselineTraceJobs, ...runTraceJobs],
819
+ });
820
+ const candidateId = `candidate_${runId.replace(/^run_/u, "")}_${String(attemptIndex + 1).padStart(3, "0")}`;
821
+ const plannedCandidateRevision = planWorkbenchExecutionJobsForPurpose({
602
822
  ownerUserId: "local",
603
823
  projectId: "local",
604
824
  runId,
605
- subjectId,
825
+ candidateId,
606
826
  attemptIndex,
607
827
  samples,
608
- now: new Date().toISOString(),
609
828
  caseIds,
610
829
  engineCases,
611
830
  spec,
612
- environmentRefsByCase: environmentRefs.byCase,
613
831
  workflow: "improve",
614
- purpose: "attempt",
615
- });
616
- const dagJobs = await executeLocalDevelopmentDag({
617
- jobs: [subjectRevision, ...attemptJobs],
832
+ purpose: "improve",
833
+ now: new Date().toISOString(),
834
+ baseFiles,
835
+ traceFiles: candidateRevisionTraceFiles,
836
+ ...(environmentRefs.defaultRef ? { environmentRef: environmentRefs.defaultRef } : {}),
837
+ baseId: activeCandidate.id,
838
+ })[0];
839
+ const candidateRevisionJobs = await executeLocalDevelopmentDag({
840
+ jobs: [plannedCandidateRevision],
618
841
  spec,
619
842
  adapterManifests,
620
843
  adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
621
- baseFiles: subjectRevisionFiles,
844
+ baseFiles,
622
845
  engineResolveFiles,
623
846
  engineCases,
847
+ traceFiles: candidateRevisionTraceFiles,
624
848
  capacity: devCapacity,
625
849
  });
626
- completedJobs.splice(0, completedJobs.length, ...dagJobs);
627
- }
628
- runTraceJobs.push(...completedJobs);
629
- const materialized = materializeWorkbenchRunResult({
630
- runId,
631
- benchmarkFingerprint,
632
- sourceYaml: projectSource.specSource,
633
- benchmarkSourceFiles: authoredBenchmarkSourceFiles(projectSource),
634
- startedAt,
635
- spec,
636
- jobs: completedJobs,
637
- previousSubject: activeSubject,
638
- existingSubjectCount: snapshot.subjects.length,
639
- });
640
- for (const subject of materialized.subjects) {
641
- snapshot = upsertLocalSubject(snapshot, subject, materialized.subjectFiles[subject.id] ?? []);
642
- events.push(createLocalEvent("subject_created", subject.createdAt, {
850
+ const candidateRevision = candidateRevisionJobs[0];
851
+ const completedJobs = [candidateRevision];
852
+ if (candidateRevision.status === "succeeded") {
853
+ const candidateRevisionFiles = completedJobOutputFiles(candidateRevision).length > 0
854
+ ? normalizeSurfaceFiles(completedJobOutputFiles(candidateRevision).filter((file) => !file.path.startsWith(".workbench/")))
855
+ : baseFiles;
856
+ const attemptJobs = planWorkbenchExecutionJobsForPurpose({
857
+ ownerUserId: "local",
858
+ projectId: "local",
859
+ runId,
860
+ candidateId,
861
+ attemptIndex,
862
+ samples,
863
+ now: new Date().toISOString(),
864
+ caseIds,
865
+ engineCases,
866
+ spec,
867
+ environmentRefsByCase: environmentRefs.byCase,
868
+ workflow: "improve",
869
+ purpose: "attempt",
870
+ });
871
+ const dagJobs = await executeLocalDevelopmentDag({
872
+ jobs: [candidateRevision, ...attemptJobs],
873
+ spec,
874
+ adapterManifests,
875
+ adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
876
+ baseFiles: candidateRevisionFiles,
877
+ engineResolveFiles,
878
+ engineCases,
879
+ capacity: devCapacity,
880
+ });
881
+ completedJobs.splice(0, completedJobs.length, ...dagJobs);
882
+ }
883
+ runTraceJobs.push(...completedJobs);
884
+ const materialized = materializeWorkbenchRunResult({
885
+ runId,
886
+ benchmarkFingerprint,
887
+ sourceYaml: projectSource.specSource,
888
+ benchmarkSourceFiles: authoredBenchmarkSourceFiles(projectSource),
889
+ startedAt,
890
+ spec,
891
+ jobs: completedJobs,
892
+ previousCandidate: activeCandidate,
893
+ existingCandidateCount: snapshot.candidates.length,
894
+ });
895
+ for (const candidate of materialized.candidates) {
896
+ outputCandidateId = candidate.id;
897
+ snapshot = upsertLocalCandidate(snapshot, candidate, materialized.candidateFiles[candidate.id] ?? []);
898
+ events.push(createLocalEvent("candidate_created", candidate.createdAt, {
899
+ runId,
900
+ candidateId: candidate.id,
901
+ baseId: candidate.baseId,
902
+ status: candidate.status,
903
+ metrics: evaluationMeanMetrics(candidate.eval),
904
+ }));
905
+ }
906
+ for (const evaluation of materialized.evaluations) {
907
+ snapshot = upsertLocalEvaluation(snapshot, evaluation);
908
+ }
909
+ snapshot = setLocalActive(snapshot, materialized.activeCandidateId);
910
+ currentBaseId = materialized.activeCandidateId ?? currentBaseId;
911
+ completedJobCount += materialized.completedJobCount;
912
+ failedJobCount += materialized.failedJobCount;
913
+ failedJobs.push(...completedJobs
914
+ .filter((job) => job.status === "failed")
915
+ .map((job) => ({
916
+ id: job.id,
917
+ purpose: workbenchExecutionPurpose(job),
918
+ error: job.error ?? "Job failed without an error message.",
919
+ })));
920
+ events.push(createLocalEvent("active_changed", new Date().toISOString(), {
643
921
  runId,
644
- subjectId: subject.id,
645
- baseId: subject.baseId,
646
- status: subject.status,
647
- metrics: subject.metrics,
922
+ candidateId: materialized.activeCandidateId ?? undefined,
923
+ activeId: materialized.activeCandidateId ?? undefined,
924
+ status: materialized.selectedCandidate?.status,
925
+ metrics: evaluationMeanMetrics(materialized.selectedCandidate?.eval),
648
926
  }));
927
+ await saveLocalJobs(workspace, completedJobs);
928
+ await saveLocalArchive(workspace, snapshot);
929
+ attemptsExecuted += 1;
649
930
  }
650
- for (const evaluation of materialized.evaluations) {
651
- snapshot = upsertLocalEvaluation(snapshot, evaluation);
652
- }
653
- snapshot = setLocalActive(snapshot, materialized.activeSubjectId);
654
- currentBaseId = materialized.activeSubjectId ?? currentBaseId;
655
- completedJobCount += materialized.completedJobCount;
656
- failedJobCount += materialized.failedJobCount;
657
- failedJobs.push(...completedJobs
658
- .filter((job) => job.status === "failed")
659
- .map((job) => ({
660
- id: job.id,
661
- purpose: workbenchExecutionPurpose(job),
662
- error: job.error ?? "Job failed without an error message.",
663
- })));
664
- events.push(createLocalEvent("active_changed", new Date().toISOString(), {
931
+ snapshot = await loadLocalArchive(workspace);
932
+ const finishedAt = new Date().toISOString();
933
+ const run = {
934
+ id: runId,
935
+ workflow: "improve",
936
+ benchmarkFingerprint,
937
+ status: "finished",
938
+ candidateId: baseCandidate.id,
939
+ candidateRunId: projectSource.spec.candidate.selectedRunId,
940
+ candidateRunName: projectSource.spec.candidate.selectedRunName,
941
+ startedAt,
942
+ finishedAt,
943
+ durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
944
+ improver: formatSpecImprover(spec),
945
+ engineRun: spec.engineRun.use,
946
+ strategy: "greedy",
947
+ budget,
948
+ repairBudget: 0,
949
+ attemptsRequested: budget,
950
+ attemptsExecuted,
951
+ samples,
952
+ executionFingerprint,
953
+ stoppedReason: "budget_exhausted",
954
+ outcome: failedJobCount > 0 ? "error" : "ok",
955
+ activeCandidateId: snapshot.activeId,
956
+ outputCandidateId: outputCandidateId ?? snapshot.activeId,
957
+ };
958
+ events.push(createLocalEvent("run_finished", finishedAt, {
665
959
  runId,
666
- subjectId: materialized.activeSubjectId ?? undefined,
667
- activeId: materialized.activeSubjectId ?? undefined,
668
- status: materialized.selectedSubject?.status,
669
- metrics: materialized.selectedSubject?.metrics,
960
+ detail: {
961
+ outcome: run.outcome ?? null,
962
+ attemptsExecuted: run.attemptsExecuted,
963
+ durationMs: run.durationMs ?? null,
964
+ },
670
965
  }));
671
- await saveLocalJobs(workspace, completedJobs);
966
+ snapshot = upsertLocalRun(snapshot, run, events.slice(1));
672
967
  await saveLocalArchive(workspace, snapshot);
968
+ const outputCandidate = run.outputCandidateId
969
+ ? readLocalCandidate(snapshot, run.outputCandidateId)
970
+ : null;
971
+ const activeCandidate = snapshot.activeId
972
+ ? readLocalCandidate(snapshot, snapshot.activeId)
973
+ : null;
974
+ const result = {
975
+ ok: failedJobCount === 0,
976
+ runId,
977
+ outputCandidateId: run.outputCandidateId,
978
+ outputCandidate,
979
+ activeCandidateId: snapshot.activeId,
980
+ activeCandidate,
981
+ completedJobCount,
982
+ failedJobCount,
983
+ failedJobs,
984
+ localView: localDevViewHint(workspace, runId),
985
+ };
986
+ writeOutput(result, parsed, io, () => {
987
+ const outputMetricValue = outputCandidate ? formatCandidateEvaluationScore(outputCandidate) : "n/a";
988
+ const activeMetricValue = activeCandidate ? formatCandidateEvaluationScore(activeCandidate) : "n/a";
989
+ const firstFailure = result.failedJobs[0];
990
+ const failureDetail = firstFailure
991
+ ? `\nFirst failed job ${firstFailure.id}${firstFailure.purpose ? ` (${firstFailure.purpose})` : ""}: ${firstFailure.error}`
992
+ : "";
993
+ const viewDetail = failedJobCount === 0
994
+ ? `\nOpen local view: ${result.localView.command}\n${result.localView.note}`
995
+ : "";
996
+ return `Run ${runId} finished. Output candidate: ${formatLocalCandidateLabel(outputCandidate)} (score: ${outputMetricValue}). Active candidate: ${formatLocalCandidateLabel(activeCandidate)} (score: ${activeMetricValue}).${failureDetail}${viewDetail}`;
997
+ });
998
+ return failedJobCount === 0 ? 0 : 1;
999
+ }
1000
+ catch (error) {
1001
+ await markLocalRunFailed({
1002
+ workspace,
1003
+ run: {
1004
+ ...runningRun,
1005
+ attemptsExecuted,
1006
+ outputCandidateId,
1007
+ },
1008
+ startedAt,
1009
+ error,
1010
+ }).catch(() => undefined);
1011
+ throw error;
673
1012
  }
674
- snapshot = await loadLocalArchive(workspace);
675
- const finishedAt = new Date().toISOString();
676
- const run = {
677
- id: runId,
678
- workflow: "improve",
679
- benchmarkFingerprint,
680
- status: "finished",
681
- startedAt,
682
- finishedAt,
683
- durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
684
- optimizer: formatSpecOptimizer(spec),
685
- engineRun: spec.engineRun.use,
686
- strategy: "greedy",
687
- budget,
688
- repairBudget: 0,
689
- attemptsRequested: budget,
690
- attemptsExecuted: budget,
691
- samples,
692
- stoppedReason: "budget_exhausted",
693
- outcome: failedJobCount > 0 ? "error" : "ok",
694
- };
695
- events.push(createLocalEvent("run_finished", finishedAt, {
696
- runId,
697
- detail: {
698
- outcome: run.outcome ?? null,
699
- attemptsExecuted: run.attemptsExecuted,
700
- durationMs: run.durationMs ?? null,
701
- },
702
- }));
703
- snapshot = appendLocalRun(snapshot, run, events);
704
- await saveLocalArchive(workspace, snapshot);
705
- const selected = snapshot.activeId
706
- ? readLocalSubject(snapshot, snapshot.activeId)
707
- : null;
708
- const result = {
709
- ok: failedJobCount === 0,
710
- runId,
711
- activeSubjectId: snapshot.activeId,
712
- selectedSubject: selected,
713
- completedJobCount,
714
- failedJobCount,
715
- failedJobs,
716
- localView: localDevViewHint(workspace, runId),
717
- };
718
- writeOutput(result, parsed, io, () => {
719
- const metricValue = selected?.metrics?.score ?? "n/a";
720
- const firstFailure = result.failedJobs[0];
721
- const failureDetail = firstFailure
722
- ? `\nFirst failed job ${firstFailure.id}${firstFailure.purpose ? ` (${firstFailure.purpose})` : ""}: ${firstFailure.error}`
723
- : "";
724
- const viewDetail = failedJobCount === 0
725
- ? `\nOpen local view: ${result.localView.command}\n${result.localView.note}`
726
- : "";
727
- return `Run ${runId} finished. Active subject: ${snapshot.activeId ?? "none"} (score: ${metricValue}).${failureDetail}${viewDetail}`;
728
- });
729
- return failedJobCount === 0 ? 0 : 1;
730
1013
  }
731
- async function ensureLocalImproveBaseSubject(args) {
1014
+ async function ensureLocalImproveBaseCandidate(args) {
732
1015
  let snapshot = await loadLocalArchive(args.workspace);
733
1016
  const explicitBase = asOptionalString(args.parsed.flags.from);
734
1017
  const benchmarkFingerprint = await readLocalBenchmarkFingerprint(args.workspace);
735
1018
  if (explicitBase) {
736
- let subject = readLocalSubject(snapshot, explicitBase);
737
- if (subject.benchmarkFingerprint !== benchmarkFingerprint) {
738
- throw new UsageError(`Base subject ${explicitBase} belongs to benchmark ${subject.benchmarkFingerprint}, not ${benchmarkFingerprint}.`);
1019
+ let candidate = readLocalCandidate(snapshot, explicitBase);
1020
+ if (candidate.benchmarkFingerprint !== benchmarkFingerprint) {
1021
+ throw new UsageError(`Base candidate ${explicitBase} belongs to benchmark ${candidate.benchmarkFingerprint}, not ${benchmarkFingerprint}.`);
739
1022
  }
740
- if (!subject.subjectFingerprint) {
741
- throw new UsageError(`Base subject ${explicitBase} is missing a subject fingerprint.`);
1023
+ if (!candidate.candidateFingerprint) {
1024
+ throw new UsageError(`Base candidate ${explicitBase} is missing a candidate fingerprint.`);
742
1025
  }
743
- if (subject.status !== "evaluated" && !subject.eval) {
744
- const code = await localEvaluateSubject(["--dir", args.workspace, "--subject", explicitBase, "--samples", String(args.samples), "--json"], createSilentIo(args.io), args.runtimeOptions);
1026
+ if (candidate.status !== "evaluated" && !candidate.eval) {
1027
+ const code = await localEvaluateCandidate([
1028
+ "--dir",
1029
+ args.workspace,
1030
+ "--candidate",
1031
+ explicitBase,
1032
+ "--runs",
1033
+ args.projectSource.spec.candidate.selectedRunId,
1034
+ "--samples",
1035
+ String(args.samples),
1036
+ ...(args.parsed.flags.rerun === true ? ["--rerun"] : []),
1037
+ "--json",
1038
+ ], createSilentIo(args.io), args.runtimeOptions);
745
1039
  if (code !== 0) {
746
- throw new UsageError(`Base subject ${explicitBase} eval failed; improve was not started.`);
1040
+ throw new UsageError(`Base candidate ${explicitBase} eval failed; improve was not started.`);
747
1041
  }
748
1042
  snapshot = await loadLocalArchive(args.workspace);
749
- subject = readLocalSubject(snapshot, explicitBase);
1043
+ candidate = readLocalCandidate(snapshot, explicitBase);
750
1044
  }
751
- return subject;
1045
+ return candidate;
752
1046
  }
753
- const subjectFingerprint = localSubjectFingerprint(args.projectSource);
754
- const existing = snapshot.subjects.find((subject) => subject.benchmarkFingerprint === benchmarkFingerprint &&
755
- subject.subjectFingerprint === subjectFingerprint &&
756
- (subject.status === "evaluated" || Boolean(subject.eval)));
1047
+ const candidateFingerprint = localCandidateFingerprint(args.projectSource);
1048
+ const existing = snapshot.candidates.find((candidate) => candidate.benchmarkFingerprint === benchmarkFingerprint &&
1049
+ candidate.candidateFingerprint === candidateFingerprint &&
1050
+ (candidate.status === "evaluated" || Boolean(candidate.eval)));
757
1051
  if (existing) {
758
1052
  return existing;
759
1053
  }
760
1054
  const evalArgs = args.parsed.positionals.length > 0
761
- ? [args.sourceArg, "--samples", String(args.samples), "--json"]
762
- : ["--dir", args.workspace, "--samples", String(args.samples), "--json"];
763
- const code = await localEvaluateSubject(evalArgs, createSilentIo(args.io), args.runtimeOptions);
1055
+ ? [
1056
+ args.sourceArg,
1057
+ "--runs",
1058
+ args.projectSource.spec.candidate.selectedRunId,
1059
+ "--samples",
1060
+ String(args.samples),
1061
+ ...(args.parsed.flags.rerun === true ? ["--rerun"] : []),
1062
+ "--json",
1063
+ ]
1064
+ : [
1065
+ "--dir",
1066
+ args.workspace,
1067
+ "--runs",
1068
+ args.projectSource.spec.candidate.selectedRunId,
1069
+ "--samples",
1070
+ String(args.samples),
1071
+ ...(args.parsed.flags.rerun === true ? ["--rerun"] : []),
1072
+ "--json",
1073
+ ];
1074
+ const code = await localEvaluateCandidate(evalArgs, createSilentIo(args.io), args.runtimeOptions);
764
1075
  if (code !== 0) {
765
- throw new UsageError("Parent subject eval failed; improve was not started.");
1076
+ throw new UsageError("Parent candidate eval failed; improve was not started.");
766
1077
  }
767
1078
  snapshot = await loadLocalArchive(args.workspace);
768
- const evaluated = snapshot.subjects.find((subject) => subject.benchmarkFingerprint === benchmarkFingerprint &&
769
- subject.subjectFingerprint === subjectFingerprint &&
770
- (subject.status === "evaluated" || Boolean(subject.eval)));
1079
+ const evaluated = snapshot.candidates.find((candidate) => candidate.benchmarkFingerprint === benchmarkFingerprint &&
1080
+ candidate.candidateFingerprint === candidateFingerprint &&
1081
+ (candidate.status === "evaluated" || Boolean(candidate.eval)));
771
1082
  if (!evaluated) {
772
- throw new UsageError("Parent subject eval did not produce an evaluated subject.");
1083
+ throw new UsageError("Parent candidate eval did not produce an evaluated candidate.");
773
1084
  }
774
1085
  return evaluated;
775
1086
  }
@@ -785,13 +1096,62 @@ function createSilentIo(io) {
785
1096
  stderr: io.stderr,
786
1097
  };
787
1098
  }
788
- async function localEvaluateSubject(argv, io, runtimeOptions) {
1099
+ function selectLocalOptimizerBaselineTraceJobs(snapshot, jobs, target) {
1100
+ const runById = new Map(snapshot.runs.map((run) => [run.id, run]));
1101
+ const evaluation = snapshot.evaluations
1102
+ .filter((entry) => {
1103
+ const run = runById.get(entry.runId);
1104
+ return entry.benchmarkFingerprint === target.benchmarkFingerprint &&
1105
+ entry.candidateId === target.candidateId &&
1106
+ entry.candidateRunId === target.candidateRunId &&
1107
+ run?.executionFingerprint === target.executionFingerprint;
1108
+ })
1109
+ .sort((left, right) => right.updatedAt.localeCompare(left.updatedAt) ||
1110
+ right.runId.localeCompare(left.runId))[0] ?? null;
1111
+ if (!evaluation) {
1112
+ return [];
1113
+ }
1114
+ return jobs.filter((job) => job.runId === evaluation.runId);
1115
+ }
1116
+ async function localEvaluateCandidate(argv, io, runtimeOptions) {
789
1117
  void runtimeOptions;
790
1118
  const parsed = parseArgs(argv);
791
- rejectUnknownFlags(parsed, new Set(["dir", "subject", "samples", "json"]));
1119
+ rejectUnknownFlags(parsed, new Set(["dir", "candidate", "runs", "samples", "rerun", "json"]));
792
1120
  const samples = parsePositiveInt(parsed.flags.samples, 1, "samples");
793
1121
  const sourceArg = resolveSourceDir(parsed);
794
- const projectSource = await readLocalProjectSource(sourceArg);
1122
+ const runsFlag = asOptionalString(parsed.flags.runs);
1123
+ const defaultProjectSource = await readLocalProjectSource(sourceArg);
1124
+ const selectedRunIds = resolveCandidateRunSelection(defaultProjectSource, runsFlag);
1125
+ if (selectedRunIds.length > 1) {
1126
+ let failed = 0;
1127
+ for (const runId of selectedRunIds) {
1128
+ const args = [
1129
+ "--dir",
1130
+ defaultProjectSource.dir,
1131
+ "--runs",
1132
+ runId,
1133
+ "--samples",
1134
+ String(samples),
1135
+ ...(readOptionalCandidateFlag(parsed) ? ["--candidate", readOptionalCandidateFlag(parsed)] : []),
1136
+ ...(parsed.flags.rerun === true ? ["--rerun"] : []),
1137
+ "--json",
1138
+ ];
1139
+ const code = await localEvaluateCandidate(args, createSilentIo(io), runtimeOptions);
1140
+ if (code !== 0) {
1141
+ failed += 1;
1142
+ }
1143
+ }
1144
+ writeOutput({
1145
+ ok: failed === 0,
1146
+ candidateId: defaultProjectSource.candidateName,
1147
+ candidateRunIds: selectedRunIds,
1148
+ failedRunCount: failed,
1149
+ }, parsed, io, () => `Evaluated ${selectedRunIds.length} candidate run(s); ${failed} failed.`);
1150
+ return failed === 0 ? 0 : 1;
1151
+ }
1152
+ const projectSource = selectedRunIds[0] === defaultProjectSource.candidateRunId
1153
+ ? defaultProjectSource
1154
+ : await readLocalProjectSource(sourceArg, { runId: selectedRunIds[0] });
795
1155
  const workspace = projectSource.dir;
796
1156
  const executionProject = await resolveLocalProjectForExecution(workspace, projectSource.specSource);
797
1157
  const { spec, adapterManifests } = executionProject;
@@ -810,114 +1170,367 @@ async function localEvaluateSubject(argv, io, runtimeOptions) {
810
1170
  const environmentRefs = await ensureLocalDockerfileEnvironments(workspace, spec, engineCases);
811
1171
  let snapshot = await loadLocalArchive(workspace);
812
1172
  const benchmarkFingerprint = await readLocalBenchmarkFingerprint(workspace);
813
- const sourceSubjectFingerprint = localSubjectFingerprint(projectSource);
814
- const explicitSubjectId = asOptionalString(parsed.flags.subject);
815
- const existingSourceSubject = snapshot.subjects.find((subject) => subject.benchmarkFingerprint === benchmarkFingerprint &&
816
- subject.subjectFingerprint === sourceSubjectFingerprint);
817
- const subjectId = explicitSubjectId ?? existingSourceSubject?.id ?? `subject_${sourceSubjectFingerprint.slice(0, 12)}`;
818
- const existingSubject = snapshot.subjects.find((subject) => subject.id === subjectId);
819
- const files = filterSubjectSourceFiles(existingSubject
820
- ? readLocalSubjectFiles(snapshot, subjectId)
821
- : normalizeSurfaceFiles(projectSource.subjectFiles));
1173
+ const executionFingerprint = localRunExecutionFingerprint(projectSource);
1174
+ const sourceCandidateFingerprint = localCandidateFingerprint(projectSource);
1175
+ const explicitCandidateId = readOptionalCandidateFlag(parsed);
1176
+ const existingSourceCandidate = snapshot.candidates.find((candidate) => candidate.benchmarkFingerprint === benchmarkFingerprint &&
1177
+ candidate.candidateFingerprint === sourceCandidateFingerprint);
1178
+ const candidateId = explicitCandidateId ?? existingSourceCandidate?.id ?? `candidate_${sourceCandidateFingerprint.slice(0, 12)}`;
1179
+ const existingCandidate = snapshot.candidates.find((candidate) => candidate.id === candidateId);
1180
+ const activeCandidateIdBeforeEval = snapshot.activeId;
1181
+ const selectedCandidateRunId = projectSource.spec.candidate.selectedRunId;
1182
+ const files = filterCandidateSourceFiles(existingCandidate
1183
+ ? readLocalCandidateFiles(snapshot, candidateId)
1184
+ : normalizeSurfaceFiles(projectSource.candidateFiles));
1185
+ const evaluationWork = parsed.flags.rerun !== true
1186
+ ? await resolveLocalEvaluationWork(workspace, snapshot, {
1187
+ benchmarkFingerprint,
1188
+ candidateId,
1189
+ candidateFingerprint: existingCandidate?.candidateFingerprint ?? sourceCandidateFingerprint,
1190
+ candidateRunId: selectedCandidateRunId,
1191
+ executionFingerprint,
1192
+ samples,
1193
+ caseIds,
1194
+ })
1195
+ : null;
1196
+ const reusableEvaluation = evaluationWork?.reusableEvaluation ?? null;
1197
+ if (reusableEvaluation) {
1198
+ const result = {
1199
+ ok: true,
1200
+ reused: true,
1201
+ runId: reusableEvaluation.runId,
1202
+ evaluation: reusableEvaluation,
1203
+ evaluationId: reusableEvaluation.id,
1204
+ candidateId,
1205
+ completedJobCount: 0,
1206
+ failedJobCount: 0,
1207
+ localView: localDevViewHint(workspace, reusableEvaluation.runId),
1208
+ };
1209
+ writeOutput(result, parsed, io, () => `Reused evaluation ${reusableEvaluation.id}. Use --rerun to intentionally run it again.`);
1210
+ return 0;
1211
+ }
1212
+ const selectedPairs = evaluationWork?.missingPairs.length
1213
+ ? evaluationWork.missingPairs
1214
+ : allCaseSamplePairs(caseIds, samples);
822
1215
  const runId = `eval_local_${Date.now().toString(36)}`;
823
- const evaluatedSubjectId = subjectId;
1216
+ const evaluatedCandidateId = candidateId;
824
1217
  const startedAt = new Date().toISOString();
825
- const baseline = createRuntimeBaselineSubjectJob({
826
- ownerUserId: "local",
827
- projectId: "local",
1218
+ const runStartedEvent = createLocalEvent("run_started", startedAt, {
828
1219
  runId,
829
- subjectId: evaluatedSubjectId,
830
- attemptIndex: 0,
831
- files,
832
- now: startedAt,
833
- baseId: null,
1220
+ candidateId: evaluatedCandidateId,
1221
+ detail: { samples, strategy: "direct" },
834
1222
  });
835
- const completedJobs = [baseline];
836
- const attemptJobs = planWorkbenchExecutionJobsForPurpose({
837
- ownerUserId: "local",
838
- projectId: "local",
839
- runId,
840
- subjectId: evaluatedSubjectId,
841
- attemptIndex: 0,
842
- samples,
843
- now: startedAt,
844
- caseIds,
845
- engineCases,
846
- spec,
847
- environmentRefsByCase: environmentRefs.byCase,
848
- workflow: "eval",
849
- purpose: "attempt",
850
- });
851
- const dagJobs = await executeLocalDevelopmentDag({
852
- jobs: [baseline, ...attemptJobs],
853
- spec,
854
- adapterManifests,
855
- adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
856
- baseFiles: files,
857
- engineResolveFiles,
858
- engineCases,
859
- capacity: await localDevelopmentCapacity(workspace),
860
- });
861
- completedJobs.splice(0, completedJobs.length, ...dagJobs);
862
- const materialized = materializeWorkbenchRunResult({
863
- runId,
864
- benchmarkFingerprint,
865
- sourceYaml: projectSource.specSource,
866
- benchmarkSourceFiles: authoredBenchmarkSourceFiles(projectSource),
867
- subjectFingerprint: existingSubject?.subjectFingerprint ?? sourceSubjectFingerprint,
868
- ...(!existingSubject || existingSubject.subjectFingerprint === sourceSubjectFingerprint
869
- ? { subjectSourceFiles: authoredSubjectSourceFiles(projectSource) }
870
- : {}),
871
- startedAt,
872
- spec,
873
- jobs: completedJobs,
874
- previousSubject: null,
875
- existingSubjectCount: snapshot.subjects.length,
876
- });
877
- for (const subjectRecord of materialized.subjects) {
878
- snapshot = upsertLocalSubject(snapshot, subjectRecord, materialized.subjectFiles[subjectRecord.id] ?? []);
879
- }
880
- if (materialized.activeSubjectId) {
881
- snapshot = setLocalActive(snapshot, materialized.activeSubjectId);
882
- }
883
- for (const evaluation of materialized.evaluations) {
884
- snapshot = upsertLocalEvaluation(snapshot, evaluation);
885
- }
886
- const finishedAt = new Date().toISOString();
887
- snapshot = appendLocalRun(snapshot, {
1223
+ const runningRun = {
888
1224
  id: runId,
889
1225
  workflow: "eval",
890
1226
  benchmarkFingerprint,
891
- status: "finished",
1227
+ status: "running",
1228
+ candidateId: evaluatedCandidateId,
1229
+ candidateRunId: projectSource.spec.candidate.selectedRunId,
1230
+ candidateRunName: projectSource.spec.candidate.selectedRunName,
892
1231
  startedAt,
893
- finishedAt,
894
- durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
895
- optimizer: "none",
1232
+ improver: "none",
896
1233
  engineRun: spec.engineRun.use,
897
1234
  strategy: "direct",
898
1235
  budget: 1,
899
1236
  repairBudget: 0,
900
1237
  attemptsRequested: 1,
901
- attemptsExecuted: 1,
1238
+ attemptsExecuted: 0,
902
1239
  samples,
903
- stoppedReason: "completed",
904
- outcome: materialized.failedJobCount > 0 ? "error" : "ok",
905
- }, []);
906
- await saveLocalJobs(workspace, completedJobs);
1240
+ executionFingerprint,
1241
+ activeCandidateId: activeCandidateIdBeforeEval,
1242
+ outputCandidateId: evaluatedCandidateId,
1243
+ };
1244
+ snapshot = upsertLocalRun(snapshot, runningRun, [runStartedEvent]);
907
1245
  await saveLocalArchive(workspace, snapshot);
908
- const evaluation = materialized.evaluations[0] ?? null;
909
- const result = {
910
- ok: materialized.failedJobCount === 0,
911
- runId,
912
- evaluation,
913
- evaluationId: evaluation?.id ?? null,
914
- subjectId: evaluatedSubjectId,
915
- completedJobCount: materialized.completedJobCount,
916
- failedJobCount: materialized.failedJobCount,
917
- localView: localDevViewHint(workspace, runId),
1246
+ try {
1247
+ const baseline = createRuntimeBaselineCandidateJob({
1248
+ ownerUserId: "local",
1249
+ projectId: "local",
1250
+ runId,
1251
+ candidateId: evaluatedCandidateId,
1252
+ attemptIndex: 0,
1253
+ files,
1254
+ now: startedAt,
1255
+ baseId: null,
1256
+ });
1257
+ const attemptJobs = planWorkbenchExecutionJobsForPurpose({
1258
+ ownerUserId: "local",
1259
+ projectId: "local",
1260
+ runId,
1261
+ candidateId: evaluatedCandidateId,
1262
+ attemptIndex: 0,
1263
+ samples,
1264
+ now: startedAt,
1265
+ caseIds: orderedCaseIdsForPairs(caseIds, selectedPairs),
1266
+ sampleIndexesByCase: sampleIndexesByCase(selectedPairs),
1267
+ engineCases,
1268
+ spec,
1269
+ environmentRefsByCase: environmentRefs.byCase,
1270
+ workflow: "eval",
1271
+ purpose: "attempt",
1272
+ });
1273
+ const dagJobs = await executeLocalDevelopmentDag({
1274
+ jobs: [baseline, ...attemptJobs],
1275
+ spec,
1276
+ adapterManifests,
1277
+ adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
1278
+ baseFiles: files,
1279
+ engineResolveFiles,
1280
+ engineCases,
1281
+ capacity: await localDevelopmentCapacity(workspace),
1282
+ });
1283
+ const materializationJobs = [
1284
+ ...(evaluationWork?.priorAttemptJobs ?? []),
1285
+ ...dagJobs,
1286
+ ];
1287
+ const currentRunJobs = dagJobs.filter((job) => job.runId === runId);
1288
+ const currentRunCompletedJobCount = currentRunJobs.filter((job) => job.status === "succeeded").length;
1289
+ const currentRunFailedJobCount = currentRunJobs.filter((job) => job.status === "failed").length;
1290
+ const materialized = materializeWorkbenchRunResult({
1291
+ runId,
1292
+ benchmarkFingerprint,
1293
+ sourceYaml: projectSource.specSource,
1294
+ benchmarkSourceFiles: authoredBenchmarkSourceFiles(projectSource),
1295
+ candidateFingerprint: existingCandidate?.candidateFingerprint ?? sourceCandidateFingerprint,
1296
+ ...(!existingCandidate || existingCandidate.candidateFingerprint === sourceCandidateFingerprint
1297
+ ? { candidateSourceFiles: authoredCandidateSourceFiles(projectSource) }
1298
+ : {}),
1299
+ startedAt,
1300
+ spec,
1301
+ jobs: materializationJobs,
1302
+ previousCandidate: existingCandidate ?? null,
1303
+ existingCandidateCount: snapshot.candidates.length,
1304
+ });
1305
+ for (const candidateRecord of materialized.candidates) {
1306
+ snapshot = upsertLocalCandidate(snapshot, candidateRecord, materialized.candidateFiles[candidateRecord.id] ?? []);
1307
+ }
1308
+ if (materialized.activeCandidateId) {
1309
+ snapshot = setLocalActive(snapshot, materialized.activeCandidateId);
1310
+ }
1311
+ for (const evaluation of materialized.evaluations) {
1312
+ snapshot = upsertLocalEvaluation(snapshot, evaluation);
1313
+ }
1314
+ const activeCandidateId = activeCandidateIdBeforeEval ?? materialized.activeCandidateId ?? null;
1315
+ const finishedAt = new Date().toISOString();
1316
+ if (activeCandidateId) {
1317
+ snapshot = setLocalActive(snapshot, activeCandidateId);
1318
+ }
1319
+ const runFinishedEvent = createLocalEvent("run_finished", finishedAt, {
1320
+ runId,
1321
+ candidateId: evaluatedCandidateId,
1322
+ detail: {
1323
+ outcome: currentRunFailedJobCount > 0 ? "error" : "ok",
1324
+ attemptsExecuted: 1,
1325
+ durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
1326
+ },
1327
+ });
1328
+ snapshot = upsertLocalRun(snapshot, {
1329
+ id: runId,
1330
+ workflow: "eval",
1331
+ benchmarkFingerprint,
1332
+ status: "finished",
1333
+ candidateId: evaluatedCandidateId,
1334
+ candidateRunId: projectSource.spec.candidate.selectedRunId,
1335
+ candidateRunName: projectSource.spec.candidate.selectedRunName,
1336
+ startedAt,
1337
+ finishedAt,
1338
+ durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
1339
+ improver: "none",
1340
+ engineRun: spec.engineRun.use,
1341
+ strategy: "direct",
1342
+ budget: 1,
1343
+ repairBudget: 0,
1344
+ attemptsRequested: 1,
1345
+ attemptsExecuted: 1,
1346
+ samples,
1347
+ executionFingerprint,
1348
+ stoppedReason: "completed",
1349
+ outcome: currentRunFailedJobCount > 0 ? "error" : "ok",
1350
+ activeCandidateId,
1351
+ outputCandidateId: evaluatedCandidateId,
1352
+ }, [runFinishedEvent]);
1353
+ await saveLocalJobs(workspace, currentRunJobs);
1354
+ await saveLocalArchive(workspace, snapshot);
1355
+ const evaluation = materialized.evaluations[0] ?? null;
1356
+ const result = {
1357
+ ok: currentRunFailedJobCount === 0,
1358
+ runId,
1359
+ evaluation,
1360
+ evaluationId: evaluation?.id ?? null,
1361
+ candidateId: evaluatedCandidateId,
1362
+ activeCandidateId,
1363
+ completedJobCount: currentRunCompletedJobCount,
1364
+ failedJobCount: currentRunFailedJobCount,
1365
+ localView: localDevViewHint(workspace, runId),
1366
+ };
1367
+ writeOutput(result, parsed, io, ({ evaluationId, candidateId }) => `Evaluation ${evaluationId ?? runId} finished for candidate ${candidateId}.\nOpen local view: ${result.localView.command}\n${result.localView.note}`);
1368
+ return currentRunFailedJobCount === 0 ? 0 : 1;
1369
+ }
1370
+ catch (error) {
1371
+ await markLocalRunFailed({
1372
+ workspace,
1373
+ run: runningRun,
1374
+ startedAt,
1375
+ error,
1376
+ }).catch(() => undefined);
1377
+ throw error;
1378
+ }
1379
+ }
1380
+ async function resolveLocalEvaluationWork(workspace, snapshot, target) {
1381
+ const runById = new Map(snapshot.runs.map((run) => [run.id, run]));
1382
+ const matchingEvaluations = snapshot.evaluations.filter((evaluation) => {
1383
+ const run = runById.get(evaluation.runId);
1384
+ return evaluation.benchmarkFingerprint === target.benchmarkFingerprint &&
1385
+ evaluation.candidateId === target.candidateId &&
1386
+ evaluation.candidateFingerprint === target.candidateFingerprint &&
1387
+ evaluation.candidateRunId === target.candidateRunId &&
1388
+ run?.executionFingerprint === target.executionFingerprint;
1389
+ });
1390
+ const reusableEvaluation = matchingEvaluations
1391
+ .filter((evaluation) => evaluation.status === "completed" &&
1392
+ evaluation.errorSampleCount === 0 &&
1393
+ evaluation.completedSampleCount >= target.samples)
1394
+ .sort((left, right) => right.updatedAt.localeCompare(left.updatedAt) ||
1395
+ right.id.localeCompare(left.id))[0] ?? null;
1396
+ if (reusableEvaluation) {
1397
+ return {
1398
+ reusableEvaluation,
1399
+ missingPairs: [],
1400
+ priorAttemptJobs: [],
1401
+ };
1402
+ }
1403
+ const matchingRunIds = new Set(matchingEvaluations.map((evaluation) => evaluation.runId));
1404
+ if (matchingRunIds.size === 0) {
1405
+ return null;
1406
+ }
1407
+ const allPairs = allCaseSamplePairs(target.caseIds, target.samples);
1408
+ const desiredKeys = new Set(allPairs.map(caseSamplePairKey));
1409
+ const previousJobs = await readLocalJobs(workspace);
1410
+ const priorAttemptJobsByPair = latestCompletedAttemptJobsByPair(previousJobs.filter((job) => matchingRunIds.has(job.runId) &&
1411
+ job.candidateId === target.candidateId), desiredKeys);
1412
+ const missingPairs = allPairs.filter((pair) => !priorAttemptJobsByPair.has(caseSamplePairKey(pair)));
1413
+ if (missingPairs.length === allPairs.length) {
1414
+ return null;
1415
+ }
1416
+ return {
1417
+ reusableEvaluation: null,
1418
+ missingPairs,
1419
+ priorAttemptJobs: [...priorAttemptJobsByPair.values()],
918
1420
  };
919
- writeOutput(result, parsed, io, ({ evaluationId, subjectId: evaluatedSubjectId }) => `Evaluation ${evaluationId ?? runId} finished for ${evaluatedSubjectId}.\nOpen local view: ${result.localView.command}\n${result.localView.note}`);
920
- return materialized.failedJobCount === 0 ? 0 : 1;
1421
+ }
1422
+ async function markLocalRunFailed(args) {
1423
+ const latest = await loadLocalArchive(args.workspace);
1424
+ const current = latest.runs.find((run) => run.id === args.run.id);
1425
+ if (current?.status === "finished") {
1426
+ return;
1427
+ }
1428
+ const finishedAt = new Date().toISOString();
1429
+ const message = errorMessage(args.error);
1430
+ const failedRun = {
1431
+ ...args.run,
1432
+ status: "finished",
1433
+ finishedAt,
1434
+ durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(args.startedAt)),
1435
+ outcome: "error",
1436
+ error: message,
1437
+ };
1438
+ await saveLocalArchive(args.workspace, upsertLocalRun(latest, failedRun, [
1439
+ createLocalEvent("run_finished", finishedAt, {
1440
+ runId: args.run.id,
1441
+ candidateId: args.run.candidateId ?? undefined,
1442
+ detail: {
1443
+ outcome: "error",
1444
+ error: message,
1445
+ attemptsExecuted: failedRun.attemptsExecuted,
1446
+ durationMs: failedRun.durationMs ?? null,
1447
+ },
1448
+ }),
1449
+ ]));
1450
+ }
1451
+ function errorMessage(error) {
1452
+ return error instanceof Error ? error.message : String(error);
1453
+ }
1454
+ function allCaseSamplePairs(caseIds, samples) {
1455
+ return caseIds.flatMap((caseId) => Array.from({ length: samples }, (_, sampleIndex) => ({
1456
+ caseId,
1457
+ sampleIndex,
1458
+ })));
1459
+ }
1460
+ function orderedCaseIdsForPairs(caseIds, pairs) {
1461
+ const selected = new Set(pairs.map((pair) => pair.caseId));
1462
+ return caseIds.filter((caseId) => selected.has(caseId));
1463
+ }
1464
+ function sampleIndexesByCase(pairs) {
1465
+ const byCase = new Map();
1466
+ for (const pair of pairs) {
1467
+ byCase.set(pair.caseId, [...(byCase.get(pair.caseId) ?? []), pair.sampleIndex]);
1468
+ }
1469
+ for (const [caseId, indexes] of byCase.entries()) {
1470
+ byCase.set(caseId, [...new Set(indexes)].sort((left, right) => left - right));
1471
+ }
1472
+ return byCase;
1473
+ }
1474
+ function latestCompletedAttemptJobsByPair(jobs, desiredKeys) {
1475
+ const byPair = new Map();
1476
+ for (const job of jobs) {
1477
+ if (job.status !== "succeeded" || executionPurposeFromJobInput(job.input) !== "attempt") {
1478
+ continue;
1479
+ }
1480
+ const pair = caseSamplePairFromJob(job);
1481
+ if (!pair) {
1482
+ continue;
1483
+ }
1484
+ const key = caseSamplePairKey(pair);
1485
+ if (!desiredKeys.has(key)) {
1486
+ continue;
1487
+ }
1488
+ const previous = byPair.get(key);
1489
+ if (!previous || compareJobRecency(job, previous) > 0) {
1490
+ byPair.set(key, job);
1491
+ }
1492
+ }
1493
+ return byPair;
1494
+ }
1495
+ function caseSamplePairFromJob(job) {
1496
+ const input = readRecord(job.input);
1497
+ const execution = readRecord(input?.execution);
1498
+ const metadata = readRecord(execution?.metadata);
1499
+ const caseId = stringValue(input?.caseId) ?? stringValue(metadata?.caseId);
1500
+ const sampleIndex = integerValue(input?.sampleIndex) ?? integerValue(metadata?.sampleIndex);
1501
+ return caseId && sampleIndex !== null
1502
+ ? { caseId, sampleIndex }
1503
+ : null;
1504
+ }
1505
+ function executionPurposeFromJobInput(inputValue) {
1506
+ const input = readRecord(inputValue);
1507
+ const execution = readRecord(input?.execution);
1508
+ return stringValue(execution?.purpose);
1509
+ }
1510
+ function caseSamplePairKey(pair) {
1511
+ return `${pair.caseId}\0${pair.sampleIndex}`;
1512
+ }
1513
+ function compareJobRecency(left, right) {
1514
+ return jobRecencyTimestamp(left).localeCompare(jobRecencyTimestamp(right)) ||
1515
+ left.id.localeCompare(right.id);
1516
+ }
1517
+ function jobRecencyTimestamp(job) {
1518
+ return job.finishedAt ?? job.updatedAt ?? job.startedAt ?? job.createdAt ?? "";
1519
+ }
1520
+ function findReusableLocalImproveRun(runs, target) {
1521
+ return runs
1522
+ .filter((run) => run.workflow === "improve" &&
1523
+ run.benchmarkFingerprint === target.benchmarkFingerprint &&
1524
+ run.candidateId === target.candidateId &&
1525
+ run.candidateRunId === target.candidateRunId &&
1526
+ run.executionFingerprint === target.executionFingerprint &&
1527
+ run.budget === target.budget &&
1528
+ run.samples === target.samples &&
1529
+ run.status === "finished" &&
1530
+ run.outcome === "ok" &&
1531
+ Boolean(run.outputCandidateId))
1532
+ .sort((left, right) => (right.finishedAt ?? right.startedAt).localeCompare(left.finishedAt ?? left.startedAt) ||
1533
+ right.id.localeCompare(left.id))[0] ?? null;
921
1534
  }
922
1535
  function localDevViewHint(workspace, runId) {
923
1536
  const runFlag = runId ? ` --run ${shellQuote(runId)}` : "";
@@ -935,20 +1548,26 @@ function localDevOpenUrl(baseUrl, snapshot, runId) {
935
1548
  .reverse()
936
1549
  .find((entry) => entry.runId === runId);
937
1550
  if (!evaluation) {
938
- return new URL("subjects", baseUrl).toString();
1551
+ return new URL("candidates", baseUrl).toString();
939
1552
  }
940
1553
  const params = new URLSearchParams({ evaluation: evaluation.id });
941
- return new URL(`subjects/${encodeURIComponent(evaluation.subjectId)}?${params.toString()}`, baseUrl).toString();
1554
+ return new URL(`candidates/${encodeURIComponent(evaluation.candidateId)}?${params.toString()}`, baseUrl).toString();
942
1555
  }
943
1556
  async function readLocalBenchmarkFingerprint(workspace) {
944
1557
  return localBenchmarkFingerprint(await readLocalProjectSource(workspace));
945
1558
  }
946
- function authoredSubjectSourceFiles(projectSource) {
1559
+ function localRunExecutionFingerprint(projectSource) {
1560
+ return workbenchRunExecutionFingerprint({
1561
+ sourceYaml: projectSource.specSource,
1562
+ adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
1563
+ });
1564
+ }
1565
+ function authoredCandidateSourceFiles(projectSource) {
947
1566
  return [{
948
- path: path.relative(projectSource.dir, projectSource.subjectSpecPath).split(path.sep).join("/"),
1567
+ path: path.relative(projectSource.dir, projectSource.candidateSpecPath).split(path.sep).join("/"),
949
1568
  kind: "text",
950
1569
  encoding: "utf8",
951
- content: projectSource.subjectSource,
1570
+ content: projectSource.candidateSource,
952
1571
  executable: false,
953
1572
  }];
954
1573
  }
@@ -1155,72 +1774,72 @@ function requireValidRunEnvelope(args) {
1155
1774
  }
1156
1775
  async function localRestore(argv, io) {
1157
1776
  const parsed = parseArgs(argv);
1158
- rejectUnknownFlags(parsed, new Set(["dir", "subject", "dry-run", "yes", "json"]));
1777
+ rejectUnknownFlags(parsed, new Set(["dir", "candidate", "dry-run", "yes", "json"]));
1159
1778
  const workspace = resolveDir(parsed);
1160
1779
  const spec = await readLocalSpecIfValid(workspace);
1161
1780
  if (!spec) {
1162
1781
  throw new UsageError("restore requires a valid Workbench project.");
1163
1782
  }
1164
- const subjectRoot = spec.subject.files.path;
1783
+ const candidateRoot = spec.candidate.files.path;
1165
1784
  const snapshot = await loadLocalArchive(workspace);
1166
- const subjectId = readSubjectIdFlag(parsed, snapshot);
1167
- const files = readLocalSubjectFiles(snapshot, subjectId);
1785
+ const candidateId = readCandidateIdFlag(parsed, snapshot);
1786
+ const files = readLocalCandidateFiles(snapshot, candidateId);
1168
1787
  if (parsed.flags["dry-run"] === true) {
1169
- writeOutput({ ok: true, subjectId, fileCount: files.length }, parsed, io, () => `Restore would write ${files.length} file(s) from ${subjectId}.`);
1788
+ writeOutput({ ok: true, candidateId: candidateId, fileCount: files.length }, parsed, io, () => `Restore would write ${files.length} file(s) from ${candidateId}.`);
1170
1789
  return 0;
1171
1790
  }
1172
1791
  if (parsed.flags.yes !== true) {
1173
1792
  throw new UsageError("restore requires --dry-run to preview or --yes to apply source directory changes.");
1174
1793
  }
1175
- const changedPaths = await materializeSubjectRoot(workspace, subjectRoot, files);
1176
- const next = setLocalActive(snapshot, subjectId);
1794
+ const changedPaths = await materializeCandidateRoot(workspace, candidateRoot, files);
1795
+ const next = setLocalActive(snapshot, candidateId);
1177
1796
  await saveLocalArchive(workspace, next);
1178
- writeOutput({ ok: true, activeAfter: subjectId, changedPaths }, parsed, io, () => `Restored ${subjectId} to ${subjectRoot}.`);
1797
+ writeOutput({ ok: true, activeCandidateId: candidateId, changedPaths }, parsed, io, () => `Restored ${candidateId} to ${candidateRoot}.`);
1179
1798
  return 0;
1180
1799
  }
1181
- async function localSubjectList(argv, io) {
1800
+ async function localCandidateList(argv, io) {
1182
1801
  const parsed = parseArgs(argv);
1183
1802
  rejectUnknownFlags(parsed, new Set(["dir", "json"]));
1184
1803
  const snapshot = await loadLocalArchive(resolveDir(parsed));
1185
- writeOutput(snapshot.subjects, parsed, io, (subjects) => subjects
1186
- .map((subject) => `${subject.id}\t${subject.status}\tmetrics ${formatMetricSummary(subject.metrics)}${snapshot.activeId === subject.id ? "\tactive" : ""}`)
1187
- .join("\n") || "No subjects.");
1804
+ writeOutput(snapshot.candidates, parsed, io, (candidates) => candidates
1805
+ .map((candidate) => `${candidate.id}\t${candidate.status}\tevaluation ${formatCandidateEvaluationScore(candidate)}${snapshot.activeId === candidate.id ? "\tactive" : ""}`)
1806
+ .join("\n") || "No candidates.");
1188
1807
  return 0;
1189
1808
  }
1190
- async function localSubjectShow(argv, io) {
1809
+ async function localCandidateShow(argv, io) {
1191
1810
  const parsed = parseArgs(argv);
1192
- rejectUnknownFlags(parsed, new Set(["dir", "subject", "json"]));
1811
+ rejectUnknownFlags(parsed, new Set(["dir", "candidate", "json"]));
1193
1812
  const snapshot = await loadLocalArchive(resolveDir(parsed));
1194
- const subjectId = readSubjectIdFlag(parsed, snapshot);
1195
- const subject = readLocalSubject(snapshot, subjectId);
1196
- writeOutput(subject, parsed, io, (record) => [
1813
+ const candidateId = readCandidateIdFlag(parsed, snapshot);
1814
+ const candidate = readLocalCandidate(snapshot, candidateId);
1815
+ writeOutput(candidate, parsed, io, (record) => [
1197
1816
  `${record.id}\t${record.status}`,
1198
1817
  `benchmark\t${record.benchmarkFingerprint}`,
1199
- `subject\t${record.subjectFingerprint}`,
1200
- `metrics\t${formatMetricSummary(record.metrics)}`,
1818
+ `candidate\t${record.candidateFingerprint ?? record.candidateFingerprint}`,
1819
+ `evaluation\t${formatCandidateEvaluationSummary(record)}`,
1201
1820
  ...(record.baseId ? [`base\t${record.baseId}`] : []),
1202
1821
  ].join("\n"));
1203
1822
  return 0;
1204
1823
  }
1205
- async function localSubjectFiles(argv, io) {
1824
+ async function localCandidateFiles(argv, io) {
1206
1825
  const parsed = parseArgs(argv);
1207
- rejectUnknownFlags(parsed, new Set(["dir", "subject", "json"]));
1826
+ rejectUnknownFlags(parsed, new Set(["dir", "candidate", "json"]));
1208
1827
  const snapshot = await loadLocalArchive(resolveDir(parsed));
1209
- const subjectId = readSubjectIdFlag(parsed, snapshot);
1210
- const subject = readLocalSubject(snapshot, subjectId);
1211
- const files = summarizeSubjectFiles(readLocalSubjectFiles(snapshot, subjectId), subject.fileChanges);
1828
+ const candidateId = readCandidateIdFlag(parsed, snapshot);
1829
+ const candidate = readLocalCandidate(snapshot, candidateId);
1830
+ const files = summarizeCandidateFiles(readLocalCandidateFiles(snapshot, candidateId), candidate.fileChanges);
1212
1831
  writeOutput(files, parsed, io, (records) => records
1213
1832
  .map((file) => `${file.path}\t${file.status}\t${file.preview_kind}`)
1214
1833
  .join("\n") || "No files.");
1215
1834
  return 0;
1216
1835
  }
1217
- async function localSubjectPreview(argv, io) {
1836
+ async function localCandidatePreview(argv, io) {
1218
1837
  const parsed = parseArgs(argv);
1219
- rejectUnknownFlags(parsed, new Set(["dir", "subject", "path", "output", "view", "json"]));
1838
+ rejectUnknownFlags(parsed, new Set(["dir", "candidate", "path", "output", "view", "json"]));
1220
1839
  const snapshot = await loadLocalArchive(resolveDir(parsed));
1221
- const subjectId = readSubjectIdFlag(parsed, snapshot);
1222
- const preview = createSubjectFilePreview({
1223
- files: readLocalSubjectFiles(snapshot, subjectId),
1840
+ const candidateId = readCandidateIdFlag(parsed, snapshot);
1841
+ const preview = createCandidateFilePreview({
1842
+ files: readLocalCandidateFiles(snapshot, candidateId),
1224
1843
  path: requireFlag(parsed, "path"),
1225
1844
  view: readPreviewMode(parsed),
1226
1845
  });
@@ -1755,7 +2374,7 @@ function createAdapterScaffoldFiles(id) {
1755
2374
  "setup:",
1756
2375
  " - npm install --global .",
1757
2376
  "operations:",
1758
- " subject.run: {}",
2377
+ " candidate.run: {}",
1759
2378
  "",
1760
2379
  ].join("\n");
1761
2380
  const packageJson = `${JSON.stringify({
@@ -1777,11 +2396,11 @@ const request = requestPath && fs.existsSync(requestPath)
1777
2396
  ? JSON.parse(fs.readFileSync(requestPath, "utf8"))
1778
2397
  : {};
1779
2398
  fs.mkdirSync(outputRoot, { recursive: true });
1780
- const operation = request.operation || "subject.run";
2399
+ const operation = request.operation || "candidate.run";
1781
2400
  const resultPath = process.env.WORKBENCH_RESULT || request.paths?.result || path.join(outputRoot, "workbench-result.json");
1782
2401
 
1783
2402
  let value;
1784
- if (operation === "subject.run") {
2403
+ if (operation === "candidate.run") {
1785
2404
  const task = request.context?.case?.prompt || "No case prompt was provided.";
1786
2405
  fs.writeFileSync(path.join(outputRoot, "adapter-output.txt"), [
1787
2406
  "adapter: ${id}",
@@ -1790,7 +2409,7 @@ if (operation === "subject.run") {
1790
2409
  "",
1791
2410
  ].join("\\n"));
1792
2411
  } else {
1793
- console.error("${id} only implements subject.run.");
2412
+ console.error("${id} only implements candidate.run.");
1794
2413
  process.exit(2);
1795
2414
  }
1796
2415
 
@@ -2065,7 +2684,7 @@ async function resolveAdapterForAuthTarget(dir, targetRaw) {
2065
2684
  const adapters = await resolveWorkbenchAdaptersForProject(dir, spec);
2066
2685
  const adapter = adapters.find((entry) => entry.manifest.id === target.adapterId);
2067
2686
  if (!adapter) {
2068
- throw new UsageError(`Adapter ${target.adapterId} is not used by this benchmark source. Add it to the benchmark, subject, or optimizer YAML before connecting auth.`);
2687
+ throw new UsageError(`Adapter ${target.adapterId} is not used by this benchmark source. Add it to the benchmark or candidate YAML before connecting auth.`);
2069
2688
  }
2070
2689
  if (!adapter.manifest.auth) {
2071
2690
  throw new UsageError(`Adapter ${target.adapterId} does not declare auth.`);
@@ -2728,15 +3347,209 @@ async function starProject(argv, io, starred) {
2728
3347
  });
2729
3348
  return 0;
2730
3349
  }
3350
+ async function retryHostedWorkflow(argv, io) {
3351
+ const parsed = parseArgs(argv);
3352
+ rejectUnknownFlags(parsed, new Set([
3353
+ "dir",
3354
+ "benchmark",
3355
+ "watch",
3356
+ "interval-ms",
3357
+ "timeout-ms",
3358
+ "json",
3359
+ ]));
3360
+ rejectUnexpectedPositionals(parsed, "workbench cloud retry", 1);
3361
+ const targetId = parsed.positionals[0];
3362
+ if (!targetId) {
3363
+ throw new UsageError("Missing required TARGET_ID.");
3364
+ }
3365
+ if (parsed.flags.watch !== true && (parsed.flags["interval-ms"] !== undefined ||
3366
+ parsed.flags["timeout-ms"] !== undefined)) {
3367
+ throw new UsageError("--interval-ms and --timeout-ms require --watch.");
3368
+ }
3369
+ const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
3370
+ const retryTarget = await resolveHostedRetryTarget(target, targetId);
3371
+ const watchIntervalMs = parsed.flags.watch === true
3372
+ ? parsePositiveInt(parsed.flags["interval-ms"], 1000, "interval-ms")
3373
+ : undefined;
3374
+ const watchTimeoutMs = parsed.flags.watch === true
3375
+ ? parseOptionalPositiveInt(parsed.flags["timeout-ms"], "timeout-ms")
3376
+ : undefined;
3377
+ const response = await apiRequest(projectApiPath(target.projectId, "/runs"), {
3378
+ method: "POST",
3379
+ body: retryTarget.request,
3380
+ }, target.baseUrl);
3381
+ const startedRun = withRunUrls(target, response.run);
3382
+ if (parsed.flags.watch === true) {
3383
+ if (parsed.flags.json !== true) {
3384
+ io.stdout.write(`${formatHostedRunStarted(startedRun, retryTarget.workflow).trimEnd()}\n${HOSTED_WATCH_LIFECYCLE_NOTE}\n`);
3385
+ }
3386
+ const watched = await watchHostedRun({
3387
+ parsed,
3388
+ target,
3389
+ runId: response.run.id,
3390
+ intervalMs: watchIntervalMs ?? 1000,
3391
+ timeoutMs: watchTimeoutMs,
3392
+ });
3393
+ const outputRun = withRunUrls(target, await withHostedRunFailureSummary(target, watched));
3394
+ const result = {
3395
+ ok: hostedRunSucceeded(watched),
3396
+ retried: {
3397
+ id: retryTarget.sourceId,
3398
+ kind: retryTarget.sourceKind,
3399
+ workflow: retryTarget.workflow,
3400
+ },
3401
+ runId: outputRun.id,
3402
+ candidateId: outputRun.outputCandidateId ?? outputRun.candidateId,
3403
+ activeCandidateId: outputRun.activeCandidateId ?? null,
3404
+ run: outputRun,
3405
+ ...(outputRun.urls ? { urls: outputRun.urls } : {}),
3406
+ ...(outputRun.failedJobCount !== undefined ? { failedJobCount: outputRun.failedJobCount } : {}),
3407
+ ...(outputRun.error ? { error: outputRun.error } : {}),
3408
+ };
3409
+ writeOutput(result, parsed, io, formatRetryCommandResult);
3410
+ return hostedRunSucceeded(watched) ? 0 : 1;
3411
+ }
3412
+ const result = {
3413
+ ok: true,
3414
+ retried: {
3415
+ id: retryTarget.sourceId,
3416
+ kind: retryTarget.sourceKind,
3417
+ workflow: retryTarget.workflow,
3418
+ },
3419
+ runId: startedRun.id,
3420
+ candidateId: startedRun.outputCandidateId ?? startedRun.candidateId,
3421
+ activeCandidateId: startedRun.activeCandidateId ?? null,
3422
+ run: startedRun,
3423
+ ...(startedRun.urls ? { urls: startedRun.urls } : {}),
3424
+ };
3425
+ writeOutput(result, parsed, io, formatRetryCommandResult);
3426
+ return 0;
3427
+ }
3428
+ async function resolveHostedRetryTarget(target, targetId) {
3429
+ if (targetId.startsWith("eval_")) {
3430
+ return await resolveHostedEvaluationRetryTarget(target, targetId);
3431
+ }
3432
+ const detail = await readHostedRunDetail(target, targetId);
3433
+ const run = detail.run;
3434
+ if (run.status !== "finished") {
3435
+ throw new UsageError(`Run ${run.id} is ${run.status}; wait for it to finish before retrying.`);
3436
+ }
3437
+ if (!hostedRunRecordFailed(run)) {
3438
+ throw new UsageError(`Run ${run.id} did not fail; use workbench cloud ${run.workflow ?? "eval"} to intentionally run it again.`);
3439
+ }
3440
+ if (run.workflow === "eval") {
3441
+ const candidateId = hostedRunEvaluationCandidateId(run, detail.jobs);
3442
+ if (!candidateId) {
3443
+ throw new UsageError(`Run ${run.id} has no candidate id to retry.`);
3444
+ }
3445
+ return {
3446
+ sourceId: targetId,
3447
+ sourceKind: "run",
3448
+ workflow: "eval",
3449
+ request: {
3450
+ workflow: "eval",
3451
+ samples: run.samples ?? 1,
3452
+ candidateId,
3453
+ sourceYaml: hostedRetrySourceYaml(run, run.id),
3454
+ preserveActive: true,
3455
+ ...retrySampleSelectionFromJobs(detail.jobs),
3456
+ },
3457
+ };
3458
+ }
3459
+ if (run.workflow === "improve") {
3460
+ const baseCandidateId = stringValue(readRecord(run.input)?.baseCandidateId);
3461
+ if (!baseCandidateId) {
3462
+ throw new UsageError(`Run ${run.id} is missing its base candidate id.`);
3463
+ }
3464
+ return {
3465
+ sourceId: targetId,
3466
+ sourceKind: "run",
3467
+ workflow: "improve",
3468
+ request: {
3469
+ workflow: "improve",
3470
+ samples: run.samples ?? 1,
3471
+ budget: run.budget ?? run.attemptsRequested ?? 1,
3472
+ candidateId: baseCandidateId,
3473
+ sourceYaml: hostedRetrySourceYaml(run, run.id),
3474
+ preserveActive: true,
3475
+ },
3476
+ };
3477
+ }
3478
+ throw new UsageError(`Run ${run.id} has no retryable workflow.`);
3479
+ }
3480
+ async function resolveHostedEvaluationRetryTarget(target, evaluationId) {
3481
+ const snapshot = await apiRequest(projectApiPath(target.projectId, "/workbench/snapshot"), {}, target.baseUrl);
3482
+ const evaluation = snapshot.evaluations.find((entry) => entry.id === evaluationId);
3483
+ if (!evaluation) {
3484
+ throw new UsageError(`Hosted evaluation not found: ${evaluationId}`);
3485
+ }
3486
+ const run = snapshot.runs.find((entry) => entry.id === evaluation.runId) ?? null;
3487
+ if (!evaluationScorecardFailed(evaluation, run)) {
3488
+ throw new UsageError(`Evaluation ${evaluation.id} did not fail; use workbench cloud eval to intentionally run it again.`);
3489
+ }
3490
+ if (!run) {
3491
+ throw new UsageError(`Evaluation ${evaluation.id} is missing its run record.`);
3492
+ }
3493
+ const detail = await readHostedRunDetail(target, run.id);
3494
+ const detailedRun = detail.run;
3495
+ return {
3496
+ sourceId: evaluationId,
3497
+ sourceKind: "evaluation",
3498
+ workflow: "eval",
3499
+ request: {
3500
+ workflow: "eval",
3501
+ samples: evaluation.sampleCount || detailedRun.samples || 1,
3502
+ candidateId: evaluation.candidateId,
3503
+ sourceYaml: hostedRetrySourceYaml(detailedRun, detailedRun.id),
3504
+ preserveActive: true,
3505
+ ...retrySampleSelectionFromJobs(detail.jobs),
3506
+ },
3507
+ };
3508
+ }
3509
+ function retrySampleSelectionFromJobs(jobs) {
3510
+ const selectedSamples = uniqueCaseSamplePairs(jobs
3511
+ .filter((job) => job.status !== "succeeded" &&
3512
+ executionPurposeFromJobInput(job.input) === "attempt")
3513
+ .map(caseSamplePairFromJob)
3514
+ .filter((pair) => pair !== null));
3515
+ return selectedSamples.length > 0
3516
+ ? { selectedSamples }
3517
+ : {};
3518
+ }
3519
+ function uniqueCaseSamplePairs(pairs) {
3520
+ const byKey = new Map();
3521
+ for (const pair of pairs) {
3522
+ byKey.set(caseSamplePairKey(pair), pair);
3523
+ }
3524
+ return [...byKey.values()].sort((left, right) => left.caseId.localeCompare(right.caseId) ||
3525
+ left.sampleIndex - right.sampleIndex);
3526
+ }
3527
+ async function readHostedRunDetail(target, runId) {
3528
+ return await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(runId)}`), {}, target.baseUrl);
3529
+ }
3530
+ function hostedRetrySourceYaml(run, runId) {
3531
+ const sourceYaml = stringValue(readRecord(run.input)?.sourceYaml);
3532
+ if (!sourceYaml) {
3533
+ throw new UsageError(`Run ${runId} is missing its recorded source configuration.`);
3534
+ }
3535
+ return sourceYaml;
3536
+ }
3537
+ function hostedRunRecordFailed(run) {
3538
+ return run.outcome === "error" ||
3539
+ run.outcome === "cancelled" ||
3540
+ (run.failedJobCount ?? 0) > 0 ||
3541
+ Boolean(run.error);
3542
+ }
2731
3543
  async function startHostedWorkflow(workflow, argv, io) {
2732
3544
  const parsed = parseArgs(argv);
2733
3545
  rejectUnknownFlags(parsed, new Set([
2734
3546
  "dir",
2735
3547
  "benchmark",
2736
3548
  "base",
2737
- "optimizer",
3549
+ "runs",
2738
3550
  "budget",
2739
3551
  "samples",
3552
+ "rerun",
2740
3553
  "watch",
2741
3554
  "dry-run",
2742
3555
  "interval-ms",
@@ -2746,42 +3559,69 @@ async function startHostedWorkflow(workflow, argv, io) {
2746
3559
  if (parsed.positionals.length > 1) {
2747
3560
  throw new UsageError(`workbench cloud ${workflow} accepts at most one source file or directory argument.`);
2748
3561
  }
2749
- const optimizerPath = asOptionalString(parsed.flags.optimizer);
2750
3562
  const sourceArg = parsed.positionals[0] ?? asOptionalString(parsed.flags.dir) ?? process.cwd();
2751
3563
  if (parsed.positionals.length > 0 && parsed.flags.dir !== undefined) {
2752
3564
  throw new UsageError("Use either --dir or SOURCE, not both.");
2753
3565
  }
2754
- const baseSubjectId = asOptionalString(parsed.flags.base);
3566
+ const samples = parsePositiveInt(parsed.flags.samples, 1, "samples");
3567
+ const budget = workflow === "improve"
3568
+ ? parsePositiveInt(parsed.flags.budget, 1, "budget")
3569
+ : undefined;
3570
+ if (parsed.flags.watch !== true && (parsed.flags["interval-ms"] !== undefined ||
3571
+ parsed.flags["timeout-ms"] !== undefined)) {
3572
+ throw new UsageError("--interval-ms and --timeout-ms require --watch.");
3573
+ }
3574
+ const runsFlag = asOptionalString(parsed.flags.runs);
3575
+ const defaultProjectSource = await readLocalProjectSource(path.resolve(sourceArg));
3576
+ const selectedRunIds = workflow === "eval"
3577
+ ? resolveCandidateRunSelection(defaultProjectSource, runsFlag)
3578
+ : [singleRequestedRunId(runsFlag, `workbench cloud ${workflow}`) ?? defaultProjectSource.candidateRunId];
3579
+ if (workflow === "eval" && selectedRunIds.length > 1) {
3580
+ let failed = 0;
3581
+ const results = [];
3582
+ for (const runId of selectedRunIds) {
3583
+ const captured = createCapturingIo(io);
3584
+ const code = await startHostedWorkflow(workflow, hostedWorkflowArgsForRun({
3585
+ parsed,
3586
+ sourceDir: defaultProjectSource.dir,
3587
+ runId,
3588
+ }), captured.io);
3589
+ if (code !== 0) {
3590
+ failed += 1;
3591
+ }
3592
+ results.push(parseCapturedJson(captured.stdoutText()));
3593
+ }
3594
+ writeOutput({
3595
+ ok: failed === 0,
3596
+ candidateRunIds: selectedRunIds,
3597
+ failedRunCount: failed,
3598
+ results,
3599
+ }, parsed, io, () => `Processed ${selectedRunIds.length} hosted candidate run(s); ${failed} failed.`);
3600
+ return failed === 0 ? 0 : 1;
3601
+ }
3602
+ const baseCandidateId = asOptionalString(parsed.flags.base);
2755
3603
  const request = workflow === "improve"
2756
3604
  ? {
2757
3605
  workflow,
2758
- budget: parsePositiveInt(parsed.flags.budget, 1, "budget"),
2759
- samples: parsePositiveInt(parsed.flags.samples, 1, "samples"),
2760
- ...(baseSubjectId ? { subjectId: baseSubjectId } : {}),
3606
+ budget,
3607
+ samples,
3608
+ ...(baseCandidateId ? { candidateId: baseCandidateId } : {}),
2761
3609
  }
2762
3610
  : {
2763
3611
  workflow,
2764
- samples: parsePositiveInt(parsed.flags.samples, 1, "samples"),
2765
- ...(baseSubjectId ? { subjectId: baseSubjectId } : {}),
3612
+ samples,
3613
+ ...(baseCandidateId ? { candidateId: baseCandidateId } : {}),
2766
3614
  };
2767
- if (workflow === "improve" && !optimizerPath) {
2768
- throw new UsageError("workbench cloud improve requires --optimizer OPTIMIZER_YAML.");
2769
- }
2770
- if (parsed.flags.watch !== true && (parsed.flags["interval-ms"] !== undefined ||
2771
- parsed.flags["timeout-ms"] !== undefined)) {
2772
- throw new UsageError("--interval-ms and --timeout-ms require --watch.");
2773
- }
2774
- const projectSource = await readLocalProjectSource(path.resolve(sourceArg), {
2775
- optimizerPath,
2776
- });
2777
- if (workflow === "eval") {
2778
- request.subjectSource = projectSource.subjectSource;
2779
- request.subjectFiles = projectSource.subjectFiles;
2780
- request.adapterFiles = projectSource.adapterFiles;
3615
+ const projectSource = selectedRunIds[0] === defaultProjectSource.candidateRunId
3616
+ ? defaultProjectSource
3617
+ : await readLocalProjectSource(path.resolve(sourceArg), { runId: selectedRunIds[0] });
3618
+ request.sourceYaml = projectSource.specSource;
3619
+ request.adapterFiles = projectSource.adapterFiles;
3620
+ if (workflow === "eval" && !baseCandidateId) {
3621
+ request.candidateFiles = projectSource.candidateFiles;
2781
3622
  }
2782
- if (workflow === "improve" && projectSource.optimizerSource) {
2783
- request.optimizerSource = projectSource.optimizerSource;
2784
- request.adapterFiles = projectSource.adapterFiles;
3623
+ if (parsed.flags.rerun === true) {
3624
+ request.rerun = true;
2785
3625
  }
2786
3626
  const watchIntervalMs = parsed.flags.watch === true
2787
3627
  ? parsePositiveInt(parsed.flags["interval-ms"], 1000, "interval-ms")
@@ -2808,11 +3648,13 @@ async function startHostedWorkflow(workflow, argv, io) {
2808
3648
  sourceDir: projectSource.dir,
2809
3649
  });
2810
3650
  if (workflow === "improve") {
2811
- request.subjectId = await ensureHostedImproveBaseSubject({
3651
+ request.candidateId = await ensureHostedImproveBaseCandidate({
2812
3652
  parsed,
2813
3653
  target,
2814
3654
  samples: request.samples,
2815
- subjectId: baseSubjectId,
3655
+ candidateId: baseCandidateId,
3656
+ sourceYaml: projectSource.specSource,
3657
+ adapterFiles: projectSource.adapterFiles,
2816
3658
  intervalMs: watchIntervalMs ?? 1000,
2817
3659
  timeoutMs: watchTimeoutMs,
2818
3660
  });
@@ -2822,6 +3664,19 @@ async function startHostedWorkflow(workflow, argv, io) {
2822
3664
  body: request,
2823
3665
  }, target.baseUrl);
2824
3666
  const startedRun = withRunUrls(target, response.run);
3667
+ const startedRunOutput = response.reused === true
3668
+ ? { ...startedRun, reused: true }
3669
+ : startedRun;
3670
+ if (response.reused === true && response.run.status === "finished") {
3671
+ writeOutput({
3672
+ ok: hostedRunSucceeded(response.run),
3673
+ reused: true,
3674
+ workflow,
3675
+ runId: startedRun.id,
3676
+ ...startedRun,
3677
+ }, parsed, io, () => `Reused hosted ${workflow} ${startedRun.id}. Use --rerun to intentionally run it again.`);
3678
+ return hostedRunSucceeded(response.run) ? 0 : 1;
3679
+ }
2825
3680
  if (parsed.flags.watch === true) {
2826
3681
  if (parsed.flags.json !== true) {
2827
3682
  io.stdout.write(`${formatHostedRunStarted(startedRun, workflow).trimEnd()}\n${HOSTED_WATCH_LIFECYCLE_NOTE}\n`);
@@ -2837,23 +3692,23 @@ async function startHostedWorkflow(workflow, argv, io) {
2837
3692
  writeOutput(withRunUrls(target, outputRun), parsed, io, formatHostedRunResult);
2838
3693
  return hostedRunSucceeded(watched) ? 0 : 1;
2839
3694
  }
2840
- writeOutput(startedRun, parsed, io, (run) => formatHostedRunStarted(run, workflow).trimEnd());
3695
+ writeOutput(startedRunOutput, parsed, io, (run) => formatHostedRunStarted(run, workflow).trimEnd());
2841
3696
  return 0;
2842
3697
  }
2843
- async function ensureHostedImproveBaseSubject(args) {
2844
- if (args.subjectId) {
2845
- const subject = await readHostedSubjectSummary(args.target, args.subjectId);
2846
- if (!subject) {
2847
- throw new UsageError(`Base subject ${args.subjectId} was not found for the current benchmark.`);
3698
+ async function ensureHostedImproveBaseCandidate(args) {
3699
+ if (args.candidateId) {
3700
+ const candidate = await readHostedCandidateSummary(args.target, args.candidateId);
3701
+ if (!candidate) {
3702
+ throw new UsageError(`Base candidate ${args.candidateId} was not found for the current benchmark.`);
2848
3703
  }
2849
- if (hostedSubjectIsEvaluated(subject)) {
2850
- return args.subjectId;
3704
+ if (hostedCandidateIsEvaluated(candidate)) {
3705
+ return args.candidateId;
2851
3706
  }
2852
3707
  }
2853
3708
  else {
2854
- const activeSubject = await readEvaluatedActiveHostedSubject(args.target);
2855
- if (activeSubject) {
2856
- return activeSubject.id;
3709
+ const activeCandidate = await readEvaluatedActiveHostedCandidate(args.target);
3710
+ if (activeCandidate) {
3711
+ return activeCandidate.id;
2857
3712
  }
2858
3713
  }
2859
3714
  const response = await apiRequest(projectApiPath(args.target.projectId, "/runs"), {
@@ -2861,7 +3716,9 @@ async function ensureHostedImproveBaseSubject(args) {
2861
3716
  body: {
2862
3717
  workflow: "eval",
2863
3718
  samples: args.samples,
2864
- ...(args.subjectId ? { subjectId: args.subjectId } : {}),
3719
+ ...(args.candidateId ? { candidateId: args.candidateId } : {}),
3720
+ sourceYaml: args.sourceYaml,
3721
+ ...(args.adapterFiles.length > 0 ? { adapterFiles: args.adapterFiles } : {}),
2865
3722
  },
2866
3723
  }, args.target.baseUrl);
2867
3724
  const watched = await watchHostedRun({
@@ -2872,28 +3729,52 @@ async function ensureHostedImproveBaseSubject(args) {
2872
3729
  timeoutMs: args.timeoutMs,
2873
3730
  });
2874
3731
  if (!hostedRunSucceeded(watched)) {
2875
- throw new UsageError(`Parent subject eval ${watched.id} failed; improve was not started.`);
3732
+ throw new UsageError(`Parent candidate eval ${watched.id} failed; improve was not started.`);
2876
3733
  }
2877
- if (!watched.subjectId) {
2878
- throw new UsageError(`Parent subject eval ${watched.id} did not produce a subject.`);
3734
+ if (!watched.candidateId) {
3735
+ throw new UsageError(`Parent candidate eval ${watched.id} did not produce a candidate.`);
2879
3736
  }
2880
- return watched.subjectId;
3737
+ return watched.candidateId;
2881
3738
  }
2882
- async function readHostedSubjectSummary(target, subjectId) {
2883
- const response = await apiRequest(projectApiPath(target.projectId, "/subjects"), {}, target.baseUrl);
2884
- return response.subjects.find((entry) => entry.id === subjectId) ?? null;
3739
+ function hostedWorkflowArgsForRun(args) {
3740
+ const next = ["--dir", args.sourceDir, "--runs", args.runId, "--json"];
3741
+ appendStringFlag(next, "benchmark", asOptionalString(args.parsed.flags.benchmark));
3742
+ appendStringFlag(next, "base", asOptionalString(args.parsed.flags.base));
3743
+ appendStringFlag(next, "samples", asOptionalString(args.parsed.flags.samples));
3744
+ appendStringFlag(next, "budget", asOptionalString(args.parsed.flags.budget));
3745
+ appendStringFlag(next, "interval-ms", asOptionalString(args.parsed.flags["interval-ms"]));
3746
+ appendStringFlag(next, "timeout-ms", asOptionalString(args.parsed.flags["timeout-ms"]));
3747
+ if (args.parsed.flags.watch === true) {
3748
+ next.push("--watch");
3749
+ }
3750
+ if (args.parsed.flags["dry-run"] === true) {
3751
+ next.push("--dry-run");
3752
+ }
3753
+ if (args.parsed.flags.rerun === true) {
3754
+ next.push("--rerun");
3755
+ }
3756
+ return next;
3757
+ }
3758
+ function appendStringFlag(args, name, value) {
3759
+ if (value !== undefined) {
3760
+ args.push(`--${name}`, value);
3761
+ }
3762
+ }
3763
+ async function readHostedCandidateSummary(target, candidateId) {
3764
+ const response = await apiRequest(projectApiPath(target.projectId, "/candidates"), {}, target.baseUrl);
3765
+ return response.candidates.find((entry) => entry.id === candidateId) ?? null;
2885
3766
  }
2886
- async function readEvaluatedActiveHostedSubject(target) {
3767
+ async function readEvaluatedActiveHostedCandidate(target) {
2887
3768
  const response = await apiRequest(projectApiPath(target.projectId), {}, target.baseUrl);
2888
- const activeSubjectId = response.benchmark.activeSubjectId;
2889
- if (!activeSubjectId) {
3769
+ const activeCandidateId = response.benchmark.activeCandidateId;
3770
+ if (!activeCandidateId) {
2890
3771
  return null;
2891
3772
  }
2892
- const subject = await readHostedSubjectSummary(target, activeSubjectId);
2893
- return subject && hostedSubjectIsEvaluated(subject) ? subject : null;
3773
+ const candidate = await readHostedCandidateSummary(target, activeCandidateId);
3774
+ return candidate && hostedCandidateIsEvaluated(candidate) ? candidate : null;
2894
3775
  }
2895
- function hostedSubjectIsEvaluated(subject) {
2896
- return subject.status === "evaluated" || subject.eval != null;
3776
+ function hostedCandidateIsEvaluated(candidate) {
3777
+ return candidate.status === "evaluated" || candidate.eval != null;
2897
3778
  }
2898
3779
  async function benchmarkList(argv, io) {
2899
3780
  const parsed = parseArgs(argv);
@@ -2905,7 +3786,7 @@ async function benchmarkList(argv, io) {
2905
3786
  return "No hosted Workbench benchmarks.";
2906
3787
  }
2907
3788
  return projects
2908
- .map((project) => `${project.id}\t${project.name}\t${project.runCount} runs\t${project.subjectCount} subjects`)
3789
+ .map((project) => `${project.id}\t${project.name}\t${project.runCount} runs\t${project.candidateCount} candidates`)
2909
3790
  .join("\n");
2910
3791
  });
2911
3792
  return 0;
@@ -2924,7 +3805,7 @@ async function benchmarkShow(argv, io) {
2924
3805
  const response = await apiRequest(benchmarkApiPath(projectRef), {}, await effectiveBaseUrl(origin?.baseUrl));
2925
3806
  writeOutput(response.benchmark, parsed, io, (project) => {
2926
3807
  const record = project;
2927
- return `${record.name} (${record.id})\n${record.runs.length} runs\n${record.subjects.length} subjects`;
3808
+ return `${record.name} (${record.id})\n${record.runs.length} runs\n${record.candidates.length} candidates`;
2928
3809
  });
2929
3810
  return 0;
2930
3811
  }
@@ -3012,61 +3893,61 @@ async function benchmarkStarred(argv, io) {
3012
3893
  });
3013
3894
  return 0;
3014
3895
  }
3015
- async function subjectList(argv, io) {
3896
+ async function candidateList(argv, io) {
3016
3897
  const parsed = parseArgs(argv);
3017
3898
  rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3018
- rejectUnexpectedPositionals(parsed, "workbench cloud subjects list", 0);
3899
+ rejectUnexpectedPositionals(parsed, "workbench cloud candidates list", 0);
3019
3900
  const target = await resolveHostedTarget(parsed);
3020
- const response = await apiRequest(projectApiPath(target.projectId, "/subjects"), {}, target.baseUrl);
3021
- writeOutput(response.subjects, parsed, io, (subjects) => {
3022
- if (subjects.length === 0) {
3023
- return "No subjects yet.";
3901
+ const response = await apiRequest(projectApiPath(target.projectId, "/candidates"), {}, target.baseUrl);
3902
+ writeOutput(response.candidates, parsed, io, (candidates) => {
3903
+ if (candidates.length === 0) {
3904
+ return "No candidates yet.";
3024
3905
  }
3025
- return subjects
3026
- .map((subject) => `${subject.id}\t${subject.status}\tmetrics ${formatMetricSummary(subject.metrics)}\t${subject.fileChanges?.length ?? 0} files`)
3906
+ return candidates
3907
+ .map((candidate) => `${candidate.id}\t${candidate.status}\t${candidate.fileChanges?.length ?? 0} files`)
3027
3908
  .join("\n");
3028
3909
  });
3029
3910
  return 0;
3030
3911
  }
3031
- async function subjectShow(argv, io) {
3912
+ async function candidateShow(argv, io) {
3032
3913
  const parsed = parseArgs(argv);
3033
3914
  rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3034
- rejectUnexpectedPositionals(parsed, "workbench cloud subjects show", 1);
3915
+ rejectUnexpectedPositionals(parsed, "workbench cloud candidates show", 1);
3035
3916
  const target = await resolveHostedTarget(parsed);
3036
- const subjectId = readRequiredSubjectId(parsed);
3037
- const params = new URLSearchParams({ id: subjectId });
3038
- const subject = await apiRequest(projectApiPath(target.projectId, `/workbench/record?${params.toString()}`), {}, target.baseUrl);
3039
- writeOutput(subject, parsed, io, (record) => {
3917
+ const candidateId = readRequiredCandidateId(parsed);
3918
+ const params = new URLSearchParams({ id: candidateId });
3919
+ const candidate = await apiRequest(projectApiPath(target.projectId, `/workbench/record?${params.toString()}`), {}, target.baseUrl);
3920
+ writeOutput(candidate, parsed, io, (record) => {
3040
3921
  const value = record;
3041
3922
  return [
3042
- `${value.id ?? subjectId}\t${value.status ?? "unknown"}`,
3923
+ `${value.id ?? candidateId}\t${value.status ?? "unknown"}`,
3043
3924
  ...(value.benchmarkFingerprint ? [`Benchmark version: ${shortDigest(value.benchmarkFingerprint)}`] : []),
3044
- ...(value.subjectFingerprint ? [`Subject digest: ${shortDigest(value.subjectFingerprint)}`] : []),
3925
+ ...(value.candidateFingerprint ? [`Candidate digest: ${shortDigest(value.candidateFingerprint)}`] : []),
3045
3926
  ].join("\n");
3046
3927
  });
3047
3928
  return 0;
3048
3929
  }
3049
- async function subjectFiles(argv, io) {
3930
+ async function candidateFiles(argv, io) {
3050
3931
  const parsed = parseArgs(argv);
3051
3932
  rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3052
- rejectUnexpectedPositionals(parsed, "workbench cloud subjects files", 1);
3933
+ rejectUnexpectedPositionals(parsed, "workbench cloud candidates files", 1);
3053
3934
  const target = await resolveHostedTarget(parsed);
3054
- const subjectId = readRequiredSubjectId(parsed);
3055
- const response = await apiRequest(projectApiPath(target.projectId, `/subjects/${encodeURIComponent(subjectId)}/files`), {}, target.baseUrl);
3935
+ const candidateId = readRequiredCandidateId(parsed);
3936
+ const response = await apiRequest(projectApiPath(target.projectId, `/candidates/${encodeURIComponent(candidateId)}/files`), {}, target.baseUrl);
3056
3937
  writeOutput(response.files, parsed, io, (files) => files
3057
3938
  .map((file) => `${file.path}\t${file.status}\t${file.preview_kind}`)
3058
3939
  .join("\n") || "No files.");
3059
3940
  return 0;
3060
3941
  }
3061
- async function subjectPreview(argv, io) {
3942
+ async function candidatePreview(argv, io) {
3062
3943
  const parsed = parseArgs(argv);
3063
3944
  rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "path", "output", "json"]));
3064
- rejectUnexpectedPositionals(parsed, "workbench cloud subjects preview", 1);
3945
+ rejectUnexpectedPositionals(parsed, "workbench cloud candidates preview", 1);
3065
3946
  const target = await resolveHostedTarget(parsed);
3066
- const subjectId = readRequiredSubjectId(parsed);
3947
+ const candidateId = readRequiredCandidateId(parsed);
3067
3948
  const filePath = requireFlag(parsed, "path");
3068
3949
  const params = new URLSearchParams({ path: filePath });
3069
- const response = await apiRequest(projectApiPath(target.projectId, `/subjects/${encodeURIComponent(subjectId)}/files?${params.toString()}`), {}, target.baseUrl);
3950
+ const response = await apiRequest(projectApiPath(target.projectId, `/candidates/${encodeURIComponent(candidateId)}/files?${params.toString()}`), {}, target.baseUrl);
3070
3951
  const content = response.preview.source?.content ??
3071
3952
  response.preview.rendered_html ??
3072
3953
  response.preview.diff ??
@@ -3084,14 +3965,14 @@ async function subjectPreview(argv, io) {
3084
3965
  }
3085
3966
  return 0;
3086
3967
  }
3087
- async function subjectExport(argv, io) {
3968
+ async function candidateExport(argv, io) {
3088
3969
  const parsed = parseArgs(argv);
3089
3970
  rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "out", "json"]));
3090
- rejectUnexpectedPositionals(parsed, "workbench cloud subjects pull", 1);
3971
+ rejectUnexpectedPositionals(parsed, "workbench cloud candidates pull", 1);
3091
3972
  const target = await resolveHostedTarget(parsed);
3092
- const subjectId = readRequiredSubjectId(parsed);
3973
+ const candidateId = readRequiredCandidateId(parsed);
3093
3974
  const outputDir = requireOutDir(parsed);
3094
- const response = await apiRequest(projectApiPath(target.projectId, `/subjects/${encodeURIComponent(subjectId)}/export`), {}, target.baseUrl);
3975
+ const response = await apiRequest(projectApiPath(target.projectId, `/candidates/${encodeURIComponent(candidateId)}/export`), {}, target.baseUrl);
3095
3976
  await writeFiles(outputDir, response.files);
3096
3977
  writeOutput({ ok: true, outputDir, files: response.files.length }, parsed, io, (result) => {
3097
3978
  const record = result;
@@ -3099,14 +3980,14 @@ async function subjectExport(argv, io) {
3099
3980
  });
3100
3981
  return 0;
3101
3982
  }
3102
- async function subjectVisibility(argv, io, visibility) {
3983
+ async function candidateVisibility(argv, io, visibility) {
3103
3984
  const parsed = parseArgs(argv);
3104
3985
  rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
3105
- rejectUnexpectedPositionals(parsed, `workbench cloud subjects ${visibility === "public" ? "publish" : "unpublish"}`, 1);
3986
+ rejectUnexpectedPositionals(parsed, `workbench cloud candidates ${visibility === "public" ? "publish" : "unpublish"}`, 1);
3106
3987
  const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
3107
- const subjectId = readRequiredSubjectId(parsed);
3108
- const response = await apiRequest(projectApiPath(target.projectId, `/subjects/${encodeURIComponent(subjectId)}/publish`), { method: visibility === "public" ? "PUT" : "DELETE" }, target.baseUrl);
3109
- writeOutput({ ok: true, visibility, subject: response.subject }, parsed, io, () => `${visibility === "public" ? "Published" : "Unpublished"} subject ${subjectId}.`);
3988
+ const candidateId = readRequiredCandidateId(parsed);
3989
+ const response = await apiRequest(projectApiPath(target.projectId, `/candidates/${encodeURIComponent(candidateId)}/publish`), { method: visibility === "public" ? "PUT" : "DELETE" }, target.baseUrl);
3990
+ writeOutput({ ok: true, visibility, candidate: response.candidate }, parsed, io, () => `${visibility === "public" ? "Published" : "Unpublished"} candidate ${candidateId}.`);
3110
3991
  return 0;
3111
3992
  }
3112
3993
  async function runList(argv, io) {
@@ -3116,7 +3997,7 @@ async function runList(argv, io) {
3116
3997
  const target = await resolveHostedTarget(parsed);
3117
3998
  const response = await apiRequest(projectApiPath(target.projectId, "/runs"), {}, target.baseUrl);
3118
3999
  writeOutput(response.runs, parsed, io, (runs) => runs
3119
- .map((run) => `${run.id}\t${run.status}\t${run.subjectId ?? "pending"}`)
4000
+ .map((run) => `${run.id}\t${run.status}\t${run.candidateId ?? "pending"}`)
3120
4001
  .join("\n") || "No runs.");
3121
4002
  return 0;
3122
4003
  }
@@ -3191,7 +4072,7 @@ async function runLogs(argv, io) {
3191
4072
  function formatRunLogs(record) {
3192
4073
  const value = record;
3193
4074
  return (value.jobs
3194
- .map((job) => `${job.id}\t${job.kind}\t${job.status}\t${job.subjectId ?? "-"}${job.error ? `\t${job.error}` : ""}`)
4075
+ .map((job) => `${job.id}\t${job.kind}\t${job.status}\t${job.candidateId ?? "-"}${job.error ? `\t${job.error}` : ""}`)
3195
4076
  .join("\n") || `No jobs for ${value.runId}.`);
3196
4077
  }
3197
4078
  async function openWorkbench(argv, io) {
@@ -3226,7 +4107,7 @@ function buildWorkbenchWebUrl(target, ref) {
3226
4107
  if (ref.startsWith("run_")) {
3227
4108
  return benchmarkUrl;
3228
4109
  }
3229
- return buildWorkbenchResourceUrls(target, { subjectId: ref }).subjectEvaluation;
4110
+ return buildWorkbenchResourceUrls(target, { candidateId: ref }).candidateEvaluation;
3230
4111
  }
3231
4112
  async function resolveHostedTarget(parsed, options = {}) {
3232
4113
  if (options.sourceArg !== undefined && parsed.flags.dir !== undefined) {
@@ -3313,7 +4194,7 @@ async function resolveOpenTarget(parsed) {
3313
4194
  const ref = parsed.positionals[0];
3314
4195
  if (ref &&
3315
4196
  !ref.startsWith("run_") &&
3316
- !ref.startsWith("subject_")) {
4197
+ !ref.startsWith("candidate_")) {
3317
4198
  const baseUrl = await effectiveBaseUrl();
3318
4199
  if (ref.includes("/")) {
3319
4200
  const parsedRef = parseBenchmarkRef(ref);
@@ -3347,13 +4228,13 @@ function buildWorkbenchResourceUrls(target, refs = {}) {
3347
4228
  const projectRef = `${encodeURIComponent(target.owner)}/${encodeURIComponent(target.projectName)}`;
3348
4229
  const benchmark = `${target.baseUrl}/benchmarks/${projectRef}`;
3349
4230
  const urls = { benchmark };
3350
- if (refs.subjectId) {
4231
+ if (refs.candidateId) {
3351
4232
  const evaluationId = refs.runId
3352
- ? evaluationScorecardId(refs.runId, refs.subjectId)
4233
+ ? evaluationScorecardId(refs.runId, refs.candidateId)
3353
4234
  : null;
3354
- urls.subjectEvaluation = evaluationId
3355
- ? `${benchmark}/subjects/${encodeURIComponent(refs.subjectId)}?evaluation=${encodeURIComponent(evaluationId)}`
3356
- : `${benchmark}/subjects/${encodeURIComponent(refs.subjectId)}`;
4235
+ urls.candidateEvaluation = evaluationId
4236
+ ? `${benchmark}/candidates/${encodeURIComponent(refs.candidateId)}?evaluation=${encodeURIComponent(evaluationId)}`
4237
+ : `${benchmark}/candidates/${encodeURIComponent(refs.candidateId)}`;
3357
4238
  }
3358
4239
  return urls;
3359
4240
  }
@@ -3423,15 +4304,15 @@ function withRunUrls(target, run) {
3423
4304
  ...run,
3424
4305
  urls: buildWorkbenchResourceUrls(target, {
3425
4306
  runId: run.id,
3426
- subjectId: run.outputSubjectId ?? run.subjectId,
4307
+ candidateId: run.outputCandidateId ?? run.candidateId,
3427
4308
  }),
3428
4309
  };
3429
4310
  }
3430
4311
  function withRunDetailUrls(target, detail) {
3431
- const subjectId = hostedRunEvaluationSubjectId(detail.run, detail.jobs);
4312
+ const candidateId = hostedRunEvaluationCandidateId(detail.run, detail.jobs);
3432
4313
  const run = withRunUrls(target, {
3433
4314
  ...detail.run,
3434
- outputSubjectId: detail.run.outputSubjectId ?? subjectId,
4315
+ outputCandidateId: detail.run.outputCandidateId ?? candidateId,
3435
4316
  });
3436
4317
  return {
3437
4318
  run,
@@ -3439,15 +4320,15 @@ function withRunDetailUrls(target, detail) {
3439
4320
  urls: run.urls ?? buildWorkbenchResourceUrls(target, { runId: run.id }),
3440
4321
  };
3441
4322
  }
3442
- function hostedRunEvaluationSubjectId(run, jobs = []) {
3443
- if (run.outputSubjectId) {
3444
- return run.outputSubjectId;
4323
+ function hostedRunEvaluationCandidateId(run, jobs = []) {
4324
+ if (run.outputCandidateId) {
4325
+ return run.outputCandidateId;
3445
4326
  }
3446
- const attemptSubjects = jobs
4327
+ const attemptCandidates = jobs
3447
4328
  .filter((job) => readRunJobPurpose(job) === "attempt")
3448
- .map((job) => job.subjectId)
3449
- .filter((subjectId) => Boolean(subjectId));
3450
- return attemptSubjects.at(-1) ?? run.subjectId ?? null;
4329
+ .map((job) => job.candidateId)
4330
+ .filter((candidateId) => Boolean(candidateId));
4331
+ return attemptCandidates.at(-1) ?? run.candidateId ?? null;
3451
4332
  }
3452
4333
  function sourceFileCount(source) {
3453
4334
  return source.sourceFiles.length;
@@ -3456,7 +4337,7 @@ function hostedProjectSourceRequest(source) {
3456
4337
  const { network, resources } = hostedEnvironmentOptions(source);
3457
4338
  return {
3458
4339
  source: source.specSource,
3459
- subjectFiles: source.subjectFiles,
4340
+ candidateFiles: source.candidateFiles,
3460
4341
  engineResolveFiles: hostedEngineResolveFiles(source),
3461
4342
  engineResolveBinding: engineResolveBindingForSpec(source.spec),
3462
4343
  adapterFiles: source.adapterFiles,
@@ -3539,24 +4420,45 @@ async function watchHostedRun(args) {
3539
4420
  }
3540
4421
  }
3541
4422
  function formatHostedRunResult(run) {
3542
- const subjectId = run.outputSubjectId ?? run.subjectId;
3543
- const activeDetail = run.activeSubjectId && subjectId && run.activeSubjectId !== subjectId
3544
- ? `; active ${run.activeSubjectId}`
4423
+ const candidateId = run.outputCandidateId ?? run.candidateId;
4424
+ const activeDetail = run.activeCandidateId && candidateId && run.activeCandidateId !== candidateId
4425
+ ? `; active ${run.activeCandidateId}`
3545
4426
  : "";
3546
- const summary = `Run ${run.id} reached ${run.status}; ${run.outcome ? `outcome ${run.outcome}; ` : ""}subject ${subjectId ?? "pending"}${activeDetail}; ${run.completedJobCount ?? 0}/${run.jobCount ?? 0} jobs completed.`;
4427
+ const summary = `Run ${run.id} reached ${run.status}; ${run.outcome ? `outcome ${run.outcome}; ` : ""}candidate ${candidateId ?? "pending"}${activeDetail}; ${run.completedJobCount ?? 0}/${run.jobCount ?? 0} jobs completed.`;
3547
4428
  return [
3548
4429
  run.error ? `${summary}\nError: ${run.error}` : summary,
3549
- ...(run.urls?.subjectEvaluation
3550
- ? [`Open evaluation: ${run.urls.subjectEvaluation}`]
4430
+ ...(run.urls?.candidateEvaluation
4431
+ ? [`Open evaluation: ${run.urls.candidateEvaluation}`]
3551
4432
  : [`Open benchmark: ${run.urls?.benchmark ?? ""}`].filter(Boolean)),
3552
4433
  ].join("\n");
3553
4434
  }
4435
+ function formatRetryCommandResult(result) {
4436
+ const run = result.run;
4437
+ const runId = run?.id ?? result.runId ?? "unknown";
4438
+ const scope = `${result.retried.kind} ${result.retried.id}`;
4439
+ const verb = run
4440
+ ? run.status === "finished" ? "finished as hosted run" : "started as hosted run"
4441
+ : "finished as local run";
4442
+ return [
4443
+ `Retry of ${scope} ${verb} ${runId}.`,
4444
+ ...(result.evaluationId ? [`Evaluation: ${result.evaluationId}`] : []),
4445
+ ...(result.candidateId ? [`Candidate: ${result.candidateId}`] : []),
4446
+ ...(result.failedJobCount ? [`Failed jobs: ${result.failedJobCount}`] : []),
4447
+ ...(result.error ? [`Error: ${result.error}`] : []),
4448
+ ...(result.localView
4449
+ ? [`Open local view: ${result.localView.command}`, result.localView.note]
4450
+ : []),
4451
+ ...(result.urls?.candidateEvaluation
4452
+ ? [`Open evaluation: ${result.urls.candidateEvaluation}`]
4453
+ : result.urls?.benchmark ? [`Open benchmark: ${result.urls.benchmark}`] : []),
4454
+ ].join("\n");
4455
+ }
3554
4456
  function formatHostedRunStarted(run, fallbackWorkflow) {
3555
- const subjectId = run.outputSubjectId ?? run.subjectId;
4457
+ const candidateId = run.outputCandidateId ?? run.candidateId;
3556
4458
  return [
3557
- `Started ${run.workflow ?? fallbackWorkflow} run ${run.id}; ${subjectId ? `subject ${subjectId}` : `${run.jobCount ?? 0} jobs queued`}.`,
3558
- ...(run.urls?.subjectEvaluation
3559
- ? [`Open evaluation: ${run.urls.subjectEvaluation}`]
4459
+ `Started ${run.workflow ?? fallbackWorkflow} run ${run.id}; ${candidateId ? `candidate ${candidateId}` : `${run.jobCount ?? 0} jobs queued`}.`,
4460
+ ...(run.urls?.candidateEvaluation
4461
+ ? [`Open evaluation: ${run.urls.candidateEvaluation}`]
3560
4462
  : run.urls?.benchmark ? [`Open benchmark: ${run.urls.benchmark}`] : []),
3561
4463
  "",
3562
4464
  ].join("\n");
@@ -3566,13 +4468,13 @@ function formatRunDetail(record) {
3566
4468
  const { run, jobs, urls } = detail;
3567
4469
  const cost = sumJobCostUsd(jobs);
3568
4470
  const firstFailedJob = jobs.find((job) => job.status === "failed" && job.error);
3569
- const subjectId = hostedRunEvaluationSubjectId(run, jobs);
4471
+ const candidateId = hostedRunEvaluationCandidateId(run, jobs);
3570
4472
  return [
3571
4473
  `Run ${run.id}: ${run.status}${run.outcome ? ` (${run.outcome})` : ""}`,
3572
4474
  `Workflow: ${run.workflow ?? "improve"}`,
3573
- `Subject: ${subjectId ?? "pending"}`,
3574
- ...(run.activeSubjectId && subjectId && run.activeSubjectId !== subjectId
3575
- ? [`Active subject: ${run.activeSubjectId}`]
4475
+ `Candidate: ${candidateId ?? "pending"}`,
4476
+ ...(run.activeCandidateId && candidateId && run.activeCandidateId !== candidateId
4477
+ ? [`Active candidate: ${run.activeCandidateId}`]
3576
4478
  : []),
3577
4479
  `Samples: ${run.samples ?? 0}`,
3578
4480
  `Attempts: ${run.attemptsExecuted ?? 0}/${run.attemptsRequested ?? run.attemptsExecuted ?? 0}`,
@@ -3584,8 +4486,8 @@ function formatRunDetail(record) {
3584
4486
  ...(firstFailedJob?.error
3585
4487
  ? [`First failed job ${firstFailedJob.id}: ${firstFailedJob.error}`]
3586
4488
  : []),
3587
- ...(urls.subjectEvaluation
3588
- ? [`Open evaluation: ${urls.subjectEvaluation}`]
4489
+ ...(urls.candidateEvaluation
4490
+ ? [`Open evaluation: ${urls.candidateEvaluation}`]
3589
4491
  : [`Open benchmark: ${urls.benchmark}`]),
3590
4492
  ...(jobs.length > 0 ? ["", "Jobs:", ...jobs.map(formatRunJobLine)] : []),
3591
4493
  ].join("\n");
@@ -3595,7 +4497,7 @@ function formatRunJobLine(job) {
3595
4497
  job.id,
3596
4498
  readRunJobPurpose(job) ?? job.kind ?? "job",
3597
4499
  job.status,
3598
- job.subjectId ?? "-",
4500
+ job.candidateId ?? "-",
3599
4501
  job.error ?? "",
3600
4502
  ].filter((value, index) => index < 4 || value !== "").join("\t");
3601
4503
  }
@@ -3621,7 +4523,7 @@ function costUsdFromUsage(value) {
3621
4523
  if (direct !== null) {
3622
4524
  return direct;
3623
4525
  }
3624
- return ["total", "optimizer", "runner", "engine"].reduce((sum, key) => {
4526
+ return ["total", "improver", "runner", "engine"].reduce((sum, key) => {
3625
4527
  const nested = readRecord(usage[key]);
3626
4528
  return sum + (readFiniteNumber(nested?.costUsd) ?? 0);
3627
4529
  }, 0);
@@ -3631,6 +4533,15 @@ function readRecord(value) {
3631
4533
  ? value
3632
4534
  : null;
3633
4535
  }
4536
+ function stringValue(value) {
4537
+ return typeof value === "string" && value.length > 0 ? value : null;
4538
+ }
4539
+ function numberValue(value) {
4540
+ return readFiniteNumber(value);
4541
+ }
4542
+ function integerValue(value) {
4543
+ return Number.isSafeInteger(value) ? value : null;
4544
+ }
3634
4545
  function readFiniteNumber(value) {
3635
4546
  return typeof value === "number" && Number.isFinite(value) ? value : null;
3636
4547
  }
@@ -3763,15 +4674,15 @@ async function readWorkbenchProfileStatus(config) {
3763
4674
  return { authenticated: true, profile: null };
3764
4675
  }
3765
4676
  }
3766
- function readOptionalSubjectId(parsed) {
3767
- return asOptionalString(parsed.flags.subject) ?? parsed.positionals[0];
4677
+ function readOptionalCandidateId(parsed) {
4678
+ return asOptionalString(parsed.flags.candidate) ?? parsed.positionals[0];
3768
4679
  }
3769
- function readRequiredSubjectId(parsed) {
3770
- const subjectId = readOptionalSubjectId(parsed);
3771
- if (!subjectId) {
3772
- throw new UsageError("Missing required SUBJECT_ID.");
4680
+ function readRequiredCandidateId(parsed) {
4681
+ const candidateId = readOptionalCandidateId(parsed);
4682
+ if (!candidateId) {
4683
+ throw new UsageError("Missing required CANDIDATE_ID.");
3773
4684
  }
3774
- return subjectId;
4685
+ return candidateId;
3775
4686
  }
3776
4687
  function readRequiredRunId(parsed) {
3777
4688
  const runId = parsed.positionals[0];
@@ -4002,6 +4913,38 @@ function readInitAgent(parsed, kind) {
4002
4913
  function asOptionalString(value) {
4003
4914
  return typeof value === "string" && value.length > 0 ? value : undefined;
4004
4915
  }
4916
+ function singleRequestedRunId(value, command) {
4917
+ if (!value || value.trim() === "") {
4918
+ return undefined;
4919
+ }
4920
+ const trimmed = value.trim();
4921
+ if (trimmed === "all" || trimmed.includes(",")) {
4922
+ throw new UsageError(`${command} accepts one candidate run id for --runs; use workbench eval --runs all to evaluate every run.`);
4923
+ }
4924
+ return trimmed;
4925
+ }
4926
+ function resolveCandidateRunSelection(source, value) {
4927
+ const available = source.candidateRunIds;
4928
+ if (available.length === 0) {
4929
+ throw new UsageError("Candidate must declare at least one run.");
4930
+ }
4931
+ if (!value || value.trim() === "") {
4932
+ return [source.candidateRunId];
4933
+ }
4934
+ const trimmed = value.trim();
4935
+ if (trimmed === "all") {
4936
+ return available;
4937
+ }
4938
+ const requested = [...new Set(trimmed.split(",").map((entry) => entry.trim()).filter(Boolean))];
4939
+ if (requested.length === 0) {
4940
+ throw new UsageError("--runs must include at least one run id or all.");
4941
+ }
4942
+ const missing = requested.filter((runId) => !available.includes(runId));
4943
+ if (missing.length > 0) {
4944
+ throw new UsageError(`Unknown candidate run(s): ${missing.join(", ")}. Available: ${available.join(", ")}.`);
4945
+ }
4946
+ return requested;
4947
+ }
4005
4948
  function readOptionalStringFlag(value, name) {
4006
4949
  if (value == null || value === false) {
4007
4950
  return undefined;
@@ -4226,6 +5169,27 @@ function parsePortFlag(value) {
4226
5169
  }
4227
5170
  return port;
4228
5171
  }
5172
+ function formatCandidateEvaluationScore(candidate) {
5173
+ const score = candidate.eval?.metrics?.score?.mean;
5174
+ return typeof score === "number" && Number.isFinite(score)
5175
+ ? formatMetricValue(score)
5176
+ : "n/a";
5177
+ }
5178
+ function formatLocalCandidateLabel(candidate) {
5179
+ if (!candidate) {
5180
+ return "none";
5181
+ }
5182
+ const name = candidate.name?.trim() || candidate.id;
5183
+ const displayName = candidate.version > 0
5184
+ ? `${name} v${candidate.version}`
5185
+ : name;
5186
+ return `${displayName} (${candidate.id})`;
5187
+ }
5188
+ function formatCandidateEvaluationSummary(candidate) {
5189
+ return formatMetricSummary(evaluationMeanMetrics(candidate.eval), {
5190
+ limit: Number.POSITIVE_INFINITY,
5191
+ });
5192
+ }
4229
5193
  function formatMetricSummary(metrics, options = {}) {
4230
5194
  const entries = Object.entries(metrics ?? {}).filter((entry) => Number.isFinite(entry[1]));
4231
5195
  if (entries.length === 0) {
@@ -4263,15 +5227,18 @@ function resolveSourceDir(parsed) {
4263
5227
  function isWorkbenchSourceYamlPath(filePath) {
4264
5228
  return path.basename(filePath) === WORKBENCH_BENCHMARK_FILE;
4265
5229
  }
4266
- function readSubjectIdFlag(parsed, snapshot) {
4267
- const explicit = asOptionalString(parsed.flags.subject) ?? asOptionalString(parsed.flags.subject);
5230
+ function readCandidateIdFlag(parsed, snapshot) {
5231
+ const explicit = readOptionalCandidateFlag(parsed);
4268
5232
  if (explicit) {
4269
5233
  return explicit;
4270
5234
  }
4271
5235
  if (snapshot.activeId) {
4272
5236
  return snapshot.activeId;
4273
5237
  }
4274
- throw new UsageError("Missing required --subject; no active subject exists.");
5238
+ throw new UsageError("Missing required --candidate; no active candidate exists.");
5239
+ }
5240
+ function readOptionalCandidateFlag(parsed) {
5241
+ return asOptionalString(parsed.flags.candidate);
4275
5242
  }
4276
5243
  function readPreviewMode(parsed) {
4277
5244
  const view = asOptionalString(parsed.flags.view) ?? "rendered";
@@ -4375,8 +5342,8 @@ async function copyInitSeedIfProvided(parsed, workspace, seed) {
4375
5342
  }
4376
5343
  });
4377
5344
  }
4378
- function formatSpecOptimizer(spec) {
4379
- return spec.improve ? `adapter:${spec.improve.use}` : "optimizer not configured";
5345
+ function formatSpecImprover(spec) {
5346
+ return spec.improve ? `adapter:${spec.improve.use}` : "improve not configured";
4380
5347
  }
4381
5348
  async function writeFiles(outputDir, files) {
4382
5349
  await fs.mkdir(outputDir, { recursive: true });