@workbench-ai/workbench-core 0.0.48 → 0.0.50

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -4,19 +4,19 @@ import path from "node:path";
4
4
  import { fileURLToPath } from "node:url";
5
5
  import YAML from "yaml";
6
6
  import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, parseWorkbenchAdapterManifest, readWorkbenchAdapterOperationResult, WORKBENCH_RUNTIME_CONTROL_TOKEN_ENV, WORKBENCH_RUNTIME_CONTROL_URL_ENV, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
7
- import { BENCHMARK_SPEC_FILE, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchSubjectManifestPath, } from "./generic-spec.js";
7
+ import { BENCHMARK_SPEC_FILE, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchCandidateManifestPath, } from "./generic-spec.js";
8
8
  import { attachSandboxMetadataToJob, createWorkbenchSandboxFileStore, isSurfaceSnapshotFile, readWorkbenchExecutionSpec, } from "./sandbox-inputs.js";
9
9
  import { asRuntimeRecord, importNodeModule, isJsonPayload, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
10
10
  import { createWorkbenchExecutionCapability, createWorkbenchSandboxAllocation, collectExecutionCapabilityScopeIssues, collectSandboxAllocationScopeIssues, collectSandboxHandleScopeIssues, assertSandboxBackendSupportsNetworkPolicy, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
11
11
  import { createSandboxBackendPlaneForProvider, } from "./sandbox-backends/index.js";
12
- import { applyWorkbenchSubjectPatch } from "./subject-patch.js";
12
+ import { applyWorkbenchCandidatePatch } from "./candidate-patch.js";
13
13
  import { assignUsageRole, completeUsageSummary, mergeUsageSummaries, normalizeUsageSummary, usageStats, } from "./execution-usage.js";
14
14
  import { traceFilePaths, workbenchTraceExecutionDirectory, } from "./trace-files.js";
15
15
  import { engineCaseForCase, } from "./execution-jobs.js";
16
16
  import { createWorkbenchExecutionEventPublisher, publishCommandStepEvent, } from "./execution-events.js";
17
17
  import { readWorkbenchExecutionPurpose } from "./execution-evidence.js";
18
18
  import { adapterAuthEnv, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
19
- export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchSubjectManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, runtimeNetwork, runtimeResources, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
19
+ export { BENCHMARK_SPEC_FILE, CANDIDATE_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchCandidateManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, runtimeNetwork, runtimeResources, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
20
20
  export { composeRuntimeDockerfileWithAdapterInstallers, } from "./runtime-dockerfile.js";
21
21
  export { adapterCommandName, cloneWorkbenchAdapterManifest, collectWorkbenchAdapterAuthRequirements, collectWorkbenchAdapterInvocations, parseWorkbenchAdapterManifest, workbenchAdapterManifestRequiresAuth, workbenchAdapterManifestSupportsOperation, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, withDefaultWorkbenchAdapterAuth, withDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
22
22
  export { adapterAuthEnv, createWorkbenchAdapterAuthBundle, defaultWorkbenchAdapterAuthStoreRoot, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, parseWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
@@ -26,14 +26,14 @@ export { createWorkbenchProgressStdoutParser, publishWorkbenchProgressStdoutEnve
26
26
  export { resolveSandboxTemplateImage, } from "./sandbox-backends/template-images.js";
27
27
  export { readOutputTraceFiles, workbenchTraceExecutionDirectory, workbenchTraceRunDirectory, workbenchTraceRunDirectoryName, } from "./trace-files.js";
28
28
  export { assertWorkbenchAdapterOperationSupport, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterOperationIssues, collectWorkbenchAdapterOperationRequirements, ensureWorkbenchAdapterOutputDir, WORKBENCH_ADAPTER_RESULT_FILE, normalizeWorkbenchAdapterOperationRequest, normalizeWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, writeWorkbenchAdapterOperationResult, } from "@workbench-ai/workbench-protocol";
29
- export { applyWorkbenchSubjectPatch, } from "./subject-patch.js";
29
+ export { applyWorkbenchCandidatePatch, } from "./candidate-patch.js";
30
30
  export { createWorkbenchSandboxFileStore, createSandboxAdapterRequest, executionResultFromCompletedSandboxJob, materializeWorkbenchSandboxInput, readWorkbenchExecutionSpec, sanitizeWorkbenchExecutionJobForSandbox, } from "./sandbox-inputs.js";
31
31
  export { compileWorkbenchExecutionGraph, } from "./execution-graph.js";
32
- export { createBaselineSubjectExecution, createBaselineSubjectJob, createWorkbenchExecutionJob, expectedWorkbenchRunJobCount, engineCaseForCase, engineCaseIds, attemptJobCountForRunSpec, workbenchExecutionJobPurpose, MAX_WORKBENCH_RUN_BUDGET, planWorkbenchExecutionJobsForPurpose, validateWorkbenchRunEnvelope, workbenchExecutionJobId, } from "./execution-jobs.js";
32
+ export { createBaselineCandidateExecution, createBaselineCandidateJob, createWorkbenchExecutionJob, expectedWorkbenchRunJobCount, engineCaseForCase, engineCaseIds, attemptJobCountForRunSpec, workbenchExecutionJobPurpose, MAX_WORKBENCH_RUN_BUDGET, planWorkbenchExecutionJobsForPurpose, validateWorkbenchRunEnvelope, workbenchExecutionJobId, } from "./execution-jobs.js";
33
33
  export { addCapacity, capacityFits, runWorkbenchExecutionDag, subtractCapacity, workbenchJobDependencies, workbenchJobHostCost, workbenchJobResources, } from "./execution-scheduler.js";
34
34
  export { assertWorkbenchExecutionIsolation, collectWorkbenchExecutionIsolationIssues, validateWorkbenchExecutionOutputPayloads, } from "./execution-outputs.js";
35
35
  export { collectSandboxAllocationScopeIssues, collectExecutionCapabilityScopeIssues, collectSandboxHandleScopeIssues, createWorkbenchSandboxAllocation, createWorkbenchSandboxExecutionMetadata, createWorkbenchExecutionCapability, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
36
- export { buildSubjectCaseExecutionRefs, buildWorkbenchExecutionEvidence, isWorkbenchExecutionActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-evidence.js";
36
+ export { buildCandidateCaseExecutionRefs, buildWorkbenchExecutionEvidence, isWorkbenchExecutionActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-evidence.js";
37
37
  export { buildWorkbenchTraceSessionsFromFiles, combineWorkbenchTraceSessions, finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, readWorkbenchExecutionTraceFiles, traceSessionLabel, } from "./execution-traces.js";
38
38
  export { DOCKER_SANDBOX_BACKEND, assertSandboxHostHealthForProvider, createDockerSandboxBackendDescriptor, createDockerSandboxPlane, resolveWorkbenchSandboxProviderName, sandboxProviderAdmissionForResources, sandboxProviderDefaultMaxConcurrentJobs, sandboxProviderLeaseScope, sandboxHostHealthExpectationForProvider, } from "./sandbox-backends/index.js";
39
39
  export const DEFAULT_ENVIRONMENT_VERSIONS = [
@@ -153,7 +153,7 @@ export const DEFAULT_ENVIRONMENTS = [
153
153
  {
154
154
  id: "env_node",
155
155
  name: "Node",
156
- description: "Node runtime for JavaScript and TypeScript subjects.",
156
+ description: "Node runtime for JavaScript and TypeScript candidates.",
157
157
  currentVersionId: "envv_node_22",
158
158
  builtIn: true,
159
159
  createdAt: "2026-04-23T00:00:00.000Z",
@@ -191,8 +191,7 @@ function splitAuthoredSourceYaml(sourceYaml) {
191
191
  }
192
192
  const entries = [
193
193
  [BENCHMARK_SPEC_FILE, parsed.benchmark],
194
- ["subjects/current/subject.yaml", splitSubjectSourceRecord(parsed.subject)],
195
- ["optimizers/current.yaml", splitOptimizerSourceRecord(parsed.optimizer)],
194
+ ["candidates/current/candidate.yaml", splitCandidateSourceRecord(parsed.candidate)],
196
195
  ];
197
196
  return entries.flatMap(([filePath, value]) => {
198
197
  if (!value || typeof value !== "object" || Array.isArray(value)) {
@@ -204,23 +203,20 @@ function splitAuthoredSourceYaml(sourceYaml) {
204
203
  }];
205
204
  });
206
205
  }
207
- function splitSubjectSourceRecord(value) {
206
+ function splitCandidateSourceRecord(value) {
208
207
  const record = cloneYamlRecord(value);
209
208
  if (!record) {
210
209
  return value;
211
210
  }
212
211
  delete record.benchmark;
213
212
  delete record.path;
214
- rewriteAdapterSources(record, "subjects");
213
+ stripCandidateRuntimeSelection(record);
214
+ rewriteAdapterSources(record, "candidates/current");
215
215
  return record;
216
216
  }
217
- function splitOptimizerSourceRecord(value) {
218
- const record = cloneYamlRecord(value);
219
- if (!record) {
220
- return value;
221
- }
222
- rewriteAdapterSources(record, "optimizers");
223
- return record;
217
+ function stripCandidateRuntimeSelection(record) {
218
+ delete record.selectedRunId;
219
+ delete record.selectedRunName;
224
220
  }
225
221
  function cloneYamlRecord(value) {
226
222
  return value && typeof value === "object" && !Array.isArray(value)
@@ -242,11 +238,10 @@ function sourcePathRelativeTo(yamlDir, sourcePath) {
242
238
  }
243
239
  function isAuthoredSourceYamlPath(filePath) {
244
240
  return filePath === BENCHMARK_SPEC_FILE ||
245
- isWorkbenchSubjectManifestPath(filePath) ||
246
- /^optimizers\/[^/]+\.ya?ml$/iu.test(filePath);
241
+ isWorkbenchCandidateManifestPath(filePath);
247
242
  }
248
- function formatOptimizerSummary(spec) {
249
- return spec.improve ? `adapter:${spec.improve.use}` : "optimizer not configured";
243
+ function formatImproveSummary(spec) {
244
+ return spec.improve ? `adapter:${spec.improve.use}` : "improve not configured";
250
245
  }
251
246
  function formatEngineRunSummary(spec) {
252
247
  return `adapter:${spec.engineRun.use}`;
@@ -287,10 +282,10 @@ function protocolStepForExecution(execution, manifests) {
287
282
  if (execution.purpose !== "improve") {
288
283
  throw new Error(`Protocol execution step only supports improve executions, not ${execution.purpose}.`);
289
284
  }
290
- const operation = "optimizer.improve";
285
+ const operation = "candidate.improve";
291
286
  const command = adapterProtocolCommandSpec(execution.adapter, operation, manifests);
292
287
  return {
293
- kind: "optimizer",
288
+ kind: "improver",
294
289
  label: execution.purpose,
295
290
  operation,
296
291
  executor: command.executor,
@@ -387,31 +382,31 @@ export function materializeWorkbenchRunResult(args) {
387
382
  const completed = args.jobs.filter((job) => job.status === "succeeded");
388
383
  const failedJobCount = args.jobs.filter((job) => job.status === "failed").length;
389
384
  const completedJobCount = args.jobs.filter((job) => job.status === "succeeded").length;
390
- const subjectRevisions = completed
385
+ const candidateRevisions = completed
391
386
  .filter((job) => workbenchExecutionPurpose(job) === "improve")
392
- .map((job) => normalizeSubjectRevisionJobOutput(job.output))
387
+ .map((job) => normalizeCandidateRevisionJobOutput(job.output))
393
388
  .filter((output) => output !== null)
394
389
  .sort((left, right) => left.attemptIndex - right.attemptIndex);
395
390
  const evaluationJobs = args.jobs.filter((job) => workbenchExecutionPurpose(job) === "attempt");
396
- const evaluationsBySubject = new Map();
391
+ const evaluationsByCandidate = new Map();
397
392
  for (const job of evaluationJobs) {
398
- const subjectId = readJobString(job.output, "subjectId") ??
399
- readJobString(job.input, "subjectId") ??
400
- job.subjectId;
401
- if (subjectId) {
402
- evaluationsBySubject.set(subjectId, [
403
- ...(evaluationsBySubject.get(subjectId) ?? []),
393
+ const candidateId = readJobString(job.output, "candidateId") ??
394
+ readJobString(job.input, "candidateId") ??
395
+ job.candidateId;
396
+ if (candidateId) {
397
+ evaluationsByCandidate.set(candidateId, [
398
+ ...(evaluationsByCandidate.get(candidateId) ?? []),
404
399
  job,
405
400
  ]);
406
401
  }
407
402
  }
408
- const subjects = [];
409
- const subjectFiles = {};
403
+ const candidates = [];
404
+ const candidateFiles = {};
410
405
  const evaluations = [];
411
- for (const subjectRevision of subjectRevisions) {
412
- const subjectId = subjectRevision.subjectId;
413
- const subjectJobs = evaluationsBySubject.get(subjectId) ?? [];
414
- const succeededEvaluationJobs = subjectJobs.filter((job) => job.status === "succeeded");
406
+ for (const candidateRevision of candidateRevisions) {
407
+ const candidateId = candidateRevision.candidateId;
408
+ const candidateJobs = evaluationsByCandidate.get(candidateId) ?? [];
409
+ const succeededEvaluationJobs = candidateJobs.filter((job) => job.status === "succeeded");
415
410
  const outputs = normalizeEvaluationSampleOutputs({
416
411
  jobs: succeededEvaluationJobs,
417
412
  allJobs: completed,
@@ -425,39 +420,38 @@ export function materializeWorkbenchRunResult(args) {
425
420
  ])
426
421
  .filter((key) => key !== null));
427
422
  const errorSampleJobs = [
428
- ...subjectJobs.filter((job) => job.status === "failed"),
423
+ ...candidateJobs.filter((job) => job.status === "failed"),
429
424
  ...succeededEvaluationJobs.filter((job) => !outputJobIds.has(job.id)),
430
425
  ];
431
- const errorSamples = errorEvaluationSamplesFromJobs(errorSampleJobs, subjectId, subjectRevision.attemptIndex, completedSampleKeys);
426
+ const errorSamples = errorEvaluationSamplesFromJobs(errorSampleJobs, candidateId, candidateRevision.attemptIndex, completedSampleKeys);
432
427
  const samples = [
433
428
  ...outputs.map(({ jobs, output }) => withJobUsage(output.sample, completed, jobs[0])),
434
429
  ...errorSamples,
435
430
  ].sort((left, right) => left.index - right.index || left.id.localeCompare(right.id));
436
- const subjectName = normalizedSubjectDisplayName(args.spec.subject.name);
437
- const evalRecord = createEvaluationRecord(subjectId, subjectName, samples);
431
+ const candidateName = normalizedCandidateDisplayName(args.spec.candidate.name);
432
+ const evalRecord = createEvaluationRecord(candidateId, candidateName, samples);
438
433
  const usage = mergeUsageSummaries([
439
- subjectRevision.usage,
434
+ candidateRevision.usage,
440
435
  ...samples.map((sample) => sample.usage),
441
436
  ]);
442
- const metrics = evaluationMeanMetrics(evalRecord);
443
- const attemptIndex = subjectRevision.attemptIndex;
437
+ const attemptIndex = candidateRevision.attemptIndex;
444
438
  const evaluationTraces = [
445
439
  ...outputs.flatMap(({ output }) => output.traces),
446
440
  ...errorSampleJobs.flatMap(jobTracePaths),
447
441
  ].sort();
448
- const baseId = subjectRevision.baseId && subjectRevision.baseId !== subjectId
449
- ? subjectRevision.baseId
442
+ const baseId = candidateRevision.baseId && candidateRevision.baseId !== candidateId
443
+ ? candidateRevision.baseId
450
444
  : null;
451
- const sourceMeta = subjectSourceMetadata(args.subjectSourceFiles);
445
+ const sourceMeta = candidateSourceMetadata(args.candidateSourceFiles);
452
446
  const benchmarkMeta = benchmarkSourceMetadata(args.benchmarkSourceFiles);
453
447
  const meta = {
454
448
  attemptIndex,
455
449
  sampleCount: evalRecord.sampleCount,
456
- optimizer: formatOptimizerSummary(args.spec),
450
+ improver: formatImproveSummary(args.spec),
457
451
  engineRun: formatEngineRunSummary(args.spec),
458
452
  strategy: "greedy",
459
453
  traces: {
460
- improve: subjectRevision.traces,
454
+ improve: candidateRevision.traces,
461
455
  evaluations: evaluationTraces,
462
456
  },
463
457
  };
@@ -467,52 +461,114 @@ export function materializeWorkbenchRunResult(args) {
467
461
  if (benchmarkMeta) {
468
462
  meta.benchmark = benchmarkMeta;
469
463
  }
470
- const record = {
471
- id: subjectId,
472
- ...(subjectName ? { name: subjectName } : {}),
473
- ordinal: args.existingSubjectCount + subjects.length,
474
- benchmarkFingerprint: args.benchmarkFingerprint,
475
- subjectFingerprint: args.subjectFingerprint ?? materializedSubjectFingerprint(args.spec, subjectRevision.files),
476
- createdAt: args.startedAt,
477
- ...(baseId ? { baseId } : {}),
478
- referenceIds: [],
479
- status: evalRecord.completedSampleCount > 0 ? "evaluated" : "eval_error",
480
- fileChanges: subjectRevision.fileChanges,
481
- ...(metrics ? { metrics } : {}),
482
- ...(usage ? { usage } : {}),
483
- eval: evalRecord,
484
- ...(subjectRevision.prompt ? { prompt: subjectRevision.prompt } : {}),
485
- meta,
486
- };
487
- subjects.push(record);
464
+ const record = preserveExistingCandidateIdentity({
465
+ candidate: {
466
+ id: candidateId,
467
+ ...(candidateName ? { name: candidateName } : {}),
468
+ version: args.existingCandidateCount + candidates.length + 1,
469
+ ordinal: args.existingCandidateCount + candidates.length + 1,
470
+ benchmarkFingerprint: args.benchmarkFingerprint,
471
+ candidateFingerprint: args.candidateFingerprint ?? materializedCandidateFingerprint(args.spec, candidateRevision.files),
472
+ createdAt: args.startedAt,
473
+ ...(baseId ? { baseId } : {}),
474
+ referenceIds: [],
475
+ status: evalRecord.completedSampleCount > 0 ? "evaluated" : "eval_error",
476
+ fileChanges: candidateRevision.fileChanges,
477
+ ...(usage ? { usage } : {}),
478
+ eval: evalRecord,
479
+ ...(candidateRevision.prompt ? { prompt: candidateRevision.prompt } : {}),
480
+ meta,
481
+ },
482
+ previousCandidate: args.previousCandidate ?? null,
483
+ });
484
+ candidates.push(record);
488
485
  evaluations.push(createEvaluationScorecard({
489
486
  runId: args.runId,
490
487
  benchmarkFingerprint: args.benchmarkFingerprint,
491
488
  createdAt: args.startedAt,
492
- subject: record,
489
+ candidate: record,
490
+ candidateRunId: args.spec.candidate.selectedRunId,
491
+ candidateRunName: args.spec.candidate.selectedRunName,
493
492
  evaluation: evalRecord,
494
493
  }));
495
- subjectFiles[subjectId] = materializedSubjectFiles({
496
- subjectRevisionFiles: subjectRevision.files,
494
+ candidateFiles[candidateId] = materializedCandidateFiles({
495
+ candidateRevisionFiles: candidateRevision.files,
497
496
  });
498
497
  }
499
- const selectedSubject = selectSubject({
500
- subjects,
501
- previousSubject: args.previousSubject ?? null,
498
+ const selectedCandidate = selectCandidate({
499
+ candidates,
500
+ previousCandidate: args.previousCandidate ?? null,
502
501
  });
503
502
  return {
504
- subjects,
505
- subjectFiles,
503
+ candidates,
504
+ candidateFiles,
506
505
  evaluations,
507
- activeSubjectId: selectedSubject?.id ?? args.previousSubject?.id ?? null,
508
- selectedSubject,
506
+ activeCandidateId: selectedCandidate?.id ?? args.previousCandidate?.id ?? null,
507
+ selectedCandidate,
509
508
  completedJobCount,
510
509
  failedJobCount,
511
510
  };
512
511
  }
513
- function subjectSourceMetadata(files) {
512
+ function preserveExistingCandidateIdentity(args) {
513
+ const previous = args.previousCandidate;
514
+ if (!previous || previous.id !== args.candidate.id) {
515
+ return args.candidate;
516
+ }
517
+ const baseId = args.candidate.baseId ?? previous.baseId;
518
+ const prompt = args.candidate.prompt ?? previous.prompt;
519
+ const meta = mergeExistingCandidateMeta(previous.meta, args.candidate.meta);
520
+ return {
521
+ ...args.candidate,
522
+ version: previous.version,
523
+ ordinal: previous.version,
524
+ createdAt: previous.createdAt,
525
+ ...(args.candidate.name ?? previous.name
526
+ ? { name: (args.candidate.name ?? previous.name) }
527
+ : {}),
528
+ ...(baseId ? { baseId } : {}),
529
+ referenceIds: previous.referenceIds.length > 0
530
+ ? [...previous.referenceIds]
531
+ : args.candidate.referenceIds,
532
+ fileChanges: args.candidate.fileChanges.length > 0
533
+ ? args.candidate.fileChanges
534
+ : [...previous.fileChanges],
535
+ ...(prompt ? { prompt } : {}),
536
+ ...(meta ? { meta } : {}),
537
+ };
538
+ }
539
+ function mergeExistingCandidateMeta(previousMeta, candidateMeta) {
540
+ const previous = jsonRecord(previousMeta);
541
+ const candidate = jsonRecord(candidateMeta);
542
+ if (!previous) {
543
+ return candidateMeta;
544
+ }
545
+ if (!candidate) {
546
+ return previousMeta;
547
+ }
548
+ const previousTraces = jsonRecord(previous.traces);
549
+ const candidateTraces = jsonRecord(candidate.traces);
550
+ if (!previousTraces || !candidateTraces) {
551
+ return { ...previous, ...candidate };
552
+ }
553
+ const traces = {
554
+ ...previousTraces,
555
+ ...candidateTraces,
556
+ };
557
+ const candidateImproveTraces = Array.isArray(candidateTraces.improve)
558
+ ? candidateTraces.improve
559
+ : [];
560
+ if (candidateImproveTraces.length === 0 && previousTraces.improve !== undefined) {
561
+ traces.improve = previousTraces.improve;
562
+ }
563
+ return {
564
+ ...previous,
565
+ ...candidate,
566
+ traces,
567
+ };
568
+ }
569
+ function candidateSourceMetadata(files) {
514
570
  const sourceFiles = (files ?? [])
515
- .filter((file) => /^subjects\/[^/]+\/subject\.ya?ml$/iu.test(file.path))
571
+ .filter((file) => /^candidates\/[^/]+\/candidate\.ya?ml$/iu.test(file.path))
516
572
  .sort((left, right) => left.path.localeCompare(right.path))
517
573
  .map((file) => ({
518
574
  path: file.path,
@@ -536,14 +592,13 @@ function benchmarkSourceMetadata(files) {
536
592
  }));
537
593
  return sourceFiles.length > 0 ? { files: sourceFiles } : null;
538
594
  }
539
- function materializedSubjectFingerprint(spec, files) {
595
+ function materializedCandidateFingerprint(spec, files) {
540
596
  const hash = createHash("sha256");
541
- hash.update("workbench-subject-v1\0");
542
- hash.update("materialized\0runner\0");
543
- hash.update(JSON.stringify(spec.run));
597
+ hash.update("workbench-candidate-v1\0");
598
+ hash.update("materialized\0");
544
599
  hash.update("prepare");
545
- hash.update(JSON.stringify(spec.subject.prepare ?? null));
546
- for (const file of filterSubjectSourceFiles(files).slice().sort((left, right) => left.path.localeCompare(right.path))) {
600
+ hash.update(JSON.stringify(spec.candidate.prepare ?? null));
601
+ for (const file of filterCandidateSourceFiles(files).slice().sort((left, right) => left.path.localeCompare(right.path))) {
547
602
  hash.update("\0file\0");
548
603
  hash.update(file.path);
549
604
  hash.update("\0");
@@ -555,9 +610,9 @@ function materializedSubjectFingerprint(spec, files) {
555
610
  }
556
611
  return hash.digest("hex");
557
612
  }
558
- function materializedSubjectFiles(args) {
613
+ function materializedCandidateFiles(args) {
559
614
  const byPath = new Map();
560
- for (const file of filterSubjectSourceFiles(args.subjectRevisionFiles)) {
615
+ for (const file of filterCandidateSourceFiles(args.candidateRevisionFiles)) {
561
616
  byPath.set(file.path, { ...file });
562
617
  }
563
618
  return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
@@ -565,12 +620,15 @@ function materializedSubjectFiles(args) {
565
620
  function createEvaluationScorecard(args) {
566
621
  const evaluation = args.evaluation;
567
622
  return {
568
- id: evaluationScorecardId(args.runId, args.subject.id),
623
+ id: evaluationScorecardId(args.runId, args.candidate.id),
569
624
  runId: args.runId,
570
625
  benchmarkFingerprint: args.benchmarkFingerprint,
571
- subjectFingerprint: args.subject.subjectFingerprint,
572
- subjectId: args.subject.id,
573
- ...(args.subject.name ? { subjectName: args.subject.name } : {}),
626
+ candidateFingerprint: args.candidate.candidateFingerprint,
627
+ candidateId: args.candidate.id,
628
+ ...(args.candidate.name ? { candidateName: args.candidate.name } : {}),
629
+ candidateVersion: args.candidate.version,
630
+ ...(args.candidateRunId ? { candidateRunId: args.candidateRunId } : {}),
631
+ ...(args.candidateRunName ? { candidateRunName: args.candidateRunName } : {}),
574
632
  createdAt: args.createdAt,
575
633
  updatedAt: evaluation.finishedAt ?? args.createdAt,
576
634
  status: evaluation.status,
@@ -584,10 +642,10 @@ function createEvaluationScorecard(args) {
584
642
  evaluation,
585
643
  };
586
644
  }
587
- export function evaluationScorecardId(runId, subjectId) {
645
+ export function evaluationScorecardId(runId, candidateId) {
588
646
  const runPart = runId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
589
- const subjectPart = subjectId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
590
- return `eval_${runPart}_${subjectPart}`;
647
+ const candidatePart = candidateId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
648
+ return `eval_${runPart}_${candidatePart}`;
591
649
  }
592
650
  export function selectExecutionOutputFilesForInspection(args) {
593
651
  return args.files.filter((file) => !isWorkbenchInternalOutputPath(file.path));
@@ -602,56 +660,89 @@ export function isWorkbenchInternalOutputPath(filePath) {
602
660
  normalized === "exit_code" ||
603
661
  /^[a-z_-]+_(stdout\.log|stderr\.log|exit_code)$/u.test(normalized));
604
662
  }
605
- export function createSubjectRevisionTraceInputFiles(args) {
663
+ export function createOptimizerTraceInputFiles(args) {
606
664
  const files = [];
607
- const manifestJobs = [];
665
+ const executions = [];
608
666
  const jobs = args.jobs
609
- .filter((job) => job.runId === args.runId && isTerminalExecutionJob(job))
667
+ .filter(isOptimizerTraceInputJob)
610
668
  .sort(compareTraceInputJobs);
611
- for (const job of jobs) {
669
+ jobs.forEach((job, index) => {
670
+ const sequence = String(index + 1).padStart(6, "0");
671
+ const executionPath = `executions/${sequence}`;
672
+ const operation = "engine.run";
612
673
  const jobFiles = completedJobOutputFiles(job);
613
- const rawTraceFiles = jobFiles.filter((file) => normalizeRelativePath(file.path).startsWith(".workbench/traces/"));
614
- files.push(...rawTraceFiles.map((file) => ({ ...file })));
615
- const events = args.events
616
- .filter((event) => event.runId === args.runId && event.jobId === job.id)
617
- .sort((left, right) => left.at.localeCompare(right.at));
618
- const eventPath = `events/${job.id}.ndjson`;
619
- if (events.length > 0) {
620
- files.push(textSurfaceFile(eventPath, `${events.map((event) => JSON.stringify(event)).join("\n")}\n`));
621
- }
622
- const summaryPath = `jobs/${job.id}.json`;
623
- const summary = subjectRevisionTraceJobSummary(job, {
624
- eventPath: events.length > 0 ? eventPath : null,
625
- rawTracePaths: rawTraceFiles.map((file) => file.path).sort(),
626
- });
627
- files.push(textSurfaceFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`));
628
- manifestJobs.push({
629
- ...summary,
630
- summary_path: summaryPath,
674
+ const requestFile = traceInputRequestFile(jobFiles, operation);
675
+ const resultFile = traceInputResultFile(jobFiles, operation);
676
+ const requestPath = `${executionPath}/request.json`;
677
+ const resultPath = `${executionPath}/result.json`;
678
+ const filesPath = `${executionPath}/files`;
679
+ files.push(textSurfaceFile(requestPath, requestFile?.content ?? `${JSON.stringify(traceInputRequestFallback(job, operation), null, 2)}\n`));
680
+ files.push(textSurfaceFile(resultPath, resultFile?.content ?? `${JSON.stringify(traceInputResultFallback(job, operation), null, 2)}\n`));
681
+ files.push(...jobFiles.map((file) => ({
682
+ ...file,
683
+ path: normalizeRelativePath(`${filesPath}/${file.path}`),
684
+ })));
685
+ executions.push({
686
+ path: executionPath,
687
+ operation,
688
+ status: job.status,
689
+ candidateId: job.candidateId ?? readJobString(job.input, "candidateId") ?? null,
690
+ runId: job.runId,
691
+ jobId: job.id,
692
+ attemptIndex: readOptionalJobNumber(job.input, "attemptIndex") ?? null,
693
+ sampleIndex: readOptionalJobNumber(job.input, "sampleIndex") ?? null,
694
+ caseId: readJobString(job.input, "caseId") ?? null,
695
+ requestPath,
696
+ resultPath,
697
+ filesPath,
631
698
  });
632
- }
633
- files.push(textSurfaceFile("manifest.json", `${JSON.stringify({
634
- run_id: args.runId,
635
- jobs: manifestJobs,
699
+ });
700
+ files.push(textSurfaceFile("index.json", `${JSON.stringify({
701
+ schema: "workbench.optimizer-traces.v1",
702
+ executions,
636
703
  }, null, 2)}\n`));
637
704
  return dedupeSurfaceFiles(files);
638
705
  }
639
- export function createSubjectEvaluationTraceInputFiles(args) {
640
- const subject = args.subject;
641
- if (!subject?.eval && !subject?.metrics) {
642
- return [];
706
+ export function evaluationMeanMetrics(evaluation) {
707
+ const entries = Object.entries(evaluation?.metrics ?? {})
708
+ .filter((entry) => Number.isFinite(entry[1].mean));
709
+ return entries.length > 0
710
+ ? Object.fromEntries(entries.map(([key, stats]) => [key, stats.mean]))
711
+ : undefined;
712
+ }
713
+ export function candidateRecordWithoutDerivedFields(candidate) {
714
+ const { metrics: _metrics, candidateRunId: _candidateRunId, candidateRunName: _candidateRunName, ...record } = candidate;
715
+ return record;
716
+ }
717
+ export function candidateSummaryFromRecord(candidate) {
718
+ const { eval: _eval, prompt: _prompt, meta: _meta, ...summary } = candidateRecordWithoutDerivedFields(candidate);
719
+ return summary;
720
+ }
721
+ export function workbenchRunExecutionFingerprint(args) {
722
+ const hash = createHash("sha256");
723
+ hash.update("workbench-run-execution-v1\0");
724
+ hash.update(args.specVersionId ?? "");
725
+ hash.update("\0");
726
+ hash.update(args.environmentVersionId ?? "");
727
+ hash.update("\0");
728
+ hash.update(args.sourceYaml ?? "");
729
+ for (const file of (args.adapterFiles ?? []).slice().sort((left, right) => left.path.localeCompare(right.path))) {
730
+ hash.update("\0file\0");
731
+ hash.update(file.path);
732
+ hash.update("\0");
733
+ hash.update(file.kind);
734
+ hash.update("\0");
735
+ hash.update(file.encoding);
736
+ hash.update("\0");
737
+ hash.update(file.executable ? "1" : "0");
738
+ hash.update("\0");
739
+ hash.update(file.content);
643
740
  }
644
- const filePath = normalizeRelativePath(args.path ?? `base-subject/${subject.id}/evaluation.json`);
645
- const payload = {
646
- kind: "subject_evaluation",
647
- subjectId: subject.id,
648
- status: subject.status,
649
- metrics: subject.metrics ?? null,
650
- fileChanges: subject.fileChanges,
651
- eval: subject.eval ?? null,
652
- prompt: subject.prompt ?? null,
653
- };
654
- return [textSurfaceFile(filePath, `${JSON.stringify(payload, null, 2)}\n`)];
741
+ return hash.digest("hex");
742
+ }
743
+ function isOptimizerTraceInputJob(job) {
744
+ return isTerminalExecutionJob(job) &&
745
+ workbenchExecutionPurpose(job) === "attempt";
655
746
  }
656
747
  function isTerminalExecutionJob(job) {
657
748
  return job.kind === "execute" && (job.status === "succeeded" ||
@@ -662,20 +753,10 @@ function compareTraceInputJobs(left, right) {
662
753
  const leftAttempt = readOptionalJobNumber(left.input, "attemptIndex") ?? -1;
663
754
  const rightAttempt = readOptionalJobNumber(right.input, "attemptIndex") ?? -1;
664
755
  return leftAttempt - rightAttempt ||
665
- purposeSortKey(workbenchExecutionPurpose(left)) - purposeSortKey(workbenchExecutionPurpose(right)) ||
666
756
  (readOptionalJobNumber(left.input, "sampleIndex") ?? -1) - (readOptionalJobNumber(right.input, "sampleIndex") ?? -1) ||
667
757
  (readJobString(left.input, "caseId") ?? "").localeCompare(readJobString(right.input, "caseId") ?? "") ||
668
758
  left.id.localeCompare(right.id);
669
759
  }
670
- function purposeSortKey(purpose) {
671
- if (purpose === "improve") {
672
- return 0;
673
- }
674
- if (purpose === "attempt") {
675
- return 1;
676
- }
677
- return 3;
678
- }
679
760
  function completedJobOutputFiles(job) {
680
761
  const output = jsonRecord(job.output);
681
762
  if (!Array.isArray(output.files)) {
@@ -689,35 +770,70 @@ function completedJobOutputFiles(job) {
689
770
  }
690
771
  return files;
691
772
  }
692
- function subjectRevisionTraceJobSummary(job, paths) {
693
- const output = jsonRecord(job.output);
773
+ function traceInputRequestFile(files, operation) {
774
+ return files.find((file) => {
775
+ const normalized = normalizeRelativePath(file.path);
776
+ return normalized.startsWith(".workbench/traces/") &&
777
+ normalized.endsWith("/request.json") &&
778
+ file.encoding === "utf8" &&
779
+ traceJsonOperation(file) === operation;
780
+ }) ?? null;
781
+ }
782
+ function traceInputResultFile(files, operation) {
783
+ return files.find((file) => {
784
+ const normalized = normalizeRelativePath(file.path);
785
+ return normalized.startsWith(".workbench/traces/") &&
786
+ normalized.endsWith("/result.json") &&
787
+ file.encoding === "utf8" &&
788
+ traceJsonOperation(file) === operation;
789
+ }) ?? null;
790
+ }
791
+ function traceJsonOperation(file) {
792
+ try {
793
+ const parsed = JSON.parse(file.content);
794
+ return typeof parsed?.operation === "string" ? parsed.operation : null;
795
+ }
796
+ catch {
797
+ return null;
798
+ }
799
+ }
800
+ function traceInputRequestFallback(job, operation) {
801
+ const execution = jsonRecord(jsonRecord(job.input).execution);
694
802
  return {
695
- job_id: job.id,
696
- purpose: workbenchExecutionPurpose(job) ?? "unknown",
697
- status: job.status,
698
- subject_id: job.subjectId ?? readJobString(job.input, "subjectId"),
699
- attempt_index: readOptionalJobNumber(job.input, "attemptIndex"),
700
- sample_index: readOptionalJobNumber(job.input, "sampleIndex"),
701
- case_id: readJobString(job.input, "caseId"),
702
- created_at: job.createdAt,
703
- ...(job.startedAt ? { started_at: job.startedAt } : {}),
704
- ...(job.finishedAt ? { finished_at: job.finishedAt } : {}),
705
- ...(job.error ? { error: job.error } : {}),
706
- traces: jobTracePaths(job),
707
- event_path: paths.eventPath,
708
- raw_trace_paths: [...paths.rawTracePaths],
709
- output: summarizeJobOutputForTrace(output),
803
+ protocol: "workbench.adapter.v3",
804
+ id: typeof execution.id === "string" ? execution.id : job.id,
805
+ jobId: job.id,
806
+ operation,
807
+ invocation: jsonRecord(execution.adapter),
808
+ context: {
809
+ candidate: {
810
+ id: job.candidateId ?? readJobString(job.input, "candidateId") ?? null,
811
+ },
812
+ attempt: {
813
+ attemptIndex: readOptionalJobNumber(job.input, "attemptIndex") ?? null,
814
+ sampleIndex: readOptionalJobNumber(job.input, "sampleIndex") ?? null,
815
+ caseId: readJobString(job.input, "caseId") ?? null,
816
+ },
817
+ },
710
818
  };
711
819
  }
712
- function summarizeJobOutputForTrace(output) {
713
- const { files: _files, fileSet: _fileSet, subjectPatch, ...rest } = output;
714
- const patch = jsonRecord(subjectPatch);
715
- const { files: _patchFiles, ...patchSummary } = patch;
820
+ function traceInputResultFallback(job, operation) {
821
+ const output = jsonRecord(job.output);
822
+ const ok = job.status === "succeeded" && output.ok !== false;
823
+ const value = operation === "candidate.improve"
824
+ ? jsonRecord(output.candidatePatch)
825
+ : operation === "engine.run"
826
+ ? jsonRecord(output.result)
827
+ : {};
716
828
  return {
717
- ...rest,
718
- ...(Object.keys(patch).length > 0
719
- ? { subjectPatch: patchSummary }
720
- : {}),
829
+ protocol: "workbench.adapter-result.v1",
830
+ operation,
831
+ ok,
832
+ ...(Object.keys(value).length > 0 ? { value: value } : {}),
833
+ ...(typeof output.summary === "string" ? { summary: output.summary } : {}),
834
+ ...(output.feedback !== undefined ? { feedback: output.feedback } : {}),
835
+ ...(output.usage !== undefined ? { usage: output.usage } : {}),
836
+ ...(!ok ? { error: job.error ?? "Execution did not complete successfully." } : {}),
721
837
  };
722
838
  }
723
839
  function textSurfaceFile(path, content) {
@@ -744,7 +860,7 @@ export function buildWorkbenchProjectSourceFiles(input) {
744
860
  ...(input.specFiles
745
861
  ? input.specFiles.map((file) => ({ ...file }))
746
862
  : [textSurfaceFile("benchmark.yaml", input.specSource ?? "")]),
747
- ...prefixProjectSourceFiles(input.subjectFiles, input.subjectFilesPath),
863
+ ...prefixProjectSourceFiles(input.candidateFiles, input.candidateFilesPath),
748
864
  ...prefixProjectSourceFiles(input.engineResolveFiles, input.engineResolveFilesPath),
749
865
  ...(input.adapterFiles ?? []).map((file) => ({ ...file })),
750
866
  ...(input.dockerfiles ?? []).map((file) => ({ ...file })),
@@ -772,18 +888,18 @@ function prefixProjectSourceFiles(files, rootPath) {
772
888
  };
773
889
  });
774
890
  }
775
- export function isSubjectSourceFilePath(filePath) {
891
+ export function isCandidateSourceFilePath(filePath) {
776
892
  const normalized = normalizeRelativePath(filePath);
777
893
  return (normalized !== ".workbench" &&
778
894
  !normalized.startsWith(".workbench/") &&
779
895
  normalized !== "workbench-result.json");
780
896
  }
781
- export function filterSubjectSourceFiles(files) {
897
+ export function filterCandidateSourceFiles(files) {
782
898
  return files
783
- .filter((file) => isSubjectSourceFilePath(file.path))
899
+ .filter((file) => isCandidateSourceFilePath(file.path))
784
900
  .map((file) => ({ ...file }));
785
901
  }
786
- export function buildSubjectLineage(args) {
902
+ export function buildCandidateLineage(args) {
787
903
  const orderedSummaries = args.summaries.slice().sort((left, right) => {
788
904
  const createdAt = left.createdAt.localeCompare(right.createdAt);
789
905
  return createdAt !== 0 ? createdAt : left.id.localeCompare(right.id);
@@ -856,7 +972,7 @@ function globPatternToRegExp(pattern) {
856
972
  function escapeRegExp(value) {
857
973
  return value.replace(/[\\^$.*+?()[\]{}|]/gu, "\\$&");
858
974
  }
859
- export function summarizeSubjectFiles(files, changedPaths = files.map((file) => file.path)) {
975
+ export function summarizeCandidateFiles(files, changedPaths = files.map((file) => file.path)) {
860
976
  const changed = new Set(changedPaths);
861
977
  return [...files]
862
978
  .sort((left, right) => left.path.localeCompare(right.path))
@@ -875,7 +991,7 @@ export function summarizeSubjectFiles(files, changedPaths = files.map((file) =>
875
991
  };
876
992
  });
877
993
  }
878
- export function createSubjectFilePreview(args) {
994
+ export function createCandidateFilePreview(args) {
879
995
  if (args.view === "diff") {
880
996
  throw new Error("Diff previews require explicit before and after file content.");
881
997
  }
@@ -901,14 +1017,14 @@ export function createSubjectFilePreview(args) {
901
1017
  export function createCaseReview(args) {
902
1018
  const preferredSampleIndex = uniqueExecutionSampleIndex(args.executions ?? []);
903
1019
  const sampleMatchesCase = (sample) => (sample.cases ?? []).some((entry) => entry.id === args.caseId);
904
- const samples = args.subject.eval?.samples ?? [];
1020
+ const samples = args.candidate.eval?.samples ?? [];
905
1021
  const sampleResult = samples.find((sample) => typeof preferredSampleIndex === "number" &&
906
1022
  sample.index === preferredSampleIndex &&
907
1023
  sampleMatchesCase(sample)) ?? samples.find(sampleMatchesCase);
908
1024
  const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId);
909
1025
  if (!sampleResult && (args.executions?.length ?? 0) > 0) {
910
1026
  return {
911
- subjectId: args.subject.id,
1027
+ candidateId: args.candidate.id,
912
1028
  caseId: args.caseId,
913
1029
  caseLabel: args.caseId,
914
1030
  ...(typeof preferredSampleIndex === "number"
@@ -920,13 +1036,13 @@ export function createCaseReview(args) {
920
1036
  };
921
1037
  }
922
1038
  if (!sampleResult) {
923
- throw new Error(`Case ${args.caseId} was not found on subject ${args.subject.id}.`);
1039
+ throw new Error(`Case ${args.caseId} was not found on candidate ${args.candidate.id}.`);
924
1040
  }
925
1041
  const durationMs = typeof caseResult?.durationMs === "number"
926
1042
  ? caseResult.durationMs
927
1043
  : undefined;
928
1044
  return {
929
- subjectId: args.subject.id,
1045
+ candidateId: args.candidate.id,
930
1046
  caseId: caseResult?.id ?? args.caseId,
931
1047
  caseLabel: caseResult?.label ?? args.caseId,
932
1048
  sampleId: sampleResult.id,
@@ -965,37 +1081,39 @@ function parseAuthoredWorkbenchSourceSpec(source) {
965
1081
  }
966
1082
  const resolved = resolveWorkbenchResolvedSourceYamlInternal(source);
967
1083
  return {
968
- version: 3,
1084
+ version: 4,
969
1085
  benchmark: {
970
1086
  name: resolved.benchmark.name,
971
1087
  description: resolved.benchmark.description,
972
1088
  engine: authoredAdapterSpecFromInvocation(resolved.engine),
973
1089
  },
974
- subject: {
975
- name: resolved.subject.name,
976
- description: resolved.subject.description,
977
- files: { path: resolved.subject.files.path },
978
- ...(resolved.subject.prepare ? { prepare: { ...resolved.subject.prepare } } : {}),
979
- run: runSpecFromInvocation(resolved.run),
980
- },
981
- ...(resolved.optimizer
982
- ? {
983
- optimizer: {
984
- name: resolved.optimizer.name,
985
- ...(resolved.optimizer.description ? { description: resolved.optimizer.description } : {}),
986
- edits: [...resolved.optimizer.edits],
987
- improve: improveSpecFromInvocation(resolved.improve),
1090
+ candidate: {
1091
+ name: resolved.candidate.name,
1092
+ description: resolved.candidate.description,
1093
+ files: { path: resolved.candidate.files.path },
1094
+ ...(resolved.candidate.prepare ? { prepare: { ...resolved.candidate.prepare } } : {}),
1095
+ defaultRun: resolved.candidate.defaultRun,
1096
+ runs: Object.fromEntries(Object.entries(resolved.candidate.runs).map(([runId, run]) => [
1097
+ runId,
1098
+ {
1099
+ name: run.name,
1100
+ ...authoredAdapterSpecFromInvocation(run),
988
1101
  },
989
- }
990
- : {}),
1102
+ ])),
1103
+ ...(resolved.candidate.improve
1104
+ ? {
1105
+ improve: {
1106
+ edits: [...resolved.candidate.improve.edits],
1107
+ ...improveSpecFromInvocation(resolved.improve),
1108
+ },
1109
+ }
1110
+ : {}),
1111
+ },
991
1112
  };
992
1113
  }
993
1114
  function improveSpecFromInvocation(invocation) {
994
1115
  return authoredAdapterSpecFromInvocation(invocation);
995
1116
  }
996
- function runSpecFromInvocation(invocation) {
997
- return authoredAdapterSpecFromInvocation(invocation);
998
- }
999
1117
  function authoredAdapterSpecFromInvocation(invocation) {
1000
1118
  const config = jsonRecord(invocation.with);
1001
1119
  return {
@@ -1048,9 +1166,9 @@ export function createWorkbenchRunWorkload(args) {
1048
1166
  if (!purpose) {
1049
1167
  throw new Error(`Unsupported runtime job kind: ${args.job.kind}`);
1050
1168
  }
1051
- const subjectId = readJobString(args.job.input, "subjectId") ?? args.job.subjectId;
1052
- if (!subjectId) {
1053
- throw new Error(`${purpose} execution job is missing subjectId.`);
1169
+ const candidateId = readJobString(args.job.input, "candidateId") ?? args.job.candidateId;
1170
+ if (!candidateId) {
1171
+ throw new Error(`${purpose} execution job is missing candidateId.`);
1054
1172
  }
1055
1173
  const attemptIndex = readRequiredJobNumber(args.job.input, "attemptIndex", `${purpose} execution job`);
1056
1174
  const sampleIndex = purpose === "improve"
@@ -1066,7 +1184,7 @@ export function createWorkbenchRunWorkload(args) {
1066
1184
  ? engineCaseFilesForRuntimeInput({ spec: args.spec, engineCase })
1067
1185
  : [];
1068
1186
  const engineCaseSpec = engineCase?.case;
1069
- const initial = createInitialSubjectFiles({
1187
+ const initial = createInitialCandidateFiles({
1070
1188
  baseFiles: args.baseFiles,
1071
1189
  spec: args.spec,
1072
1190
  attemptIndex,
@@ -1074,10 +1192,10 @@ export function createWorkbenchRunWorkload(args) {
1074
1192
  return {
1075
1193
  job: args.job,
1076
1194
  spec: args.spec,
1077
- subjectId,
1195
+ candidateId,
1078
1196
  attemptIndex,
1079
1197
  sampleIndex,
1080
- subjectFiles: initial.files,
1198
+ candidateFiles: initial.files,
1081
1199
  caseId,
1082
1200
  engineResolveFiles: selectedEngineResolveFiles,
1083
1201
  traceFiles: (args.traceFiles ?? []).map((file) => ({ ...file })),
@@ -1088,22 +1206,22 @@ export function createWorkbenchRunWorkload(args) {
1088
1206
  baseId: readJobString(args.job.input, "baseId"),
1089
1207
  };
1090
1208
  }
1091
- function createInitialSubjectFiles(args) {
1092
- const editablePaths = optimizerEdits(args.spec).map(normalizeRelativePath);
1209
+ function createInitialCandidateFiles(args) {
1210
+ const editablePaths = improveEdits(args.spec).map(normalizeRelativePath);
1093
1211
  const editPath = editablePaths[0];
1094
- const subjectPaths = editPath ? [editPath] : [];
1212
+ const candidatePaths = editPath ? [editPath] : [];
1095
1213
  const files = args.baseFiles.length > 0
1096
1214
  ? args.baseFiles.map((file) => ({ ...file }))
1097
1215
  : editPath
1098
1216
  ? normalizeSurfaceFiles([{ path: editPath, content: "" }])
1099
1217
  : [];
1100
1218
  const prompt = [
1101
- `Run the subject workload for benchmark: ${args.spec.benchmark.description}`,
1102
- `Attempt ${args.attemptIndex + 1} uses ${formatOptimizerSummary(args.spec)}; the improve adapter may edit the subject before Workbench scores it.`,
1219
+ `Run the candidate workload for benchmark: ${args.spec.benchmark.description}`,
1220
+ `Attempt ${args.attemptIndex + 1} uses ${formatImproveSummary(args.spec)}; the improve adapter may edit the candidate before Workbench scores it.`,
1103
1221
  ].join("\n");
1104
1222
  const byPath = new Map(files.map((file) => [file.path, file]));
1105
1223
  if (editPath &&
1106
- ![...byPath.keys()].some((filePath) => subjectPaths.includes(filePath))) {
1224
+ ![...byPath.keys()].some((filePath) => candidatePaths.includes(filePath))) {
1107
1225
  byPath.set(editPath, {
1108
1226
  path: editPath,
1109
1227
  kind: "text",
@@ -1167,7 +1285,7 @@ export function workbenchExecutionExecutorForRuntimeInput(args) {
1167
1285
  }
1168
1286
  function adapterOperationForExecutionPurpose(purpose) {
1169
1287
  if (purpose === "improve") {
1170
- return "optimizer.improve";
1288
+ return "candidate.improve";
1171
1289
  }
1172
1290
  if (purpose === "attempt") {
1173
1291
  return "engine.run";
@@ -1281,8 +1399,8 @@ function normalizeRuntimeControlInputs(value) {
1281
1399
  }
1282
1400
  const record = value;
1283
1401
  const inputs = {};
1284
- if (hasOwn(record, "subject")) {
1285
- inputs.subject = normalizeRuntimeControlFiles(record.subject, "inputs.subject");
1402
+ if (hasOwn(record, "candidate")) {
1403
+ inputs.candidate = normalizeRuntimeControlFiles(record.candidate, "inputs.candidate");
1286
1404
  }
1287
1405
  if (hasOwn(record, "case")) {
1288
1406
  inputs.case = normalizeRuntimeControlFiles(record.case, "inputs.case");
@@ -1326,8 +1444,8 @@ function normalizeRuntimeControlOperation(value, label) {
1326
1444
  const operation = record.operation;
1327
1445
  if (operation !== "engine.resolve" &&
1328
1446
  operation !== "engine.run" &&
1329
- operation !== "subject.run" &&
1330
- operation !== "optimizer.improve") {
1447
+ operation !== "candidate.run" &&
1448
+ operation !== "candidate.improve") {
1331
1449
  throw new Error(`Workbench runtime-control ${label}.operation is invalid.`);
1332
1450
  }
1333
1451
  const invocation = record.invocation;
@@ -1415,7 +1533,7 @@ export async function executeAdapterInCurrentRuntime(args, execution, startedAt,
1415
1533
  };
1416
1534
  try {
1417
1535
  if (execution.purpose === "improve") {
1418
- return await executeSubjectRevisionExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
1536
+ return await executeCandidateRevisionExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
1419
1537
  }
1420
1538
  if (execution.purpose === "attempt") {
1421
1539
  return await executeAttemptExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
@@ -1589,22 +1707,22 @@ function completedJobFromSandboxResult(fallbackJob, startedAt, result) {
1589
1707
  }
1590
1708
  return attachSandboxMetadataToJob(failWorkbenchRunJob(fallbackJob, result.startedAt || startedAt, result.error ?? `Sandbox execution ${result.status}.`, result.finishedAt), asRuntimeRecord(result.metadata).sandbox);
1591
1709
  }
1592
- async function executeSubjectRevisionExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
1710
+ async function executeCandidateRevisionExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
1593
1711
  const { workload, result } = await runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher);
1594
1712
  if (result.error || (result.exitCode ?? 0) !== 0) {
1595
1713
  return failWorkbenchRunJob(args.job, startedAt, result.error ?? `Adapter ${execution.adapter.use} exited with status ${result.exitCode}.`, result.finishedAt, result);
1596
1714
  }
1597
1715
  const finishedAt = result.finishedAt ?? new Date().toISOString();
1598
- const subjectPatch = createSubjectPatchFromResult(result, args.spec);
1599
- if (subjectPatch.fileChanges.length === 0) {
1600
- return failWorkbenchRunJob(args.job, startedAt, `${execution.adapter.use === "command" ? "Command improve adapter" : `Adapter ${execution.adapter.use}`} completed without changing subject files covered by optimizer edits.`, finishedAt, result);
1601
- }
1602
- const subjectRevisionFiles = applyWorkbenchSubjectPatch({
1603
- baseFiles: workload.subjectFiles,
1604
- patch: subjectPatch,
1605
- edits: requireOptimizerEdits(args.spec),
1716
+ const candidatePatch = createCandidatePatchFromResult(result, args.spec);
1717
+ if (candidatePatch.fileChanges.length === 0) {
1718
+ return failWorkbenchRunJob(args.job, startedAt, `${execution.adapter.use === "command" ? "Command improve adapter" : `Adapter ${execution.adapter.use}`} completed without changing candidate files covered by improve edits.`, finishedAt, result);
1719
+ }
1720
+ const candidateRevisionFiles = applyWorkbenchCandidatePatch({
1721
+ baseFiles: workload.candidateFiles,
1722
+ patch: candidatePatch,
1723
+ edits: requireImproveEdits(args.spec),
1606
1724
  });
1607
- const usage = assignUsageRole("optimizer", result.usage);
1725
+ const usage = assignUsageRole("improver", result.usage);
1608
1726
  return {
1609
1727
  ...args.job,
1610
1728
  status: "succeeded",
@@ -1616,13 +1734,13 @@ async function executeSubjectRevisionExecutionInCurrentRuntime(args, execution,
1616
1734
  ok: true,
1617
1735
  executionId: execution.id,
1618
1736
  purpose: execution.purpose,
1619
- subjectId: workload.subjectId,
1737
+ candidateId: workload.candidateId,
1620
1738
  attemptIndex: workload.attemptIndex,
1621
1739
  baseId: workload.baseId,
1622
1740
  prompt: workload.prompt,
1623
- subjectPatch,
1624
- fileChanges: subjectPatch.fileChanges,
1625
- files: subjectRevisionFiles,
1741
+ candidatePatch,
1742
+ fileChanges: candidatePatch.fileChanges,
1743
+ files: candidateRevisionFiles,
1626
1744
  traces: traceFilePaths(result.files),
1627
1745
  ...(usage ? { usage } : {}),
1628
1746
  ...(result.summary !== undefined ? { summary: result.summary } : {}),
@@ -1655,7 +1773,7 @@ async function executeAttemptExecutionInCurrentRuntime(args, execution, startedA
1655
1773
  const finishedAt = workloadResult.finishedAt ?? new Date().toISOString();
1656
1774
  const usage = attemptUsageSummary(workloadResult.usage, engineResult.usage);
1657
1775
  const sample = evaluateSample({
1658
- subjectId: workload.subjectId,
1776
+ candidateId: workload.candidateId,
1659
1777
  files: workloadResult.files,
1660
1778
  engineResolveFiles: workload.engineResolveFiles,
1661
1779
  spec: workload.spec,
@@ -1682,7 +1800,7 @@ async function executeAttemptExecutionInCurrentRuntime(args, execution, startedA
1682
1800
  ok: true,
1683
1801
  executionId: execution.id,
1684
1802
  purpose: execution.purpose,
1685
- subjectId: workload.subjectId,
1803
+ candidateId: workload.candidateId,
1686
1804
  attemptIndex: workload.attemptIndex,
1687
1805
  sampleIndex: workload.sampleIndex,
1688
1806
  caseId: workload.caseId,
@@ -1725,7 +1843,7 @@ export async function executeRuntimeControlOperationSequenceInCurrentRuntime(arg
1725
1843
  ? { adapterAuthEnv: adapterAuth.env }
1726
1844
  : {}),
1727
1845
  }, workload, args.runtimeControlOperation.operations.map((operation, index) => runtimeControlStepForOperation(operation, index, args.adapterManifests)), startedAt, {
1728
- runSubjectPrepare: args.runtimeControlOperation.prepare ?? false,
1846
+ runCandidatePrepare: args.runtimeControlOperation.prepare ?? false,
1729
1847
  workspaceFiles: args.runtimeControlOperation.inputs?.workspace ?? [],
1730
1848
  outputFiles: args.runtimeControlOperation.inputs?.output ?? [],
1731
1849
  collectWorkspace: args.runtimeControlOperation.collectWorkspace ?? false,
@@ -1823,7 +1941,7 @@ function createRuntimeControlSandboxInput(args, request) {
1823
1941
  const parentInput = asRuntimeRecord(args.job.input);
1824
1942
  const publicFiles = runtimeControlInputFiles(request.inputs, "case", parentWorkload.engineCase ? engineCasePublicFiles(parentWorkload.engineCase) : []);
1825
1943
  const privateFiles = runtimeControlInputFiles(request.inputs, "enginePrivate", parentWorkload.engineCase ? engineCasePrivateFiles(parentWorkload.engineCase) : []);
1826
- const subjectFiles = runtimeControlInputFiles(request.inputs, "subject", parentWorkload.subjectFiles);
1944
+ const candidateFiles = runtimeControlInputFiles(request.inputs, "candidate", parentWorkload.candidateFiles);
1827
1945
  const traceFiles = runtimeControlInputFiles(request.inputs, "traces", parentWorkload.traceFiles);
1828
1946
  const adapter = request.operations[request.operations.length - 1]?.invocation;
1829
1947
  const childExecution = {
@@ -1866,7 +1984,7 @@ function createRuntimeControlSandboxInput(args, request) {
1866
1984
  const childArgs = {
1867
1985
  ...args,
1868
1986
  job: childJob,
1869
- baseFiles: subjectFiles,
1987
+ baseFiles: candidateFiles,
1870
1988
  engineResolveFiles: [...publicFiles, ...privateFiles],
1871
1989
  engineCases: [engineCase],
1872
1990
  traceFiles,
@@ -1890,10 +2008,10 @@ function runtimeControlStepForOperation(operation, index, manifests = []) {
1890
2008
  ...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
1891
2009
  }, operation.operation, manifests).command;
1892
2010
  return {
1893
- kind: operation.operation === "subject.run"
1894
- ? "subject"
1895
- : operation.operation === "optimizer.improve"
1896
- ? "optimizer"
2011
+ kind: operation.operation === "candidate.run"
2012
+ ? "candidate"
2013
+ : operation.operation === "candidate.improve"
2014
+ ? "improver"
1897
2015
  : "engine",
1898
2016
  label: operation.label ?? `${operation.operation.replace(".", "_")}_${index + 1}`,
1899
2017
  operation: operation.operation,
@@ -1960,8 +2078,8 @@ function isWorkbenchAdapterOperationResult(value) {
1960
2078
  return record.protocol === "workbench.adapter-result.v1" &&
1961
2079
  (record.operation === "engine.resolve" ||
1962
2080
  record.operation === "engine.run" ||
1963
- record.operation === "subject.run" ||
1964
- record.operation === "optimizer.improve");
2081
+ record.operation === "candidate.run" ||
2082
+ record.operation === "candidate.improve");
1965
2083
  }
1966
2084
  function cloneSurfaceFiles(files) {
1967
2085
  return files.map((file) => ({ ...file, path: normalizeRelativePath(file.path) }));
@@ -2040,9 +2158,11 @@ async function runHostedCommandExecutionSteps(args, workload, steps, startedAt,
2040
2158
  const stepTimeoutMs = environmentVersion
2041
2159
  ? environmentVersionTimeoutMs(environmentVersion)
2042
2160
  : 5 * 60 * 1000;
2043
- const shouldRunSubjectPrepare = options.runSubjectPrepare ?? steps.some((step) => step.executor === "sandbox");
2044
- if (shouldRunSubjectPrepare) {
2045
- await runSubjectPrepareCommand({
2161
+ const shouldRunCandidatePrepare = options.runCandidatePrepare ??
2162
+ (readWorkloadExecutionPurpose(workload) === "attempt" &&
2163
+ steps.some((step) => step.executor === "sandbox"));
2164
+ if (shouldRunCandidatePrepare) {
2165
+ await runCandidatePrepareCommand({
2046
2166
  root: workspace.root,
2047
2167
  workload,
2048
2168
  execution,
@@ -2081,6 +2201,9 @@ async function runHostedCommandExecutionSteps(args, workload, steps, startedAt,
2081
2201
  });
2082
2202
  const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root), step.operation);
2083
2203
  assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${step.adapter?.use ?? execution.adapter.use} ${step.operation}`);
2204
+ await writeSurfaceFiles(outputDir(workspace.root), [
2205
+ textSurfaceFile(`.workbench/traces/${workload.job.id}/${step.label}/result.json`, `${JSON.stringify(operationResult, null, 2)}\n`),
2206
+ ]);
2084
2207
  operationResults.push(operationResult);
2085
2208
  await publishCommandStepEvent(options.eventPublisher, {
2086
2209
  step: step.label,
@@ -2132,19 +2255,19 @@ async function runHostedCommandExecutionSteps(args, workload, steps, startedAt,
2132
2255
  await workspace.cleanup();
2133
2256
  }
2134
2257
  }
2135
- async function runSubjectPrepareCommand(args) {
2136
- const command = args.workload.spec.subject.prepare?.command;
2258
+ async function runCandidatePrepareCommand(args) {
2259
+ const command = args.workload.spec.candidate.prepare?.command;
2137
2260
  if (!command) {
2138
2261
  return;
2139
2262
  }
2140
- const role = args.execution.purpose === "improve" ? "optimizer" : "runner";
2263
+ const role = args.execution.purpose === "improve" ? "improver" : "runner";
2141
2264
  await publishCommandStepEvent(args.eventPublisher, {
2142
- step: "subject_prepare",
2265
+ step: "candidate_prepare",
2143
2266
  status: "started",
2144
2267
  role,
2145
2268
  });
2146
2269
  try {
2147
- const shellCommand = createHostedWorkloadShellCommand(args.root, command, "subject_prepare");
2270
+ const shellCommand = createHostedWorkloadShellCommand(args.root, command, "candidate_prepare");
2148
2271
  await args.execFileAsync("sh", ["-c", shellCommand], {
2149
2272
  cwd: args.root,
2150
2273
  env: createHostedWorkloadPrepareEnv(args.root),
@@ -2152,20 +2275,20 @@ async function runSubjectPrepareCommand(args) {
2152
2275
  timeout: args.timeoutMs,
2153
2276
  });
2154
2277
  await publishCommandStepEvent(args.eventPublisher, {
2155
- step: "subject_prepare",
2278
+ step: "candidate_prepare",
2156
2279
  status: "succeeded",
2157
2280
  role,
2158
2281
  });
2159
2282
  }
2160
2283
  catch (error) {
2161
2284
  await publishCommandStepEvent(args.eventPublisher, {
2162
- step: "subject_prepare",
2285
+ step: "candidate_prepare",
2163
2286
  status: "failed",
2164
2287
  exitCode: readExitCode(error),
2165
2288
  error: error instanceof Error ? error.message : String(error),
2166
2289
  role,
2167
2290
  });
2168
- throw new Error(`Subject prepare command failed: ${error instanceof Error ? error.message : String(error)}`);
2291
+ throw new Error(`Candidate prepare command failed: ${error instanceof Error ? error.message : String(error)}`);
2169
2292
  }
2170
2293
  }
2171
2294
  async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
@@ -2204,10 +2327,10 @@ async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
2204
2327
  };
2205
2328
  }
2206
2329
  function stepEventRole(step) {
2207
- if (step.kind === "optimizer") {
2208
- return "optimizer";
2330
+ if (step.kind === "improver") {
2331
+ return "improver";
2209
2332
  }
2210
- if (step.kind === "subject") {
2333
+ if (step.kind === "candidate") {
2211
2334
  return "runner";
2212
2335
  }
2213
2336
  if (step.kind === "engine") {
@@ -2219,10 +2342,10 @@ function adapterOperationUsageSummary(result) {
2219
2342
  if (hasExplicitUsageRole(result.usage)) {
2220
2343
  return completeUsageSummary(result.usage);
2221
2344
  }
2222
- if (result.operation === "optimizer.improve") {
2223
- return assignUsageRole("optimizer", result.usage);
2345
+ if (result.operation === "candidate.improve") {
2346
+ return assignUsageRole("improver", result.usage);
2224
2347
  }
2225
- if (result.operation === "subject.run") {
2348
+ if (result.operation === "candidate.run") {
2226
2349
  return assignUsageRole("runner", result.usage);
2227
2350
  }
2228
2351
  if (result.operation === "engine.run") {
@@ -2239,16 +2362,16 @@ function attemptUsageSummary(workloadUsage, resultUsage) {
2239
2362
  }
2240
2363
  function hasExplicitUsageRole(usage) {
2241
2364
  const normalized = completeUsageSummary(usage);
2242
- return Boolean(normalized?.optimizer || normalized?.runner || normalized?.engine);
2365
+ return Boolean(normalized?.improver || normalized?.runner || normalized?.engine);
2243
2366
  }
2244
- function createSubjectPatchFromResult(result, spec) {
2245
- if (result.subjectPatch) {
2246
- return result.subjectPatch;
2367
+ function createCandidatePatchFromResult(result, spec) {
2368
+ if (result.candidatePatch) {
2369
+ return result.candidatePatch;
2247
2370
  }
2248
2371
  const changedEditPaths = result.fileChanges
2249
2372
  .map(normalizeRelativePath)
2250
2373
  .filter((filePath) => !filePath.startsWith(".workbench/") &&
2251
- isSubjectEditPath(filePath, optimizerEdits(spec)));
2374
+ isCandidateEditPath(filePath, improveEdits(spec)));
2252
2375
  const changedSet = new Set(changedEditPaths);
2253
2376
  const files = result.files
2254
2377
  .filter((file) => changedSet.has(normalizeRelativePath(file.path)))
@@ -2260,7 +2383,7 @@ function createSubjectPatchFromResult(result, spec) {
2260
2383
  ...(result.feedback !== undefined ? { feedback: result.feedback } : {}),
2261
2384
  };
2262
2385
  }
2263
- function isSubjectEditPath(filePath, edits) {
2386
+ function isCandidateEditPath(filePath, edits) {
2264
2387
  const normalized = normalizeRelativePath(filePath);
2265
2388
  return edits.some((entry) => {
2266
2389
  const editPath = normalizeRelativePath(entry).replace(/\/+$/u, "");
@@ -2320,21 +2443,33 @@ export async function stageWorkbenchRunWorkload(root, workload) {
2320
2443
  ]);
2321
2444
  await fs.mkdir(inputDir(root), { recursive: true });
2322
2445
  await fs.mkdir(outputDir(root), { recursive: true });
2446
+ await clearMutableWorkspaceFiles(root);
2323
2447
  if (purpose === "attempt") {
2324
- await fs.mkdir(subjectDir(root), { recursive: true });
2448
+ await fs.mkdir(candidateDir(root), { recursive: true });
2325
2449
  await fs.mkdir(caseDir(root), { recursive: true });
2326
2450
  const engineCase = requireWorkloadEngineCase(workload, "Attempt staging");
2327
- await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
2451
+ await writeSurfaceFiles(candidateDir(root), workload.candidateFiles);
2328
2452
  await writeSurfaceFiles(caseDir(root), engineCasePublicFiles(engineCase));
2329
2453
  return;
2330
2454
  }
2331
2455
  if (purpose === "improve") {
2332
- await fs.mkdir(subjectDir(root), { recursive: true });
2333
- await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
2456
+ await writeSurfaceFiles(root, workload.candidateFiles.filter((file) => isMutableWorkspaceSnapshotPath(file.path)));
2334
2457
  await fs.mkdir(tracesDir(root), { recursive: true });
2335
2458
  await writeSurfaceFiles(tracesDir(root), workload.traceFiles);
2336
2459
  }
2337
2460
  }
2461
+ async function clearMutableWorkspaceFiles(root) {
2462
+ const fs = await importNodeModule(nodeBuiltin("fs/promises"));
2463
+ const path = await importNodeModule(nodeBuiltin("path"));
2464
+ const entries = await fs.readdir(root, { withFileTypes: true }).catch(() => []);
2465
+ await Promise.all(entries.map(async (entry) => {
2466
+ const relativePath = normalizeRelativePath(entry.name);
2467
+ if (!isMutableWorkspaceSnapshotPath(relativePath)) {
2468
+ return;
2469
+ }
2470
+ await fs.rm(path.join(root, entry.name), { recursive: true, force: true });
2471
+ }));
2472
+ }
2338
2473
  async function stageWorkbenchEnginePrivateFiles(root, workload) {
2339
2474
  if (readWorkloadExecutionPurpose(workload) !== "attempt") {
2340
2475
  return;
@@ -2417,7 +2552,7 @@ function adapterFilePathWithinRoot(filePath, sourceRoot) {
2417
2552
  }
2418
2553
  async function readHostedRunFailureResult(root, workload, options) {
2419
2554
  const traceFiles = await readRuntimeTraceFiles(root, workload);
2420
- const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root)));
2555
+ const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root), { ignorePath: isWorkbenchInternalOutputPath }));
2421
2556
  const startedAt = options.startedAt ?? new Date().toISOString();
2422
2557
  const finishedAt = new Date().toISOString();
2423
2558
  const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
@@ -2433,13 +2568,13 @@ async function readHostedRunFailureResult(root, workload, options) {
2433
2568
  async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
2434
2569
  const path = await importNodeModule(nodeBuiltin("path"));
2435
2570
  const traceFiles = await readRuntimeTraceFiles(root, workload);
2436
- const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root)));
2571
+ const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root), { ignorePath: isWorkbenchInternalOutputPath }));
2437
2572
  const outputExitCode = await readOptionalNumber(path.join(outputDir(root), "exit_code"));
2438
2573
  const startedAt = options.startedAt ?? new Date().toISOString();
2439
2574
  const finishedAt = new Date().toISOString();
2440
2575
  const purpose = readWorkloadExecutionPurpose(workload);
2441
2576
  const primaryOperation = purpose === "improve"
2442
- ? "optimizer.improve"
2577
+ ? "candidate.improve"
2443
2578
  : "engine.run";
2444
2579
  const primaryResult = [...(options.operationResults ?? [])]
2445
2580
  .reverse()
@@ -2453,9 +2588,9 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
2453
2588
  const cases = normalizeResultCases(resultPayload.cases);
2454
2589
  const includeResultScoring = purpose === "attempt";
2455
2590
  const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
2456
- const subjectPatch = purpose === "improve" ? primaryResult?.value : undefined;
2591
+ const candidatePatch = purpose === "improve" ? primaryResult?.value : undefined;
2457
2592
  const engineResult = purpose === "attempt" ? primaryResult?.value : undefined;
2458
- const declaredChanges = subjectPatch?.fileChanges ??
2593
+ const declaredChanges = candidatePatch?.fileChanges ??
2459
2594
  (Array.isArray(resultPayload.fileChanges)
2460
2595
  ? resultPayload.fileChanges.filter((entry) => typeof entry === "string")
2461
2596
  : files.map((file) => file.path));
@@ -2463,7 +2598,7 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
2463
2598
  files,
2464
2599
  fileChanges: declaredChanges,
2465
2600
  ...(options.operationResults ? { operationResults: [...options.operationResults] } : {}),
2466
- ...(subjectPatch ? { subjectPatch } : {}),
2601
+ ...(candidatePatch ? { candidatePatch } : {}),
2467
2602
  ...(engineResult ? { result: engineResult } : {}),
2468
2603
  ...(includeResultScoring && metrics ? { metrics } : {}),
2469
2604
  ...(includeResultScoring && cases ? { cases } : {}),
@@ -2537,8 +2672,8 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, step, aut
2537
2672
  await fs.mkdir(path.dirname(requestPath), { recursive: true });
2538
2673
  const casePrompt = workload.engineCaseSpec?.prompt;
2539
2674
  const adapter = step.adapter ?? execution.adapter;
2540
- const subjectCommand = adapterProtocolCommandSpec(workload.spec.run, "subject.run", manifests).command;
2541
- await fs.writeFile(requestPath, `${JSON.stringify({
2675
+ const candidateCommand = adapterProtocolCommandSpec(workload.spec.run, "candidate.run", manifests).command;
2676
+ const payload = {
2542
2677
  protocol: "workbench.adapter.v3",
2543
2678
  id: execution.id,
2544
2679
  jobId: workload.job.id,
@@ -2554,17 +2689,17 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, step, aut
2554
2689
  name: workload.spec.benchmark.name,
2555
2690
  description: workload.spec.benchmark.description,
2556
2691
  },
2557
- subject: {
2558
- id: workload.subjectId,
2559
- path: workload.spec.subject.files.path,
2560
- ...(workload.spec.subject.prepare ? { prepare: { ...workload.spec.subject.prepare } } : {}),
2692
+ candidate: {
2693
+ id: workload.candidateId,
2694
+ path: workload.spec.candidate.files.path,
2695
+ ...(workload.spec.candidate.prepare ? { prepare: { ...workload.spec.candidate.prepare } } : {}),
2561
2696
  run: {
2562
2697
  ...workload.spec.run,
2563
- command: subjectCommand,
2698
+ command: candidateCommand,
2564
2699
  },
2565
2700
  },
2566
- ...(workload.spec.optimizer
2567
- ? { optimizer: { edits: [...workload.spec.optimizer.edits] } }
2701
+ ...(workload.spec.candidate.improve
2702
+ ? { improve: { edits: [...workload.spec.candidate.improve.edits] } }
2568
2703
  : {}),
2569
2704
  attempt: {
2570
2705
  attemptIndex: workload.attemptIndex,
@@ -2580,21 +2715,41 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, step, aut
2580
2715
  workspace: root,
2581
2716
  output: outputDir(root),
2582
2717
  result: workbenchAdapterOperationResultPath(outputDir(root)),
2583
- subject: subjectDir(root),
2718
+ ...(readWorkloadExecutionPurpose(workload) === "attempt" ? { candidate: candidateDir(root) } : {}),
2584
2719
  ...(workload.engineCaseSpec ? { case: caseDir(root) } : {}),
2585
2720
  traces: tracesDir(root),
2586
2721
  ...(step.kind === "engine" ? { enginePrivate: runtimeEnginePrivateDir(root) } : {}),
2587
2722
  },
2588
- }, null, 2)}\n`);
2723
+ };
2724
+ await fs.writeFile(requestPath, `${JSON.stringify(payload, null, 2)}\n`);
2725
+ await writeSurfaceFiles(outputDir(root), [
2726
+ textSurfaceFile(`.workbench/traces/${workload.job.id}/${step.label}/request.json`, `${JSON.stringify(sanitizeAdapterRequestTracePayload(payload), null, 2)}\n`),
2727
+ ]);
2589
2728
  return requestPath;
2590
2729
  }
2591
- function optimizerEdits(spec) {
2592
- return spec.optimizer?.edits ?? [];
2730
+ function sanitizeAdapterRequestTracePayload(value) {
2731
+ if (Array.isArray(value)) {
2732
+ return value.map((entry) => sanitizeAdapterRequestTracePayload(entry));
2733
+ }
2734
+ if (!value || typeof value !== "object") {
2735
+ return (value ?? null);
2736
+ }
2737
+ const sanitized = {};
2738
+ for (const [key, entry] of Object.entries(value)) {
2739
+ if (key === "auth" || key === "enginePrivate") {
2740
+ continue;
2741
+ }
2742
+ sanitized[key] = sanitizeAdapterRequestTracePayload(entry);
2743
+ }
2744
+ return sanitized;
2745
+ }
2746
+ function improveEdits(spec) {
2747
+ return spec.candidate.improve?.edits ?? [];
2593
2748
  }
2594
- function requireOptimizerEdits(spec) {
2595
- const edits = optimizerEdits(spec);
2749
+ function requireImproveEdits(spec) {
2750
+ const edits = improveEdits(spec);
2596
2751
  if (edits.length === 0) {
2597
- throw new Error("Optimizer YAML must declare at least one entry in edits.");
2752
+ throw new Error("Candidate improve configuration must declare at least one entry in edits.");
2598
2753
  }
2599
2754
  return edits;
2600
2755
  }
@@ -2691,8 +2846,8 @@ function requireWorkloadEngineCase(workload, label) {
2691
2846
  }
2692
2847
  return workload.engineCase;
2693
2848
  }
2694
- function subjectDir(root) {
2695
- return `${inputDir(root)}/subject`;
2849
+ function candidateDir(root) {
2850
+ return `${inputDir(root)}/candidate`;
2696
2851
  }
2697
2852
  function caseDir(root) {
2698
2853
  return `${inputDir(root)}/case`;
@@ -2727,7 +2882,7 @@ async function writeSurfaceFiles(root, files) {
2727
2882
  }
2728
2883
  }
2729
2884
  }
2730
- async function readSurfaceFiles(root) {
2885
+ async function readSurfaceFiles(root, options = {}) {
2731
2886
  const fs = await importNodeModule(nodeBuiltin("fs/promises"));
2732
2887
  const path = await importNodeModule(nodeBuiltin("path"));
2733
2888
  const utf8Decoder = new TextDecoder("utf-8", { fatal: true });
@@ -2738,6 +2893,10 @@ async function readSurfaceFiles(root) {
2738
2893
  .catch(() => []);
2739
2894
  for (const entry of entries) {
2740
2895
  const absolutePath = path.join(directory, entry.name);
2896
+ const relativePath = normalizeRelativePath(path.relative(root, absolutePath).replace(/\\/gu, "/"));
2897
+ if (options.ignorePath?.(relativePath)) {
2898
+ continue;
2899
+ }
2741
2900
  if (entry.isDirectory()) {
2742
2901
  await walk(absolutePath);
2743
2902
  continue;
@@ -2745,9 +2904,18 @@ async function readSurfaceFiles(root) {
2745
2904
  if (!entry.isFile()) {
2746
2905
  continue;
2747
2906
  }
2748
- const relativePath = normalizeRelativePath(path.relative(root, absolutePath).replace(/\\/gu, "/"));
2749
- const body = await fs.readFile(absolutePath);
2750
- const stats = await fs.stat(absolutePath);
2907
+ let body;
2908
+ let stats;
2909
+ try {
2910
+ body = await fs.readFile(absolutePath);
2911
+ stats = await fs.stat(absolutePath);
2912
+ }
2913
+ catch (error) {
2914
+ if (isVanishedWalkEntry(error)) {
2915
+ continue;
2916
+ }
2917
+ throw error;
2918
+ }
2751
2919
  const content = encodeSurfaceSnapshotContent(body, utf8Decoder);
2752
2920
  files.push({
2753
2921
  path: relativePath,
@@ -2761,6 +2929,10 @@ async function readSurfaceFiles(root) {
2761
2929
  await walk(root);
2762
2930
  return files.sort((left, right) => left.path.localeCompare(right.path));
2763
2931
  }
2932
+ function isVanishedWalkEntry(error) {
2933
+ const code = error?.code;
2934
+ return code === "ENOENT" || code === "ENOTDIR";
2935
+ }
2764
2936
  function encodeSurfaceSnapshotContent(body, utf8Decoder) {
2765
2937
  try {
2766
2938
  return {
@@ -2943,7 +3115,13 @@ function evaluateSample(args) {
2943
3115
  if (metrics.score === undefined) {
2944
3116
  metrics.score = sampleScore;
2945
3117
  }
2946
- const cases = args.workload.cases?.length ? args.workload.cases : undefined;
3118
+ const cases = runtimeTimedCaseResults({
3119
+ caseId: args.caseId,
3120
+ status: "completed",
3121
+ durationMs,
3122
+ metrics,
3123
+ cases: args.workload.cases,
3124
+ });
2947
3125
  const feedback = {
2948
3126
  ...(args.workload.summary !== undefined
2949
3127
  ? { summary: args.workload.summary }
@@ -2956,10 +3134,10 @@ function evaluateSample(args) {
2956
3134
  return {
2957
3135
  id: `${args.caseId}__sample_${String(args.sampleIndex + 1).padStart(3, "0")}`,
2958
3136
  index: args.sampleIndex,
2959
- subject: {
2960
- id: args.subjectId,
2961
- kind: "subject",
2962
- label: args.subjectId,
3137
+ candidate: {
3138
+ id: args.candidateId,
3139
+ kind: "candidate",
3140
+ label: args.candidateId,
2963
3141
  },
2964
3142
  status: "completed",
2965
3143
  startedAt: args.startedAt,
@@ -2967,7 +3145,7 @@ function evaluateSample(args) {
2967
3145
  durationMs,
2968
3146
  metrics,
2969
3147
  ...(usage ? { usage } : {}),
2970
- ...(cases ? { cases } : {}),
3148
+ cases,
2971
3149
  feedback,
2972
3150
  };
2973
3151
  }
@@ -2976,7 +3154,7 @@ function normalizeSampleJobOutput(value) {
2976
3154
  return null;
2977
3155
  }
2978
3156
  const record = value;
2979
- if (record.ok !== true || typeof record.subjectId !== "string") {
3157
+ if (record.ok !== true || typeof record.candidateId !== "string") {
2980
3158
  return null;
2981
3159
  }
2982
3160
  const files = Array.isArray(record.files)
@@ -2991,7 +3169,7 @@ function normalizeSampleJobOutput(value) {
2991
3169
  return null;
2992
3170
  }
2993
3171
  return {
2994
- subjectId: record.subjectId,
3172
+ candidateId: record.candidateId,
2995
3173
  attemptIndex: record.attemptIndex,
2996
3174
  sample,
2997
3175
  fileChanges: Array.isArray(record.fileChanges)
@@ -3006,9 +3184,57 @@ function normalizeSampleJobOutput(value) {
3006
3184
  function normalizeEvaluationSampleOutputs(args) {
3007
3185
  return args.jobs.flatMap((job) => {
3008
3186
  const output = normalizeSampleJobOutput(job.output);
3009
- return output ? [{ jobs: [job], output }] : [];
3187
+ if (!output) {
3188
+ return [];
3189
+ }
3190
+ const caseId = readJobString(job.input, "caseId") ?? output.sample.cases?.[0]?.id ?? null;
3191
+ const durationMs = runtimeJobDurationMs(job) ?? output.sample.durationMs;
3192
+ const sample = caseId && typeof durationMs === "number" && Number.isFinite(durationMs)
3193
+ ? {
3194
+ ...output.sample,
3195
+ cases: runtimeTimedCaseResults({
3196
+ caseId,
3197
+ status: output.sample.status === "error" ? "error" : "completed",
3198
+ durationMs,
3199
+ metrics: output.sample.metrics ?? {},
3200
+ cases: output.sample.cases,
3201
+ }),
3202
+ }
3203
+ : output.sample;
3204
+ return [{
3205
+ jobs: [job],
3206
+ output: {
3207
+ ...output,
3208
+ sample,
3209
+ },
3210
+ }];
3010
3211
  });
3011
3212
  }
3213
+ function runtimeTimedCaseResults(args) {
3214
+ const cases = args.cases?.length
3215
+ ? args.cases
3216
+ : [{
3217
+ id: args.caseId,
3218
+ status: args.status,
3219
+ metrics: args.metrics,
3220
+ }];
3221
+ return cases.map((entry) => ({
3222
+ ...entry,
3223
+ status: entry.status ?? args.status,
3224
+ metrics: entry.metrics ?? args.metrics,
3225
+ durationMs: args.durationMs,
3226
+ }));
3227
+ }
3228
+ function runtimeJobDurationMs(job) {
3229
+ if (typeof job.startedAt !== "string" || typeof job.finishedAt !== "string") {
3230
+ return undefined;
3231
+ }
3232
+ const startedMs = Date.parse(job.startedAt);
3233
+ const finishedMs = Date.parse(job.finishedAt);
3234
+ return Number.isFinite(startedMs) && Number.isFinite(finishedMs)
3235
+ ? Math.max(0, finishedMs - startedMs)
3236
+ : undefined;
3237
+ }
3012
3238
  function meanFinite(values) {
3013
3239
  const finite = values.filter((value) => typeof value === "number" && Number.isFinite(value));
3014
3240
  if (finite.length === 0) {
@@ -3039,12 +3265,12 @@ function withJobUsage(sample, _jobs, attemptJob) {
3039
3265
  usage,
3040
3266
  };
3041
3267
  }
3042
- function normalizeSubjectRevisionJobOutput(value) {
3268
+ function normalizeCandidateRevisionJobOutput(value) {
3043
3269
  if (!value || typeof value !== "object" || Array.isArray(value)) {
3044
3270
  return null;
3045
3271
  }
3046
3272
  const record = value;
3047
- if (record.ok !== true || typeof record.subjectId !== "string") {
3273
+ if (record.ok !== true || typeof record.candidateId !== "string") {
3048
3274
  return null;
3049
3275
  }
3050
3276
  const files = Array.isArray(record.files)
@@ -3056,7 +3282,7 @@ function normalizeSubjectRevisionJobOutput(value) {
3056
3282
  }
3057
3283
  const usage = normalizeUsageSummary(record.usage);
3058
3284
  return {
3059
- subjectId: record.subjectId,
3285
+ candidateId: record.candidateId,
3060
3286
  attemptIndex: record.attemptIndex,
3061
3287
  baseId: typeof record.baseId === "string" && record.baseId.length > 0
3062
3288
  ? record.baseId
@@ -3072,7 +3298,7 @@ function normalizeSubjectRevisionJobOutput(value) {
3072
3298
  ...(usage ? { usage } : {}),
3073
3299
  };
3074
3300
  }
3075
- function errorEvaluationSamplesFromJobs(jobs, subjectId, attemptIndex, completedSampleKeys) {
3301
+ function errorEvaluationSamplesFromJobs(jobs, candidateId, attemptIndex, completedSampleKeys) {
3076
3302
  const groups = new Map();
3077
3303
  for (const job of jobs) {
3078
3304
  const key = evaluationSampleGroupKeyFromJob(job);
@@ -3082,10 +3308,10 @@ function errorEvaluationSamplesFromJobs(jobs, subjectId, attemptIndex, completed
3082
3308
  groups.set(key, [...(groups.get(key) ?? []), job]);
3083
3309
  }
3084
3310
  return [...groups.values()]
3085
- .map((group) => errorEvaluationSampleFromJobGroup(group, subjectId, attemptIndex))
3311
+ .map((group) => errorEvaluationSampleFromJobGroup(group, candidateId, attemptIndex))
3086
3312
  .filter((sample) => sample !== null);
3087
3313
  }
3088
- function errorEvaluationSampleFromJobGroup(jobs, subjectId, attemptIndex) {
3314
+ function errorEvaluationSampleFromJobGroup(jobs, candidateId, attemptIndex) {
3089
3315
  const job = jobs[0];
3090
3316
  if (!job) {
3091
3317
  return null;
@@ -3097,25 +3323,27 @@ function errorEvaluationSampleFromJobGroup(jobs, subjectId, attemptIndex) {
3097
3323
  }
3098
3324
  const startedAt = minIsoTimestamp(jobs.map((entry) => entry.startedAt ?? entry.createdAt));
3099
3325
  const finishedAt = maxIsoTimestamp(jobs.map((entry) => entry.finishedAt ?? entry.updatedAt ?? entry.startedAt));
3326
+ const durationMs = startedAt && finishedAt
3327
+ ? Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt))
3328
+ : undefined;
3100
3329
  const error = summarizeEvaluationJobErrors(jobs) ?? "Evaluation job did not produce a valid sample.";
3101
3330
  return {
3102
3331
  id: `${caseId}__sample_${String(sampleIndex + 1).padStart(3, "0")}`,
3103
3332
  index: sampleIndex,
3104
- subject: {
3105
- id: subjectId,
3106
- kind: "subject",
3107
- label: subjectId,
3333
+ candidate: {
3334
+ id: candidateId,
3335
+ kind: "candidate",
3336
+ label: candidateId,
3108
3337
  },
3109
3338
  status: "error",
3110
3339
  ...(startedAt ? { startedAt } : {}),
3111
3340
  ...(finishedAt ? { finishedAt } : {}),
3112
- ...(startedAt && finishedAt
3113
- ? { durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)) }
3114
- : {}),
3341
+ ...(durationMs !== undefined ? { durationMs } : {}),
3115
3342
  ...(error ? { error } : {}),
3116
3343
  cases: [{
3117
3344
  id: caseId,
3118
3345
  status: "error",
3346
+ ...(durationMs !== undefined ? { durationMs } : {}),
3119
3347
  metrics: {},
3120
3348
  ...(error ? { feedback: { summary: error } } : {}),
3121
3349
  }],
@@ -3171,13 +3399,13 @@ function compareSampleOutputs(left, right) {
3171
3399
  }
3172
3400
  return left.sample.id.localeCompare(right.sample.id);
3173
3401
  }
3174
- function createEvaluationRecord(subjectId, subjectName, rawSamples) {
3175
- const samples = mergeEvaluationSampleRecords(rawSamples).map((sample) => subjectName
3402
+ function createEvaluationRecord(candidateId, candidateName, rawSamples) {
3403
+ const samples = mergeEvaluationSampleRecords(rawSamples).map((sample) => candidateName
3176
3404
  ? {
3177
3405
  ...sample,
3178
- subject: {
3179
- ...sample.subject,
3180
- label: subjectName,
3406
+ candidate: {
3407
+ ...sample.candidate,
3408
+ label: candidateName,
3181
3409
  },
3182
3410
  }
3183
3411
  : sample);
@@ -3191,10 +3419,10 @@ function createEvaluationRecord(subjectId, subjectName, rawSamples) {
3191
3419
  const errorSampleCount = samples.filter((sample) => sample.status === "error")
3192
3420
  .length;
3193
3421
  return {
3194
- subject: {
3195
- id: subjectId,
3196
- kind: "subject",
3197
- ...(subjectName ? { label: subjectName } : {}),
3422
+ candidate: {
3423
+ id: candidateId,
3424
+ kind: "candidate",
3425
+ ...(candidateName ? { label: candidateName } : {}),
3198
3426
  },
3199
3427
  status: samples.length > 0 && completedSampleCount === samples.length
3200
3428
  ? "completed"
@@ -3215,7 +3443,7 @@ function createEvaluationRecord(subjectId, subjectName, rawSamples) {
3215
3443
  samples,
3216
3444
  };
3217
3445
  }
3218
- function normalizedSubjectDisplayName(value) {
3446
+ function normalizedCandidateDisplayName(value) {
3219
3447
  const normalized = value?.trim();
3220
3448
  return normalized ? normalized : null;
3221
3449
  }
@@ -3263,7 +3491,7 @@ function mergeEvaluationSampleGroup(group) {
3263
3491
  return {
3264
3492
  id: `sample_${String(first.index + 1).padStart(3, "0")}`,
3265
3493
  index: first.index,
3266
- subject: first.subject,
3494
+ candidate: first.candidate,
3267
3495
  status: mergeEvaluationSampleStatus(group),
3268
3496
  ...(startedAt ? { startedAt } : {}),
3269
3497
  ...(finishedAt ? { finishedAt } : {}),
@@ -3355,34 +3583,28 @@ function aggregateCaseStatus(results) {
3355
3583
  }
3356
3584
  return undefined;
3357
3585
  }
3358
- function evaluationMeanMetrics(evaluation) {
3359
- const entries = Object.entries(evaluation.metrics ?? {}).filter((entry) => Number.isFinite(entry[1].mean));
3360
- return entries.length > 0
3361
- ? Object.fromEntries(entries.map(([key, stats]) => [key, Number(stats.mean.toFixed(3))]))
3362
- : undefined;
3363
- }
3364
- function selectSubject(args) {
3365
- let selected = args.previousSubject;
3366
- for (const subject of args.subjects) {
3367
- if (!selected || hasHigherScore(subject, selected)) {
3368
- selected = subject;
3586
+ function selectCandidate(args) {
3587
+ let selected = args.previousCandidate;
3588
+ for (const candidate of args.candidates) {
3589
+ if (!selected || hasHigherScore(candidate, selected)) {
3590
+ selected = candidate;
3369
3591
  }
3370
3592
  }
3371
3593
  return selected;
3372
3594
  }
3373
- function hasHigherScore(subject, incumbent) {
3374
- const subjectValue = readMetric(subject, "score");
3375
- const incumbentValue = readMetric(incumbent, "score");
3376
- if (subjectValue == null) {
3595
+ function hasHigherScore(candidate, incumbent) {
3596
+ const candidateValue = readEvaluationMean(candidate.eval, "score");
3597
+ const incumbentValue = readEvaluationMean(incumbent.eval, "score");
3598
+ if (candidateValue == null) {
3377
3599
  return false;
3378
3600
  }
3379
3601
  if (incumbentValue == null) {
3380
3602
  return true;
3381
3603
  }
3382
- return subjectValue > incumbentValue;
3604
+ return candidateValue > incumbentValue;
3383
3605
  }
3384
- function readMetric(subject, metric) {
3385
- const direct = subject.metrics?.[metric];
3606
+ function readEvaluationMean(evaluation, metric) {
3607
+ const direct = evaluation?.metrics?.[metric]?.mean;
3386
3608
  return typeof direct === "number" && Number.isFinite(direct) ? direct : null;
3387
3609
  }
3388
3610
  function metricStats(values) {
@@ -3501,7 +3723,7 @@ function isEvaluationSampleRecord(value) {
3501
3723
  !Array.isArray(value) &&
3502
3724
  typeof record.id === "string" &&
3503
3725
  typeof record.index === "number" &&
3504
- typeof record.subject === "object" &&
3726
+ typeof record.candidate === "object" &&
3505
3727
  isEvaluationSampleStatus(record.status) &&
3506
3728
  hasOperationalCaseStatuses(record.cases));
3507
3729
  }