@workbench-ai/workbench-core 0.0.49 → 0.0.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -4,19 +4,19 @@ import path from "node:path";
4
4
  import { fileURLToPath } from "node:url";
5
5
  import YAML from "yaml";
6
6
  import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, parseWorkbenchAdapterManifest, readWorkbenchAdapterOperationResult, WORKBENCH_RUNTIME_CONTROL_TOKEN_ENV, WORKBENCH_RUNTIME_CONTROL_URL_ENV, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
7
- import { BENCHMARK_SPEC_FILE, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchSubjectManifestPath, } from "./generic-spec.js";
7
+ import { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchCandidateManifestPath, } from "./generic-spec.js";
8
8
  import { attachSandboxMetadataToJob, createWorkbenchSandboxFileStore, isSurfaceSnapshotFile, readWorkbenchExecutionSpec, } from "./sandbox-inputs.js";
9
9
  import { asRuntimeRecord, importNodeModule, isJsonPayload, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
10
10
  import { createWorkbenchExecutionCapability, createWorkbenchSandboxAllocation, collectExecutionCapabilityScopeIssues, collectSandboxAllocationScopeIssues, collectSandboxHandleScopeIssues, assertSandboxBackendSupportsNetworkPolicy, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
11
11
  import { createSandboxBackendPlaneForProvider, } from "./sandbox-backends/index.js";
12
- import { applyWorkbenchSubjectPatch } from "./subject-patch.js";
12
+ import { applyWorkbenchCandidatePatch } from "./candidate-patch.js";
13
13
  import { assignUsageRole, completeUsageSummary, mergeUsageSummaries, normalizeUsageSummary, usageStats, } from "./execution-usage.js";
14
14
  import { traceFilePaths, workbenchTraceExecutionDirectory, } from "./trace-files.js";
15
15
  import { engineCaseForCase, } from "./execution-jobs.js";
16
16
  import { createWorkbenchExecutionEventPublisher, publishCommandStepEvent, } from "./execution-events.js";
17
17
  import { readWorkbenchExecutionPurpose } from "./execution-evidence.js";
18
18
  import { adapterAuthEnv, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
19
- export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchSubjectManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, runtimeNetwork, runtimeResources, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
19
+ export { BENCHMARK_SPEC_FILE, CANDIDATE_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchCandidateManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, runtimeNetwork, runtimeResources, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
20
20
  export { composeRuntimeDockerfileWithAdapterInstallers, } from "./runtime-dockerfile.js";
21
21
  export { adapterCommandName, cloneWorkbenchAdapterManifest, collectWorkbenchAdapterAuthRequirements, collectWorkbenchAdapterInvocations, parseWorkbenchAdapterManifest, workbenchAdapterManifestRequiresAuth, workbenchAdapterManifestSupportsOperation, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, withDefaultWorkbenchAdapterAuth, withDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
22
22
  export { adapterAuthEnv, createWorkbenchAdapterAuthBundle, defaultWorkbenchAdapterAuthStoreRoot, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, parseWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
@@ -26,16 +26,127 @@ export { createWorkbenchProgressStdoutParser, publishWorkbenchProgressStdoutEnve
26
26
  export { resolveSandboxTemplateImage, } from "./sandbox-backends/template-images.js";
27
27
  export { readOutputTraceFiles, workbenchTraceExecutionDirectory, workbenchTraceRunDirectory, workbenchTraceRunDirectoryName, } from "./trace-files.js";
28
28
  export { assertWorkbenchAdapterOperationSupport, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterOperationIssues, collectWorkbenchAdapterOperationRequirements, ensureWorkbenchAdapterOutputDir, WORKBENCH_ADAPTER_RESULT_FILE, normalizeWorkbenchAdapterOperationRequest, normalizeWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, writeWorkbenchAdapterOperationResult, } from "@workbench-ai/workbench-protocol";
29
- export { applyWorkbenchSubjectPatch, } from "./subject-patch.js";
29
+ export { applyWorkbenchCandidatePatch, } from "./candidate-patch.js";
30
30
  export { createWorkbenchSandboxFileStore, createSandboxAdapterRequest, executionResultFromCompletedSandboxJob, materializeWorkbenchSandboxInput, readWorkbenchExecutionSpec, sanitizeWorkbenchExecutionJobForSandbox, } from "./sandbox-inputs.js";
31
31
  export { compileWorkbenchExecutionGraph, } from "./execution-graph.js";
32
- export { createBaselineSubjectExecution, createBaselineSubjectJob, createWorkbenchExecutionJob, expectedWorkbenchRunJobCount, engineCaseForCase, engineCaseIds, attemptJobCountForRunSpec, workbenchExecutionJobPurpose, MAX_WORKBENCH_RUN_BUDGET, planWorkbenchExecutionJobsForPurpose, validateWorkbenchRunEnvelope, workbenchExecutionJobId, } from "./execution-jobs.js";
32
+ export { createBaselineCandidateExecution, createBaselineCandidateJob, createWorkbenchExecutionJob, expectedWorkbenchRunJobCount, engineCaseForCase, engineCaseIds, attemptJobCountForRunSpec, workbenchExecutionJobPurpose, MAX_WORKBENCH_RUN_BUDGET, planWorkbenchExecutionJobsForPurpose, validateWorkbenchRunEnvelope, workbenchExecutionJobId, } from "./execution-jobs.js";
33
33
  export { addCapacity, capacityFits, runWorkbenchExecutionDag, subtractCapacity, workbenchJobDependencies, workbenchJobHostCost, workbenchJobResources, } from "./execution-scheduler.js";
34
34
  export { assertWorkbenchExecutionIsolation, collectWorkbenchExecutionIsolationIssues, validateWorkbenchExecutionOutputPayloads, } from "./execution-outputs.js";
35
35
  export { collectSandboxAllocationScopeIssues, collectExecutionCapabilityScopeIssues, collectSandboxHandleScopeIssues, createWorkbenchSandboxAllocation, createWorkbenchSandboxExecutionMetadata, createWorkbenchExecutionCapability, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
36
- export { buildSubjectCaseExecutionRefs, buildWorkbenchExecutionEvidence, isWorkbenchExecutionActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-evidence.js";
36
+ export { buildCandidateCaseExecutionRefs, buildWorkbenchExecutionEvidence, isWorkbenchExecutionActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-evidence.js";
37
37
  export { buildWorkbenchTraceSessionsFromFiles, combineWorkbenchTraceSessions, finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, readWorkbenchExecutionTraceFiles, traceSessionLabel, } from "./execution-traces.js";
38
38
  export { DOCKER_SANDBOX_BACKEND, assertSandboxHostHealthForProvider, createDockerSandboxBackendDescriptor, createDockerSandboxPlane, resolveWorkbenchSandboxProviderName, sandboxProviderAdmissionForResources, sandboxProviderDefaultMaxConcurrentJobs, sandboxProviderLeaseScope, sandboxHostHealthExpectationForProvider, } from "./sandbox-backends/index.js";
39
+ export function sanitizeWorkbenchRuntimeJobForExchange(job) {
40
+ const { leaseUntil: _leaseUntil, wakeupLeaseUntil: _wakeupLeaseUntil, hostId: _hostId, workerId: _workerId, claimTokenHash: _claimTokenHash, trace: _trace, traceSessions: _traceSessions, ...portable } = job;
41
+ return { ...portable };
42
+ }
43
+ export function sanitizeWorkbenchRuntimeCandidateForExchange(candidate) {
44
+ const { ownerUserId: _ownerUserId, ownerUsername: _ownerUsername, visibility: _visibility, metrics: _metrics, candidateRunId: _candidateRunId, candidateRunName: _candidateRunName, ...portable } = candidate;
45
+ return { ...portable };
46
+ }
47
+ export function workbenchProjectSourceFingerprint(input) {
48
+ const canonical = {
49
+ sourceYaml: normalizeTextForProjectStateFingerprint(input.source),
50
+ candidateFiles: canonicalFilesForProjectStateFingerprint(input.candidateFiles),
51
+ engineResolveFiles: canonicalFilesForProjectStateFingerprint(input.engineResolveFiles),
52
+ engineResolveBinding: {
53
+ engine: input.engineResolveBinding.engine,
54
+ resolver: {
55
+ use: input.engineResolveBinding.resolver.use,
56
+ withFingerprint: input.engineResolveBinding.resolver.withFingerprint,
57
+ },
58
+ },
59
+ adapterFiles: canonicalFilesForProjectStateFingerprint(input.adapterFiles),
60
+ runtimeFiles: canonicalFilesForProjectStateFingerprint(input.runtimeFiles),
61
+ dockerfile: normalizeTextForProjectStateFingerprint(input.dockerfile),
62
+ runtimeDockerfile: normalizeTextForProjectStateFingerprint(input.runtimeDockerfile),
63
+ resources: normalizeProjectStateResources(input.resources),
64
+ network: input.network,
65
+ };
66
+ return createHash("sha256").update(JSON.stringify(canonicalizeProjectState(canonical))).digest("hex");
67
+ }
68
+ export function workbenchRuntimeBundleFingerprint(bundle) {
69
+ const canonical = {
70
+ schema: bundle.schema,
71
+ activeId: bundle.activeId,
72
+ candidates: sortByStableKey(bundle.candidates.map(sanitizeWorkbenchRuntimeCandidateForExchange), (candidate) => candidate.id),
73
+ candidateFiles: sortByStableKey(bundle.candidateFiles.map((group) => ({
74
+ candidateId: group.candidateId,
75
+ files: canonicalFilesForProjectStateFingerprint(group.files),
76
+ })), (group) => group.candidateId),
77
+ evaluations: sortByStableKey(bundle.evaluations, (evaluation) => evaluation.id),
78
+ runs: sortByStableKey(bundle.runs, (run) => run.id),
79
+ jobs: sortByStableKey(bundle.jobs.map(runtimeJobForProjectStateFingerprint), (job) => job.id),
80
+ executionFiles: sortByStableKey(bundle.executionFiles.map((group) => ({
81
+ jobId: group.jobId,
82
+ files: canonicalFilesForProjectStateFingerprint(group.files),
83
+ })), (group) => group.jobId),
84
+ events: sortByStableKey(bundle.events, (event) => [event.runId ?? "_", event.jobId ?? "_", event.at, event.id].join("#")),
85
+ };
86
+ return createHash("sha256").update(JSON.stringify(canonicalizeProjectState(canonical))).digest("hex");
87
+ }
88
+ export function workbenchSurfaceFilesEqualForExchange(left, right) {
89
+ return JSON.stringify(canonicalFilesForProjectStateFingerprint(left)) ===
90
+ JSON.stringify(canonicalFilesForProjectStateFingerprint(right));
91
+ }
92
+ export function workbenchRuntimeBundleStats(bundle) {
93
+ return {
94
+ candidates: bundle.candidates.length,
95
+ candidateFiles: bundle.candidateFiles.reduce((sum, group) => sum + group.files.length, 0),
96
+ evaluations: bundle.evaluations.length,
97
+ runs: bundle.runs.length,
98
+ jobs: bundle.jobs.length,
99
+ executionFiles: bundle.executionFiles.reduce((sum, group) => sum + group.files.length, 0),
100
+ events: bundle.events.length,
101
+ activeId: bundle.activeId,
102
+ };
103
+ }
104
+ function runtimeJobForProjectStateFingerprint(job) {
105
+ const portable = sanitizeWorkbenchRuntimeJobForExchange(job);
106
+ const output = portable.output;
107
+ if (!output || typeof output !== "object" || Array.isArray(output)) {
108
+ return portable;
109
+ }
110
+ const { files: _files, fileSet: _fileSet, ...portableOutput } = output;
111
+ return {
112
+ ...portable,
113
+ output: portableOutput,
114
+ };
115
+ }
116
+ function canonicalFilesForProjectStateFingerprint(files) {
117
+ return sortByStableKey(files.map((file) => ({
118
+ path: file.path,
119
+ encoding: file.encoding,
120
+ executable: Boolean(file.executable),
121
+ content: file.content,
122
+ })), (file) => file.path);
123
+ }
124
+ function normalizeTextForProjectStateFingerprint(value) {
125
+ return value.replace(/\r\n/gu, "\n").replace(/\r/gu, "\n");
126
+ }
127
+ function normalizeProjectStateResources(resources) {
128
+ return {
129
+ cpu: resources.cpu ?? DEFAULT_EXECUTION_RESOURCES.cpu,
130
+ memoryGb: resources.memoryGb ?? DEFAULT_EXECUTION_RESOURCES.memoryGb,
131
+ diskGb: resources.diskGb ?? DEFAULT_EXECUTION_RESOURCES.diskGb,
132
+ timeoutMinutes: resources.timeoutMinutes ?? DEFAULT_EXECUTION_RESOURCES.timeoutMinutes,
133
+ };
134
+ }
135
+ function sortByStableKey(items, keyFor) {
136
+ return [...items].sort((left, right) => keyFor(left).localeCompare(keyFor(right)));
137
+ }
138
+ function canonicalizeProjectState(value) {
139
+ if (Array.isArray(value)) {
140
+ return value.map(canonicalizeProjectState);
141
+ }
142
+ if (!value || typeof value !== "object") {
143
+ return value;
144
+ }
145
+ const record = value;
146
+ return Object.fromEntries(Object.keys(record)
147
+ .sort()
148
+ .map((key) => [key, canonicalizeProjectState(record[key])]));
149
+ }
39
150
  export const DEFAULT_ENVIRONMENT_VERSIONS = [
40
151
  {
41
152
  id: "envv_python_3_12",
@@ -153,7 +264,7 @@ export const DEFAULT_ENVIRONMENTS = [
153
264
  {
154
265
  id: "env_node",
155
266
  name: "Node",
156
- description: "Node runtime for JavaScript and TypeScript subjects.",
267
+ description: "Node runtime for JavaScript and TypeScript candidates.",
157
268
  currentVersionId: "envv_node_22",
158
269
  builtIn: true,
159
270
  createdAt: "2026-04-23T00:00:00.000Z",
@@ -191,8 +302,7 @@ function splitAuthoredSourceYaml(sourceYaml) {
191
302
  }
192
303
  const entries = [
193
304
  [BENCHMARK_SPEC_FILE, parsed.benchmark],
194
- ["subjects/current/subject.yaml", splitSubjectSourceRecord(parsed.subject)],
195
- ["optimizers/current.yaml", splitOptimizerSourceRecord(parsed.optimizer)],
305
+ ["candidates/current/candidate.yaml", splitCandidateSourceRecord(parsed.candidate)],
196
306
  ];
197
307
  return entries.flatMap(([filePath, value]) => {
198
308
  if (!value || typeof value !== "object" || Array.isArray(value)) {
@@ -204,23 +314,20 @@ function splitAuthoredSourceYaml(sourceYaml) {
204
314
  }];
205
315
  });
206
316
  }
207
- function splitSubjectSourceRecord(value) {
317
+ function splitCandidateSourceRecord(value) {
208
318
  const record = cloneYamlRecord(value);
209
319
  if (!record) {
210
320
  return value;
211
321
  }
212
322
  delete record.benchmark;
213
323
  delete record.path;
214
- rewriteAdapterSources(record, "subjects");
324
+ stripCandidateRuntimeSelection(record);
325
+ rewriteAdapterSources(record, "candidates/current");
215
326
  return record;
216
327
  }
217
- function splitOptimizerSourceRecord(value) {
218
- const record = cloneYamlRecord(value);
219
- if (!record) {
220
- return value;
221
- }
222
- rewriteAdapterSources(record, "optimizers");
223
- return record;
328
+ function stripCandidateRuntimeSelection(record) {
329
+ delete record.selectedRunId;
330
+ delete record.selectedRunName;
224
331
  }
225
332
  function cloneYamlRecord(value) {
226
333
  return value && typeof value === "object" && !Array.isArray(value)
@@ -242,11 +349,10 @@ function sourcePathRelativeTo(yamlDir, sourcePath) {
242
349
  }
243
350
  function isAuthoredSourceYamlPath(filePath) {
244
351
  return filePath === BENCHMARK_SPEC_FILE ||
245
- isWorkbenchSubjectManifestPath(filePath) ||
246
- /^optimizers\/[^/]+\.ya?ml$/iu.test(filePath);
352
+ isWorkbenchCandidateManifestPath(filePath);
247
353
  }
248
- function formatOptimizerSummary(spec) {
249
- return spec.improve ? `adapter:${spec.improve.use}` : "optimizer not configured";
354
+ function formatImproveSummary(spec) {
355
+ return spec.improve ? `adapter:${spec.improve.use}` : "improve not configured";
250
356
  }
251
357
  function formatEngineRunSummary(spec) {
252
358
  return `adapter:${spec.engineRun.use}`;
@@ -287,10 +393,10 @@ function protocolStepForExecution(execution, manifests) {
287
393
  if (execution.purpose !== "improve") {
288
394
  throw new Error(`Protocol execution step only supports improve executions, not ${execution.purpose}.`);
289
395
  }
290
- const operation = "optimizer.improve";
396
+ const operation = "candidate.improve";
291
397
  const command = adapterProtocolCommandSpec(execution.adapter, operation, manifests);
292
398
  return {
293
- kind: "optimizer",
399
+ kind: "improver",
294
400
  label: execution.purpose,
295
401
  operation,
296
402
  executor: command.executor,
@@ -387,35 +493,32 @@ export function materializeWorkbenchRunResult(args) {
387
493
  const completed = args.jobs.filter((job) => job.status === "succeeded");
388
494
  const failedJobCount = args.jobs.filter((job) => job.status === "failed").length;
389
495
  const completedJobCount = args.jobs.filter((job) => job.status === "succeeded").length;
390
- const subjectRevisions = completed
496
+ const candidateRevisions = completed
391
497
  .filter((job) => workbenchExecutionPurpose(job) === "improve")
392
- .map((job) => normalizeSubjectRevisionJobOutput(job.output))
498
+ .map((job) => normalizeCandidateRevisionJobOutput(job.output))
393
499
  .filter((output) => output !== null)
394
500
  .sort((left, right) => left.attemptIndex - right.attemptIndex);
395
501
  const evaluationJobs = args.jobs.filter((job) => workbenchExecutionPurpose(job) === "attempt");
396
- const evaluationsBySubject = new Map();
502
+ const evaluationsByCandidate = new Map();
397
503
  for (const job of evaluationJobs) {
398
- const subjectId = readJobString(job.output, "subjectId") ??
399
- readJobString(job.input, "subjectId") ??
400
- job.subjectId;
401
- if (subjectId) {
402
- evaluationsBySubject.set(subjectId, [
403
- ...(evaluationsBySubject.get(subjectId) ?? []),
504
+ const candidateId = readJobString(job.output, "candidateId") ??
505
+ readJobString(job.input, "candidateId") ??
506
+ job.candidateId;
507
+ if (candidateId) {
508
+ evaluationsByCandidate.set(candidateId, [
509
+ ...(evaluationsByCandidate.get(candidateId) ?? []),
404
510
  job,
405
511
  ]);
406
512
  }
407
513
  }
408
- const subjects = [];
409
- const subjectFiles = {};
514
+ const candidates = [];
515
+ const candidateFiles = {};
410
516
  const evaluations = [];
411
- for (const subjectRevision of subjectRevisions) {
412
- const subjectId = subjectRevision.subjectId;
413
- const subjectJobs = evaluationsBySubject.get(subjectId) ?? [];
414
- const succeededEvaluationJobs = subjectJobs.filter((job) => job.status === "succeeded");
415
- const outputs = normalizeEvaluationSampleOutputs({
416
- jobs: succeededEvaluationJobs,
417
- allJobs: completed,
418
- })
517
+ for (const candidateRevision of candidateRevisions) {
518
+ const candidateId = candidateRevision.candidateId;
519
+ const candidateJobs = evaluationsByCandidate.get(candidateId) ?? [];
520
+ const succeededEvaluationJobs = candidateJobs.filter((job) => job.status === "succeeded");
521
+ const outputs = normalizeEvaluationSampleOutputs(succeededEvaluationJobs)
419
522
  .sort((left, right) => compareSampleOutputs(left.output, right.output));
420
523
  const outputJobIds = new Set(outputs.flatMap(({ jobs }) => jobs.map((job) => job.id)));
421
524
  const completedSampleKeys = new Set(outputs
@@ -425,39 +528,38 @@ export function materializeWorkbenchRunResult(args) {
425
528
  ])
426
529
  .filter((key) => key !== null));
427
530
  const errorSampleJobs = [
428
- ...subjectJobs.filter((job) => job.status === "failed"),
531
+ ...candidateJobs.filter((job) => job.status === "failed"),
429
532
  ...succeededEvaluationJobs.filter((job) => !outputJobIds.has(job.id)),
430
533
  ];
431
- const errorSamples = errorEvaluationSamplesFromJobs(errorSampleJobs, subjectId, subjectRevision.attemptIndex, completedSampleKeys);
534
+ const errorSamples = errorEvaluationSamplesFromJobs(errorSampleJobs, candidateId, candidateRevision.attemptIndex, completedSampleKeys);
432
535
  const samples = [
433
536
  ...outputs.map(({ jobs, output }) => withJobUsage(output.sample, completed, jobs[0])),
434
537
  ...errorSamples,
435
538
  ].sort((left, right) => left.index - right.index || left.id.localeCompare(right.id));
436
- const subjectName = normalizedSubjectDisplayName(args.spec.subject.name);
437
- const evalRecord = createEvaluationRecord(subjectId, subjectName, samples);
539
+ const candidateName = normalizedCandidateDisplayName(args.spec.candidate.name);
540
+ const evalRecord = createEvaluationRecord(candidateId, candidateName, samples);
438
541
  const usage = mergeUsageSummaries([
439
- subjectRevision.usage,
542
+ candidateRevision.usage,
440
543
  ...samples.map((sample) => sample.usage),
441
544
  ]);
442
- const metrics = evaluationMeanMetrics(evalRecord);
443
- const attemptIndex = subjectRevision.attemptIndex;
545
+ const attemptIndex = candidateRevision.attemptIndex;
444
546
  const evaluationTraces = [
445
547
  ...outputs.flatMap(({ output }) => output.traces),
446
548
  ...errorSampleJobs.flatMap(jobTracePaths),
447
549
  ].sort();
448
- const baseId = subjectRevision.baseId && subjectRevision.baseId !== subjectId
449
- ? subjectRevision.baseId
550
+ const baseId = candidateRevision.baseId && candidateRevision.baseId !== candidateId
551
+ ? candidateRevision.baseId
450
552
  : null;
451
- const sourceMeta = subjectSourceMetadata(args.subjectSourceFiles);
553
+ const sourceMeta = candidateSourceMetadata(args.candidateSourceFiles);
452
554
  const benchmarkMeta = benchmarkSourceMetadata(args.benchmarkSourceFiles);
453
555
  const meta = {
454
556
  attemptIndex,
455
557
  sampleCount: evalRecord.sampleCount,
456
- optimizer: formatOptimizerSummary(args.spec),
558
+ improver: formatImproveSummary(args.spec),
457
559
  engineRun: formatEngineRunSummary(args.spec),
458
560
  strategy: "greedy",
459
561
  traces: {
460
- improve: subjectRevision.traces,
562
+ improve: candidateRevision.traces,
461
563
  evaluations: evaluationTraces,
462
564
  },
463
565
  };
@@ -467,52 +569,124 @@ export function materializeWorkbenchRunResult(args) {
467
569
  if (benchmarkMeta) {
468
570
  meta.benchmark = benchmarkMeta;
469
571
  }
470
- const record = {
471
- id: subjectId,
472
- ...(subjectName ? { name: subjectName } : {}),
473
- ordinal: args.existingSubjectCount + subjects.length,
474
- benchmarkFingerprint: args.benchmarkFingerprint,
475
- subjectFingerprint: args.subjectFingerprint ?? materializedSubjectFingerprint(args.spec, subjectRevision.files),
476
- createdAt: args.startedAt,
477
- ...(baseId ? { baseId } : {}),
478
- referenceIds: [],
479
- status: evalRecord.completedSampleCount > 0 ? "evaluated" : "eval_error",
480
- fileChanges: subjectRevision.fileChanges,
481
- ...(metrics ? { metrics } : {}),
482
- ...(usage ? { usage } : {}),
483
- eval: evalRecord,
484
- ...(subjectRevision.prompt ? { prompt: subjectRevision.prompt } : {}),
485
- meta,
486
- };
487
- subjects.push(record);
572
+ const record = preserveExistingCandidateIdentity({
573
+ candidate: {
574
+ id: candidateId,
575
+ ...(candidateName ? { name: candidateName } : {}),
576
+ version: args.existingCandidateCount + candidates.length + 1,
577
+ ordinal: args.existingCandidateCount + candidates.length + 1,
578
+ benchmarkFingerprint: args.benchmarkFingerprint,
579
+ candidateFingerprint: args.candidateFingerprint ?? materializedCandidateFingerprint(args.spec, candidateRevision.files),
580
+ createdAt: args.startedAt,
581
+ ...(baseId ? { baseId } : {}),
582
+ referenceIds: [],
583
+ status: evalRecord.completedSampleCount > 0 ? "evaluated" : "eval_error",
584
+ fileChanges: candidateRevision.fileChanges,
585
+ ...(usage ? { usage } : {}),
586
+ eval: evalRecord,
587
+ ...(candidateRevision.prompt ? { prompt: candidateRevision.prompt } : {}),
588
+ meta,
589
+ },
590
+ previousCandidate: args.previousCandidate ?? null,
591
+ });
592
+ candidates.push(record);
488
593
  evaluations.push(createEvaluationScorecard({
489
594
  runId: args.runId,
490
595
  benchmarkFingerprint: args.benchmarkFingerprint,
491
596
  createdAt: args.startedAt,
492
- subject: record,
597
+ candidate: record,
598
+ candidateRunId: args.spec.candidate.selectedRunId,
599
+ candidateRunName: args.spec.candidate.selectedRunName,
493
600
  evaluation: evalRecord,
601
+ ...(args.selection
602
+ ? {
603
+ selection: {
604
+ metric: args.selection.metric,
605
+ caseIds: args.selection.caseIds,
606
+ ...(args.selection.label ? { label: args.selection.label } : {}),
607
+ },
608
+ }
609
+ : {}),
494
610
  }));
495
- subjectFiles[subjectId] = materializedSubjectFiles({
496
- subjectRevisionFiles: subjectRevision.files,
611
+ candidateFiles[candidateId] = materializedCandidateFiles({
612
+ candidateRevisionFiles: candidateRevision.files,
497
613
  });
498
614
  }
499
- const selectedSubject = selectSubject({
500
- subjects,
501
- previousSubject: args.previousSubject ?? null,
615
+ const selectedCandidate = selectCandidate({
616
+ candidates,
617
+ previousCandidate: args.previousCandidate ?? null,
618
+ selection: args.selection,
502
619
  });
503
620
  return {
504
- subjects,
505
- subjectFiles,
621
+ candidates,
622
+ candidateFiles,
506
623
  evaluations,
507
- activeSubjectId: selectedSubject?.id ?? args.previousSubject?.id ?? null,
508
- selectedSubject,
624
+ activeCandidateId: selectedCandidate?.id ?? args.previousCandidate?.id ?? null,
625
+ selectedCandidate,
509
626
  completedJobCount,
510
627
  failedJobCount,
511
628
  };
512
629
  }
513
- function subjectSourceMetadata(files) {
630
+ function preserveExistingCandidateIdentity(args) {
631
+ const previous = args.previousCandidate;
632
+ if (!previous || previous.id !== args.candidate.id) {
633
+ return args.candidate;
634
+ }
635
+ const baseId = args.candidate.baseId ?? previous.baseId;
636
+ const prompt = args.candidate.prompt ?? previous.prompt;
637
+ const meta = mergeExistingCandidateMeta(previous.meta, args.candidate.meta);
638
+ return {
639
+ ...args.candidate,
640
+ version: previous.version,
641
+ ordinal: previous.version,
642
+ createdAt: previous.createdAt,
643
+ ...(args.candidate.name ?? previous.name
644
+ ? { name: (args.candidate.name ?? previous.name) }
645
+ : {}),
646
+ ...(baseId ? { baseId } : {}),
647
+ referenceIds: previous.referenceIds.length > 0
648
+ ? [...previous.referenceIds]
649
+ : args.candidate.referenceIds,
650
+ fileChanges: args.candidate.fileChanges.length > 0
651
+ ? args.candidate.fileChanges
652
+ : [...previous.fileChanges],
653
+ ...(prompt ? { prompt } : {}),
654
+ ...(meta ? { meta } : {}),
655
+ };
656
+ }
657
+ function mergeExistingCandidateMeta(previousMeta, candidateMeta) {
658
+ const previous = jsonRecord(previousMeta);
659
+ const candidate = jsonRecord(candidateMeta);
660
+ if (!previous) {
661
+ return candidateMeta;
662
+ }
663
+ if (!candidate) {
664
+ return previousMeta;
665
+ }
666
+ const previousTraces = jsonRecord(previous.traces);
667
+ const candidateTraces = jsonRecord(candidate.traces);
668
+ if (!previousTraces || !candidateTraces) {
669
+ return { ...previous, ...candidate };
670
+ }
671
+ const traces = {
672
+ ...previousTraces,
673
+ ...candidateTraces,
674
+ };
675
+ const candidateImproveTraces = Array.isArray(candidateTraces.improve)
676
+ ? candidateTraces.improve
677
+ : [];
678
+ if (candidateImproveTraces.length === 0 && previousTraces.improve !== undefined) {
679
+ traces.improve = previousTraces.improve;
680
+ }
681
+ return {
682
+ ...previous,
683
+ ...candidate,
684
+ traces,
685
+ };
686
+ }
687
+ function candidateSourceMetadata(files) {
514
688
  const sourceFiles = (files ?? [])
515
- .filter((file) => /^subjects\/[^/]+\/subject\.ya?ml$/iu.test(file.path))
689
+ .filter((file) => /^candidates\/[^/]+\/candidate\.ya?ml$/iu.test(file.path))
516
690
  .sort((left, right) => left.path.localeCompare(right.path))
517
691
  .map((file) => ({
518
692
  path: file.path,
@@ -536,14 +710,13 @@ function benchmarkSourceMetadata(files) {
536
710
  }));
537
711
  return sourceFiles.length > 0 ? { files: sourceFiles } : null;
538
712
  }
539
- function materializedSubjectFingerprint(spec, files) {
713
+ function materializedCandidateFingerprint(spec, files) {
540
714
  const hash = createHash("sha256");
541
- hash.update("workbench-subject-v1\0");
542
- hash.update("materialized\0runner\0");
543
- hash.update(JSON.stringify(spec.run));
715
+ hash.update("workbench-candidate-v1\0");
716
+ hash.update("materialized\0");
544
717
  hash.update("prepare");
545
- hash.update(JSON.stringify(spec.subject.prepare ?? null));
546
- for (const file of filterSubjectSourceFiles(files).slice().sort((left, right) => left.path.localeCompare(right.path))) {
718
+ hash.update(JSON.stringify(spec.candidate.prepare ?? null));
719
+ for (const file of filterCandidateSourceFiles(files).slice().sort((left, right) => left.path.localeCompare(right.path))) {
547
720
  hash.update("\0file\0");
548
721
  hash.update(file.path);
549
722
  hash.update("\0");
@@ -555,22 +728,28 @@ function materializedSubjectFingerprint(spec, files) {
555
728
  }
556
729
  return hash.digest("hex");
557
730
  }
558
- function materializedSubjectFiles(args) {
731
+ function materializedCandidateFiles(args) {
559
732
  const byPath = new Map();
560
- for (const file of filterSubjectSourceFiles(args.subjectRevisionFiles)) {
733
+ for (const file of filterCandidateSourceFiles(args.candidateRevisionFiles)) {
561
734
  byPath.set(file.path, { ...file });
562
735
  }
563
736
  return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
564
737
  }
565
738
  function createEvaluationScorecard(args) {
566
739
  const evaluation = args.evaluation;
740
+ const selectionScore = args.selection
741
+ ? readEvaluationSelectionStats(evaluation, args.selection.metric, args.selection.caseIds)
742
+ : null;
567
743
  return {
568
- id: evaluationScorecardId(args.runId, args.subject.id),
744
+ id: evaluationScorecardId(args.runId, args.candidate.id),
569
745
  runId: args.runId,
570
746
  benchmarkFingerprint: args.benchmarkFingerprint,
571
- subjectFingerprint: args.subject.subjectFingerprint,
572
- subjectId: args.subject.id,
573
- ...(args.subject.name ? { subjectName: args.subject.name } : {}),
747
+ candidateFingerprint: args.candidate.candidateFingerprint,
748
+ candidateId: args.candidate.id,
749
+ ...(args.candidate.name ? { candidateName: args.candidate.name } : {}),
750
+ candidateVersion: args.candidate.version,
751
+ ...(args.candidateRunId ? { candidateRunId: args.candidateRunId } : {}),
752
+ ...(args.candidateRunName ? { candidateRunName: args.candidateRunName } : {}),
574
753
  createdAt: args.createdAt,
575
754
  updatedAt: evaluation.finishedAt ?? args.createdAt,
576
755
  status: evaluation.status,
@@ -578,16 +757,19 @@ function createEvaluationScorecard(args) {
578
757
  completedSampleCount: evaluation.completedSampleCount,
579
758
  errorSampleCount: evaluation.errorSampleCount,
580
759
  ...(evaluation.metrics ? { metrics: evaluation.metrics } : {}),
760
+ ...(args.selection ? { selectionMetric: args.selection.metric } : {}),
761
+ ...(args.selection ? { selectionLabel: args.selection.label ?? `${args.selection.metric} on selected cases` } : {}),
762
+ ...(selectionScore ? { selectionScore } : {}),
581
763
  ...(evaluation.durationMs ? { durationMs: evaluation.durationMs } : {}),
582
764
  ...(evaluation.usage ? { usage: evaluation.usage } : {}),
583
765
  ...(evaluation.error ? { error: evaluation.error } : {}),
584
766
  evaluation,
585
767
  };
586
768
  }
587
- export function evaluationScorecardId(runId, subjectId) {
769
+ export function evaluationScorecardId(runId, candidateId) {
588
770
  const runPart = runId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
589
- const subjectPart = subjectId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
590
- return `eval_${runPart}_${subjectPart}`;
771
+ const candidatePart = candidateId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
772
+ return `eval_${runPart}_${candidatePart}`;
591
773
  }
592
774
  export function selectExecutionOutputFilesForInspection(args) {
593
775
  return args.files.filter((file) => !isWorkbenchInternalOutputPath(file.path));
@@ -602,56 +784,145 @@ export function isWorkbenchInternalOutputPath(filePath) {
602
784
  normalized === "exit_code" ||
603
785
  /^[a-z_-]+_(stdout\.log|stderr\.log|exit_code)$/u.test(normalized));
604
786
  }
605
- export function createSubjectRevisionTraceInputFiles(args) {
787
+ export function createOptimizerTraceInputFiles(args) {
606
788
  const files = [];
607
- const manifestJobs = [];
789
+ const executions = [];
608
790
  const jobs = args.jobs
609
- .filter((job) => job.runId === args.runId && isTerminalExecutionJob(job))
791
+ .filter(isOptimizerTraceInputJob)
610
792
  .sort(compareTraceInputJobs);
611
- for (const job of jobs) {
793
+ jobs.forEach((job, index) => {
794
+ const sequence = String(index + 1).padStart(6, "0");
795
+ const executionPath = `executions/${sequence}`;
796
+ const operation = "engine.run";
612
797
  const jobFiles = completedJobOutputFiles(job);
613
- const rawTraceFiles = jobFiles.filter((file) => normalizeRelativePath(file.path).startsWith(".workbench/traces/"));
614
- files.push(...rawTraceFiles.map((file) => ({ ...file })));
615
- const events = args.events
616
- .filter((event) => event.runId === args.runId && event.jobId === job.id)
617
- .sort((left, right) => left.at.localeCompare(right.at));
618
- const eventPath = `events/${job.id}.ndjson`;
619
- if (events.length > 0) {
620
- files.push(textSurfaceFile(eventPath, `${events.map((event) => JSON.stringify(event)).join("\n")}\n`));
621
- }
622
- const summaryPath = `jobs/${job.id}.json`;
623
- const summary = subjectRevisionTraceJobSummary(job, {
624
- eventPath: events.length > 0 ? eventPath : null,
625
- rawTracePaths: rawTraceFiles.map((file) => file.path).sort(),
626
- });
627
- files.push(textSurfaceFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`));
628
- manifestJobs.push({
629
- ...summary,
630
- summary_path: summaryPath,
798
+ const requestFile = traceInputRequestFile(jobFiles, operation);
799
+ const resultFile = traceInputResultFile(jobFiles, operation);
800
+ const requestPath = `${executionPath}/request.json`;
801
+ const resultPath = `${executionPath}/result.json`;
802
+ const filesPath = `${executionPath}/files`;
803
+ files.push(textSurfaceFile(requestPath, requestFile?.content ?? `${JSON.stringify(traceInputRequestFallback(job, operation), null, 2)}\n`));
804
+ files.push(textSurfaceFile(resultPath, resultFile?.content ?? `${JSON.stringify(traceInputResultFallback(job, operation), null, 2)}\n`));
805
+ files.push(...jobFiles.map((file) => ({
806
+ ...file,
807
+ path: normalizeRelativePath(`${filesPath}/${file.path}`),
808
+ })));
809
+ executions.push({
810
+ path: executionPath,
811
+ operation,
812
+ status: job.status,
813
+ candidateId: job.candidateId ?? readJobString(job.input, "candidateId") ?? null,
814
+ runId: job.runId,
815
+ jobId: job.id,
816
+ attemptIndex: readOptionalJobNumber(job.input, "attemptIndex") ?? null,
817
+ sampleIndex: readOptionalJobNumber(job.input, "sampleIndex") ?? null,
818
+ caseId: readJobString(job.input, "caseId") ?? null,
819
+ requestPath,
820
+ resultPath,
821
+ filesPath,
631
822
  });
632
- }
633
- files.push(textSurfaceFile("manifest.json", `${JSON.stringify({
634
- run_id: args.runId,
635
- jobs: manifestJobs,
823
+ });
824
+ files.push(textSurfaceFile("index.json", `${JSON.stringify({
825
+ schema: "workbench.optimizer-traces.v1",
826
+ executions,
636
827
  }, null, 2)}\n`));
637
828
  return dedupeSurfaceFiles(files);
638
829
  }
639
- export function createSubjectEvaluationTraceInputFiles(args) {
640
- const subject = args.subject;
641
- if (!subject?.eval && !subject?.metrics) {
830
+ export function workbenchImproveOptimizeSelector(spec) {
831
+ return cloneWorkbenchCaseSelector(spec.candidate.improve?.optimizeOn ?? { all: true });
832
+ }
833
+ export function workbenchImproveSelectionPolicy(spec) {
834
+ const optimizeOn = workbenchImproveOptimizeSelector(spec);
835
+ const selectBy = spec.candidate.improve?.selectBy;
836
+ return {
837
+ metric: selectBy?.metric ?? "score",
838
+ selector: cloneWorkbenchCaseSelector(selectBy?.cases ?? optimizeOn),
839
+ };
840
+ }
841
+ export function workbenchEngineCaseIdsForSelector(engineCases, selector) {
842
+ return engineCases
843
+ .filter((engineCase) => workbenchEngineCaseMatchesSelector(engineCase, selector))
844
+ .map((engineCase) => engineCase.id);
845
+ }
846
+ export function workbenchEngineCaseIdsForImproveEvaluation(args) {
847
+ const optimizeIds = new Set(workbenchEngineCaseIdsForSelector(args.engineCases, workbenchImproveOptimizeSelector(args.spec)));
848
+ const selectionIds = new Set(workbenchEngineCaseIdsForSelector(args.engineCases, workbenchImproveSelectionPolicy(args.spec).selector));
849
+ return args.engineCases
850
+ .map((engineCase) => engineCase.id)
851
+ .filter((caseId) => optimizeIds.has(caseId) || selectionIds.has(caseId));
852
+ }
853
+ export function filterOptimizerTraceJobsForCaseIds(jobs, caseIds) {
854
+ const allowed = new Set(caseIds);
855
+ if (allowed.size === 0) {
642
856
  return [];
643
857
  }
644
- const filePath = normalizeRelativePath(args.path ?? `base-subject/${subject.id}/evaluation.json`);
645
- const payload = {
646
- kind: "subject_evaluation",
647
- subjectId: subject.id,
648
- status: subject.status,
649
- metrics: subject.metrics ?? null,
650
- fileChanges: subject.fileChanges,
651
- eval: subject.eval ?? null,
652
- prompt: subject.prompt ?? null,
653
- };
654
- return [textSurfaceFile(filePath, `${JSON.stringify(payload, null, 2)}\n`)];
858
+ return jobs.filter((job) => {
859
+ if (workbenchExecutionPurpose(job) !== "attempt") {
860
+ return false;
861
+ }
862
+ const caseId = readJobString(job.input, "caseId");
863
+ return caseId !== null && allowed.has(caseId);
864
+ });
865
+ }
866
+ export function formatWorkbenchCaseSelector(selector) {
867
+ return workbenchCaseSelectorUsesAllCases(selector)
868
+ ? "all cases"
869
+ : `split=${selector.split}`;
870
+ }
871
+ export function formatWorkbenchSelectionPolicy(policy) {
872
+ return `${policy.metric} on ${formatWorkbenchCaseSelector(policy.selector)}`;
873
+ }
874
+ export function workbenchCaseSelectorUsesAllCases(selector) {
875
+ return !selector.split;
876
+ }
877
+ function workbenchEngineCaseMatchesSelector(engineCase, selector) {
878
+ if (workbenchCaseSelectorUsesAllCases(selector)) {
879
+ return true;
880
+ }
881
+ return engineCase.case.split === selector.split;
882
+ }
883
+ function cloneWorkbenchCaseSelector(selector) {
884
+ return selector.split ? { split: selector.split } : { all: true };
885
+ }
886
+ export function evaluationMeanMetrics(evaluation) {
887
+ const entries = Object.entries(evaluation?.metrics ?? {})
888
+ .filter((entry) => Number.isFinite(entry[1].mean));
889
+ return entries.length > 0
890
+ ? Object.fromEntries(entries.map(([key, stats]) => [key, stats.mean]))
891
+ : undefined;
892
+ }
893
+ export function candidateRecordWithoutDerivedFields(candidate) {
894
+ const { metrics: _metrics, candidateRunId: _candidateRunId, candidateRunName: _candidateRunName, ...record } = candidate;
895
+ return record;
896
+ }
897
+ export function candidateSummaryFromRecord(candidate) {
898
+ const { eval: _eval, prompt: _prompt, meta: _meta, ...summary } = candidateRecordWithoutDerivedFields(candidate);
899
+ return summary;
900
+ }
901
+ export function workbenchRunExecutionFingerprint(args) {
902
+ const hash = createHash("sha256");
903
+ hash.update("workbench-run-execution-v1\0");
904
+ hash.update(args.specVersionId ?? "");
905
+ hash.update("\0");
906
+ hash.update(args.environmentVersionId ?? "");
907
+ hash.update("\0");
908
+ hash.update(args.sourceYaml ?? "");
909
+ for (const file of (args.adapterFiles ?? []).slice().sort((left, right) => left.path.localeCompare(right.path))) {
910
+ hash.update("\0file\0");
911
+ hash.update(file.path);
912
+ hash.update("\0");
913
+ hash.update(file.kind);
914
+ hash.update("\0");
915
+ hash.update(file.encoding);
916
+ hash.update("\0");
917
+ hash.update(file.executable ? "1" : "0");
918
+ hash.update("\0");
919
+ hash.update(file.content);
920
+ }
921
+ return hash.digest("hex");
922
+ }
923
+ function isOptimizerTraceInputJob(job) {
924
+ return isTerminalExecutionJob(job) &&
925
+ workbenchExecutionPurpose(job) === "attempt";
655
926
  }
656
927
  function isTerminalExecutionJob(job) {
657
928
  return job.kind === "execute" && (job.status === "succeeded" ||
@@ -662,20 +933,10 @@ function compareTraceInputJobs(left, right) {
662
933
  const leftAttempt = readOptionalJobNumber(left.input, "attemptIndex") ?? -1;
663
934
  const rightAttempt = readOptionalJobNumber(right.input, "attemptIndex") ?? -1;
664
935
  return leftAttempt - rightAttempt ||
665
- purposeSortKey(workbenchExecutionPurpose(left)) - purposeSortKey(workbenchExecutionPurpose(right)) ||
666
936
  (readOptionalJobNumber(left.input, "sampleIndex") ?? -1) - (readOptionalJobNumber(right.input, "sampleIndex") ?? -1) ||
667
937
  (readJobString(left.input, "caseId") ?? "").localeCompare(readJobString(right.input, "caseId") ?? "") ||
668
938
  left.id.localeCompare(right.id);
669
939
  }
670
- function purposeSortKey(purpose) {
671
- if (purpose === "improve") {
672
- return 0;
673
- }
674
- if (purpose === "attempt") {
675
- return 1;
676
- }
677
- return 3;
678
- }
679
940
  function completedJobOutputFiles(job) {
680
941
  const output = jsonRecord(job.output);
681
942
  if (!Array.isArray(output.files)) {
@@ -689,35 +950,70 @@ function completedJobOutputFiles(job) {
689
950
  }
690
951
  return files;
691
952
  }
692
- function subjectRevisionTraceJobSummary(job, paths) {
693
- const output = jsonRecord(job.output);
953
+ function traceInputRequestFile(files, operation) {
954
+ return files.find((file) => {
955
+ const normalized = normalizeRelativePath(file.path);
956
+ return normalized.startsWith(".workbench/traces/") &&
957
+ normalized.endsWith("/request.json") &&
958
+ file.encoding === "utf8" &&
959
+ traceJsonOperation(file) === operation;
960
+ }) ?? null;
961
+ }
962
+ function traceInputResultFile(files, operation) {
963
+ return files.find((file) => {
964
+ const normalized = normalizeRelativePath(file.path);
965
+ return normalized.startsWith(".workbench/traces/") &&
966
+ normalized.endsWith("/result.json") &&
967
+ file.encoding === "utf8" &&
968
+ traceJsonOperation(file) === operation;
969
+ }) ?? null;
970
+ }
971
+ function traceJsonOperation(file) {
972
+ try {
973
+ const parsed = JSON.parse(file.content);
974
+ return typeof parsed?.operation === "string" ? parsed.operation : null;
975
+ }
976
+ catch {
977
+ return null;
978
+ }
979
+ }
980
+ function traceInputRequestFallback(job, operation) {
981
+ const execution = jsonRecord(jsonRecord(job.input).execution);
694
982
  return {
695
- job_id: job.id,
696
- purpose: workbenchExecutionPurpose(job) ?? "unknown",
697
- status: job.status,
698
- subject_id: job.subjectId ?? readJobString(job.input, "subjectId"),
699
- attempt_index: readOptionalJobNumber(job.input, "attemptIndex"),
700
- sample_index: readOptionalJobNumber(job.input, "sampleIndex"),
701
- case_id: readJobString(job.input, "caseId"),
702
- created_at: job.createdAt,
703
- ...(job.startedAt ? { started_at: job.startedAt } : {}),
704
- ...(job.finishedAt ? { finished_at: job.finishedAt } : {}),
705
- ...(job.error ? { error: job.error } : {}),
706
- traces: jobTracePaths(job),
707
- event_path: paths.eventPath,
708
- raw_trace_paths: [...paths.rawTracePaths],
709
- output: summarizeJobOutputForTrace(output),
983
+ protocol: "workbench.adapter.v3",
984
+ id: typeof execution.id === "string" ? execution.id : job.id,
985
+ jobId: job.id,
986
+ operation,
987
+ invocation: jsonRecord(execution.adapter),
988
+ context: {
989
+ candidate: {
990
+ id: job.candidateId ?? readJobString(job.input, "candidateId") ?? null,
991
+ },
992
+ attempt: {
993
+ attemptIndex: readOptionalJobNumber(job.input, "attemptIndex") ?? null,
994
+ sampleIndex: readOptionalJobNumber(job.input, "sampleIndex") ?? null,
995
+ caseId: readJobString(job.input, "caseId") ?? null,
996
+ },
997
+ },
710
998
  };
711
999
  }
712
- function summarizeJobOutputForTrace(output) {
713
- const { files: _files, fileSet: _fileSet, subjectPatch, ...rest } = output;
714
- const patch = jsonRecord(subjectPatch);
715
- const { files: _patchFiles, ...patchSummary } = patch;
1000
+ function traceInputResultFallback(job, operation) {
1001
+ const output = jsonRecord(job.output);
1002
+ const ok = job.status === "succeeded" && output.ok !== false;
1003
+ const value = operation === "candidate.improve"
1004
+ ? jsonRecord(output.candidatePatch)
1005
+ : operation === "engine.run"
1006
+ ? jsonRecord(output.result)
1007
+ : {};
716
1008
  return {
717
- ...rest,
718
- ...(Object.keys(patch).length > 0
719
- ? { subjectPatch: patchSummary }
720
- : {}),
1009
+ protocol: "workbench.adapter-result.v1",
1010
+ operation,
1011
+ ok,
1012
+ ...(Object.keys(value).length > 0 ? { value: value } : {}),
1013
+ ...(typeof output.summary === "string" ? { summary: output.summary } : {}),
1014
+ ...(output.feedback !== undefined ? { feedback: output.feedback } : {}),
1015
+ ...(output.usage !== undefined ? { usage: output.usage } : {}),
1016
+ ...(!ok ? { error: job.error ?? "Execution did not complete successfully." } : {}),
721
1017
  };
722
1018
  }
723
1019
  function textSurfaceFile(path, content) {
@@ -744,7 +1040,7 @@ export function buildWorkbenchProjectSourceFiles(input) {
744
1040
  ...(input.specFiles
745
1041
  ? input.specFiles.map((file) => ({ ...file }))
746
1042
  : [textSurfaceFile("benchmark.yaml", input.specSource ?? "")]),
747
- ...prefixProjectSourceFiles(input.subjectFiles, input.subjectFilesPath),
1043
+ ...prefixProjectSourceFiles(input.candidateFiles, input.candidateFilesPath),
748
1044
  ...prefixProjectSourceFiles(input.engineResolveFiles, input.engineResolveFilesPath),
749
1045
  ...(input.adapterFiles ?? []).map((file) => ({ ...file })),
750
1046
  ...(input.dockerfiles ?? []).map((file) => ({ ...file })),
@@ -772,18 +1068,18 @@ function prefixProjectSourceFiles(files, rootPath) {
772
1068
  };
773
1069
  });
774
1070
  }
775
- export function isSubjectSourceFilePath(filePath) {
1071
+ export function isCandidateSourceFilePath(filePath) {
776
1072
  const normalized = normalizeRelativePath(filePath);
777
1073
  return (normalized !== ".workbench" &&
778
1074
  !normalized.startsWith(".workbench/") &&
779
1075
  normalized !== "workbench-result.json");
780
1076
  }
781
- export function filterSubjectSourceFiles(files) {
1077
+ export function filterCandidateSourceFiles(files) {
782
1078
  return files
783
- .filter((file) => isSubjectSourceFilePath(file.path))
1079
+ .filter((file) => isCandidateSourceFilePath(file.path))
784
1080
  .map((file) => ({ ...file }));
785
1081
  }
786
- export function buildSubjectLineage(args) {
1082
+ export function buildCandidateLineage(args) {
787
1083
  const orderedSummaries = args.summaries.slice().sort((left, right) => {
788
1084
  const createdAt = left.createdAt.localeCompare(right.createdAt);
789
1085
  return createdAt !== 0 ? createdAt : left.id.localeCompare(right.id);
@@ -856,7 +1152,7 @@ function globPatternToRegExp(pattern) {
856
1152
  function escapeRegExp(value) {
857
1153
  return value.replace(/[\\^$.*+?()[\]{}|]/gu, "\\$&");
858
1154
  }
859
- export function summarizeSubjectFiles(files, changedPaths = files.map((file) => file.path)) {
1155
+ export function summarizeCandidateFiles(files, changedPaths = files.map((file) => file.path)) {
860
1156
  const changed = new Set(changedPaths);
861
1157
  return [...files]
862
1158
  .sort((left, right) => left.path.localeCompare(right.path))
@@ -875,7 +1171,7 @@ export function summarizeSubjectFiles(files, changedPaths = files.map((file) =>
875
1171
  };
876
1172
  });
877
1173
  }
878
- export function createSubjectFilePreview(args) {
1174
+ export function createCandidateFilePreview(args) {
879
1175
  if (args.view === "diff") {
880
1176
  throw new Error("Diff previews require explicit before and after file content.");
881
1177
  }
@@ -901,14 +1197,14 @@ export function createSubjectFilePreview(args) {
901
1197
  export function createCaseReview(args) {
902
1198
  const preferredSampleIndex = uniqueExecutionSampleIndex(args.executions ?? []);
903
1199
  const sampleMatchesCase = (sample) => (sample.cases ?? []).some((entry) => entry.id === args.caseId);
904
- const samples = args.subject.eval?.samples ?? [];
1200
+ const samples = args.candidate.eval?.samples ?? [];
905
1201
  const sampleResult = samples.find((sample) => typeof preferredSampleIndex === "number" &&
906
1202
  sample.index === preferredSampleIndex &&
907
1203
  sampleMatchesCase(sample)) ?? samples.find(sampleMatchesCase);
908
1204
  const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId);
909
1205
  if (!sampleResult && (args.executions?.length ?? 0) > 0) {
910
1206
  return {
911
- subjectId: args.subject.id,
1207
+ candidateId: args.candidate.id,
912
1208
  caseId: args.caseId,
913
1209
  caseLabel: args.caseId,
914
1210
  ...(typeof preferredSampleIndex === "number"
@@ -920,13 +1216,13 @@ export function createCaseReview(args) {
920
1216
  };
921
1217
  }
922
1218
  if (!sampleResult) {
923
- throw new Error(`Case ${args.caseId} was not found on subject ${args.subject.id}.`);
1219
+ throw new Error(`Case ${args.caseId} was not found on candidate ${args.candidate.id}.`);
924
1220
  }
925
1221
  const durationMs = typeof caseResult?.durationMs === "number"
926
1222
  ? caseResult.durationMs
927
1223
  : undefined;
928
1224
  return {
929
- subjectId: args.subject.id,
1225
+ candidateId: args.candidate.id,
930
1226
  caseId: caseResult?.id ?? args.caseId,
931
1227
  caseLabel: caseResult?.label ?? args.caseId,
932
1228
  sampleId: sampleResult.id,
@@ -965,37 +1261,45 @@ function parseAuthoredWorkbenchSourceSpec(source) {
965
1261
  }
966
1262
  const resolved = resolveWorkbenchResolvedSourceYamlInternal(source);
967
1263
  return {
968
- version: 3,
1264
+ version: 4,
969
1265
  benchmark: {
970
1266
  name: resolved.benchmark.name,
971
1267
  description: resolved.benchmark.description,
972
1268
  engine: authoredAdapterSpecFromInvocation(resolved.engine),
973
1269
  },
974
- subject: {
975
- name: resolved.subject.name,
976
- description: resolved.subject.description,
977
- files: { path: resolved.subject.files.path },
978
- ...(resolved.subject.prepare ? { prepare: { ...resolved.subject.prepare } } : {}),
979
- run: runSpecFromInvocation(resolved.run),
980
- },
981
- ...(resolved.optimizer
982
- ? {
983
- optimizer: {
984
- name: resolved.optimizer.name,
985
- ...(resolved.optimizer.description ? { description: resolved.optimizer.description } : {}),
986
- edits: [...resolved.optimizer.edits],
987
- improve: improveSpecFromInvocation(resolved.improve),
1270
+ candidate: {
1271
+ name: resolved.candidate.name,
1272
+ description: resolved.candidate.description,
1273
+ files: { path: resolved.candidate.files.path },
1274
+ ...(resolved.candidate.prepare ? { prepare: { ...resolved.candidate.prepare } } : {}),
1275
+ defaultRun: resolved.candidate.defaultRun,
1276
+ runs: Object.fromEntries(Object.entries(resolved.candidate.runs).map(([runId, run]) => [
1277
+ runId,
1278
+ {
1279
+ name: run.name,
1280
+ ...authoredAdapterSpecFromInvocation(run),
988
1281
  },
989
- }
990
- : {}),
1282
+ ])),
1283
+ ...(resolved.candidate.improve
1284
+ ? {
1285
+ improve: {
1286
+ edits: [...resolved.candidate.improve.edits],
1287
+ ...(resolved.candidate.improve.optimizeOn
1288
+ ? { optimizeOn: resolved.candidate.improve.optimizeOn }
1289
+ : {}),
1290
+ ...(resolved.candidate.improve.selectBy
1291
+ ? { selectBy: resolved.candidate.improve.selectBy }
1292
+ : {}),
1293
+ ...improveSpecFromInvocation(resolved.improve),
1294
+ },
1295
+ }
1296
+ : {}),
1297
+ },
991
1298
  };
992
1299
  }
993
1300
  function improveSpecFromInvocation(invocation) {
994
1301
  return authoredAdapterSpecFromInvocation(invocation);
995
1302
  }
996
- function runSpecFromInvocation(invocation) {
997
- return authoredAdapterSpecFromInvocation(invocation);
998
- }
999
1303
  function authoredAdapterSpecFromInvocation(invocation) {
1000
1304
  const config = jsonRecord(invocation.with);
1001
1305
  return {
@@ -1048,9 +1352,9 @@ export function createWorkbenchRunWorkload(args) {
1048
1352
  if (!purpose) {
1049
1353
  throw new Error(`Unsupported runtime job kind: ${args.job.kind}`);
1050
1354
  }
1051
- const subjectId = readJobString(args.job.input, "subjectId") ?? args.job.subjectId;
1052
- if (!subjectId) {
1053
- throw new Error(`${purpose} execution job is missing subjectId.`);
1355
+ const candidateId = readJobString(args.job.input, "candidateId") ?? args.job.candidateId;
1356
+ if (!candidateId) {
1357
+ throw new Error(`${purpose} execution job is missing candidateId.`);
1054
1358
  }
1055
1359
  const attemptIndex = readRequiredJobNumber(args.job.input, "attemptIndex", `${purpose} execution job`);
1056
1360
  const sampleIndex = purpose === "improve"
@@ -1066,7 +1370,7 @@ export function createWorkbenchRunWorkload(args) {
1066
1370
  ? engineCaseFilesForRuntimeInput({ spec: args.spec, engineCase })
1067
1371
  : [];
1068
1372
  const engineCaseSpec = engineCase?.case;
1069
- const initial = createInitialSubjectFiles({
1373
+ const initial = createInitialCandidateFiles({
1070
1374
  baseFiles: args.baseFiles,
1071
1375
  spec: args.spec,
1072
1376
  attemptIndex,
@@ -1074,10 +1378,10 @@ export function createWorkbenchRunWorkload(args) {
1074
1378
  return {
1075
1379
  job: args.job,
1076
1380
  spec: args.spec,
1077
- subjectId,
1381
+ candidateId,
1078
1382
  attemptIndex,
1079
1383
  sampleIndex,
1080
- subjectFiles: initial.files,
1384
+ candidateFiles: initial.files,
1081
1385
  caseId,
1082
1386
  engineResolveFiles: selectedEngineResolveFiles,
1083
1387
  traceFiles: (args.traceFiles ?? []).map((file) => ({ ...file })),
@@ -1088,22 +1392,22 @@ export function createWorkbenchRunWorkload(args) {
1088
1392
  baseId: readJobString(args.job.input, "baseId"),
1089
1393
  };
1090
1394
  }
1091
- function createInitialSubjectFiles(args) {
1092
- const editablePaths = optimizerEdits(args.spec).map(normalizeRelativePath);
1395
+ function createInitialCandidateFiles(args) {
1396
+ const editablePaths = improveEdits(args.spec).map(normalizeRelativePath);
1093
1397
  const editPath = editablePaths[0];
1094
- const subjectPaths = editPath ? [editPath] : [];
1398
+ const candidatePaths = editPath ? [editPath] : [];
1095
1399
  const files = args.baseFiles.length > 0
1096
1400
  ? args.baseFiles.map((file) => ({ ...file }))
1097
1401
  : editPath
1098
1402
  ? normalizeSurfaceFiles([{ path: editPath, content: "" }])
1099
1403
  : [];
1100
1404
  const prompt = [
1101
- `Run the subject workload for benchmark: ${args.spec.benchmark.description}`,
1102
- `Attempt ${args.attemptIndex + 1} uses ${formatOptimizerSummary(args.spec)}; the improve adapter may edit the subject before Workbench scores it.`,
1405
+ `Run the candidate workload for benchmark: ${args.spec.benchmark.description}`,
1406
+ `Attempt ${args.attemptIndex + 1} uses ${formatImproveSummary(args.spec)}; the improve adapter may edit the candidate before Workbench scores it.`,
1103
1407
  ].join("\n");
1104
1408
  const byPath = new Map(files.map((file) => [file.path, file]));
1105
1409
  if (editPath &&
1106
- ![...byPath.keys()].some((filePath) => subjectPaths.includes(filePath))) {
1410
+ ![...byPath.keys()].some((filePath) => candidatePaths.includes(filePath))) {
1107
1411
  byPath.set(editPath, {
1108
1412
  path: editPath,
1109
1413
  kind: "text",
@@ -1167,7 +1471,7 @@ export function workbenchExecutionExecutorForRuntimeInput(args) {
1167
1471
  }
1168
1472
  function adapterOperationForExecutionPurpose(purpose) {
1169
1473
  if (purpose === "improve") {
1170
- return "optimizer.improve";
1474
+ return "candidate.improve";
1171
1475
  }
1172
1476
  if (purpose === "attempt") {
1173
1477
  return "engine.run";
@@ -1281,8 +1585,8 @@ function normalizeRuntimeControlInputs(value) {
1281
1585
  }
1282
1586
  const record = value;
1283
1587
  const inputs = {};
1284
- if (hasOwn(record, "subject")) {
1285
- inputs.subject = normalizeRuntimeControlFiles(record.subject, "inputs.subject");
1588
+ if (hasOwn(record, "candidate")) {
1589
+ inputs.candidate = normalizeRuntimeControlFiles(record.candidate, "inputs.candidate");
1286
1590
  }
1287
1591
  if (hasOwn(record, "case")) {
1288
1592
  inputs.case = normalizeRuntimeControlFiles(record.case, "inputs.case");
@@ -1326,8 +1630,8 @@ function normalizeRuntimeControlOperation(value, label) {
1326
1630
  const operation = record.operation;
1327
1631
  if (operation !== "engine.resolve" &&
1328
1632
  operation !== "engine.run" &&
1329
- operation !== "subject.run" &&
1330
- operation !== "optimizer.improve") {
1633
+ operation !== "candidate.run" &&
1634
+ operation !== "candidate.improve") {
1331
1635
  throw new Error(`Workbench runtime-control ${label}.operation is invalid.`);
1332
1636
  }
1333
1637
  const invocation = record.invocation;
@@ -1415,7 +1719,7 @@ export async function executeAdapterInCurrentRuntime(args, execution, startedAt,
1415
1719
  };
1416
1720
  try {
1417
1721
  if (execution.purpose === "improve") {
1418
- return await executeSubjectRevisionExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
1722
+ return await executeCandidateRevisionExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
1419
1723
  }
1420
1724
  if (execution.purpose === "attempt") {
1421
1725
  return await executeAttemptExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
@@ -1589,22 +1893,22 @@ function completedJobFromSandboxResult(fallbackJob, startedAt, result) {
1589
1893
  }
1590
1894
  return attachSandboxMetadataToJob(failWorkbenchRunJob(fallbackJob, result.startedAt || startedAt, result.error ?? `Sandbox execution ${result.status}.`, result.finishedAt), asRuntimeRecord(result.metadata).sandbox);
1591
1895
  }
1592
- async function executeSubjectRevisionExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
1896
+ async function executeCandidateRevisionExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
1593
1897
  const { workload, result } = await runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher);
1594
1898
  if (result.error || (result.exitCode ?? 0) !== 0) {
1595
1899
  return failWorkbenchRunJob(args.job, startedAt, result.error ?? `Adapter ${execution.adapter.use} exited with status ${result.exitCode}.`, result.finishedAt, result);
1596
1900
  }
1597
1901
  const finishedAt = result.finishedAt ?? new Date().toISOString();
1598
- const subjectPatch = createSubjectPatchFromResult(result, args.spec);
1599
- if (subjectPatch.fileChanges.length === 0) {
1600
- return failWorkbenchRunJob(args.job, startedAt, `${execution.adapter.use === "command" ? "Command improve adapter" : `Adapter ${execution.adapter.use}`} completed without changing subject files covered by optimizer edits.`, finishedAt, result);
1601
- }
1602
- const subjectRevisionFiles = applyWorkbenchSubjectPatch({
1603
- baseFiles: workload.subjectFiles,
1604
- patch: subjectPatch,
1605
- edits: requireOptimizerEdits(args.spec),
1902
+ const candidatePatch = createCandidatePatchFromResult(result, args.spec);
1903
+ if (candidatePatch.fileChanges.length === 0) {
1904
+ return failWorkbenchRunJob(args.job, startedAt, `${execution.adapter.use === "command" ? "Command improve adapter" : `Adapter ${execution.adapter.use}`} completed without changing candidate files covered by improve edits.`, finishedAt, result);
1905
+ }
1906
+ const candidateRevisionFiles = applyWorkbenchCandidatePatch({
1907
+ baseFiles: workload.candidateFiles,
1908
+ patch: candidatePatch,
1909
+ edits: requireImproveEdits(args.spec),
1606
1910
  });
1607
- const usage = assignUsageRole("optimizer", result.usage);
1911
+ const usage = assignUsageRole("improver", result.usage);
1608
1912
  return {
1609
1913
  ...args.job,
1610
1914
  status: "succeeded",
@@ -1616,13 +1920,13 @@ async function executeSubjectRevisionExecutionInCurrentRuntime(args, execution,
1616
1920
  ok: true,
1617
1921
  executionId: execution.id,
1618
1922
  purpose: execution.purpose,
1619
- subjectId: workload.subjectId,
1923
+ candidateId: workload.candidateId,
1620
1924
  attemptIndex: workload.attemptIndex,
1621
1925
  baseId: workload.baseId,
1622
1926
  prompt: workload.prompt,
1623
- subjectPatch,
1624
- fileChanges: subjectPatch.fileChanges,
1625
- files: subjectRevisionFiles,
1927
+ candidatePatch,
1928
+ fileChanges: candidatePatch.fileChanges,
1929
+ files: candidateRevisionFiles,
1626
1930
  traces: traceFilePaths(result.files),
1627
1931
  ...(usage ? { usage } : {}),
1628
1932
  ...(result.summary !== undefined ? { summary: result.summary } : {}),
@@ -1655,13 +1959,14 @@ async function executeAttemptExecutionInCurrentRuntime(args, execution, startedA
1655
1959
  const finishedAt = workloadResult.finishedAt ?? new Date().toISOString();
1656
1960
  const usage = attemptUsageSummary(workloadResult.usage, engineResult.usage);
1657
1961
  const sample = evaluateSample({
1658
- subjectId: workload.subjectId,
1962
+ candidateId: workload.candidateId,
1659
1963
  files: workloadResult.files,
1660
1964
  engineResolveFiles: workload.engineResolveFiles,
1661
1965
  spec: workload.spec,
1662
1966
  attemptIndex: workload.attemptIndex,
1663
1967
  sampleIndex: workload.sampleIndex,
1664
1968
  caseId: workload.caseId,
1969
+ split: workload.engineCaseSpec?.split,
1665
1970
  startedAt,
1666
1971
  finishedAt,
1667
1972
  durationMs: workloadResult.durationMs,
@@ -1682,7 +1987,7 @@ async function executeAttemptExecutionInCurrentRuntime(args, execution, startedA
1682
1987
  ok: true,
1683
1988
  executionId: execution.id,
1684
1989
  purpose: execution.purpose,
1685
- subjectId: workload.subjectId,
1990
+ candidateId: workload.candidateId,
1686
1991
  attemptIndex: workload.attemptIndex,
1687
1992
  sampleIndex: workload.sampleIndex,
1688
1993
  caseId: workload.caseId,
@@ -1725,7 +2030,7 @@ export async function executeRuntimeControlOperationSequenceInCurrentRuntime(arg
1725
2030
  ? { adapterAuthEnv: adapterAuth.env }
1726
2031
  : {}),
1727
2032
  }, workload, args.runtimeControlOperation.operations.map((operation, index) => runtimeControlStepForOperation(operation, index, args.adapterManifests)), startedAt, {
1728
- runSubjectPrepare: args.runtimeControlOperation.prepare ?? false,
2033
+ runCandidatePrepare: args.runtimeControlOperation.prepare ?? false,
1729
2034
  workspaceFiles: args.runtimeControlOperation.inputs?.workspace ?? [],
1730
2035
  outputFiles: args.runtimeControlOperation.inputs?.output ?? [],
1731
2036
  collectWorkspace: args.runtimeControlOperation.collectWorkspace ?? false,
@@ -1823,7 +2128,7 @@ function createRuntimeControlSandboxInput(args, request) {
1823
2128
  const parentInput = asRuntimeRecord(args.job.input);
1824
2129
  const publicFiles = runtimeControlInputFiles(request.inputs, "case", parentWorkload.engineCase ? engineCasePublicFiles(parentWorkload.engineCase) : []);
1825
2130
  const privateFiles = runtimeControlInputFiles(request.inputs, "enginePrivate", parentWorkload.engineCase ? engineCasePrivateFiles(parentWorkload.engineCase) : []);
1826
- const subjectFiles = runtimeControlInputFiles(request.inputs, "subject", parentWorkload.subjectFiles);
2131
+ const candidateFiles = runtimeControlInputFiles(request.inputs, "candidate", parentWorkload.candidateFiles);
1827
2132
  const traceFiles = runtimeControlInputFiles(request.inputs, "traces", parentWorkload.traceFiles);
1828
2133
  const adapter = request.operations[request.operations.length - 1]?.invocation;
1829
2134
  const childExecution = {
@@ -1866,7 +2171,7 @@ function createRuntimeControlSandboxInput(args, request) {
1866
2171
  const childArgs = {
1867
2172
  ...args,
1868
2173
  job: childJob,
1869
- baseFiles: subjectFiles,
2174
+ baseFiles: candidateFiles,
1870
2175
  engineResolveFiles: [...publicFiles, ...privateFiles],
1871
2176
  engineCases: [engineCase],
1872
2177
  traceFiles,
@@ -1890,10 +2195,10 @@ function runtimeControlStepForOperation(operation, index, manifests = []) {
1890
2195
  ...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
1891
2196
  }, operation.operation, manifests).command;
1892
2197
  return {
1893
- kind: operation.operation === "subject.run"
1894
- ? "subject"
1895
- : operation.operation === "optimizer.improve"
1896
- ? "optimizer"
2198
+ kind: operation.operation === "candidate.run"
2199
+ ? "candidate"
2200
+ : operation.operation === "candidate.improve"
2201
+ ? "improver"
1897
2202
  : "engine",
1898
2203
  label: operation.label ?? `${operation.operation.replace(".", "_")}_${index + 1}`,
1899
2204
  operation: operation.operation,
@@ -1960,8 +2265,8 @@ function isWorkbenchAdapterOperationResult(value) {
1960
2265
  return record.protocol === "workbench.adapter-result.v1" &&
1961
2266
  (record.operation === "engine.resolve" ||
1962
2267
  record.operation === "engine.run" ||
1963
- record.operation === "subject.run" ||
1964
- record.operation === "optimizer.improve");
2268
+ record.operation === "candidate.run" ||
2269
+ record.operation === "candidate.improve");
1965
2270
  }
1966
2271
  function cloneSurfaceFiles(files) {
1967
2272
  return files.map((file) => ({ ...file, path: normalizeRelativePath(file.path) }));
@@ -2040,9 +2345,11 @@ async function runHostedCommandExecutionSteps(args, workload, steps, startedAt,
2040
2345
  const stepTimeoutMs = environmentVersion
2041
2346
  ? environmentVersionTimeoutMs(environmentVersion)
2042
2347
  : 5 * 60 * 1000;
2043
- const shouldRunSubjectPrepare = options.runSubjectPrepare ?? steps.some((step) => step.executor === "sandbox");
2044
- if (shouldRunSubjectPrepare) {
2045
- await runSubjectPrepareCommand({
2348
+ const shouldRunCandidatePrepare = options.runCandidatePrepare ??
2349
+ (readWorkloadExecutionPurpose(workload) === "attempt" &&
2350
+ steps.some((step) => step.executor === "sandbox"));
2351
+ if (shouldRunCandidatePrepare) {
2352
+ await runCandidatePrepareCommand({
2046
2353
  root: workspace.root,
2047
2354
  workload,
2048
2355
  execution,
@@ -2081,6 +2388,9 @@ async function runHostedCommandExecutionSteps(args, workload, steps, startedAt,
2081
2388
  });
2082
2389
  const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root), step.operation);
2083
2390
  assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${step.adapter?.use ?? execution.adapter.use} ${step.operation}`);
2391
+ await writeSurfaceFiles(outputDir(workspace.root), [
2392
+ textSurfaceFile(`.workbench/traces/${workload.job.id}/${step.label}/result.json`, `${JSON.stringify(operationResult, null, 2)}\n`),
2393
+ ]);
2084
2394
  operationResults.push(operationResult);
2085
2395
  await publishCommandStepEvent(options.eventPublisher, {
2086
2396
  step: step.label,
@@ -2132,19 +2442,19 @@ async function runHostedCommandExecutionSteps(args, workload, steps, startedAt,
2132
2442
  await workspace.cleanup();
2133
2443
  }
2134
2444
  }
2135
- async function runSubjectPrepareCommand(args) {
2136
- const command = args.workload.spec.subject.prepare?.command;
2445
+ async function runCandidatePrepareCommand(args) {
2446
+ const command = args.workload.spec.candidate.prepare?.command;
2137
2447
  if (!command) {
2138
2448
  return;
2139
2449
  }
2140
- const role = args.execution.purpose === "improve" ? "optimizer" : "runner";
2450
+ const role = args.execution.purpose === "improve" ? "improver" : "runner";
2141
2451
  await publishCommandStepEvent(args.eventPublisher, {
2142
- step: "subject_prepare",
2452
+ step: "candidate_prepare",
2143
2453
  status: "started",
2144
2454
  role,
2145
2455
  });
2146
2456
  try {
2147
- const shellCommand = createHostedWorkloadShellCommand(args.root, command, "subject_prepare");
2457
+ const shellCommand = createHostedWorkloadShellCommand(args.root, command, "candidate_prepare");
2148
2458
  await args.execFileAsync("sh", ["-c", shellCommand], {
2149
2459
  cwd: args.root,
2150
2460
  env: createHostedWorkloadPrepareEnv(args.root),
@@ -2152,20 +2462,20 @@ async function runSubjectPrepareCommand(args) {
2152
2462
  timeout: args.timeoutMs,
2153
2463
  });
2154
2464
  await publishCommandStepEvent(args.eventPublisher, {
2155
- step: "subject_prepare",
2465
+ step: "candidate_prepare",
2156
2466
  status: "succeeded",
2157
2467
  role,
2158
2468
  });
2159
2469
  }
2160
2470
  catch (error) {
2161
2471
  await publishCommandStepEvent(args.eventPublisher, {
2162
- step: "subject_prepare",
2472
+ step: "candidate_prepare",
2163
2473
  status: "failed",
2164
2474
  exitCode: readExitCode(error),
2165
2475
  error: error instanceof Error ? error.message : String(error),
2166
2476
  role,
2167
2477
  });
2168
- throw new Error(`Subject prepare command failed: ${error instanceof Error ? error.message : String(error)}`);
2478
+ throw new Error(`Candidate prepare command failed: ${error instanceof Error ? error.message : String(error)}`);
2169
2479
  }
2170
2480
  }
2171
2481
  async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
@@ -2204,10 +2514,10 @@ async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
2204
2514
  };
2205
2515
  }
2206
2516
  function stepEventRole(step) {
2207
- if (step.kind === "optimizer") {
2208
- return "optimizer";
2517
+ if (step.kind === "improver") {
2518
+ return "improver";
2209
2519
  }
2210
- if (step.kind === "subject") {
2520
+ if (step.kind === "candidate") {
2211
2521
  return "runner";
2212
2522
  }
2213
2523
  if (step.kind === "engine") {
@@ -2219,10 +2529,10 @@ function adapterOperationUsageSummary(result) {
2219
2529
  if (hasExplicitUsageRole(result.usage)) {
2220
2530
  return completeUsageSummary(result.usage);
2221
2531
  }
2222
- if (result.operation === "optimizer.improve") {
2223
- return assignUsageRole("optimizer", result.usage);
2532
+ if (result.operation === "candidate.improve") {
2533
+ return assignUsageRole("improver", result.usage);
2224
2534
  }
2225
- if (result.operation === "subject.run") {
2535
+ if (result.operation === "candidate.run") {
2226
2536
  return assignUsageRole("runner", result.usage);
2227
2537
  }
2228
2538
  if (result.operation === "engine.run") {
@@ -2239,16 +2549,16 @@ function attemptUsageSummary(workloadUsage, resultUsage) {
2239
2549
  }
2240
2550
  function hasExplicitUsageRole(usage) {
2241
2551
  const normalized = completeUsageSummary(usage);
2242
- return Boolean(normalized?.optimizer || normalized?.runner || normalized?.engine);
2552
+ return Boolean(normalized?.improver || normalized?.runner || normalized?.engine);
2243
2553
  }
2244
- function createSubjectPatchFromResult(result, spec) {
2245
- if (result.subjectPatch) {
2246
- return result.subjectPatch;
2554
+ function createCandidatePatchFromResult(result, spec) {
2555
+ if (result.candidatePatch) {
2556
+ return result.candidatePatch;
2247
2557
  }
2248
2558
  const changedEditPaths = result.fileChanges
2249
2559
  .map(normalizeRelativePath)
2250
2560
  .filter((filePath) => !filePath.startsWith(".workbench/") &&
2251
- isSubjectEditPath(filePath, optimizerEdits(spec)));
2561
+ isCandidateEditPath(filePath, improveEdits(spec)));
2252
2562
  const changedSet = new Set(changedEditPaths);
2253
2563
  const files = result.files
2254
2564
  .filter((file) => changedSet.has(normalizeRelativePath(file.path)))
@@ -2260,7 +2570,7 @@ function createSubjectPatchFromResult(result, spec) {
2260
2570
  ...(result.feedback !== undefined ? { feedback: result.feedback } : {}),
2261
2571
  };
2262
2572
  }
2263
- function isSubjectEditPath(filePath, edits) {
2573
+ function isCandidateEditPath(filePath, edits) {
2264
2574
  const normalized = normalizeRelativePath(filePath);
2265
2575
  return edits.some((entry) => {
2266
2576
  const editPath = normalizeRelativePath(entry).replace(/\/+$/u, "");
@@ -2320,21 +2630,33 @@ export async function stageWorkbenchRunWorkload(root, workload) {
2320
2630
  ]);
2321
2631
  await fs.mkdir(inputDir(root), { recursive: true });
2322
2632
  await fs.mkdir(outputDir(root), { recursive: true });
2633
+ await clearMutableWorkspaceFiles(root);
2323
2634
  if (purpose === "attempt") {
2324
- await fs.mkdir(subjectDir(root), { recursive: true });
2635
+ await fs.mkdir(candidateDir(root), { recursive: true });
2325
2636
  await fs.mkdir(caseDir(root), { recursive: true });
2326
2637
  const engineCase = requireWorkloadEngineCase(workload, "Attempt staging");
2327
- await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
2638
+ await writeSurfaceFiles(candidateDir(root), workload.candidateFiles);
2328
2639
  await writeSurfaceFiles(caseDir(root), engineCasePublicFiles(engineCase));
2329
2640
  return;
2330
2641
  }
2331
2642
  if (purpose === "improve") {
2332
- await fs.mkdir(subjectDir(root), { recursive: true });
2333
- await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
2643
+ await writeSurfaceFiles(root, workload.candidateFiles.filter((file) => isMutableWorkspaceSnapshotPath(file.path)));
2334
2644
  await fs.mkdir(tracesDir(root), { recursive: true });
2335
2645
  await writeSurfaceFiles(tracesDir(root), workload.traceFiles);
2336
2646
  }
2337
2647
  }
2648
+ async function clearMutableWorkspaceFiles(root) {
2649
+ const fs = await importNodeModule(nodeBuiltin("fs/promises"));
2650
+ const path = await importNodeModule(nodeBuiltin("path"));
2651
+ const entries = await fs.readdir(root, { withFileTypes: true }).catch(() => []);
2652
+ await Promise.all(entries.map(async (entry) => {
2653
+ const relativePath = normalizeRelativePath(entry.name);
2654
+ if (!isMutableWorkspaceSnapshotPath(relativePath)) {
2655
+ return;
2656
+ }
2657
+ await fs.rm(path.join(root, entry.name), { recursive: true, force: true });
2658
+ }));
2659
+ }
2338
2660
  async function stageWorkbenchEnginePrivateFiles(root, workload) {
2339
2661
  if (readWorkloadExecutionPurpose(workload) !== "attempt") {
2340
2662
  return;
@@ -2417,7 +2739,7 @@ function adapterFilePathWithinRoot(filePath, sourceRoot) {
2417
2739
  }
2418
2740
  async function readHostedRunFailureResult(root, workload, options) {
2419
2741
  const traceFiles = await readRuntimeTraceFiles(root, workload);
2420
- const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root)));
2742
+ const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root), { ignorePath: isWorkbenchInternalOutputPath }));
2421
2743
  const startedAt = options.startedAt ?? new Date().toISOString();
2422
2744
  const finishedAt = new Date().toISOString();
2423
2745
  const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
@@ -2433,13 +2755,13 @@ async function readHostedRunFailureResult(root, workload, options) {
2433
2755
  async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
2434
2756
  const path = await importNodeModule(nodeBuiltin("path"));
2435
2757
  const traceFiles = await readRuntimeTraceFiles(root, workload);
2436
- const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root)));
2758
+ const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root), { ignorePath: isWorkbenchInternalOutputPath }));
2437
2759
  const outputExitCode = await readOptionalNumber(path.join(outputDir(root), "exit_code"));
2438
2760
  const startedAt = options.startedAt ?? new Date().toISOString();
2439
2761
  const finishedAt = new Date().toISOString();
2440
2762
  const purpose = readWorkloadExecutionPurpose(workload);
2441
2763
  const primaryOperation = purpose === "improve"
2442
- ? "optimizer.improve"
2764
+ ? "candidate.improve"
2443
2765
  : "engine.run";
2444
2766
  const primaryResult = [...(options.operationResults ?? [])]
2445
2767
  .reverse()
@@ -2453,9 +2775,9 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
2453
2775
  const cases = normalizeResultCases(resultPayload.cases);
2454
2776
  const includeResultScoring = purpose === "attempt";
2455
2777
  const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
2456
- const subjectPatch = purpose === "improve" ? primaryResult?.value : undefined;
2778
+ const candidatePatch = purpose === "improve" ? primaryResult?.value : undefined;
2457
2779
  const engineResult = purpose === "attempt" ? primaryResult?.value : undefined;
2458
- const declaredChanges = subjectPatch?.fileChanges ??
2780
+ const declaredChanges = candidatePatch?.fileChanges ??
2459
2781
  (Array.isArray(resultPayload.fileChanges)
2460
2782
  ? resultPayload.fileChanges.filter((entry) => typeof entry === "string")
2461
2783
  : files.map((file) => file.path));
@@ -2463,7 +2785,7 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
2463
2785
  files,
2464
2786
  fileChanges: declaredChanges,
2465
2787
  ...(options.operationResults ? { operationResults: [...options.operationResults] } : {}),
2466
- ...(subjectPatch ? { subjectPatch } : {}),
2788
+ ...(candidatePatch ? { candidatePatch } : {}),
2467
2789
  ...(engineResult ? { result: engineResult } : {}),
2468
2790
  ...(includeResultScoring && metrics ? { metrics } : {}),
2469
2791
  ...(includeResultScoring && cases ? { cases } : {}),
@@ -2536,9 +2858,10 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, step, aut
2536
2858
  const requestPath = path.join(root, ".workbench", "request.json");
2537
2859
  await fs.mkdir(path.dirname(requestPath), { recursive: true });
2538
2860
  const casePrompt = workload.engineCaseSpec?.prompt;
2861
+ const caseSplit = workload.engineCaseSpec?.split;
2539
2862
  const adapter = step.adapter ?? execution.adapter;
2540
- const subjectCommand = adapterProtocolCommandSpec(workload.spec.run, "subject.run", manifests).command;
2541
- await fs.writeFile(requestPath, `${JSON.stringify({
2863
+ const candidateCommand = adapterProtocolCommandSpec(workload.spec.run, "candidate.run", manifests).command;
2864
+ const payload = {
2542
2865
  protocol: "workbench.adapter.v3",
2543
2866
  id: execution.id,
2544
2867
  jobId: workload.job.id,
@@ -2554,17 +2877,17 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, step, aut
2554
2877
  name: workload.spec.benchmark.name,
2555
2878
  description: workload.spec.benchmark.description,
2556
2879
  },
2557
- subject: {
2558
- id: workload.subjectId,
2559
- path: workload.spec.subject.files.path,
2560
- ...(workload.spec.subject.prepare ? { prepare: { ...workload.spec.subject.prepare } } : {}),
2880
+ candidate: {
2881
+ id: workload.candidateId,
2882
+ path: workload.spec.candidate.files.path,
2883
+ ...(workload.spec.candidate.prepare ? { prepare: { ...workload.spec.candidate.prepare } } : {}),
2561
2884
  run: {
2562
2885
  ...workload.spec.run,
2563
- command: subjectCommand,
2886
+ command: candidateCommand,
2564
2887
  },
2565
2888
  },
2566
- ...(workload.spec.optimizer
2567
- ? { optimizer: { edits: [...workload.spec.optimizer.edits] } }
2889
+ ...(workload.spec.candidate.improve
2890
+ ? { improve: { edits: [...workload.spec.candidate.improve.edits] } }
2568
2891
  : {}),
2569
2892
  attempt: {
2570
2893
  attemptIndex: workload.attemptIndex,
@@ -2574,27 +2897,48 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, step, aut
2574
2897
  case: {
2575
2898
  id: workload.caseId,
2576
2899
  ...(casePrompt ? { prompt: casePrompt } : {}),
2900
+ ...(caseSplit ? { split: caseSplit } : {}),
2577
2901
  },
2578
2902
  },
2579
2903
  paths: {
2580
2904
  workspace: root,
2581
2905
  output: outputDir(root),
2582
2906
  result: workbenchAdapterOperationResultPath(outputDir(root)),
2583
- subject: subjectDir(root),
2907
+ ...(readWorkloadExecutionPurpose(workload) === "attempt" ? { candidate: candidateDir(root) } : {}),
2584
2908
  ...(workload.engineCaseSpec ? { case: caseDir(root) } : {}),
2585
2909
  traces: tracesDir(root),
2586
2910
  ...(step.kind === "engine" ? { enginePrivate: runtimeEnginePrivateDir(root) } : {}),
2587
2911
  },
2588
- }, null, 2)}\n`);
2912
+ };
2913
+ await fs.writeFile(requestPath, `${JSON.stringify(payload, null, 2)}\n`);
2914
+ await writeSurfaceFiles(outputDir(root), [
2915
+ textSurfaceFile(`.workbench/traces/${workload.job.id}/${step.label}/request.json`, `${JSON.stringify(sanitizeAdapterRequestTracePayload(payload), null, 2)}\n`),
2916
+ ]);
2589
2917
  return requestPath;
2590
2918
  }
2591
- function optimizerEdits(spec) {
2592
- return spec.optimizer?.edits ?? [];
2919
+ function sanitizeAdapterRequestTracePayload(value) {
2920
+ if (Array.isArray(value)) {
2921
+ return value.map((entry) => sanitizeAdapterRequestTracePayload(entry));
2922
+ }
2923
+ if (!value || typeof value !== "object") {
2924
+ return (value ?? null);
2925
+ }
2926
+ const sanitized = {};
2927
+ for (const [key, entry] of Object.entries(value)) {
2928
+ if (key === "auth" || key === "enginePrivate") {
2929
+ continue;
2930
+ }
2931
+ sanitized[key] = sanitizeAdapterRequestTracePayload(entry);
2932
+ }
2933
+ return sanitized;
2934
+ }
2935
+ function improveEdits(spec) {
2936
+ return spec.candidate.improve?.edits ?? [];
2593
2937
  }
2594
- function requireOptimizerEdits(spec) {
2595
- const edits = optimizerEdits(spec);
2938
+ function requireImproveEdits(spec) {
2939
+ const edits = improveEdits(spec);
2596
2940
  if (edits.length === 0) {
2597
- throw new Error("Optimizer YAML must declare at least one entry in edits.");
2941
+ throw new Error("Candidate improve configuration must declare at least one entry in edits.");
2598
2942
  }
2599
2943
  return edits;
2600
2944
  }
@@ -2691,8 +3035,8 @@ function requireWorkloadEngineCase(workload, label) {
2691
3035
  }
2692
3036
  return workload.engineCase;
2693
3037
  }
2694
- function subjectDir(root) {
2695
- return `${inputDir(root)}/subject`;
3038
+ function candidateDir(root) {
3039
+ return `${inputDir(root)}/candidate`;
2696
3040
  }
2697
3041
  function caseDir(root) {
2698
3042
  return `${inputDir(root)}/case`;
@@ -2727,7 +3071,7 @@ async function writeSurfaceFiles(root, files) {
2727
3071
  }
2728
3072
  }
2729
3073
  }
2730
- async function readSurfaceFiles(root) {
3074
+ async function readSurfaceFiles(root, options = {}) {
2731
3075
  const fs = await importNodeModule(nodeBuiltin("fs/promises"));
2732
3076
  const path = await importNodeModule(nodeBuiltin("path"));
2733
3077
  const utf8Decoder = new TextDecoder("utf-8", { fatal: true });
@@ -2738,6 +3082,10 @@ async function readSurfaceFiles(root) {
2738
3082
  .catch(() => []);
2739
3083
  for (const entry of entries) {
2740
3084
  const absolutePath = path.join(directory, entry.name);
3085
+ const relativePath = normalizeRelativePath(path.relative(root, absolutePath).replace(/\\/gu, "/"));
3086
+ if (options.ignorePath?.(relativePath)) {
3087
+ continue;
3088
+ }
2741
3089
  if (entry.isDirectory()) {
2742
3090
  await walk(absolutePath);
2743
3091
  continue;
@@ -2745,9 +3093,18 @@ async function readSurfaceFiles(root) {
2745
3093
  if (!entry.isFile()) {
2746
3094
  continue;
2747
3095
  }
2748
- const relativePath = normalizeRelativePath(path.relative(root, absolutePath).replace(/\\/gu, "/"));
2749
- const body = await fs.readFile(absolutePath);
2750
- const stats = await fs.stat(absolutePath);
3096
+ let body;
3097
+ let stats;
3098
+ try {
3099
+ body = await fs.readFile(absolutePath);
3100
+ stats = await fs.stat(absolutePath);
3101
+ }
3102
+ catch (error) {
3103
+ if (isVanishedWalkEntry(error)) {
3104
+ continue;
3105
+ }
3106
+ throw error;
3107
+ }
2751
3108
  const content = encodeSurfaceSnapshotContent(body, utf8Decoder);
2752
3109
  files.push({
2753
3110
  path: relativePath,
@@ -2761,6 +3118,10 @@ async function readSurfaceFiles(root) {
2761
3118
  await walk(root);
2762
3119
  return files.sort((left, right) => left.path.localeCompare(right.path));
2763
3120
  }
3121
+ function isVanishedWalkEntry(error) {
3122
+ const code = error?.code;
3123
+ return code === "ENOENT" || code === "ENOTDIR";
3124
+ }
2764
3125
  function encodeSurfaceSnapshotContent(body, utf8Decoder) {
2765
3126
  try {
2766
3127
  return {
@@ -2943,7 +3304,14 @@ function evaluateSample(args) {
2943
3304
  if (metrics.score === undefined) {
2944
3305
  metrics.score = sampleScore;
2945
3306
  }
2946
- const cases = args.workload.cases?.length ? args.workload.cases : undefined;
3307
+ const cases = runtimeTimedCaseResults({
3308
+ caseId: args.caseId,
3309
+ split: args.split,
3310
+ status: "completed",
3311
+ durationMs,
3312
+ metrics,
3313
+ cases: args.workload.cases,
3314
+ });
2947
3315
  const feedback = {
2948
3316
  ...(args.workload.summary !== undefined
2949
3317
  ? { summary: args.workload.summary }
@@ -2956,10 +3324,10 @@ function evaluateSample(args) {
2956
3324
  return {
2957
3325
  id: `${args.caseId}__sample_${String(args.sampleIndex + 1).padStart(3, "0")}`,
2958
3326
  index: args.sampleIndex,
2959
- subject: {
2960
- id: args.subjectId,
2961
- kind: "subject",
2962
- label: args.subjectId,
3327
+ candidate: {
3328
+ id: args.candidateId,
3329
+ kind: "candidate",
3330
+ label: args.candidateId,
2963
3331
  },
2964
3332
  status: "completed",
2965
3333
  startedAt: args.startedAt,
@@ -2967,7 +3335,7 @@ function evaluateSample(args) {
2967
3335
  durationMs,
2968
3336
  metrics,
2969
3337
  ...(usage ? { usage } : {}),
2970
- ...(cases ? { cases } : {}),
3338
+ cases,
2971
3339
  feedback,
2972
3340
  };
2973
3341
  }
@@ -2976,7 +3344,7 @@ function normalizeSampleJobOutput(value) {
2976
3344
  return null;
2977
3345
  }
2978
3346
  const record = value;
2979
- if (record.ok !== true || typeof record.subjectId !== "string") {
3347
+ if (record.ok !== true || typeof record.candidateId !== "string") {
2980
3348
  return null;
2981
3349
  }
2982
3350
  const files = Array.isArray(record.files)
@@ -2991,7 +3359,7 @@ function normalizeSampleJobOutput(value) {
2991
3359
  return null;
2992
3360
  }
2993
3361
  return {
2994
- subjectId: record.subjectId,
3362
+ candidateId: record.candidateId,
2995
3363
  attemptIndex: record.attemptIndex,
2996
3364
  sample,
2997
3365
  fileChanges: Array.isArray(record.fileChanges)
@@ -3003,12 +3371,72 @@ function normalizeSampleJobOutput(value) {
3003
3371
  : traceFilePaths(files),
3004
3372
  };
3005
3373
  }
3006
- function normalizeEvaluationSampleOutputs(args) {
3007
- return args.jobs.flatMap((job) => {
3374
+ function normalizeEvaluationSampleOutputs(jobs) {
3375
+ return jobs.flatMap((job) => {
3008
3376
  const output = normalizeSampleJobOutput(job.output);
3009
- return output ? [{ jobs: [job], output }] : [];
3377
+ if (!output) {
3378
+ return [];
3379
+ }
3380
+ const caseId = readJobString(job.input, "caseId") ?? output.sample.cases?.[0]?.id ?? null;
3381
+ const durationMs = runtimeJobDurationMs(job) ?? output.sample.durationMs;
3382
+ const sample = caseId && typeof durationMs === "number" && Number.isFinite(durationMs)
3383
+ ? {
3384
+ ...output.sample,
3385
+ cases: runtimeTimedCaseResults({
3386
+ caseId,
3387
+ split: readJobEngineCaseSplit(job),
3388
+ status: output.sample.status === "error" ? "error" : "completed",
3389
+ durationMs,
3390
+ metrics: output.sample.metrics ?? {},
3391
+ cases: output.sample.cases,
3392
+ }),
3393
+ }
3394
+ : output.sample;
3395
+ return [{
3396
+ jobs: [job],
3397
+ output: {
3398
+ ...output,
3399
+ sample,
3400
+ },
3401
+ }];
3010
3402
  });
3011
3403
  }
3404
+ function runtimeTimedCaseResults(args) {
3405
+ const cases = args.cases?.length
3406
+ ? args.cases
3407
+ : [{
3408
+ id: args.caseId,
3409
+ status: args.status,
3410
+ metrics: args.metrics,
3411
+ }];
3412
+ return cases.map((entry) => ({
3413
+ ...entry,
3414
+ ...(!entry.split && args.split && entry.id === args.caseId ? { split: args.split } : {}),
3415
+ status: entry.status ?? args.status,
3416
+ metrics: entry.metrics ?? args.metrics,
3417
+ durationMs: args.durationMs,
3418
+ }));
3419
+ }
3420
+ function readJobEngineCaseSplit(job) {
3421
+ const input = jsonRecord(job.input);
3422
+ const execution = jsonRecord(input.execution);
3423
+ const metadata = jsonRecord(execution.metadata);
3424
+ const engineCase = jsonRecord(metadata.engineCase);
3425
+ const split = engineCase.split;
3426
+ return typeof split === "string" && split.trim().length > 0
3427
+ ? split.trim()
3428
+ : undefined;
3429
+ }
3430
+ function runtimeJobDurationMs(job) {
3431
+ if (typeof job.startedAt !== "string" || typeof job.finishedAt !== "string") {
3432
+ return undefined;
3433
+ }
3434
+ const startedMs = Date.parse(job.startedAt);
3435
+ const finishedMs = Date.parse(job.finishedAt);
3436
+ return Number.isFinite(startedMs) && Number.isFinite(finishedMs)
3437
+ ? Math.max(0, finishedMs - startedMs)
3438
+ : undefined;
3439
+ }
3012
3440
  function meanFinite(values) {
3013
3441
  const finite = values.filter((value) => typeof value === "number" && Number.isFinite(value));
3014
3442
  if (finite.length === 0) {
@@ -3039,12 +3467,12 @@ function withJobUsage(sample, _jobs, attemptJob) {
3039
3467
  usage,
3040
3468
  };
3041
3469
  }
3042
- function normalizeSubjectRevisionJobOutput(value) {
3470
+ function normalizeCandidateRevisionJobOutput(value) {
3043
3471
  if (!value || typeof value !== "object" || Array.isArray(value)) {
3044
3472
  return null;
3045
3473
  }
3046
3474
  const record = value;
3047
- if (record.ok !== true || typeof record.subjectId !== "string") {
3475
+ if (record.ok !== true || typeof record.candidateId !== "string") {
3048
3476
  return null;
3049
3477
  }
3050
3478
  const files = Array.isArray(record.files)
@@ -3056,7 +3484,7 @@ function normalizeSubjectRevisionJobOutput(value) {
3056
3484
  }
3057
3485
  const usage = normalizeUsageSummary(record.usage);
3058
3486
  return {
3059
- subjectId: record.subjectId,
3487
+ candidateId: record.candidateId,
3060
3488
  attemptIndex: record.attemptIndex,
3061
3489
  baseId: typeof record.baseId === "string" && record.baseId.length > 0
3062
3490
  ? record.baseId
@@ -3072,7 +3500,7 @@ function normalizeSubjectRevisionJobOutput(value) {
3072
3500
  ...(usage ? { usage } : {}),
3073
3501
  };
3074
3502
  }
3075
- function errorEvaluationSamplesFromJobs(jobs, subjectId, attemptIndex, completedSampleKeys) {
3503
+ function errorEvaluationSamplesFromJobs(jobs, candidateId, attemptIndex, completedSampleKeys) {
3076
3504
  const groups = new Map();
3077
3505
  for (const job of jobs) {
3078
3506
  const key = evaluationSampleGroupKeyFromJob(job);
@@ -3082,40 +3510,44 @@ function errorEvaluationSamplesFromJobs(jobs, subjectId, attemptIndex, completed
3082
3510
  groups.set(key, [...(groups.get(key) ?? []), job]);
3083
3511
  }
3084
3512
  return [...groups.values()]
3085
- .map((group) => errorEvaluationSampleFromJobGroup(group, subjectId, attemptIndex))
3513
+ .map((group) => errorEvaluationSampleFromJobGroup(group, candidateId, attemptIndex))
3086
3514
  .filter((sample) => sample !== null);
3087
3515
  }
3088
- function errorEvaluationSampleFromJobGroup(jobs, subjectId, attemptIndex) {
3516
+ function errorEvaluationSampleFromJobGroup(jobs, candidateId, attemptIndex) {
3089
3517
  const job = jobs[0];
3090
3518
  if (!job) {
3091
3519
  return null;
3092
3520
  }
3093
3521
  const sampleIndex = readOptionalJobNumber(job.input, "sampleIndex");
3094
3522
  const caseId = readJobString(job.input, "caseId");
3523
+ const split = readJobEngineCaseSplit(job);
3095
3524
  if (sampleIndex === null || !caseId) {
3096
3525
  return null;
3097
3526
  }
3098
3527
  const startedAt = minIsoTimestamp(jobs.map((entry) => entry.startedAt ?? entry.createdAt));
3099
3528
  const finishedAt = maxIsoTimestamp(jobs.map((entry) => entry.finishedAt ?? entry.updatedAt ?? entry.startedAt));
3529
+ const durationMs = startedAt && finishedAt
3530
+ ? Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt))
3531
+ : undefined;
3100
3532
  const error = summarizeEvaluationJobErrors(jobs) ?? "Evaluation job did not produce a valid sample.";
3101
3533
  return {
3102
3534
  id: `${caseId}__sample_${String(sampleIndex + 1).padStart(3, "0")}`,
3103
3535
  index: sampleIndex,
3104
- subject: {
3105
- id: subjectId,
3106
- kind: "subject",
3107
- label: subjectId,
3536
+ candidate: {
3537
+ id: candidateId,
3538
+ kind: "candidate",
3539
+ label: candidateId,
3108
3540
  },
3109
3541
  status: "error",
3110
3542
  ...(startedAt ? { startedAt } : {}),
3111
3543
  ...(finishedAt ? { finishedAt } : {}),
3112
- ...(startedAt && finishedAt
3113
- ? { durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)) }
3114
- : {}),
3544
+ ...(durationMs !== undefined ? { durationMs } : {}),
3115
3545
  ...(error ? { error } : {}),
3116
3546
  cases: [{
3117
3547
  id: caseId,
3548
+ ...(split ? { split } : {}),
3118
3549
  status: "error",
3550
+ ...(durationMs !== undefined ? { durationMs } : {}),
3119
3551
  metrics: {},
3120
3552
  ...(error ? { feedback: { summary: error } } : {}),
3121
3553
  }],
@@ -3171,13 +3603,13 @@ function compareSampleOutputs(left, right) {
3171
3603
  }
3172
3604
  return left.sample.id.localeCompare(right.sample.id);
3173
3605
  }
3174
- function createEvaluationRecord(subjectId, subjectName, rawSamples) {
3175
- const samples = mergeEvaluationSampleRecords(rawSamples).map((sample) => subjectName
3606
+ function createEvaluationRecord(candidateId, candidateName, rawSamples) {
3607
+ const samples = mergeEvaluationSampleRecords(rawSamples).map((sample) => candidateName
3176
3608
  ? {
3177
3609
  ...sample,
3178
- subject: {
3179
- ...sample.subject,
3180
- label: subjectName,
3610
+ candidate: {
3611
+ ...sample.candidate,
3612
+ label: candidateName,
3181
3613
  },
3182
3614
  }
3183
3615
  : sample);
@@ -3191,10 +3623,10 @@ function createEvaluationRecord(subjectId, subjectName, rawSamples) {
3191
3623
  const errorSampleCount = samples.filter((sample) => sample.status === "error")
3192
3624
  .length;
3193
3625
  return {
3194
- subject: {
3195
- id: subjectId,
3196
- kind: "subject",
3197
- ...(subjectName ? { label: subjectName } : {}),
3626
+ candidate: {
3627
+ id: candidateId,
3628
+ kind: "candidate",
3629
+ ...(candidateName ? { label: candidateName } : {}),
3198
3630
  },
3199
3631
  status: samples.length > 0 && completedSampleCount === samples.length
3200
3632
  ? "completed"
@@ -3215,7 +3647,7 @@ function createEvaluationRecord(subjectId, subjectName, rawSamples) {
3215
3647
  samples,
3216
3648
  };
3217
3649
  }
3218
- function normalizedSubjectDisplayName(value) {
3650
+ function normalizedCandidateDisplayName(value) {
3219
3651
  const normalized = value?.trim();
3220
3652
  return normalized ? normalized : null;
3221
3653
  }
@@ -3263,7 +3695,7 @@ function mergeEvaluationSampleGroup(group) {
3263
3695
  return {
3264
3696
  id: `sample_${String(first.index + 1).padStart(3, "0")}`,
3265
3697
  index: first.index,
3266
- subject: first.subject,
3698
+ candidate: first.candidate,
3267
3699
  status: mergeEvaluationSampleStatus(group),
3268
3700
  ...(startedAt ? { startedAt } : {}),
3269
3701
  ...(finishedAt ? { finishedAt } : {}),
@@ -3355,35 +3787,49 @@ function aggregateCaseStatus(results) {
3355
3787
  }
3356
3788
  return undefined;
3357
3789
  }
3358
- function evaluationMeanMetrics(evaluation) {
3359
- const entries = Object.entries(evaluation.metrics ?? {}).filter((entry) => Number.isFinite(entry[1].mean));
3360
- return entries.length > 0
3361
- ? Object.fromEntries(entries.map(([key, stats]) => [key, Number(stats.mean.toFixed(3))]))
3362
- : undefined;
3363
- }
3364
- function selectSubject(args) {
3365
- let selected = args.previousSubject;
3366
- for (const subject of args.subjects) {
3367
- if (!selected || hasHigherScore(subject, selected)) {
3368
- selected = subject;
3790
+ function selectCandidate(args) {
3791
+ let selected = args.previousCandidate;
3792
+ for (const candidate of args.candidates) {
3793
+ if (!selected || hasHigherEvaluationMetric(candidate, selected, args.selection)) {
3794
+ selected = candidate;
3369
3795
  }
3370
3796
  }
3371
3797
  return selected;
3372
3798
  }
3373
- function hasHigherScore(subject, incumbent) {
3374
- const subjectValue = readMetric(subject, "score");
3375
- const incumbentValue = readMetric(incumbent, "score");
3376
- if (subjectValue == null) {
3799
+ function hasHigherEvaluationMetric(candidate, incumbent, selection) {
3800
+ const metric = selection?.metric ?? "score";
3801
+ const candidateValue = readEvaluationSelectionMean(candidate.eval, metric, selection?.caseIds);
3802
+ const incumbentValue = readEvaluationSelectionMean(incumbent.eval, metric, selection?.caseIds);
3803
+ if (candidateValue == null) {
3377
3804
  return false;
3378
3805
  }
3379
3806
  if (incumbentValue == null) {
3380
3807
  return true;
3381
3808
  }
3382
- return subjectValue > incumbentValue;
3809
+ return candidateValue > incumbentValue;
3810
+ }
3811
+ function readEvaluationSelectionMean(evaluation, metric, caseIds) {
3812
+ const stats = readEvaluationSelectionStats(evaluation, metric, caseIds);
3813
+ return stats ? stats.mean : null;
3383
3814
  }
3384
- function readMetric(subject, metric) {
3385
- const direct = subject.metrics?.[metric];
3386
- return typeof direct === "number" && Number.isFinite(direct) ? direct : null;
3815
+ function readEvaluationSelectionStats(evaluation, metric, caseIds) {
3816
+ if (!caseIds) {
3817
+ const direct = evaluation?.metrics?.[metric];
3818
+ return direct && Number.isFinite(direct.mean) ? direct : null;
3819
+ }
3820
+ if (caseIds.length === 0) {
3821
+ return null;
3822
+ }
3823
+ const allowed = new Set(caseIds);
3824
+ const values = (evaluation?.samples ?? [])
3825
+ .flatMap((sample) => sample.cases ?? [])
3826
+ .flatMap((caseResult) => {
3827
+ const metricValue = caseResult.metrics[metric];
3828
+ return allowed.has(caseResult.id) && typeof metricValue === "number" && Number.isFinite(metricValue)
3829
+ ? [metricValue]
3830
+ : [];
3831
+ });
3832
+ return values.length > 0 ? metricStats(values) : null;
3387
3833
  }
3388
3834
  function metricStats(values) {
3389
3835
  const count = values.length;
@@ -3501,7 +3947,7 @@ function isEvaluationSampleRecord(value) {
3501
3947
  !Array.isArray(value) &&
3502
3948
  typeof record.id === "string" &&
3503
3949
  typeof record.index === "number" &&
3504
- typeof record.subject === "object" &&
3950
+ typeof record.candidate === "object" &&
3505
3951
  isEvaluationSampleStatus(record.status) &&
3506
3952
  hasOperationalCaseStatuses(record.cases));
3507
3953
  }